#!/usr/bin/python # aaec_cartoons.py # --copyright-- Copyright 2007 (C) Tranzoa, Co. All rights reserved. Warranty: You're free and on your own here. This code is not necessarily up-to-date or of public quality. # --url-- http://www.tranzoa.net/tzpython/ # --email-- pycode is the name to send to. tranzoa.com is the place to send to. # --bodstamps-- # August 6, 2005 bar # November 18, 2007 bar turn on doxygen # November 27, 2007 bar insert boilerplate copyright # May 17, 2008 bar email adr # November 29, 2011 bar pyflake cleanup # May 27, 2012 bar doxygen namespace # February 23, 2023 bar get rid of has_keys # March 5, 2023 bar remove pyflakes fuss # March 5, 2023 bar future print # --eodstamps-- ## \file # \namespace tzpython.tzpython.aaec_cartoons # # # This script creates a web page that contains the newest N editorial cartoons and aaeconline.org # # from __future__ import print_function import re import urllib2 import url_getter import urls2dir # sucks in code to allow user agent to be set urls2dir # make pyflakes happy home_url = "http://aaeconline.org" urls_re = re.compile(r'.*?src\s*=\s*[\"\'](/CFC/toonimage\.cfm\?cartoonist=[^&]+&date=\d+-\d+-\d+&filename=[^\&]+)&thumb=1[\"\'].*?\s*([^<]+)', re.DOTALL + re.IGNORECASE) def get_all_info(htm) : """ Gets an array of [ name, cartoon_page_url, img_url ] for all cartoons in the page. """ retval = [] if not htm : return(retval) info = urls_re.findall(htm) if not info : return(retval) for n in info : nn = [ n[2], home_url + n[0], home_url + n[1] ] retval.append(nn) return(retval) # # %u == 1, 11, 21 ... # # new_cartoons_url = home_url + "/cartoon/browse.cfm/Regular/?count=%u" def get_main_page(index = 1, timeout = None) : url = new_cartoons_url % ( index ) req = urllib2.Request(url) req.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.5) Gecko/20031007') htm = url_getter.url_open_read_with_timeout(req, timeout) if htm : if len(htm.strip()) == 0 : htm = None else : # print("htmlen", len(htm)) info = get_all_info(htm) if (not info) or (len(info) == 0) : htm = None pass pass else : htm = None return(htm) def get_all_names_and_urls(idx = 1, print_errs = False, timeout = None) : info = [] htm = get_main_page(idx, timeout) if not htm : if print_errs : print( "Did not get main page") else : info = get_all_info(htm) if (not info) or (len(info) == 0) : if print_errs : print( "Did not get info") pass return(info) def _get_cfm_numer(s) : g = re.search(r"(\d+)$", s) if g : return(g.group(1)) return(None) if __name__ == '__main__' : import os import sys import time import replace_file import TZCommandLineAtFile import tzlib del(sys.argv[0]) TZCommandLineAtFile.expand_at_sign_command_line_files(sys.argv) if False : fi = open(sys.argv[0], "r") htm = fi.read() fi.close() info = get_all_info(htm) info.sort() print("len=", len(info)) print(info) if True : if len(sys.argv) != 1 : print("Tell me an output htm file name!") sys.exit(101) htm_name = sys.argv[0] hname = None hist = {} if hname and os.path.isfile(hname) : fi = open(hname, "r") hist = fi.read() fi.close() hist = tzlib.make_dictionary(hist.split()) newones = [] newhist = [] for i in range(1, 200, 10) : info = get_all_names_and_urls(i, True) if len(info) == 0 : break # no new cartoons for n in info : cfm = _get_cfm_numer(n[1]) if cfm and (not (cfm in hist)) : hist[cfm] = True newhist.append(cfm) nn = [ re.sub(r"\s+", "", n[0] + cfm), n[1], n[2], n[0] ] newones.append(nn) pass if (len(newones) == 0) or (len(newones) >= 100) : break # enough or no new ones pass if hname and (len(newhist) > 0) : if os.path.isfile(hname) : fo = open(hname, "a") else : fo = open(hname, "w") for cfm in newhist : print(cfm, file = fo) fo.close() if htm_name and (len(newones) > 0) : tname = htm_name + ".tmp" fo = open(tname, "w") tde = "" tdi = 10000 print("

AAEC Editorial Cartoons

", file = fo) for ni in newones : if tdi >= 5 : print(tde, file = fo) print("", file = fo) tde = "" tdi = 0 print('' % ( ni[0], ni[2] ), file = fo) tdi += 1 print(tde, file = fo) print("

", file = fo) for ni in newones : print('

%s %s' % ( ni[0], ni[0], ni[1], ni[2], tzlib.safe_html(ni[3]), tzlib.safe_html(ni[1]) ), file = fo) print( "

Created: " + time.ctime() + "

", file = fo) fo.close() replace_file.replace_file(htm_name, tname, htm_name + ".bak") pass pass # # # # eof