#!/usr/bin/python # cagle_cartoons.py # --copyright-- Copyright 2007 (C) Tranzoa, Co. All rights reserved. Warranty: You're free and on your own here. This code is not necessarily up-to-date or of public quality. # --url-- http://www.tranzoa.net/tzpython/ # --email-- pycode is the name to send to. tranzoa.com is the place to send to. # --bodstamps-- # August 6, 2005 bar # November 18, 2007 bar turn on doxygen # November 27, 2007 bar insert boilerplate copyright # May 17, 2008 bar email adr # May 27, 2012 bar doxygen namespace # March 5, 2023 bar remove pyflakes fuss # March 5, 2023 bar future print # --eodstamps-- ## \file # \namespace tzpython.cagle_cartoons # # # This script creates a web page that contains all cagle.com political cartoons. # # from __future__ import print_function import random import re import urllib2 import url_getter import urls2dir # sucks in code to allow user agent to be set urls2dir # make pyflakes happy names_re = re.compile(r"\s*(?:<[^>]+>\s*)?", re.DOTALL) def get_htm_url(PCcartoons_page) : if not PCcartoons_page : return(None) g = image_htm_re.search(PCcartoons_page) if not g : return(None) return(g.group(1)) image_url_re = re.compile(r"", re.DOTALL) def get_image_url(PCcartoons_page) : if not PCcartoons_page : return(None) g = image_url_re.search(PCcartoons_page) if not g : return(None) return(g.group(1)) image_type_re = re.compile(r"http://www\.cagle\.com/caglecards/main\.asp\?image=http://www\.cagle\.com/working/\d+/[^\.]+\.(gif|jpg)\"\s*>", re.DOTALL) def get_image_type(PCcartoons_page) : if not PCcartoons_page : return(None) g = image_type_re.search(PCcartoons_page) if not g : return(None) return(g.group(1)) def get_cartoon_image(url, timeout = None) : if not url : return(url) req = urllib2.Request(url) req.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.5) Gecko/20031007') img = url_getter.url_open_read_with_timeout(req, timeout) if not img : img = None return(img) def PCcartoon_page_url(name) : return("http://www.cagle.com/politicalcartoons/PCcartoons/" + name + ".asp") def get_PCcartoon_page(name, timeout = None) : if not name : return(None) url = PCcartoon_page_url(name) req = urllib2.Request(url) req.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.5) Gecko/20031007') htm = url_getter.url_open_read_with_timeout(req, timeout) if htm : if len(htm.strip()) == 0 : htm = None elif not get_image_url(htm) : htm = None pass else : htm = None return(htm) def get_main_page(timeout = None) : url = "http://www.cagle.com/politicalcartoons/main.asp" req = urllib2.Request(url) req.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.5) Gecko/20031007') htm = url_getter.url_open_read_with_timeout(req, timeout) if htm : if len(htm.strip()) == 0 : htm = None else : names = get_all_names(htm) if (not names) or (len(names) == 0) : htm = None pass else : htm = None return(htm) def get_all_names_and_urls(print_errs = False, timeout = None) : """ Gets an array of [ name, htm_url, img_url ] for all current names. """ retval = [] htm = get_main_page(timeout) if not htm : if print_errs : print( "Did not get main page") else : names = get_all_names(htm) if (not names) or (len(names) == 0) : if print_errs : print("Did not get names") pass random.shuffle(names) for n in names : htm = get_PCcartoon_page(n, timeout) if not htm : if print_errs : print("Did not get htm for", n) else : htm_url = PCcartoon_page_url(n) img_url = get_image_url(htm) if not img_url : if print_errs : print("Did not get img url for", n) else : ni = [ n, htm_url, img_url ] retval.append(ni) pass pass pass retval.sort() return(retval) if __name__ == '__main__' : import glob import sys import time import replace_file import TZCommandLineAtFile import tzlib del(sys.argv[0]) TZCommandLineAtFile.expand_at_sign_command_line_files(sys.argv) if False : for an in sys.argv : fnames = glob.glob(an) for fn in fnames : fi = open(fn, "rb") htm = fi.read() fi.close() if True : names = get_all_names(htm) print(fn, len(names)) li = 0 for n in names : li += 1 print( li, n) if li == 50 : htm = get_PCcartoon_page(n) if htm : print(" ", get_image_type(htm), get_image_url(htm)) img = get_cartoon_image(get_image_url(htm)) fo = open(n + "." + get_image_type(htm), "wb") fo.write(img) fo.close() pass pass if False : print(fn, get_image_url(htm)) pass pass pass if True : if len(sys.argv) != 1 : print("Tell me the web page to create!") sys.exit(101) info = get_all_names_and_urls(True) fn = sys.argv[0] tn = fn + ".tmp" fo = open(tn, "w") tde = "" tdi = 10000 print("
Political Cartoons
", file = fo) for ni in info : if tdi >= 5 : print(tde, file = fo) print("", file = fo) tde = "" tdi = 0 print('' % ( ni[0], ni[2] ), file = fo) tdi += 1 print(tde, file = fo) print("
", file = fo) for ni in info : print('
%s' % ( ni[0], ni[0], ni[1], ni[2], tzlib.safe_html(ni[1]) ), file = fo) print("
Created: " + time.ctime() + "
", file = fo) fo.close() replace_file.replace_file(fn, tn, fn + ".bak") pass # # # # eof