#!/usr/bin/python
# cagle_cartoons.py
# --copyright-- Copyright 2007 (C) Tranzoa, Co. All rights reserved. Warranty: You're free and on your own here. This code is not necessarily up-to-date or of public quality.
# --url-- http://www.tranzoa.net/tzpython/
# --email-- pycode is the name to send to. tranzoa.com is the place to send to.
# --bodstamps--
# August 6, 2005 bar
# November 18, 2007 bar turn on doxygen
# November 27, 2007 bar insert boilerplate copyright
# May 17, 2008 bar email adr
# May 27, 2012 bar doxygen namespace
# March 5, 2023 bar remove pyflakes fuss
# March 5, 2023 bar future print
# --eodstamps--
## \file
# \namespace tzpython.cagle_cartoons
#
#
# This script creates a web page that contains all cagle.com political cartoons.
#
#
from __future__ import print_function
import random
import re
import urllib2
import url_getter
import urls2dir # sucks in code to allow user agent to be set
urls2dir # make pyflakes happy
names_re = re.compile(r"\s*(?:<[^>]+>\s*)?", re.DOTALL)
def get_htm_url(PCcartoons_page) :
if not PCcartoons_page : return(None)
g = image_htm_re.search(PCcartoons_page)
if not g :
return(None)
return(g.group(1))
image_url_re = re.compile(r"", re.DOTALL)
def get_image_url(PCcartoons_page) :
if not PCcartoons_page : return(None)
g = image_url_re.search(PCcartoons_page)
if not g :
return(None)
return(g.group(1))
image_type_re = re.compile(r"http://www\.cagle\.com/caglecards/main\.asp\?image=http://www\.cagle\.com/working/\d+/[^\.]+\.(gif|jpg)\"\s*>", re.DOTALL)
def get_image_type(PCcartoons_page) :
if not PCcartoons_page : return(None)
g = image_type_re.search(PCcartoons_page)
if not g :
return(None)
return(g.group(1))
def get_cartoon_image(url, timeout = None) :
if not url : return(url)
req = urllib2.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.5) Gecko/20031007')
img = url_getter.url_open_read_with_timeout(req, timeout)
if not img :
img = None
return(img)
def PCcartoon_page_url(name) :
return("http://www.cagle.com/politicalcartoons/PCcartoons/" + name + ".asp")
def get_PCcartoon_page(name, timeout = None) :
if not name : return(None)
url = PCcartoon_page_url(name)
req = urllib2.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.5) Gecko/20031007')
htm = url_getter.url_open_read_with_timeout(req, timeout)
if htm :
if len(htm.strip()) == 0 :
htm = None
elif not get_image_url(htm) :
htm = None
pass
else :
htm = None
return(htm)
def get_main_page(timeout = None) :
url = "http://www.cagle.com/politicalcartoons/main.asp"
req = urllib2.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.5) Gecko/20031007')
htm = url_getter.url_open_read_with_timeout(req, timeout)
if htm :
if len(htm.strip()) == 0 :
htm = None
else :
names = get_all_names(htm)
if (not names) or (len(names) == 0) :
htm = None
pass
else :
htm = None
return(htm)
def get_all_names_and_urls(print_errs = False, timeout = None) :
"""
Gets an array of [ name, htm_url, img_url ] for all current names.
"""
retval = []
htm = get_main_page(timeout)
if not htm :
if print_errs : print( "Did not get main page")
else :
names = get_all_names(htm)
if (not names) or (len(names) == 0) :
if print_errs : print("Did not get names")
pass
random.shuffle(names)
for n in names :
htm = get_PCcartoon_page(n, timeout)
if not htm :
if print_errs : print("Did not get htm for", n)
else :
htm_url = PCcartoon_page_url(n)
img_url = get_image_url(htm)
if not img_url :
if print_errs : print("Did not get img url for", n)
else :
ni = [ n, htm_url, img_url ]
retval.append(ni)
pass
pass
pass
retval.sort()
return(retval)
if __name__ == '__main__' :
import glob
import sys
import time
import replace_file
import TZCommandLineAtFile
import tzlib
del(sys.argv[0])
TZCommandLineAtFile.expand_at_sign_command_line_files(sys.argv)
if False :
for an in sys.argv :
fnames = glob.glob(an)
for fn in fnames :
fi = open(fn, "rb")
htm = fi.read()
fi.close()
if True :
names = get_all_names(htm)
print(fn, len(names))
li = 0
for n in names :
li += 1
print( li, n)
if li == 50 :
htm = get_PCcartoon_page(n)
if htm :
print(" ", get_image_type(htm), get_image_url(htm))
img = get_cartoon_image(get_image_url(htm))
fo = open(n + "." + get_image_type(htm), "wb")
fo.write(img)
fo.close()
pass
pass
if False :
print(fn, get_image_url(htm))
pass
pass
pass
if True :
if len(sys.argv) != 1 :
print("Tell me the web page to create!")
sys.exit(101)
info = get_all_names_and_urls(True)
fn = sys.argv[0]
tn = fn + ".tmp"
fo = open(tn, "w")
tde = ""
tdi = 10000
print("Political Cartoons
", file = fo)
for ni in info :
if tdi >= 5 :
print(tde, file = fo)
print("
", file = fo)
tde = " "
tdi = 0
print('' % ( ni[0], ni[2] ), file = fo)
tdi += 1
print(tde, file = fo)
print("
", file = fo) for ni in info : print('
%s' % ( ni[0], ni[0], ni[1], ni[2], tzlib.safe_html(ni[1]) ), file = fo) print("
Created: " + time.ctime() + "
", file = fo) fo.close() replace_file.replace_file(fn, tn, fn + ".bak") pass # # # # eof