#!/usr/bin/python
# townhall_comics.py
# --copyright-- Copyright 2017 (C) Tranzoa, Co. All rights reserved. Warranty: You're free and on your own here. This code is not necessarily up-to-date or of public quality.
# --url-- http://www.tranzoa.net/tzpython/
# --email-- pycode is the name to send to. tranzoa.com is the place to send to.
# --bodstamps--
# January 3, 2017 bar
# January 4, 2017 bar whack old images when we have enough to show
# January 6, 2017 bar don't stop on duped images
# fix image urls that start with //
# get 'em all each run (because of out-of-order images at the site)
# January 10, 2017 bar let's try a different user agent (no, the problem is thrashing of the old ones)
# February 23, 2019 bar better print next url and img url
# handle ?... in image urls
# March 27, 2019 bar the image URL changed a bit
# August 14, 2022 bar better, but not perfect, stripping of dupes
# October 14, 2022 bar update to the new web page
# January 16, 2023 bar update to the new web page
# February 27, 2023 bar python3
# March 4, 2023 bar better way to import urllib2
# --eodstamps--
## \file
# \namespace tzpython.townhall_comics
#
#
"""
Make a web page out of the latest N comics at townhall.com.
Whack old images that are too many to look at.
"""
from __future__ import print_function
import glob
import os
import re
import sys
try :
import urllib.request as urllib2
except ImportError :
import urllib2
import tzlib
import url_getter
opener = urllib2.build_opener()
opener.addheaders = [] # get rid of 'User-agent' the only way that seems to work (yes, I tried lower-casing 'Agent')
urllib2.install_opener(opener)
URL = "http://townhall.com/political-cartoons/"
IMAGE_RE = re.compile(r''' d :
b = d
bi = j
pass
pass
if bi >= 0 :
( tfn, timg, tshape, tis_color, tbw ) = imgs[bi]
# print("@@@@", fn, is_color, "----", tfn, tis_color, "Diff", b, tbw.size, tbw.shape)
if b < 3600 : # test had the worst dupe at 3349, 3284... and the closest non-match at 3942, 5947...
if is_color > tis_color :
# print("@@@@ replacing %s with older %s" % ( imgs[bi][0], imgs[i][0], ))
imgs[bi] = imgs[i] # replace the newer one we found with this one
fnd = True
else :
# print("@@@@ replacing %s with newer %s" % ( imgs[i][0], imgs[bi][0], ))
imgs[i] = imgs[bi] # replace the older one with the newer, more colorful one we've already found
fnd = True
pass
pass
pass
fns = list(reversed([ fn for fn, _, _, _, _ in imgs ]))
fns = list(reversed(tzlib.remove_trailing_dupes(fns)))
return(fns)
help_str = """
%s (options) output_htm_file_name
Make a web page with the latest %u comics from townhall.com.
The images are kept in local files.
The web page shows the images in reverse file date/time order.
Extra, old image files are deleted.
Options:
--how_many N Get only this number of images, max. (default: %u)
(--number and -n)
--verbose Be noisier.
"""
#
#
#
if __name__ == '__main__' :
"""
python town_all_comics.py output_htm_file
"""
import TZCommandLineAtFile
program_name = sys.argv.pop(0)
TZCommandLineAtFile.expand_at_sign_command_line_files(sys.argv)
ofile_name = None
get_cnt = MAX_GET
verbose = 0
while True :
oi = tzlib.array_find(sys.argv, [ "--help", "-h", "-?", "/h", "/H", "/?" ] )
if oi < 0 : break
del sys.argv[oi]
print(help_str % ( os.path.basename(program_name), get_cnt, ))
sys.exit(254)
while True :
oi = tzlib.array_find(sys.argv, [ "--verbose", "-v", ] )
if oi < 0 : break
del sys.argv[oi]
verbose += 1
while True :
oi = tzlib.array_find(sys.argv, [ "--number", "--how_many", "--how-many", "--howmany", "-n", ] )
if oi < 0 : break
del sys.argv[oi]
get_cnt = max(1, int(sys.argv.pop(oi)))
if not len(sys.argv) :
print(help_str % ( os.path.basename(program_name), get_cnt, ))
sys.exit(254)
if len(sys.argv) != 1 :
print("I only understand one file name to output to!", file = sys.stderr)
sys.exit(101)
ofile_name = sys.argv.pop(0)
dn = os.path.dirname(os.path.abspath(ofile_name))
ifls = tzlib.make_dictionary([ os.path.basename(iifn) for iifn in glob.glob(os.path.join(dn, IMG_FN_HDR + "*.*")) ])
fnln = max([ len(fn) for fn in ifls.keys() ] + [ 13 ])
iurlln = fnln + len(dn)
cnt = 0
url = URL
urls = get_comic_urls(url)
urls = list(reversed(urls))
while cnt < min(get_cnt, len(urls)) :
img = urls[cnt]
url = img
if not img :
if verbose :
print("No image URL at %s !" % url)
break
iurlln = max(iurlln, len(img))
ibfn = image_file_name_from_url(img)
if not ibfn :
if verbose :
print("No local image file name created for %s at %s!" % ( img, url, ))
break # we already have the image, so we've seen it, so stop now
fnln = max(fnln, len(ibfn))
if ibfn in ifls :
if verbose > 1 :
print("Dupe %-*s %-*s" % ( fnln, ibfn, iurlln, img, ))
pass
else :
if verbose > 1 :
print("New %-*s %-*s" % ( fnln, ibfn, iurlln, img, ))
iurl = tzlib.decode_html_entities(img)
ii = url_getter.url_open_read_with_timeout(iurl)
if not ii :
if verbose :
print("Image %s not downloaded from %s!" % ( img, iurl, ))
break
tzlib.write_whole_binary_file(os.path.join(dn, ibfn), ii) # write the image file
ifls[ibfn] = url
cnt += 1
if verbose :
print("%u image%s gotten." % ( cnt, tzlib.s_except_1(cnt), ))
cnt = 0
htm = ""
fns = list(ifls.keys())
fns.sort(key = lambda a : os.path.getmtime(os.path.join(dn, a)), reverse = True) # should be sorted by regx YYYY/MM/DD but who wants to depend on their url format?
fns = remove_dupished_images(dn, fns)
for ibfn in fns :
if cnt >= get_cnt * 2 :
tzlib.whack_file(os.path.join(dn, ibfn))
else :
if cnt < get_cnt :
if tzlib.is_stringish(ifls[ibfn]) :
url = ifls[ibfn]
htm += """%s
\n""" % ( url, url, ibfn, IMAGE_STYLE, ) else : htm += """
\n""" % ( ibfn, IMAGE_STYLE, ) pass cnt += 1 pass if cnt : htm = """\n
\n%s\n\n\n""" % htm tzlib.write_whole_text_file(ofile_name, htm) pass # # # eof