#!/usr/bin/python

# aaec_cartoons.py
#       --copyright--                   Copyright 2007 (C) Tranzoa, Co. All rights reserved.    Warranty: You're free and on your own here. This code is not necessarily up-to-date or of public quality.
#       --url--                         http://www.tranzoa.net/tzpython/
#       --email--                       pycode is the name to send to. tranzoa.com is the place to send to.
#       --bodstamps--
#       August 6, 2005          bar
#       November 18, 2007       bar     turn on doxygen
#       November 27, 2007       bar     insert boilerplate copyright
#       May 17, 2008            bar     email adr
#       November 29, 2011       bar     pyflake cleanup
#       --eodstamps--
##      \file
#
#
#       This script creates a web page that contains the newest N editorial cartoons and aaeconline.org
#
#

import  re
import  urllib2

import  url_getter
import  urls2dir                            # sucks in code to allow user agent to be set


home_url    =   "http://aaeconline.org"

urls_re     =   re.compile(r'<a\s+href\s*=\s*[\"\'](/cartoon/display\.cfm/[\d]+)[\"\']\s+target\s*=\s*[\"\']_parent[\"\']>.*?src\s*=\s*[\"\'](/CFC/toonimage\.cfm\?cartoonist=[^&]+&date=\d+-\d+-\d+&filename=[^\&]+)&thumb=1[\"\'].*?<a\s+href=[\"\']/cartoonist/profile.cfm/.*?[\"\']\s+target\s*=\s*[\"\']_parent[\"\']\s*>\s*([^<]+)</a>', re.DOTALL + re.IGNORECASE)

def get_all_info(htm) :
    """
        Gets an array of [ name, cartoon_page_url, img_url ] for all cartoons in the page.
    """

    retval  =  []

    if  not htm :   return(retval)

    info    = urls_re.findall(htm)
    if  not info :  return(retval)

    for n in info :
        nn = [ n[2], home_url + n[0], home_url + n[1] ]
        retval.append(nn)

    return(retval)



#
#   %u == 1, 11, 21 ...
#
#
new_cartoons_url    =   home_url + "/cartoon/browse.cfm/Regular/?count=%u"


def get_main_page(index = 1, timeout = None) :

    url     = new_cartoons_url % ( index )

    req     = urllib2.Request(url)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.5) Gecko/20031007')

    htm     = url_getter.url_open_read_with_timeout(req, timeout)

    if  htm :
        if  len(htm.strip()) == 0 :
            htm = None
        else :
            # print "htmlen", len(htm)
            info = get_all_info(htm)
            if  (not info) or (len(info) == 0) :
                htm = None
            pass
        pass
    else :
        htm     = None

    return(htm)



def get_all_names_and_urls(idx = 1, print_errs = False, timeout = None) :
    info    =   []

    htm     = get_main_page(idx, timeout)

    if  not htm :
        if  print_errs :                print   "Did not get main page"
    else :
        info = get_all_info(htm)

        if  (not info) or (len(info) == 0) :
            if  print_errs :            print   "Did not get info"
        pass

    return(info)




def _get_cfm_numer(s) :
    g = re.search(r"(\d+)$", s)
    if  g :
        return(g.group(1))
    return(None)


if  __name__ == '__main__' :
    import  os
    import  sys
    import  time

    import  replace_file
    import  TZCommandLineAtFile
    import  tzlib

    del(sys.argv[0])

    TZCommandLineAtFile.expand_at_sign_command_line_files(sys.argv)

    if  False :
        fi = open(sys.argv[0], "r")
        htm = fi.read()
        fi.close()
        info = get_all_info(htm)
        info.sort()
        print "len=", len(info)
        print info


    if  True :

        if  len(sys.argv) != 1 :
            print "Tell me an output htm file name!"
            sys.exit(101)

        htm_name    = sys.argv[0]
        hname       = None

        hist        = {}
        if  hname and os.path.isfile(hname) :
            fi      = open(hname, "r")
            hist    = fi.read()
            fi.close()

            hist    = tzlib.make_dictionary(hist.split())


        newones = []
        newhist = []
        for i in range(1, 200, 10) :

            info = get_all_names_and_urls(i, True)

            if  len(info) == 0 :
                break                                   # no new cartoons

            for n in info :
                cfm = _get_cfm_numer(n[1])
                if  cfm and not hist.has_key(cfm) :
                    hist[cfm] = True
                    newhist.append(cfm)
                    nn        = [ re.sub(r"\s+", "", n[0] + cfm), n[1], n[2], n[0] ]
                    newones.append(nn)
                pass

            if  (len(newones) == 0) or (len(newones) >= 100) :
                break                                   # enough or no new ones
            pass



        if  hname and (len(newhist) > 0) :
            if  os.path.isfile(hname) :
                fo      = open(hname, "a")
            else :
                fo      = open(hname, "w")

            for cfm in newhist :
                print >> fo, cfm
            fo.close()

        if  htm_name and (len(newones) > 0) :

            tname = htm_name + ".tmp"

            fo  = open(tname, "w")

            tde = ""
            tdi = 10000
            print   >> fo, "<HTML><BODY><H2>AAEC Editorial Cartoons</H2><HR><HR><TABLE>"
            for ni in newones :
                if  tdi >= 5 :
                    print   >> fo, tde
                    print   >> fo, "<TR>"
                    tde     =     "</TR>"
                    tdi     =  0
                print       >> fo, '<TD><A HREF="#%s"><IMG WIDTH=196 HEIGHT=196 SRC="%s"></A></TD>' % ( ni[0], ni[2] )
                tdi        +=  1

            print   >> fo, tde
            print   >> fo, "</TABLE><P><HR><P>"


            for ni in newones :
                print   >> fo,  '<P><A HREF="#%s" NAME="%s"></A><P><HR><P><A HREF="%s"><IMG SRC="%s"> %s %s</A>' % ( ni[0], ni[0], ni[1], ni[2], tzlib.safe_html(ni[3]), tzlib.safe_html(ni[1]) )

            print       >> fo,  "<P><HR><P>Created: " + time.ctime() + " <P><HR><P></BODY></HTML>"

            fo.close()

            replace_file.replace_file(htm_name, tname, htm_name + ".bak")


        pass

    pass

#
#
#
# eof

