#!/usr/bin/python

# cagle_cartoons.py
#       --copyright--                   Copyright 2007 (C) Tranzoa, Co. All rights reserved.    Warranty: You're free and on your own here. This code is not necessarily up-to-date or of public quality.
#       --url--                         http://www.tranzoa.net/tzpython/
#       --email--                       pycode is the name to send to. tranzoa.com is the place to send to.
#       --bodstamps--
#       August 6, 2005          bar
#       November 18, 2007       bar     turn on doxygen
#       November 27, 2007       bar     insert boilerplate copyright
#       May 17, 2008            bar     email adr
#       May 27, 2012            bar     doxygen namespace
#       --eodstamps--
##      \file
#       \namespace              tzpython.cagle_cartoons
#
#
#       This script creates a web page that contains all cagle.com political cartoons.
#
#

import  random
import  re
import  urllib2

import  url_getter
import  urls2dir                            # sucks in code to allow user agent to be set






names_re    = re.compile(r"<B>\s*(?:<[^>]+>\s*)?<A\s+HREF\s*=\s*[\'\"]/politicalcartoons/PCcartoons/([^\.]+)\.asp[\'\"]", re.DOTALL + re.IGNORECASE)

def get_all_names(main_asp) :

    if  not main_asp :  return([])

    names = names_re.findall(main_asp)

    for i in range(len(names) - 1, -1, -1) :
        if  names[i] == "PCbest2" :
            del(names[i])
        pass

    return(names)







image_htm_re    =   re.compile(r"<A\s+[Hh][Rr][Ee][Ff]\s*=\s*\"(http://www\.cagle\.com/caglecards/main\.asp\?image=http://www\.cagle\.com/working/\d+/[^\.]+\.(?:gif|jpg))\"\s*>", re.DOTALL)

def get_htm_url(PCcartoons_page) :

    if  not PCcartoons_page :   return(None)

    g   = image_htm_re.search(PCcartoons_page)
    if  not g :
        return(None)

    return(g.group(1))



image_url_re    =   re.compile(r"<A\s+[Hh][Rr][Ee][Ff]\s*=\s*\"http://www\.cagle\.com/caglecards/main\.asp\?image=(http://www\.cagle\.com/working/\d+/[^\.]+\.(?:gif|jpg))\"\s*>", re.DOTALL)

def get_image_url(PCcartoons_page) :

    if  not PCcartoons_page :   return(None)

    g   = image_url_re.search(PCcartoons_page)
    if  not g :
        return(None)

    return(g.group(1))



image_type_re   =   re.compile(r"http://www\.cagle\.com/caglecards/main\.asp\?image=http://www\.cagle\.com/working/\d+/[^\.]+\.(gif|jpg)\"\s*>", re.DOTALL)

def get_image_type(PCcartoons_page) :

    if  not PCcartoons_page :   return(None)

    g   = image_type_re.search(PCcartoons_page)
    if  not g :
        return(None)

    return(g.group(1))






def get_cartoon_image(url, timeout = None) :

    if  not url :   return(url)


    req     = urllib2.Request(url)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.5) Gecko/20031007')

    img     = url_getter.url_open_read_with_timeout(req, timeout)

    if  not img :
        img = None

    return(img)



def PCcartoon_page_url(name) :
    return("http://www.cagle.com/politicalcartoons/PCcartoons/" + name + ".asp")



def get_PCcartoon_page(name, timeout = None) :

    if  not name :  return(None)

    url     = PCcartoon_page_url(name)

    req     = urllib2.Request(url)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.5) Gecko/20031007')

    htm     = url_getter.url_open_read_with_timeout(req, timeout)

    if  htm :
        if  len(htm.strip()) == 0 :
            htm = None
        elif    not get_image_url(htm) :
            htm = None
        pass
    else :
        htm     = None

    return(htm)


def get_main_page(timeout = None) :

    url     = "http://www.cagle.com/politicalcartoons/main.asp"

    req     = urllib2.Request(url)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.5) Gecko/20031007')

    htm     = url_getter.url_open_read_with_timeout(req, timeout)

    if  htm :
        if  len(htm.strip()) == 0 :
            htm = None
        else :
            names = get_all_names(htm)
            if  (not names) or (len(names) == 0) :
                htm = None
        pass
    else :
        htm     = None

    return(htm)


def get_all_names_and_urls(print_errs = False, timeout = None) :
    """
        Gets an array of [ name, htm_url, img_url ] for all current names.
    """

    retval = []

    htm = get_main_page(timeout)

    if  not htm :
        if  print_errs :                print   "Did not get main page"
    else :
        names = get_all_names(htm)

        if  (not names) or (len(names) == 0) :
            if  print_errs :            print "Did not get names"
            pass

        random.shuffle(names)

        for n in names :
            htm = get_PCcartoon_page(n, timeout)

            if  not htm :
                if  print_errs :        print "Did not get htm for", n
            else :
                htm_url = PCcartoon_page_url(n)

                img_url = get_image_url(htm)

                if  not img_url :
                    if  print_errs :    print "Did not get img url for", n
                else :
                    ni = [ n, htm_url, img_url ]
                    retval.append(ni)
                pass
            pass

        pass

    retval.sort()

    return(retval)





if  __name__ == '__main__' :
    import  glob
    import  sys
    import  time

    import  replace_file
    import  TZCommandLineAtFile
    import  tzlib

    del(sys.argv[0])

    TZCommandLineAtFile.expand_at_sign_command_line_files(sys.argv)

    if  False :
        for an in sys.argv :
            fnames = glob.glob(an)

            for fn in fnames :

                fi = open(fn, "rb")
                htm = fi.read()
                fi.close()

                if  True :
                    names = get_all_names(htm)
                    print fn, len(names)
                    li  = 0
                    for n in names :
                        li += 1
                        print   li, n
                        if  li == 50 :
                            htm = get_PCcartoon_page(n)
                            if  htm :
                                print " ", get_image_type(htm), get_image_url(htm)
                                img = get_cartoon_image(get_image_url(htm))
                                fo  = open(n + "." + get_image_type(htm), "wb")
                                fo.write(img)
                                fo.close()
                        pass
                    pass

                if  False :
                    print fn, get_image_url(htm)

                pass
            pass
        pass

    if  True :

        if  len(sys.argv) != 1 :
            print "Tell me the web page to create!"
            sys.exit(101)

        info = get_all_names_and_urls(True)

        fn  = sys.argv[0]
        tn  = fn + ".tmp"

        fo  = open(tn, "w")

        tde = ""
        tdi = 10000
        print   >> fo, "<HTML><BODY><H2>Political Cartoons</H2><HR><HR><TABLE>"
        for ni in info :
            if  tdi >= 5 :
                print   >> fo, tde
                print   >> fo, "<TR>"
                tde     =     "</TR>"
                tdi     =  0
            print       >> fo, '<TD><A HREF="#%s"><IMG WIDTH=196 HEIGHT=196 SRC="%s"></A></TD>' % ( ni[0], ni[2] )
            tdi        +=  1

        print   >> fo, tde
        print   >> fo, "</TABLE><P><HR><P>"


        for ni in info :
            print   >> fo,  '<P><A HREF="#%s" NAME="%s"></A><P><HR><P><A HREF="%s"><IMG SRC="%s"> %s</A>' % ( ni[0], ni[0], ni[1], ni[2], tzlib.safe_html(ni[1]) )

        print       >> fo,  "<P><HR><P>Created: " + time.ctime() + " <P><HR><P></BODY></HTML>"

        fo.close()
        replace_file.replace_file(fn, tn, fn + ".bak")

    pass

#
#
#
# eof
