#!/usr/bin/python

# image_from_re_and_site.py
#       --copyright--                   Copyright 2007 (C) Tranzoa, Co. All rights reserved.    Warranty: You're free and on your own here. This code is not necessarily up-to-date or of public quality.
#       --url--                         http://www.tranzoa.net/tzpython/
#       --email--                       pycode is the name to send to. tranzoa.com is the place to send to.
#       --bodstamps--
#       November 28, 2007       bar
#       November 29, 2007       bar     kingfeatures (they are delayed)
#       December 18, 2007       bar     wulffmorgenthaler
#       March 11, 2008          bar     get uclick and arcamax and seattle_times_re working again
#       April 21, 2008          bar     fix seattle times again
#       May 17, 2008            bar     email adr
#       July 5, 2008            bar     comicspage changed to gocomics beta
#       July 18, 2008           bar     gocomics change
#       November 3, 2008        bar     new comics dot com
#       November 8, 2008        bar     again
#       January 26, 2009        bar     arcmax change (they have a bug in their link around the picture)
#       May 23, 2009            bar     uclick and gocomics changes
#       May 30, 2009            bar     daybyday
#       August 18, 2009         bar     new king regx
#       January 1, 2010         bar     seattle pi uses external site with direct gif files
#       January 4, 2010         bar     better error msg
#       April 18, 2010          bar     reason.com
#       April 19, 2010          bar     sfgate
#       July 23, 2010           bar     comicskingdom.net needs referer (list at http://content.comicskingdom.net/old.htaccess - e.g. host.madison.com/comics  theham.net/comics kitsapsun.com/comics dallasnews.com)
#       January 23, 2011        bar     gocomics
#       May 7, 2011             bar     arcamax change
#       May 12, 2011            bar     gocomics change
#       June 2, 2011            bar     loosen up uclick
#       --eodstamps--
##      \file
#
#
#       Get urls from a web page.
#
#       Create a web page that combines them - assuming they are images - like comics, for instance.
#
#

import  re
import  sys
import  urllib2
import  urlparse

import  tzlib
import  url_getter



opener              = urllib2.build_opener()
opener.addheaders   = []                        # get rid of 'User-agent' the only way that seems to work (yes, I tried lower-casing 'Agent')
urllib2.install_opener(opener)




comics_dot_com_todays_image_re  =   re.compile(r'SRC\s*=\s*"([^"]+)"\sALT\s*=\s*"Today\'s Comic"',                                                                                                                                  re.IGNORECASE)
comics_dot_com_todays_image_re  =   re.compile(r'title="Click\s+to\s+View\s+this\s+Strip\'s\s+Page"><img\s+src="([^"]+)"\s+border="0"',                                                                                             re.IGNORECASE)
comics_dot_com_todays_image_re  =   re.compile(r'<img\s+src="(http:.*?\.gif)"\s+border="0"',                                                                                                                                        re.IGNORECASE)
arcamax_dot_com_image_re        =   re.compile(r'<a\s+href\s*=\s*"(http://www.arcamax.com/.*?/\d+)"\s+target\s*=\s*"_blank">',                                                                                                      re.IGNORECASE)
arcamax_dot_com_image_re        =   re.compile(r'<img\s+src\s*=\s*"(http://www.arcamax.com/[^"]+)"',                                                                                                                                re.IGNORECASE)
arcamax_dot_com_image_re        =   re.compile(r'<img\s+src\s*=\s*"(/newspics/[^"]+)"',                                                                                                                                             re.IGNORECASE)
jewish_review_re                =   re.compile(r'<BR>\s*<TABLE\s+BORDER=0\s+CELLPADDING=3\s+cellspacing=0>\s*<TR><TD\s*BGCOLOR=BLUE><IMG\s+SRC\s*=\s*"([^"]+)"\s+border="1"></TD></TR>\s*</TABLE>\s*<P>',                           re.IGNORECASE)
nwsource_re                     =   re.compile(r'(?:<A\s+HREF="[^"]+">\s*<IMG\s+BORDER=0\s+SRC="([^"]+)"><BR>About|<IMG\s+BORDER=0\s+SRC="([^"]+)"><BR>[^<]+</A>\s+<p><form\s+action=hi.asp\s+method=get>\s+<select\s+name=date>)', re.IGNORECASE)
seattle_times_re                =   re.compile(r'"><img\s+src="([^"]+)"/?></div',                                                                                                                                                   re.IGNORECASE)
uclick_re                       =   re.compile(r'<IMG\s+BORDER="0"\s+HEIGHT="\d+"\s+WIDTH="\d+"\s+SRC="([^"]+)"(?:\s+A(?:LT|TL)="[^"]+")?>',                                                                                        re.IGNORECASE)
comics_page_re                  =   re.compile(r'<span\s+class="description"><img\s+src="([^"]+)"></img></span>',                                                                                                                   re.IGNORECASE)
gocomics_re                     =   re.compile(r'<img\s+alt="[^"]+"\s+id="[^"]+"\s+src="([^"]+)"\s*/>\s*<div\s+id="tags">',                                                                                                         re.IGNORECASE)
gocomics_re                     =   re.compile(r'<img\s+alt="[^"]+"\s+id="[^"]+"\s+src="([^"]+)"\s*/>\s*(?:</a>)?\s*<div\s+id="tags">',                                                                                             re.IGNORECASE)
gocomics_re                     =   re.compile(r'<img\s+alt="[^"]+"\s+height="\d+"\s+src="([^"]+)"\s+width="\d+"\s*/>',                                                                                                             re.IGNORECASE)
gocomics_re                     =   re.compile(r"<img\s+src='([^']+)'\s+height='\d+'\s+width='\d+'\s+alt='[^']+'\s+",                                                                                                               re.IGNORECASE)
gocomics_re                     =   re.compile(r'<img.*?\sclass="strip"\s+src="([^\?]+)\?',                                                                                                                                         re.IGNORECASE)
gocomics_re                     =   re.compile(r'<img.*?\sclass="strip".*?src="([^\?"]+)[\?"]',                                                                                                                                     re.IGNORECASE)
creators_dot_com_re             =   re.compile(r'<div\s+class="img"\s+style="padding:10px\s+10px\s+0px\s+10px;\s*">\s*<img\s+src="([^"]+)"\s+alt=""\s+border="0">\s*</div>',                                                        re.IGNORECASE)
creators_dot_com_re             =   re.compile(r'<td\s+align="center">\s*<img\s+src="(/comics/[^"]+)"\s+alt=""(?:\s+border="0"|\s*)?>\s*</td>',                                                                                     re.IGNORECASE)
king_features_re                =   re.compile(r'<td\s+align="left"\s+valign="top"\s+width="\d+"><!--CMS\s+NAME="image"-->\s*<img\s+src=[\'"]([^\'"]+)\'>\s*<!--/CMS-->',                                                           re.IGNORECASE)
king_features_re                =   re.compile(r'<!--CMS\s+NAME="image"-->\s*<img\s+src=[\'"]([^\'"]+)\'>\s*<!--/CMS-->',                                                                                                           re.IGNORECASE)
wulffmorgenthaler_re            =   re.compile(r'<img\s+id="ctl00_content_Strip1_imgStrip"\s+class="strip"\s+src="([^"]+)"\s+alt="Strip"',                                                                                          re.IGNORECASE)
daybydaycartoon_re              =   re.compile(r'<div\s+class="cartoon">\s*<p><img\s+alt="[^"]+"\s+src="([^"]+)"',                                                                                                                  re.IGNORECASE)
content_comicskingdom_net_re    =   ""
reason_re                       =   re.compile(r'<div\s+class="entry"><p><img\s+class="pic"\s+alt=""\s+height="\d+"\s+src="([^"]+)"\s+width="\d+"\s+/>',                                                                            re.IGNORECASE)
sfgate_re                       =   re.compile(r'<div\s+class="comic_main\s+clearfix"><img\s+src="([^"]+)"\s+border="0"',                                                                                                           re.IGNORECASE)


if  False :
    fd  = tzlib.read_whole_binary_file(sys.argv[1])
    g   = gocomics_re.search(fd)
    if  g :
        print g.group(1)
    else  :
        print "Not found"
    sys.exit(1)


referer_re                      =   re.compile(r'^(.*?)\{REFERER:([^\}]+)\}$')


def find_image_url(htm, regx) :
    """
        Given an HTML string with the image path or URL somewhere on it, return the url or path to the gif or jpg of the image.
    """

    if  not htm :   return("")

    g   = regx.search(htm)
    if  g :
        return(g.group(1))

    return("")



def get_object(url, referer = None, timeout = None) :
    """
        Get the image or web page for the given URL.
    """

    if  not url :   return(url)

    if  url.find('%') >= 0 :
        url = time.strftime(url)

    req     = urllib2.Request(url)
    if  referer :
        req.add_header('Referer', referer)
    r       = url_getter.url_open_read_with_timeout(req, timeout) or None

    return(r)




def image_url(site, regx, timeout = None) :
    """
        Get the url to the gif of jpg for the desired image from the given site.

        Return the url, or "" if the regx didn't find it, or None if the site did not load.
    """

    if  not regx :
        url     = site
        if  url.find('%') >= 0 :
            url = time.strftime(url)
        g       = referer_re.search(url)
        if  g   :
            url     = g.group(1)
            site    = g.group(2)
        else        :
            site    = url
        pass
    else        :
        if  site.find('%') >= 0 :
            site = time.strftime(site)

        # print "site", site

        htm     = get_object(site, timeout = timeout) or ""

        # print "htm", len(htm), htm[:200]

        url     = find_image_url(htm, regx)
        if  url :
            url = urlparse.urljoin(site, url)
        elif htm :
            tzlib.write_whole_text_file("x.z", htm)
            url = ""
        else :
            url = None
        pass

    return(site, url)





#
#
#
if __name__ == '__main__' :
    """
        Create an HTML page with the images from the given urls.
    """


    import  time

    import  TZCommandLineAtFile
    import  replace_file


    del(sys.argv[0])
    TZCommandLineAtFile.expand_at_sign_command_line_files(sys.argv)

    local   = None

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--local", '-l', '/l' ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        local       = sys.argv.pop(oi)




    when            = int(time.time())

    print """
<HTML>
<HEAD><TITLE>Today's Comics</TITLE></HEAD>
<BODY>
    <H2>Today's Comics</H2>
    <P><HR><HR><P>

"""

    regx        = None

    cnt         = 0

    while len(sys.argv) > 0 :

        site    = sys.argv.pop(0)

        if  (site == "--regx") or (site == "/r") or (site == "-r") :

            regx    = re.compile(sys.argv.pop(0), re.IGNORECASE)

        else :

            rgx         = regx

            if  site.startswith("http://www.dilbert.com") or site.startswith("http://www.comics.com") :
                rgx     = comics_dot_com_todays_image_re

            if  site.startswith("http://www.arcamax.com") :
                rgx     = arcamax_dot_com_image_re

            if  site.startswith("http://www.jewishworldreview.com") :
                rgx     = jewish_review_re

            if  site.startswith("http://seattlepi.nwsource.com") :
                rgx     = nwsource_re

            if  site.startswith("http://seattletimes.nwsource.com") :
                rgx     = seattle_times_re

            if  site.startswith("http://www.seattlepi.com") :
                rgx     = seattle_times_re                          # no - js builds it

            if  site.startswith("http://www.uclick.com") :
                rgx     = uclick_re

            if  site.startswith("http://www.comicspage.com") :
                rgx     = comics_page_re

            if  site.startswith("http://www.gocomics.com") :
                rgx     = gocomics_re

            if  site.startswith("http://www.creators.com") :
                rgx     = creators_dot_com_re

            if  site.startswith("http://www.kingfeatures.com") :
                rgx     = king_features_re

            if  site.startswith("http://www.wulffmorgenthaler.com") :
                rgx     = wulffmorgenthaler_re

            if  site.startswith("http://www.daybydaycartoon.com") :
                rgx     = daybydaycartoon_re

            if  site.startswith("http://content.comicskingdom.net") :
                rgx     = content_comicskingdom_net_re

            if  site.startswith("http://www.sfgate.com") :
                rgx     = sfgate_re

            if  site.startswith("http://reason.com") :
                rgx     = reason_re
                t       = time.time()
                tm      = time.localtime(t)
                t       = time.localtime(t - ((24 * 60 * 60) * ((tm.tm_wday + 3) % 7)))
                site    = time.strftime(site, t)


            if  rgx    == None :
                raise ValueError("Put the check for the new regx just above here [%s]." % site)

            ( site, url )   = image_url(site, rgx)
            if  not url :
                if  url == None :
                    url = "no site";
                print '<A HREF="%s">%s</A> missing (%s).<P>'     % ( site, site, url )
            else :
                if  local :
                    r   = get_object(url, referer = site)
                    if  r :
                        ext     = ".jpg"
                        if  r.startswith('GIF89') :
                            ext = ".gif"

                        fname   = local + str(cnt) + ext
                        tfn     = fname + ".tmp"
                        fo      = open(tfn, "wb")
                        fo.write(r)
                        fo.close()
                        replace_file.replace_file(fname, tfn, fname + ".bak")
                        url     = fname
                    pass

                print '<A HREF="%s">%s</A><P>'              % ( site, site )
                print '<A HREF="%s"><IMG SRC="%s"></A><P>'  % ( site, url )
            print '<P><HR><P>'

        cnt    += 1


    print """
    <P><HR><HR>
    <SMALL>%s</SMALL>
    <HR><HR><P>
</BODY>
</HTML>
""" % ( time.asctime(time.localtime(when)) )

    pass

#
#
#
# eof

