#!/usr/bin/python

# image_from_re_and_site.py
#       --copyright--                   Copyright 2007 (C) Tranzoa, Co. All rights reserved.    Warranty: You're free and on your own here. This code is not necessarily up-to-date or of public quality.
#       --url--                         http://www.tranzoa.net/tzpython/
#       --email--                       pycode is the name to send to. tranzoa.com is the place to send to.
#       --bodstamps--
#       November 28, 2007       bar
#       November 29, 2007       bar     kingfeatures (they are delayed)
#       December 18, 2007       bar     wulffmorgenthaler
#       March 11, 2008          bar     get uclick and arcamax and seattle_times_re working again
#       April 21, 2008          bar     fix seattle times again
#       May 17, 2008            bar     email adr
#       July 5, 2008            bar     comicspage changed to gocomics beta
#       July 18, 2008           bar     gocomics change
#       --eodstamps--
##      \file
#
#
#       Get urls from a web page.
#
#       Create a web page that combines them - assuming they are images - like comics, for instance.
#
#

import  re
import  urllib2
import  urlparse

import  url_getter



opener              = urllib2.build_opener()
opener.addheaders   = []                        # get rid of 'User-agent' the only way that seems to work (yes, I tried lower-casing 'Agent')
urllib2.install_opener(opener)




comics_dot_com_todays_image_re  =   re.compile(r'SRC\s*=\s*"([^"]+)"\sALT\s*=\s*"Today\'s Comic"',                                                                                                                                  re.IGNORECASE)
arcamax_dot_com_image_re        =   re.compile(r'<a\s+href\s*=\s*"(http://www.arcamax.com/.*?/\d+)"\s+target\s*=\s*"_blank">',                                                                                                      re.IGNORECASE)
jewish_review_re                =   re.compile(r'<BR>\s*<TABLE\s+BORDER=0\s+CELLPADDING=3\s+cellspacing=0>\s*<TR><TD\s*BGCOLOR=BLUE><IMG\s+SRC\s*=\s*"([^"]+)"\s+border="1"></TD></TR>\s*</TABLE>\s*<P>',                           re.IGNORECASE)
nwsource_re                     =   re.compile(r'(?:<A\s+HREF="[^"]+">\s*<IMG\s+BORDER=0\s+SRC="([^"]+)"><BR>About|<IMG\s+BORDER=0\s+SRC="([^"]+)"><BR>[^<]+</A>\s+<p><form\s+action=hi.asp\s+method=get>\s+<select\s+name=date>)', re.IGNORECASE)
seattle_times_re                =   re.compile(r'"><img\s+src="([^"]+)"/?></div',                                                                                                                                                   re.IGNORECASE)
uclick_re                       =   re.compile(r'<IMG\s+BORDER="0"\s+HEIGHT="\d+"\s+WIDTH="\d+"\s+SRC="([^"]+)"><',                                                                                                                 re.IGNORECASE)
comics_page_re                  =   re.compile(r'<span\s+class="description"><img\s+src="([^"]+)"></img></span>',                                                                                                                   re.IGNORECASE)
gocomics_re                     =   re.compile(r'<img\s+alt="[^"]+"\s+id="[^"]+"\s+src="([^"]+)"\s*/>\s*<div\s+id="tags">',                                                                                                         re.IGNORECASE)
gocomics_re                     =   re.compile(r'<img\s+alt="[^"]+"\s+id="[^"]+"\s+src="([^"]+)"\s*/>\s*(?:</a>)?\s*<div\s+id="tags">',                                                                                             re.IGNORECASE)
creators_dot_com_re             =   re.compile(r'<div\s+class="img"\s+style="padding:10px\s+10px\s+0px\s+10px;\s*">\s*<img\s+src="([^"]+)"\s+alt=""\s+border="0">\s*</div>',                                                        re.IGNORECASE)
king_features_re                =   re.compile(r'<td\s+align="left"\s+valign="top"\s+width="\d+"><!--CMS\s+NAME="image"-->\s*<img\s+src=[\'"]([^\'"]+)\'>\s*<!--/CMS-->',                                                           re.IGNORECASE)
wulffmorgenthaler_re            =   re.compile(r'<img\s+id="ctl00_content_Strip1_imgStrip"\s+class="strip"\s+src="([^"]+)"\s+alt="Strip"',                                                                                          re.IGNORECASE)


def find_image_url(htm, regx) :
    """
        Given an HTML string with the image path or URL somewhere on it, return the url or path to the gif or jpg of the image.
    """

    if  not htm :   return("")

    g   = regx.search(htm)
    if  g :
        return(g.group(1))

    return("")



def get_object(url, referer = None, timeout = None) :
    """
        Get the image or web page for the given URL.
    """

    if  not url :   return(url)

    req     = urllib2.Request(url)
    if  referer :
        req.add_header('Referer', referer)
    r       = url_getter.url_open_read_with_timeout(req, timeout) or None

    return(r)




def image_url(site, regx, timeout = None) :
    """
        Get the url to the gif of jpg for the desired image from the given site.

        Return the url, or "" if the regx didn't find it, or None if the site did not load.
    """

    htm     = get_object(site, timeout = timeout) or ""
    url     = find_image_url(htm, regx)
    if  url :
        url     = urlparse.urljoin(site, url)
    elif htm :
        #         tzlib.write_whole_text_file("x.z", htm)
        url     = ""
    else :
        url     = None

    return(url)





#
#
#
if __name__ == '__main__' :
    """
        Create an HTML page with the images from the given urls.
    """


    import  sys
    import  time

    import  TZCommandLineAtFile
    import  tzlib
    import  replace_file


    del(sys.argv[0])
    TZCommandLineAtFile.expand_at_sign_command_line_files(sys.argv)

    local   = None

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--local", '-l', '/l' ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        local       = sys.argv.pop(oi)




    when            = int(time.time())

    print """
<HTML>
<HEAD><TITLE>Today's Comics</TITLE></HEAD>
<BODY>
    <H2>Today's Comics</H2>
    <P><HR><HR><P>

"""

    regx        = None

    cnt         = 0

    while len(sys.argv) > 0 :

        site    = sys.argv.pop(0)

        if  (site == "--regx") or (site == "/r") or (site == "-r") :

            regx    = re.compile(sys.argv.pop(0), re.IGNORECASE)

        else :

            rgx         = regx

            if  site.startswith("http://www.dilbert.com") or site.startswith("http://www.comics.com") :
                rgx     = comics_dot_com_todays_image_re

            if  site.startswith("http://www.arcamax.com") :
                rgx     = arcamax_dot_com_image_re

            if  site.startswith("http://www.jewishworldreview.com") :
                rgx     = jewish_review_re

            if  site.startswith("http://seattlepi.nwsource.com") :
                rgx     = nwsource_re

            if  site.startswith("http://seattletimes.nwsource.com") :
                rgx     = seattle_times_re

            if  site.startswith("http://www.uclick.com") :
                rgx     = uclick_re

            if  site.startswith("http://www.comicspage.com") :
                rgx     = comics_page_re

            if  site.startswith("http://www.gocomics.com") :
                rgx     = gocomics_re

            if  site.startswith("http://www.creators.com") :
                rgx     = creators_dot_com_re

            if  site.startswith("http://www.kingfeatures.com") :
                rgx     = king_features_re

            if  site.startswith("http://www.wulffmorgenthaler.com") :
                rgx     = wulffmorgenthaler_re

            if  not rgx :
                raise "Put the check for the new regx just above here."

            url         = image_url(site, rgx)
            if  not url :
                if  url == None :
                    url = "no site";
                print '<A HREF="%s">%s</A> missing (%s).<P>'     % ( site, site, url )
            else :
                if  local :
                    r   = get_object(url, referer = site)
                    if  r :
                        ext     = ".jpg"
                        if  r.startswith('GIF89') :
                            ext = ".gif"

                        fname   = local + str(cnt) + ext
                        tfn     = fname + ".tmp"
                        fo      = open(tfn, "wb")
                        fo.write(r)
                        fo.close()
                        replace_file.replace_file(fname, tfn, fname + ".bak")
                        url     = fname
                    pass

                print '<A HREF="%s">%s</A><P>'              % ( site, site )
                print '<A HREF="%s"><IMG SRC="%s"></A><P>'  % ( site, url )
            print '<P><HR><P>'

        cnt    += 1


    print """
    <P><HR><HR>
    <SMALL>%s</SMALL>
    <HR><HR><P>
</BODY>
</HTML>
""" % ( time.asctime(time.localtime(when)) )

    pass

#
#
#
# eof
