#!/usr/bin/python

# bunchalinks.py
#       --copyright--                   Copyright 2008 (C) Tranzoa, Co. All rights reserved.    Warranty: You're free and on your own here. This code is not necessarily up-to-date or of public quality.
#       --url--                         http://www.tranzoa.net/tzpython/
#       --email--                       pycode is the name to send to. tranzoa.com is the place to send to.
#       --bodstamps--
#       May 1, 2008             bar
#       May 17, 2008            bar     email adr
#       August 29, 2008         bar     basestring instead of StringType because of unicode strings and others
#       --eodstamps--
##      \file
#
#
#       Get a gob of links from given web pages that have 'em and keep a record of the latest ones snagged.
#
#       In practice, we'll grab 'em from places like Fark and Google news.
#
#


import  re
import  urllib
import  urllib2
import  urlparse
import  time

import  tzlib
import  url_getter


opener              = urllib2.build_opener()
opener.addheaders   = []                        # get rid of 'User-agent' the only way that seems to work (yes, I tried lower-casing 'Agent')
urllib2.install_opener(opener)




class   a_url(object) :
    def __init__(me, when, url) :
        me.when = when
        me.url  = url
    def __cmp__(me, om) :
        return(cmp(me.when, om.when))
    pass



class   a_link_page_regx(object) :
    def __init__(me, s, unquote) :
        me.unquote  = unquote
        me.regx     = s
        if  isinstance(me.regx, basestring) :
            me.regx     = re.compile(me.regx, re.DOTALL | re.IGNORECASE)
        pass
    pass




def find_urls(htm, regx) :
    """
        Given an HTML string, return an array of links.
    """

    if  not htm :   return("")

    ga  = regx.findall(htm)
    if  ga :
        return(ga)

    return([])



def get_link_page(url, timeout = None) :
    """
        Get the link page.
    """

    if  not url :   return(url)

    req     = urllib2.Request(url)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.5) Gecko/20031007')

    r       = url_getter.url_open_read_with_timeout(req, timeout) or None

    return(r)




def get_urls(site, lp_regx,  timeout = None) :
    """
        Get the urls from the given site.

        Return the urls, or [] if the regx didn't find any, or None if the site did not load.
    """

    htm             = get_link_page(site, timeout = timeout) or None
    urls            = find_urls(htm, lp_regx.regx)
    if  urls        :
        urls        = [ urllib.unquote(urlparse.urljoin(site, url)) for url in urls ]
        if  lp_regx.unquote :
            urls    = [ urllib.unquote(url) for url in urls ]
        pass
    elif htm :
        #         tzlib.write_whole_text_file("x.z", htm)
        urls        = []
    else :
        urls        = None

    return(urls)



def print_memory(mem) :
    kys     = mem.keys()
    kys.sort()

    print ";", time.asctime(time.localtime())
    print

    for host in kys :
        print "; Host", host
        ma  = mem[host]
        for m in ma :
            print " ", time.asctime(time.localtime(m.when)), m.url
        pass
    print




def count_mem_urls(mem) :
    tcnt        = 0
    for host in mem.keys() :
        tcnt   += len(mem[host])

    return(tcnt)





help_str    = """
python bunchalinks link_memory_file_name    Keep a file containing lots of links.

  Options:

    --max_per_site  n           Keep only a max of the given number of links per link-target site (default: 10).
    --max_age       days        Keep links only for this long, max (default: forever).
    --google_news               Get links from Google news.
    --fark                      Get links from Fark.
    --link_page     url         Get links from the given URL.   (multiple --link_page options allowed)
    --ignore        regx        Ignore matching link URLs.      (multiple --ignore    options allowed)
    --timeout       seconds     How long to wait for link pages.
    --dump                      Print all URLs from memory (after any updates - implies --count)
    --count                     Print the number of new/updated and total URLs in memory.
"""

#
#
#
if __name__ == '__main__' :
    import  cPickle
    import  os
    import  sys

    import  TZCommandLineAtFile
    import  replace_file


    del(sys.argv[0])
    TZCommandLineAtFile.expand_at_sign_command_line_files(sys.argv)

    gn_link_page    = "http://news.google.com/?output=rss"
    gn_regx         = re.compile(r"<link>http://news\.google\.com/news/url\?.*?&amp;url=([^&]+)&amp;cid=",              re.DOTALL | re.IGNORECASE)

    fark_link_page  = "http://www.fark.com"
    fark_regx       = re.compile(r"<a href\s*=\s*\"http://go\.fark\.com/cgi/fark/go\.pl\?i=[^&]+&amp;l=([^\"&>\r\n]+)", re.DOTALL | re.IGNORECASE)

    link_regx       = re.compile(r"<a [^h>]*href\s*=\s*[\'\"](http://.+?)[\'\"][\s>]",                                  re.DOTALL | re.IGNORECASE)

    link_pages      = {}

    ignores         = {
                        r"^http://[^\.]\.google.com\b"  : True,
                        r"^http://[^\.]\.fark.com\b"    : True,
                      }
    for i in ignores.keys() :
        ignores[i]  = re.compile(i, re.DOTALL | re.IGNORECASE)

    max_per_site    = 10
    max_age         = 0
    timeout         = None
    dump            = False
    count           = False


    while True :
        oi  = tzlib.array_find(sys.argv, [ "--help", '-h', '/h', '/?', '-?' ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        print help_str
        sys.exit(254)


    while True :
        oi  = tzlib.array_find(sys.argv, [ "--dump" ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        dump                        = True
        count                       = True

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--count" ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        count                       = True

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--fark" ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        link_pages[fark_link_page]  = a_link_page_regx(fark_regx, True)

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--google_news" ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        link_pages[gn_link_page]    = a_link_page_regx(gn_regx, True)

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--timeout", '-t' ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        timeout                     = int(sys.argv.pop(oi))

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--max_per_site" ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        max_per_site                = int(sys.argv.pop(oi))

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--max_age" ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        max_age                     = float(sys.argv.pop(oi)) * 24.0 * 60.0 * 60.0

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--link_page" ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        link_pages[sys.argv.pop(oi)]    = a_link_page_regx(link_regx, False)

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--ignore" ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        regx                        = sys.argv.pop(oi)
        ignores[regx]               = re.compile(regx, re.DOTALL | re.IGNORECASE)


    if  len(sys.argv) > 1 :
        print "Please tell me just a url memory file name!"
        sys.exit(101)


    urls    = []
    for url in link_pages.keys() :

        ua  = get_urls(url, link_pages[url], timeout)

        if  ua == None :
            print "Did not load:", url
        elif not ua :
            print "No URLs from:", url
        else :
            for u in ua :
                for rx in ignores.values() :
                    if  rx.search(u) :
                        u   = None
                        break
                    pass
                if  u :
                    urls.append(u)
                pass
            pass
        pass

    if  not len(sys.argv) :
        print ";", time.asctime(time.localtime())
        print
        for url in urls :
            print url

        if  count :
            print "; URL count: %u." % ( len(urls) )
        pass
    else :
        when    = time.time()
        mem     = {}                                    # arrays of a_url, keyed by host name

        fn      = sys.argv.pop(0)
        if  os.path.isfile(fn) :
            fi  = open(fn, "rb")
            try :
                mem = cPickle.load(fi)
            except cPickle.PickleError :
                print "Cannot read %s!" % fn
                sys.exit(111)
            fi.close()

        ocnt        = count_mem_urls(mem)

        for url in urls :                               # add the new urls to the memory
            host    = urlparse.urlsplit(url)[1]
            if  not mem.has_key(host) :
                mem[host]   = []
            mem[host]       = [ m for m in mem[host] if m.url != url ]                      # replace old instances of the same url by deleting the old one
            mem[host].append(a_url(when, url))

        for host in mem.keys() :
            if  max_age :
                mem[host]   = [ m for m in mem[host] if when - m.when < max_age ]

            mem[host].sort()
            mem[host]       = mem[host][-max_per_site:]                                     # strip too-many urls


        tfn = fn + ".tmp"
        fo  = open(tfn, "wb")
        cPickle.dump(mem, fo)
        fo.close()

        replace_file.replace_file(fn, tfn, fn + ".bak")

        if  dump :
            print_memory(mem)
        if  count :
            tcnt        = count_mem_urls(mem)
            print "; Total %u, was %u, updated %u." % ( tcnt, ocnt, len(urls) )
        if  dump :
            print "; eof"
        pass

    pass


#
#
#
# eof

