#!/usr/bin/python

# bunchalinks.py
#       --copyright--                   Copyright 2008 (C) Tranzoa, Co. All rights reserved.    Warranty: You're free and on your own here. This code is not necessarily up-to-date or of public quality.
#       --url--                         http://www.tranzoa.net/tzpython/
#       --email--                       pycode is the name to send to. tranzoa.com is the place to send to.
#       --bodstamps--
#       May 1, 2008             bar
#       May 17, 2008            bar     email adr
#       August 29, 2008         bar     basestring instead of StringType because of unicode strings and others
#       May 27, 2012            bar     doxygen namespace
#       February 23, 2023       bar     get rid of has_keys
#       March 2, 2023           bar     python3
#       March 4, 2023           bar     better url sort
#       --eodstamps--
##      \file
#       \namespace              tzpython.bunchalinks
#
#
#       Get a gob of links from given web pages that have 'em and keep a record of the latest ones snagged.
#
#       In practice, we'll grab 'em from places like Fark and Google news.
#
#

from    __future__  import  print_function

import  re
import  time

try :
    import  urllib.request  as  urllib2
except ImportError              :
    import                      urllib2

try :
    from    urllib  import  parse   as  urlparse
except ImportError                      :
    import                              urlparse

try :
    import  urllib.parse    as  urllib      # python3 for urllib.unquote()
except  ImportError :
    import                      urllib

try :
        basestring
except  NameError :
        basestring  = str

try :
    cmp
except NameError :
    cmp     = lambda x, y : (x > y) - (x < y)

import  tzlib
import  url_getter


opener              = urllib2.build_opener()
opener.addheaders   = []                        # get rid of 'User-agent' the only way that seems to work (yes, I tried lower-casing 'Agent')
urllib2.install_opener(opener)


class   a_url(object) :
    def __init__(me, when, url) :
        me.when = when
        me.url  = url
    def __cmp__(me, om) :
        return(cmp(me.when, om.when) or cmp(me.url.upper(), om.url.upper()) or cmp(me.url, om.url))
    def __lt__(me, om) :
        if  me.url <  om.url :
            return(True)
        if  me.url != om.url :
            return(False)
        return(me.when < om.when)
    pass


class   a_link_page_regx(object) :
    def __init__(me, s, unquote) :
        me.unquote  = unquote
        me.regx     = s
        if  isinstance(me.regx, basestring) :
            me.regx = re.compile(me.regx, re.DOTALL | re.IGNORECASE)
        pass
    pass


def find_urls(htm, regx) :
    """
        Given an HTML string, return an array of links.
    """

    if  not htm :   return("")

    ga  = regx.findall(htm)
    if  ga :
        return(ga)

    return([])


def get_link_page(url, timeout = None) :
    """
        Get the link page.
    """

    if  not url :   return(url)

    req     = urllib2.Request(url)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.5) Gecko/20031007')

    r       = url_getter.url_open_read_with_timeout(req, timeout) or None
    r       = tzlib.convert_to_unicode(r)

    return(r)


def get_urls(site, lp_regx,  timeout = None) :
    """
        Get the urls from the given site.

        Return the urls, or [] if the regx didn't find any, or None if the site did not load.
    """

    htm             = get_link_page(site, timeout = timeout) or None
    urls            = find_urls(htm, lp_regx.regx)
    if  urls        :
        urls        = [ urllib.unquote(urlparse.urljoin(site, url)) for url in urls ]
        if  lp_regx.unquote :
            urls    = [ urllib.unquote(url) for url in urls ]
        pass
    elif htm :
        #         tzlib.write_whole_text_file("x.z", htm)
        urls        = []
    else :
        urls        = None

    return(urls)


def print_memory(mem) :
    kys     = list(mem.keys())
    kys.sort()

    print(";", time.asctime(time.localtime()))
    print('')

    for host in kys :
        print("; Host", host)
        ma  = mem[host]
        for m in ma :
            print(" ", time.asctime(time.localtime(m.when)), m.url)
        pass
    print('')


def count_mem_urls(mem) :
    tcnt        = 0
    for host in mem.keys() :
        tcnt   += len(mem[host])

    return(tcnt)


help_str    = """
python bunchalinks link_memory_file_name    Keep a file containing lots of links.

  Options:

    --max_per_site  n           Keep only a max of the given number of links per link-target site (default: 10).
    --max_age       days        Keep links only for this long, max (default: forever).
    --google_news               Get links from Google news.
    --fark                      Get links from Fark.
    --link_page     url         Get links from the given URL.   (multiple --link_page options allowed)
    --ignore        regx        Ignore matching link URLs.      (multiple --ignore    options allowed)
    --timeout       seconds     How long to wait for link pages.
    --dump                      Print all URLs from memory (after any updates - implies --count)
    --count                     Print the number of new/updated and total URLs in memory.
"""

#
#
#
if __name__ == '__main__' :
    import  os
    import  sys

    import  TZCommandLineAtFile


    del(sys.argv[0])
    TZCommandLineAtFile.expand_at_sign_command_line_files(sys.argv)

    gn_link_page    = "http://news.google.com/?output=rss"
    gn_regx         = re.compile(r"<link>http://news\.google\.com/news/url\?.*?&amp;url=([^&]+)&amp;cid=",              re.DOTALL | re.IGNORECASE)

    fark_link_page  = "http://www.fark.com"
    fark_regx       = re.compile(r"<a href\s*=\s*\"http://go\.fark\.com/cgi/fark/go\.pl\?i=[^&]+&amp;l=([^\"&>\r\n]+)", re.DOTALL | re.IGNORECASE)

    link_regx       = re.compile(r"<a [^h>]*href\s*=\s*[\'\"](http://.+?)[\'\"][\s>]",                                  re.DOTALL | re.IGNORECASE)

    link_pages      = {}

    ignores         = {
                        r"^http://[^\.]\.google.com\b"  : True,
                        r"^http://[^\.]\.fark.com\b"    : True,
                      }
    for i in ignores.keys() :
        ignores[i]  = re.compile(i, re.DOTALL | re.IGNORECASE)

    max_per_site    = 10
    max_age         = 0
    timeout         = None
    dump            = False
    count           = False


    while True :
        oi  = tzlib.array_find(sys.argv, [ "--help", '-h', '/h', '/?', '-?' ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        print(help_str)
        sys.exit(254)


    while True :
        oi  = tzlib.array_find(sys.argv, [ "--dump" ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        dump                        = True
        count                       = True

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--count" ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        count                       = True

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--fark" ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        link_pages[fark_link_page]  = a_link_page_regx(fark_regx, True)

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--google_news" ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        link_pages[gn_link_page]    = a_link_page_regx(gn_regx, True)

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--timeout", '-t' ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        timeout                     = int(sys.argv.pop(oi))

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--max_per_site" ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        max_per_site                = int(sys.argv.pop(oi))

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--max_age" ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        max_age                     = float(sys.argv.pop(oi)) * 24.0 * 60.0 * 60.0

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--link_page" ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        link_pages[sys.argv.pop(oi)]    = a_link_page_regx(link_regx, False)

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--ignore" ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        regx                        = sys.argv.pop(oi)
        ignores[regx]               = re.compile(regx, re.DOTALL | re.IGNORECASE)


    if  len(sys.argv) > 1 :
        print("Please tell me just a url memory/.pickle file name!")
        sys.exit(101)


    urls    = []
    for url in link_pages.keys() :

        ua  = get_urls(url, link_pages[url], timeout)

        if  ua == None :
            print("Did not load:", url)
        elif not ua :
            print("No URLs from:", url)
        else :
            for u in ua :
                for rx in ignores.values() :
                    if  rx.search(u) :
                        u   = None
                        break
                    pass
                if  u :
                    urls.append(u)
                pass
            pass
        pass

    if  not len(sys.argv) :
        print(";", time.asctime(time.localtime()))
        print('')
        for url in urls :
            print(url)

        if  count :
            print("; URL count: %u." % ( len(urls) ))
        pass
    else :
        when    = time.time()
        mem     = {}                                    # arrays of a_url, keyed by host name

        fn      = sys.argv.pop(0)
        if  os.path.isfile(fn) :
            mem = tzlib.unpickle_file(fn)
            if  not mem :
                print("Cannot read %s!" % fn)
                sys.exit(111)
            pass

        ocnt        = count_mem_urls(mem)

        for url in urls :                               # add the new urls to the memory
            host    = urlparse.urlsplit(url)[1]
            if  not (host in mem) :
                mem[host]   = []
            mem[host]       = [ m for m in mem[host] if m.url != url ]                      # replace old instances of the same url by deleting the old one
            mem[host].append(a_url(when, url))

        for host in mem.keys() :
            if  max_age :
                mem[host]   = [ m for m in mem[host] if when - m.when < max_age ]

            mem[host].sort()
            mem[host]       = mem[host][-max_per_site:]                                     # strip too-many urls


        tzlib.pickle_file(fn, mem, tzlib.BEST_PICKLE_PROTOCOL)

        if  dump :
            print_memory(mem)
        if  count :
            tcnt        = count_mem_urls(mem)
            print("; Total %u, was %u, updated %u." % ( tcnt, ocnt, len(urls) ))
        if  dump :
            print("; eof")
        pass

    pass


#
#
#
# eof