#!/usr/bin/python

# url_getter.py
#       --copyright--                   Copyright 2007 (C) Tranzoa, Co. All rights reserved.    Warranty: You're free and on your own here. This code is not necessarily up-to-date or of public quality.
#       --url--                         http://www.tranzoa.net/tzpython/
#       --email--                       pycode is the name to send to. tranzoa.com is the place to send to.
#       --bodstamps--
#       August 4, 2003          bar     spun off from another script
#       March 29, 2004          bar     comment
#       July 14, 2004           bar     url_open_read_with_timeout()
#       October 22, 2004        bar     allow explicit None in timeout value
#                                       MemoryError and call to gc
#       March 20, 2005          bar     allow arrays of urls to be given and gotten from url_open_read_with_timeout()
#       May 2, 2005             bar     able to return headers from url_open_read_with_timeout()
#       June 30, 2005           bar     AttributeError
#       December 22, 2006       bar     stop_it
#       March 8, 2007           bar     try to avoid a spurious Control Break exception, in efffect, during program abort
#       November 18, 2007       bar     turn on doxygen
#       November 20, 2007       bar     comments
#       November 27, 2007       bar     insert boilerplate copyright
#       December 1, 2007        bar     use tzlib's elapsed_time for timing
#       December 25, 2007       bar     maybe have HTTPException right
#       May 17, 2008            bar     email adr
#       May 31, 2008            bar     SHOW_HITS
#       May 6, 2009             bar     show_info param and note about https
#       May 14, 2009            bar     handle a ValueError from httplib in getter.read
#       October 26, 2010        bar     handle program shutdown situation, thought the problem shouldn't happen, really, if the caller joins us
#       April 18, 2011          bar     add a couple of exceptions
#       November 29, 2011       bar     pyflake cleanup
#       --eodstamps--
##      \file
#
#
#       Threaded URL getter to a file.
#
#       And timeout get URL.
#
#


import  copy
import  gc
import  httplib
import  os.path
import  socket
import  time
from    threading               import  *
from    types                   import  ListType, TupleType
import  urllib2

import  tzlib




SHOW_HITS   = False



class a_url_getter(Thread) :
    """

        Back-calling a given object's routine named '_get_url_and_file_name()', we find a 'url' to get to a 'file_name'.

        When we have downloaded the URL, we call the same object's '_got_file_from_url()'.

        We are triggered to do this through our 'trigger' Event.

        TODO:
            Get control of the download.
                Allow it to be aborted gracefully.
                Allow it to resume operation on partial downloads.

            Should this be a simple, one-shot task that just calls the Python function to copy a URL's contents to a file?

    """

    def __init__(me, thang, show_info = False) :
        """ Constructor. """

        Thread.__init__(me)

        me.thang        = thang
        me._show_info   = show_info

        me.getter       = None

        me.stop         = False;
        me.lock         = RLock()
        me.trigger      = Event()



    def run(me) :
        """ Owner object called start on us. Do the thread. """

        if  me._show_info : print "URL running"

        me.trigger.set()

        while   not me.stop :

            try :
                while not me.stop :
                    (url, file_name) = me.thang._get_url_and_file_name()

                    if  me.stop     :       break

                    if  url == None :       break;

                    if  me._show_info :     print "Getting URL",    url, "to", file_name


                    #################################################################
                                                                                    #
                    def __errme(me, file_name) :                                    #
                        if  me._show_info : print "Get to", file_name, "failed"     #
                        if  os.path.isfile(file_name) :                             #
                            try :                                                   #
                                os.remove( file_name)                               #
                            except OSError, (errno, strerror) :                     #
                                pass                                                #
                            except IOError, (errno, strerror) :                     #
                                pass                                                #
                                                                                    #
                            pass                                                    #
                                                                                    #
                        return(False)                                               #
                                                                                    #
                    #################################################################


                    ok    = True

                    try :
                        if  SHOW_HITS :
                            print "with timeout", me.url
                        me.getter   = urllib2.urlopen(url)          # urllib2 seems to pick up on 404's, which urllib doesn't appear to do
                        s = me.getter.read()
                        me.getter   = None

                        f = open(file_name + ".tmp", "wb")
                        f.write(s)
                        f.close()
                        os.rename(file_name + ".tmp", file_name)

                        if  me._show_info : print "Got URL:", url, "to", file_name
                        pass
                    except socket.error :
                        ok = __errme(me, file_name)
                    except socket.herror :
                        ok = __errme(me, file_name)
                    except socket.gaierror :
                        ok = __errme(me, file_name)
                    except         OSError, (errno, strerror) :
                        ok = __errme(me, file_name)
                    except         IOError :                        # HTTPError is raised by urllib2 on 404's. But it doesn't come with info - and somehow IOError gets triggered ok
                        ok = __errme(me, file_name)
                    except     MemoryError :                        # we've run out of memory during the read or what?
                        ok = __errme(me, file_name)
                    except  AttributeError :                        # something is very wrong with the server
                        ok = __errme(me, file_name)
                    except urllib2.URLError :
                        ok = __errme(me, file_name)
                    except urllib2.HTTPError :
                        ok = __errme(me, file_name)
                    except httplib.HTTPException :
                        ok = __errme(me, file_name)
                    except ValueError :                             # see same exception, below
                        ok = __errme(me, file_name)

                    me.getter   = None


                    if  hasattr(me.thang, '_got_file_from_url') :
                        me.thang._got_file_from_url(file_name, ok)
                    pass


                if  me._show_info : print "URL waiting"

                if  me.stop :   break

                me.trigger.wait()                                   # wait for a trigger set so that we can do our thing
                me.trigger.clear()
                if  me._show_info : print "URL get restart"
            except :
                if  hasattr(me.thang, 'dir') :
                    open(os.path.join(me.thang.dir, "urlgetexcept.txt"), "w").write("urlget except" + time.asctime())
                raise
            pass

        if  me._show_info : print "URL ended"
        pass


    def stop_it(me) :
        """ Try to stop us. Effect is not immediate. """

        me.stop = True

        me.lock.acquire()
        if  me.getter :
            me.getter.close()
        me.lock.release()

        pass





    def show_info(me, how = None) :
        """ Set/get whether we should print what is happening to sys.stdout. """

        retval = me._show_info

        if  how != None :
            if  how :
                me._show_info = True
            else :
                me._show_info = False
            pass

        return(retval)



    pass            # a_url_getter



def url_open_read_with_timeout(url, timeout = 180.0, threads = 8, with_headers = False, show_info = False) :
    """
        Get a URL contents to a string with a timeout.

        This routine can take an array of strings,
          in which case it will get the results in parallel
          and return an array of results.

        If 'with_headers' is True, then each results will be
          returned as two element arrays:
            [0] is the results,
            [1] is the headers' hash.
    """


    if  threads == None :
        threads  = 8
    threads      = max(1, threads)

    if  timeout == None :
        timeout  = 180.0

    isarray      = True
    urls         = url
    if  (not isinstance(urls, ListType)) and (not isinstance(urls, TupleType)) :
        urls     = [ url ]
        isarray  = False



    class a_url_thread(Thread) :
        """ Class of a single thread to get a URL's object. """

        def __init__(me, url) :
            """ Constructor. """

            Thread.__init__(me)

            me.url          = url
            me.contents     = None
            me.getter       = None

            me.lock         = RLock()



        def run(me) :
            """ Owner object called start on us. Do the thread. """

            try :
                try :
                    if  SHOW_HITS :
                        print "with timeout", me.url
                    me.getter   = urllib2.urlopen(me.url)       # urllib2 seems to pick up on 404's, which urllib doesn't appear to do
                    c = me.getter.read()
                    me.lock.acquire()
                    me.contents = c
                    me.headers  = me.getter.info()
                    me.getter   = None
                    me.lock.release()
                except socket.error,    msg :
                    if  show_info :
                        print "socket.error", me.url, msg
                    me.contents = None
                except socket.herror,   msg :
                    if  show_info :
                        print "socket.herror", me.url, msg
                    me.contents = None
                except socket.gaierror, msg :
                    if  show_info :
                        print "socket.gaierror", me.url, msg
                    me.contents = None
                except         OSError, (errno, strerror) :
                    if  show_info :
                        print "OSError", me.url, errno, strerror
                    me.contents = None
                except         IOError :                        # HTTPError is raised by urllib2 on 404's. But it doesn't come with info - and somehow IOError gets triggered ok
                    if  show_info :
                        print "IOError", me.url
                        # tzlib.print_exception()               # note: https: requires C:\Python??\DLLs\_ssl.pyd
                    me.contents = None
                except     MemoryError :                        # we've run out of memory during the read or what?
                    if  show_info :
                        print "MemoryError", me.url
                    me.contents = None
                except  AttributeError :                        # something is very wrong with the server
                    if  show_info :
                        print "AttributeError", me.url
                    me.contents = None
                except urllib2.URLError, msg :
                    if  show_info :
                        print "urllib2.URLError", me.url, msg
                    me.contents = None
                except urllib2.HTTPError, msg :
                    if  show_info :
                        print "urllib2.HTTPError", me.url, msg
                    me.contents = None
                except httplib.HTTPException, msg :
                    if  show_info :
                        print "httplib.HTTPException", me.url, msg
                    me.contents = None
                except ValueError :
                    if  show_info :
                        print "ValueError", me.url              # _read_chunked int(line, 16) in httplib.py ?
                    me.contents = None
                pass
            except AttributeError :                             # program shut-down
                if  socket :
                    raise
                pass

            me.getter   = None

            pass

        pass    # a_url_thread




    threads     = min(threads, len(urls))

    urls        = map(lambda url : [ url, 0.0, None, None, {} ], urls)
    dis         = range(0, len(urls))

    gcnt        = 0
    dcnt        = 0

    while   dcnt < len(urls) :

        #
        #   Fire off any fetches we can make a thread for
        #
        while (gcnt < threads) and (len(dis) > 0) :
            i       = dis.pop(0)
            u       = urls[i]

            if  u[0] == None :
                dcnt += 1                                   # well, that was fast. since he asked for nothing, he gets nothing in return
            else :
                getter  = a_url_thread(u[0])
                getter.setDaemon(True)                      # so that we can kick out of the program while the thread is running
                getter.start()
                u[1]    = tzlib.elapsed_time()
                u[2]    = getter

                # print "getting", u[0]

                gcnt += 1
            pass

        try :
            time.sleep(0.01)
        except AttributeError :
            return(None)                                    # program is probably being aborted


        #
        #   Find finished threads
        #
        cto = True                                          # check timeout on at least one of 'em
        for gi in range(0, len(urls)) :
            u       = urls[gi]

            getter  = u[2]
            if  getter     != None :

                if  len(urls) == 1 :
                    try :
                        getter.join(timeout)                # special-case a single url fetch so that timeout values are more accurate than the unix time function gives
                    except TypeError :
                        return(None)                        # program is probably being aborted
                    pass

                try :
                    getter.lock.acquire()

                    hdrs        = {}
                    r           = getter.contents
                    if  r      != None :
                        r       = "" + r

                        if  with_headers :
                            hdrs = copy.deepcopy(getter.headers)
                        pass

                    getter.lock.release()
                except TypeError :
                    gcnt    = len(urls)
                    return(None)                            # program is probably being aborted

                if  (r != None) or (cto and (tzlib.elapsed_time() - u[1] >= timeout)) or (len(urls) == 1) :      # too bad this isn't elapsed time, but there it is
                    if  len(urls)  != 1 :
                        try :
                            getter.join(0.0)                # special-case a single url fetch so that timeout values are more accurate than the unix time function gives
                        except TypeError :
                            return(None)                    # program is probably being aborted
                        pass

                    u[2]    = None
                    gcnt   -= 1

                    del(getter)
                    getter  = None

                    u[3]    = r                             # the results (or None, if it's a timeout) are now stored
                    u[4]    = hdrs                          # and the HTTP headers, if any and if they are desired are now stored
                    dcnt   += 1

                    # print "  got", u[0]

                else :
                    cto     = False                         # we've checked one timeout, so the rest aren't timed out (well, if they are, it's right now), 'cause they started later than this one

                pass

            pass

        pass

    gc.collect()                                            # the only way to clean up the threads

    if  with_headers :
        retval = map(lambda u : [ u[3], u[4] ], urls)       # snag results and headers
    else :
        retval = map(lambda u : u[3], urls)                 # snag all the results, which have been stored in the [3] element ([0]==url [1]=time [2]=getter-when-actively-retrieving [3]=results [4]=headers, if wanted

    if  not isarray :
        retval = retval[0]                                  # return the single request's results

    return(retval)




#
#
#
if __name__ == '__main__' :
    import  sys
    import  re

    import  TZCommandLineAtFile

    del(sys.argv[0])

    TZCommandLineAtFile.expand_at_sign_command_line_files(sys.argv)

    if  len(sys.argv) != 2 :

        print "Tell me a URL and file name"

    elif re.match(r"^[\d\.]+$", sys.argv[1]) :

        s = url_open_read_with_timeout(sys.argv[0], float(sys.argv[1]));

        print s

    else :

        class _a_thang :
            def __init__(me, url, file_name) :
                """ Constructor. """
                me.url       = url
                me.file_name = file_name
                me.stop      = False

            def _get_url_and_file_name(me) :
                url       = me.url
                file_name = me.file_name
                me.url    = me.file_name = None
                return( ( url, file_name ) )


            def _got_file_from_url(me, file_name, ok = True) :
                print "Got", file_name
                me.stop   = True




        me     = _a_thang(sys.argv[0], sys.argv[1])

        getter = a_url_getter(me, show_info = True)
        getter.setDaemon(True)                                     # so that we can kick out of the program while the thread is running

        getter.start()

        getter.trigger.set()

        while not me.stop :
            time.sleep(0.1)

        getter.lock.acquire()
        getter.stop = True
        getter.trigger.set()
        getter.lock.release()

        getter.trigger.set()
        getter.join(0.35)

        print "Bye, bye"


    pass



##      Public things.
__ALL__ = [
            'a_url_getter',
            'url_open_read_with_timeout',
          ]




#
#
#
# eof

