#!/usr/bin/python

# url_getter.py
#       --copyright--                   Copyright 2007 (C) Tranzoa, Co. All rights reserved.    Warranty: You're free and on your own here. This code is not necessarily up-to-date or of public quality.
#       --url--                         http://www.tranzoa.net/tzpython/
#       --email--                       pycode is the name to send to. tranzoa.com is the place to send to.
#       --bodstamps--
#       August 4, 2003          bar     spun off from another script
#       March 29, 2004          bar     comment
#       July 14, 2004           bar     url_open_read_with_timeout()
#       October 22, 2004        bar     allow explicit None in timeout value
#                                       MemoryError and call to gc
#       March 20, 2005          bar     allow arrays of urls to be given and gotten from url_open_read_with_timeout()
#       May 2, 2005             bar     able to return headers from url_open_read_with_timeout()
#       June 30, 2005           bar     AttributeError
#       December 22, 2006       bar     stop_it
#       March 8, 2007           bar     try to avoid a spurious Control Break exception, in efffect, during program abort
#       November 18, 2007       bar     turn on doxygen
#       November 20, 2007       bar     comments
#       November 27, 2007       bar     insert boilerplate copyright
#       December 1, 2007        bar     use tzlib's elapsed_time for timing
#       December 25, 2007       bar     maybe have HTTPException right
#       May 17, 2008            bar     email adr
#       May 31, 2008            bar     SHOW_HITS
#       May 6, 2009             bar     show_info param and note about https
#       May 14, 2009            bar     handle a ValueError from httplib in getter.read
#       October 26, 2010        bar     handle program shutdown situation, thought the problem shouldn't happen, really, if the caller joins us
#       April 18, 2011          bar     add a couple of exceptions
#       November 29, 2011       bar     pyflake cleanup
#       May 27, 2012            bar     doxygen namespace
#       June 10, 2012           bar     don't use time.time() defaulted
#       August 12, 2012         bar     put name in thread
#       January 19, 2013        bar     clean up some
#       February 19, 2013       bar     epydoc indentation corrected
#       May 28, 2014            bar     put thread id in threads
#       January 18, 2016        bar     except syntax change
#       July 7, 2018            bar     pyflakes
#       February 27, 2023       bar     python3
#       --eodstamps--
##      \file
#       \namespace              tzpython.url_getter
#
#
#       Threaded URL getter to a file.
#
#       And timeout get URL.
#
#


from    __future__  import  print_function

import  copy
import  gc
try :
    import  http.client as  httplib
except ImportError          :
    import                  httplib
import  os.path
import  socket
import  threading
import  time
try :
    import  urllib.request  as  urllib2
except ImportError              :
    import                      urllib2

import  tzlib


SHOW_HITS   = False


class a_url_getter(threading.Thread) :
    """
        Back-calling a given object's routine named '_get_url_and_file_name()', we find a 'url' to get to a 'file_name'.

        When we have downloaded the URL, we call the same object's '_got_file_from_url()'.

        We are triggered to do this through our 'trigger' threading.Event.

        TODO:
            - Get control of the download.
                - Allow it to be aborted gracefully.
                - Allow it to resume operation on partial downloads.
            - Should this be a simple, one-shot task that just calls the Python function to copy a URL's contents to a file?

    """

    def __init__(me, thang, show_info = False) :
        """ Constructor. """

        threading.Thread.__init__(me, name = __file__ + '.a_url_getter')

        me.tid          = None
        me.thang        = thang
        me._show_info   = show_info

        me.getter       = None

        me.stop         = False;
        me.lock         = threading.RLock()
        me.trigger      = threading.Event()


    def run(me) :
        """ Owner object called start on us. Do the thread. """

        me.tid          = tzlib.get_tid()
        if  me._show_info : print("URL running")

        me.trigger.set()

        while   not me.stop :

            try :
                while not me.stop :
                    (url, file_name) = me.thang._get_url_and_file_name()

                    if  me.stop     :       break

                    if  url == None :       break;

                    if  me._show_info :     print("Getting URL",    url, "to", file_name)


                    #################################################################
                                                                                    #
                    def __errme(me, file_name) :                                    #
                        if  me._show_info : print("Get to", file_name, "failed")    #
                        if  os.path.isfile(file_name) :                             #
                            try :                                                   #
                                os.remove( file_name)                               #
                            except OSError  :                                       #
                                pass                                                #
                            except IOError  :                                       #
                                pass                                                #
                                                                                    #
                            pass                                                    #
                                                                                    #
                        return(False)                                               #
                                                                                    #
                    #################################################################


                    ok    = True

                    try :
                        if  SHOW_HITS :
                            print("with timeout", me.url)
                        me.getter   = urllib2.urlopen(url)          # urllib2 seems to pick up on 404's, which urllib doesn't appear to do
                        s = me.getter.read()
                        me.getter   = None

                        f = open(file_name + ".tmp", "wb")
                        f.write(s)
                        f.close()
                        os.rename(file_name + ".tmp", file_name)

                        if  me._show_info : print("Got URL:", url, "to", file_name)
                        pass
                    except socket.error :
                        ok = __errme(me, file_name)
                    except socket.herror :
                        ok = __errme(me, file_name)
                    except socket.gaierror :
                        ok = __errme(me, file_name)
                    except         OSError :
                        ok = __errme(me, file_name)
                    except         IOError :                        # HTTPError is raised by urllib2 on 404's. But it doesn't come with info - and somehow IOError gets triggered ok
                        ok = __errme(me, file_name)
                    except     MemoryError :                        # we've run out of memory during the read or what?
                        ok = __errme(me, file_name)
                    except  AttributeError :                        # something is very wrong with the server
                        ok = __errme(me, file_name)
                    except urllib2.URLError :
                        ok = __errme(me, file_name)
                    except urllib2.HTTPError :
                        ok = __errme(me, file_name)
                    except httplib.HTTPException :
                        ok = __errme(me, file_name)
                    except ValueError :                             # see same exception, below
                        ok = __errme(me, file_name)

                    me.getter   = None


                    if  hasattr(me.thang, '_got_file_from_url') :
                        me.thang._got_file_from_url(file_name, ok)
                    pass


                if  me._show_info : print("URL waiting")

                if  me.stop :   break

                me.trigger.wait()                                   # wait for a trigger set so that we can do our thing
                me.trigger.clear()
                if  me._show_info : print("URL get restart")
            except :
                if  hasattr(me.thang, 'dir') :
                    open(os.path.join(me.thang.dir, "urlgetexcept.txt"), "w").write("urlget except" + time.asctime(time.localtime(time.time())))
                raise
            pass

        if  me._show_info : print("URL ended")
        pass


    def stop_it(me) :
        """ Try to stop us. Effect is not immediate. """

        me.stop = True

        me.lock.acquire()
        if  me.getter :
            me.getter.close()
        me.lock.release()

        pass


    def show_info(me, how = None) :
        """ Set/get whether we should print what is happening to sys.stdout. """

        retval = me._show_info

        if  how != None :
            if  how :
                me._show_info = True
            else :
                me._show_info = False
            pass

        return(retval)


    pass            # a_url_getter


def url_open_read_with_timeout(url, timeout = 180.0, threads = 8, with_headers = False, show_info = False) :
    """
        Get a URL contents to a string with a timeout.

        This routine can take an array of strings,
        in which case it will get the results in parallel
        and return an array of results.

        If 'with_headers' is True, then each results will be returned as two element arrays:
            - [0] is the results,
            - [1] is the headers' hash.
    """


    if  threads == None :
        threads  = 8
    threads      = max(1, threads)

    if  timeout == None :
        timeout  = 180.0

    isarray      = True
    urls         = url
    if  not tzlib.is_listish(urls) :
        urls     = [ url ]
        isarray  = False


    class   a_hit(object) :
        def __init__(me, url) :
            me.url      = url           #: the url
            me.when     = 0.0           #: when the hit was started
            me.getter   = None          #: get urllib2.urlopen() return value
            me.content  = None          #: None or a string with the returned content
            me.headers  = {}            #: the HTTP result headers
        #   a_hit


    class a_url_thread(threading.Thread) :
        """ Class of a single thread to get a URL's object. """

        def __init__(me, url) :
            """ Constructor. """

            threading.Thread.__init__(me, name = __file__ + '.a_url_thread')

            me.tid          = None
            me.url          = url
            me.contents     = None
            me.getter       = None

            me.setDaemon(True)                                  # so that we can kick out of the program while the thread is running

            me.lock         = threading.RLock()


        def run(me)         :
            """ Owner object called start on us. Do the thread. """

            me.tid          = tzlib.get_tid()
            try :
                try :
                    if  SHOW_HITS :
                        print("with timeout", me.url)
                    me.getter   = urllib2.urlopen(me.url)       # urllib2 seems to pick up on 404's, which urllib doesn't appear to do
                    c = me.getter.read()
                    me.lock.acquire()
                    me.contents = c
                    me.headers  = me.getter.info()
                    me.getter   = None
                    me.lock.release()
                except socket.error     as msg :
                    if  show_info :
                        print("socket.error", me.url, msg)
                    me.contents = None
                except socket.herror    as msg :
                    if  show_info :
                        print("socket.herror", me.url, msg)
                    me.contents = None
                except socket.gaierror  as msg :
                    if  show_info :
                        print("socket.gaierror", me.url, msg)
                    me.contents = None
                except         OSError as e :
                    if  show_info :
                        print("OSError", me.url, e.errno, e.strerror)
                    me.contents = None
                except         IOError :                        # HTTPError is raised by urllib2 on 404's. But it doesn't come with info - and somehow IOError gets triggered ok
                    if  show_info :
                        print("IOError", me.url)
                        # tzlib.print_exception()               # note: https: requires C:\Python??\DLLs\_ssl.pyd
                    me.contents = None
                except     MemoryError :                        # we've run out of memory during the read or what?
                    if  show_info :
                        print("MemoryError", me.url)
                    me.contents = None
                except  AttributeError :                        # something is very wrong with the server
                    if  show_info :
                        print("AttributeError", me.url)
                    me.contents = None
                except urllib2.URLError         as msg :
                    if  show_info :
                        print("urllib2.URLError", me.url, msg)
                    me.contents = None
                except urllib2.HTTPError        as msg :
                    if  show_info :
                        print("urllib2.HTTPError", me.url, msg)
                    me.contents = None
                except httplib.HTTPException    as msg :
                    if  show_info :
                        print("httplib.HTTPException", me.url, msg)
                    me.contents = None
                except ValueError :
                    if  show_info :
                        print("ValueError", me.url)             # _read_chunked int(line, 16) in httplib.py ?
                    me.contents = None
                pass
            except AttributeError :                             # program shut-down
                if  socket :
                    raise
                pass

            me.getter   = None

        pass    # a_url_thread


    threads     = min(threads, len(urls))

    urls        = [ a_hit(u) for u in urls ]
    dis         = list(range(0, len(urls)))

    gcnt        = 0
    dcnt        = 0

    while   dcnt < len(urls) :

        #
        #   Fire off any fetches we can make a thread for
        #
        while (gcnt < threads) and (len(dis) > 0) :
            i       = dis.pop(0)
            u       = urls[i]

            if  u.url  == None :
                dcnt   += 1                                 # well, that was fast. since he asked for nothing, he gets nothing in return
            else :
                getter      = a_url_thread(u.url)
                getter.start()
                u.when      = tzlib.elapsed_time()
                u.getter    = getter

                # print("getting", u.url)

                gcnt += 1
            pass

        try :
            time.sleep(0.01)
        except AttributeError :
            return(None)                                    # program is probably being aborted


        #
        #   Find finished threads
        #
        cto = True                                          # check timeout on at least one of 'em
        for u in urls :
            getter  = u.getter
            if  getter != None :

                if  len(urls) == 1 :
                    try :
                        getter.join(timeout)                # special-case a single url fetch so that timeout values are more accurate than the unix time function gives
                    except TypeError :
                        return(None)                        # program is probably being aborted
                    pass

                try :
                    getter.lock.acquire()

                    hdrs        = {}
                    r           = getter.contents
                    if  r      != None :
                        if  not tzlib.is_stringish(r) :
                            r   = str(r)

                        if  with_headers :
                            hdrs = copy.deepcopy(getter.headers)
                        pass

                    getter.lock.release()
                except TypeError :
                    gcnt    = len(urls)
                    return(None)                            # program is probably being aborted

                if  (r != None) or (cto and (tzlib.elapsed_time() - u.when >= timeout)) or (len(urls) == 1) :   # too bad this isn't elapsed time, but there it is
                    if  len(urls)  != 1 :
                        try :
                            getter.join(0.0)                # special-case a single url fetch so that timeout values are more accurate than the unix time function gives
                        except TypeError :
                            return(None)                    # program is probably being aborted
                        pass

                    u.getter    = None
                    gcnt       -= 1

                    del(getter)
                    getter      = None

                    u.content   = r                         # the results (or None, if it's a timeout) are now stored
                    u.headers   = hdrs                      # and the HTTP headers, if any and if they are desired are now stored
                    dcnt       += 1

                    # print("  got", u.url)

                else :
                    cto     = False                         # we've checked one timeout, so the rest aren't timed out (well, if they are, it's right now), 'cause they started later than this one

                pass

            pass

        pass

    gc.collect()                                            # the only way to clean up the threads

    if  with_headers :
        retval = [ [ u.content, u.headers ] for u in urls ] # snag results and headers
    else :
        retval = [   u.content              for u in urls ] # snag all the results, which have been stored in the [3] element ([0]==url [1]=time [2]=getter-when-actively-retrieving [3]=results [4]=headers, if wanted

    if  not isarray :
        retval = retval[0]                                  # return the single request's results

    return(retval)


#
#
#
if  __name__ == '__main__' :
    import  sys
    import  re

    import  TZCommandLineAtFile

    del(sys.argv[0])

    TZCommandLineAtFile.expand_at_sign_command_line_files(sys.argv)


    if  len(sys.argv) != 2 :

        print("Tell me a URL and file name")

    elif re.match(r"^[\d\.]+$", sys.argv[1]) :

        s   = url_open_read_with_timeout(sys.argv[0], float(sys.argv[1]))
        s   = tzlib.convert_to_unicode(s)
        # s = url_open_read_with_timeout(sys.argv[0], float(sys.argv[1]), with_headers = True, show_info = True);
        print(s)

    else :

        class _a_thang :
            def __init__(me, url, file_name) :
                """ Constructor. """
                me.url       = url
                me.file_name = file_name
                me.stop      = False

            def _get_url_and_file_name(me) :
                url       = me.url
                file_name = me.file_name
                me.url    = me.file_name = None
                return( ( url, file_name ) )


            def _got_file_from_url(me, file_name, ok = True) :
                print("Got", file_name)
                me.stop   = True


        me     = _a_thang(sys.argv[0], sys.argv[1])

        getter = a_url_getter(me, show_info = True)
        getter.setDaemon(True)                                     # so that we can kick out of the program while the thread is running

        getter.start()

        getter.trigger.set()

        while not me.stop :
            time.sleep(0.1)

        getter.lock.acquire()
        getter.stop = True
        getter.trigger.set()
        getter.lock.release()

        getter.trigger.set()
        getter.join(0.35)

        print("Bye, bye")


    pass


##      Public things.
__ALL__ = [
            'a_url_getter',
            'url_open_read_with_timeout',
          ]


#
#
#
# eof