#!/usr/bin/python

# urls2dir.py
#       --copyright--                   Copyright 2007 (C) Tranzoa, Co. All rights reserved.    Warranty: You're free and on your own here. This code is not necessarily up-to-date or of public quality.
#       --url--                         http://www.tranzoa.net/tzpython/
#       --email--                       pycode is the name to send to. tranzoa.com is the place to send to.
#       --bodstamps--
#       January 8, 2005         bar
#       June 30, 2005           bar     put the user-agent trick in the code
#       June 2, 2006            bar     extension options
#       November 18, 2007       bar     turn on doxygen
#       November 27, 2007       bar     insert boilerplate copyright
#       May 17, 2008            bar     email adr
#       August 20, 2009         bar     --delay
#       August 21, 2009         bar     --outdir
#       May 27, 2012            bar     doxygen namespace
#       November 14, 2012       bar     allow strftime directories
#       --eodstamps--
##      \file
#       \namespace              tzpython.urls2dir
#
#
#       Get a bunch of URLs to a unix-time-named directory.
#
#

import  os
import  time
import  urllib2



opener            = urllib2.build_opener()
opener.addheaders = []                  # get rid of 'User-agent' the only way that seems to work (yes, I tried lower-casing 'Agent')
urllib2.install_opener(opener)



def timed_dir(when = None, dir_name = None) :
    if  when == None :
        when  = int(time.time())

    dir_name        = dir_name or ""
    if  dir_name.find('%') >= 0 :
        data_dir    = time.strftime(dir_name, time.gmtime(when))
    else :
        data_dir    = os.path.join(dir_name, "d" + str(when))

    return(data_dir)



def output_file_name(data_dir, fname_part, when, ext = None) :
    if  ext == None :
        ext  = ".txt"
    if  ext == "ORIGINAL" :
        ext  = os.path.splitext(fname_part)[1]
    return(os.path.join(data_dir, fname_part + '_' + str(when) + ext))





#
#
#
if __name__ == '__main__' :
    """

        Get some URLs to a directory named for the current Unix time.
        Put the time in the output file names, too.

    """


    import  re
    import  sys
    import  urllib2

    import  TZCommandLineAtFile
    import  tzlib
    import  url_getter


    del(sys.argv[0])
    TZCommandLineAtFile.expand_at_sign_command_line_files(sys.argv)

    when        = int(time.time())
    data_dir    = None
    ext         = None
    delay       = 0.0
    verbose     = 1                 # be compatible with pre-verbose version

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--ext", '-e', '-x', ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        ext     = sys.argv.pop(oi)


    while True :
        oi  = tzlib.array_find(sys.argv, [ "--delay", '-d', ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        delay   = float(sys.argv.pop(oi))


    while True :
        oi  = tzlib.array_find(sys.argv, [ "--outdir", '-o', ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        data_dir    = sys.argv.pop(oi)

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--quiet", '-q', ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        verbose    -= 1

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--verbose", '-v', ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        verbose    += 1



    data_dir    = timed_dir(when, data_dir)

    while   len(sys.argv) > 0 :


        url     = sys.argv.pop(0)

        fname   = url
        fname   = re.sub(r"^.+://",                                 "",  fname)
        fname   = re.sub(r"[\:\\\/\^\?\*\|\<\>\{\}\[\]\%\(\)\=\&]", "_", fname)

        req     = urllib2.Request(url)
#       req.add_header('User-Agent',       'Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.5) Gecko/20031007')
#       req.add_header('Accept',           'text/html,image/png,image/jpeg,image/gif,image/bmp,image/jpg')
#       req.add_header('Accept-Language',  'en-us,en')
#       req.add_header('Accept-Charset',   'ISO-8859-1,utf-8')

        req.add_header('User-Agent',        'Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.7.8) Gecko/20050511 Firefox/1.0.4')
        req.add_header('Accept',            'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5')
        req.add_header('Accept-Language',   'en-us,en;q=0.5')
#       req.add_header('Accept-Encoding',   'gzip,deflate')
        req.add_header('Accept-Charset',    'ISO-8859-1,utf-8;q=0.7,*;q=0.7')
#       req.add_header('Keep-Alive',        '300')
#       req.add_header('Connection',        'keep-alive')


        content = url_getter.url_open_read_with_timeout(req)

        if  content == None :

            if  verbose >= 0 :
                print "No results from", url
            pass

        else :

            fname   = output_file_name(data_dir, fname, when, ext)

            if  verbose > 0 :
                print fname, url

            if  True :
                if  not os.path.isdir(data_dir) :
                    os.makedirs(data_dir)

                fo      = open(fname, "wb")
                fo.write(content)
                fo.close()

            pass

        time.sleep(delay)

    pass

#
#
#
# eof
