#!/usr/bin/python

# GoogleSearch.py
#       --copyright--                   Copyright 2007 (C) Tranzoa, Co. All rights reserved.    Warranty: You're free and on your own here. This code is not necessarily up-to-date or of public quality.
#       --url--                         http://www.tranzoa.net/tzpython/
#       --email--                       pycode is the name to send to. tranzoa.com is the place to send to.
#       --bodstamps--
#       December 17, 2003       bar
#       October 22, 2004        bar     allow timeouts and use url_getter
#       January 11, 2005        bar     page count
#                                       able to change the requested number of results
#       January 12, 2005        bar     allow the query to be uncleaned
#                                       command line options
#       January 13, 2005        bar     allow retries on None counts
#       February 12, 2005       bar     multi-word queries
#                                       --with_word
#                                       --quoted
#                                       --paired
#       March 3, 2005           bar     images
#       March 11, 2005          bar     --help
#       March 19, 2005          bar     fixed page count when image count is 1
#                                       insure corrent number_of_results
#       March 20, 2005          bar     test some things
#       April 12, 2005          bar     find_any_type_google_urls
#       November 18, 2007       bar     turn on doxygen
#       November 27, 2007       bar     insert boilerplate copyright
#       May 17, 2008            bar     email adr
#       May 9, 2009             bar     update the unused FF user agent
#                                       --lang
#                                       better page count regx
#       May 10, 2009            bar     --delay
#       May 16, 2009            bar     remove dupes from page count words
#       November 6, 2009        bar     combo
#       November 29, 2011       bar     pyflake cleanup
#       --eodstamps--
##      \file
#
#
#       Search Google for something.
#
#       Run this with a search string in the command line parameter. It will print up to 100 Google result URLs.
#
#

import  copy
import  random
import  re
import  time

import  tzlib
import  urllib
import  urllib2

import  url_getter


opener            = urllib2.build_opener()
opener.addheaders = []                  # get rid of 'User-agent' the only way that seems to work (yes, I tried lower-casing 'Agent')
urllib2.install_opener(opener)



def get_googlish_request(req, timeout = None) :
    f   = ""
    if  False :
        req.add_header('User-Agent',       'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10 (.NET CLR 3.5.30729)')
        req.add_header('Accept',           'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,image/jpeg,image/gif;q=0.2,*/*;q=0.1')
        # req.add_header('Accept-Language',  'en-us,en;q=0.5')
        # req.add_header('Accept-Encoding',  'gzip,deflate')
        req.add_header('Accept-Charset',   'ISO-8859-1,utf-8;q=0.7,*;q=0.7')
        req.add_header('Keep-Alive',       '300')
        req.add_header('Connection',       'keep-alive')

    else :

        req.add_header('User-Agent',       'TZGGBrowser/00.01 Graph/01.00 Text/01.00 Gen/01.00')
        req.add_header('Accept',           'text/html,image/png,image/jpeg,image/gif,image/bmp,image/jpg')
        req.add_header('Accept-Language',  'en-us,en')
        req.add_header('Accept-Charset',   'ISO-8859-1,utf-8')

    f   = url_getter.url_open_read_with_timeout(req, timeout)

    return(f)




class a_google_html_querier :

    are         = re.compile(r"<p\s+class=g>(?:.*?)<a\s+href=\"?([^>\"]+?)\"?\s*>",                 re.DOTALL + re.IGNORECASE)
    html_only   = re.compile(r"(?:\.html?|/|\.com|\.mil|\.edu|\.net|\.org|\.biz|\.tv|\.de|\.uk|\.fr)$",         re.IGNORECASE)

    iurlre      = re.compile(r"<a\s+href=/imgres\?imgurl=([^\&]+)\&imgrefurl=[^\&]+\&h=(\d+)\&w=(\d+)\&sz=.*?<img\s+src=(/images\?q=tbn\:[^ ]+)\s+width=(\d+)\s+height=(\d+)>.*?",    re.DOTALL + re.IGNORECASE)

    r1          = re.compile(r".*?<p\s+class=g>",                                                   re.DOTALL + re.IGNORECASE)
    r2          = re.compile(r"<blockquote.*?</blockquote>",                                        re.DOTALL + re.IGNORECASE)
    r3          = re.compile(r"Result(\&nbsp;|\s+)Page:(\&nbsp;|\s+)<\/font><td>.*",                re.DOTALL + re.IGNORECASE)

    rcchrs      = re.compile(r"[\"\:]",                                                             re.DOTALL + re.IGNORECASE)
    zchrs       = re.compile(r"[\'\`]",                                                             re.DOTALL + re.IGNORECASE)
    schrs       = re.compile(r"[\!\@\#\$\%\^\&\*\(\)\-\_\+\=\\\|\{\}\[\]\;\'\,\.\<\>\?\/\`\~]",     re.DOTALL + re.IGNORECASE)

    page_cnt    = re.compile(r"</b>\s*of\s+(?:about\s+)?<b>\s*([\d,]+)\s*</b>\s+(?:<b>[^<]+</b>\s+)?(?:pages\s+)?for\s+<b>",  re.IGNORECASE)


    def __init__(me, host = None) :

        if  host == None :        host = "www.google.com"

        me.host                 = host

        me.results_html         = ""
        me.urls                 = []
        me.image_info           = []

        me.safe                 = False
        me.lang                 = ""
        me.delay                = 0.0
        me.prev_et              = tzlib.elapsed_time()

        me.number_of_results    = 100


    def set_safe_search(me, how = True) :
        if  how == None :   how = True

        ov       = me.safe
        me.safe  = how

        return(ov)


    """ May 9, 2009

        "lang_af" >Afrikaans
        "lang_ar" >Arabic
        "lang_hy" >Armenian
        "lang_be" >Belarusian
        "lang_bg" >Bulgarian
        "lang_ca" >Catalan
        "lang_zh-CN" >Chinese (Simplified)
        "lang_zh-TW" >Chinese (Traditional)
        "lang_hr" >Croatian
        "lang_cs" >Czech
        "lang_da" >Danish
        "lang_nl" >Dutch
        "lang_en" >English
        "lang_eo" >Esperanto
        "lang_et" >Estonian
        "lang_tl" >Filipino
        "lang_fi" >Finnish
        "lang_fr" >French
        "lang_de" >German
        "lang_el" >Greek
        "lang_iw" >Hebrew
        "lang_hu" >Hungarian
        "lang_is" >Icelandic
        "lang_id" >Indonesian
        "lang_it" >Italian
        "lang_ja" >Japanese
        "lang_ko" >Korean
        "lang_lv" >Latvian
        "lang_lt" >Lithuanian
        "lang_no" >Norwegian
        "lang_fa" >Persian
        "lang_pl" >Polish
        "lang_pt" >Portuguese
        "lang_ro" >Romanian
        "lang_ru" >Russian
        "lang_sr" >Serbian
        "lang_sk" >Slovak
        "lang_sl" >Slovenian
        "lang_es" >Spanish
        "lang_sw" >Swahili
        "lang_sv" >Swedish
        "lang_th" >Thai
        "lang_tr" >Turkish
        "lang_uk" >Ukrainian
        "lang_vi" >Vietnamese
    """

    def set_language(me, lang) :
        lang    = lang or ""
        ov      = me.lang
        me.lang = lang

        return(ov)


    def set_delay(me, delay = 0.0) :
        ov          = me.delay
        me.delay    = delay or 0.0

        return(ov)



    def clean_query(me, q) :
        """
            Clean up every weird character except double quotes and colon. ( This organization needs to be re-thought !!!! )
        """
        q   = re.sub(r"[\000-\037]",      "", q)
        q   = re.sub(r"\b(and|or|not)\b", "", q)

        #                                                                           latin 1 chars -> ascii equivalents????

        q   = a_google_html_querier.zchrs.sub("",  q)
        q   = a_google_html_querier.schrs.sub(" ", q)
        q   = q.strip()

        return(q)

    def really_clean_query(me, q) :
        q   = me.clean_query(q)
        q   = a_google_html_querier.rcchrs.sub(" ", q)                              # space out double quotes and colon

        q   = q.strip()

        return(q)


    def set_number_of_results(me, n = None) :
        ov  = me.number_of_results

        if  n :
            me.number_of_results = int(n);

        return(ov)


    def _do_query(me, q, timeout = None, clean_the_query = True, qs = "search") :

        if  clean_the_query == None :
            clean_the_query  = True

        me.results_html = ""
        me.urls         = []

        if  q == None :
            q  = ""

        if  clean_the_query :
            q  = me.clean_query(q)

        if  len(q) :

            params = urllib.urlencode( {'q': q } )

            # print params
            # raise "x"

            me.number_of_results = max(me.number_of_results,   1)                   # note that google no longer takes num=-1
            me.number_of_results = min(me.number_of_results, 100)

            # req                = urllib2.Request("http://%s/search?hl=en&ie=UTF-8&oe=UTF-8&%s&num=%u&btnG=Google+Search" % ( me.host, params, me.number_of_results ) )

            safe                 = "&safe=off"
            if  me.safe :   safe = "&safe=on"
            lang                 = ""
            if  me.lang :
                lang             = "&lr=lang_%s" % me.lang
            req                  = urllib2.Request("http://%s/%s?ie=UTF-8%s&oe=UTF-8&num=%u%s&%s" % ( me.host, qs, safe, me.number_of_results, lang, params ) )

            # print req.get_full_url()
            # raise "x"

            d                    = random.gauss(me.delay, me.delay / 4.0) - (tzlib.elapsed_time() - me.prev_et)
            if  d > 0.0 :
                time.sleep(d)
            me.prev_et           = tzlib.elapsed_time()

            me.results_html      = get_googlish_request(req, timeout)
            if  me.results_html == None :
                me.results_html  = ""


            # print "  results len:", len(me.results_html)
            pass

        return(copy.copy(me.results_html))



    def do_query(me, q, timeout = None, clean_the_query = True) :

        return(me._do_query(q, timeout, clean_the_query, "search"))


    def do_image_query(me, q, timeout = None, clean_the_query = True) :

        return(me._do_query(q, timeout, clean_the_query, "images"))


    def find_any_type_google_urls(me, results_html = None) :

        if  results_html != None :
            me.results_html =  results_html

        retval = []

        if  me.results_html != None :

            results_html     = a_google_html_querier.r2.sub("", me.results_html)            # get rid of the indented, sub-items

            if  False :
                results_html = a_google_html_querier.r1.sub("<p class=g>", results_html, 1)
                results_html = a_google_html_querier.r2.sub("", results_html)
                results_html = a_google_html_querier.r3.sub("", results_html, 1)

                results_html = re.sub(r"\&nbsp;", " ", results_html)


                print results_html

            retval = a_google_html_querier.are.findall(results_html)                        # extract out the results' URLs

        me.urls    = copy.deepcopy(retval)

        return(retval)


    def find_google_urls(me, results_html = None) :

        if  results_html != None :
            me.results_html =  results_html

        retval  = me.find_any_type_google_urls()

        retval  = filter(a_google_html_querier.html_only.search, retval)                    # toss all but obvious HTM/HTML documents

        me.urls = copy.deepcopy(retval)

        return(retval)


    def page_count(me, results_html = None) :

        if  results_html != None :
            me.results_html =  results_html

        retval = None

        if  me.results_html != None :

            g = a_google_html_querier.page_cnt.search(me.results_html)

            if  g :
                ns = g.group(1)
                ns = re.sub(r",", "", ns)
                if  len(ns) :
                    retval = long(ns)
                pass
            pass

        return(retval)



    def find_image_info(me, results_html = None) :

        if  results_html    != None :
            me.results_html  =  results_html

        retval               = []

        if  me.results_html != None :

            image_info       = a_google_html_querier.iurlre.findall(me.results_html)        # extract out the image [ URL, hite, width, URL, width, hite ] 's

            for ii in image_info :
                url          = "http://" + "images.google.com" + ii[3]
                iurl         = ii[0]
                                                                                                            #     thumbnail                real image
                retval.append( [ url, int(ii[4]), int(ii[5]), None, iurl, int(ii[2]), int(ii[1]), None ] )  #   [ URL, hite, width, image, URL, width, hite, image ]
            pass

        me.image_info        = copy.deepcopy(retval)

        return(retval)


    def _resolve_images_info(me, image_info, bi) :
        if  image_info      != None :
            me.image_info    = image_info

        retval               = []

        if  me.image_info   != None :

            for ii in me.image_info :
                ii              = list(ii)
                if  ii[bi + 3] == None :
                    ii[bi + 3]  =  get_googlish_request(urllib2.Request(ii[bi]))

                retval.append(ii)

            pass

        me.image_info        = copy.deepcopy(retval)

        return(retval)


    def resolve_thumbnails(me, image_info = None) :
        return(me._resolve_images_info(image_info, 0))

    def resolve_images(me, image_info = None) :
        return(me._resolve_images_info(image_info, 4))


    pass




if  __name__ == '__main__' :
    import  sys

    if  len(sys.argv) < 2 :

        print   "Tell me a search string."

    else :


        import  os.path
        import  string

        import  TZCommandLineAtFile

        program_name      = sys.argv.pop(0)
        TZCommandLineAtFile.expand_at_sign_command_line_files(sys.argv)

        host              = 'www.google.com'
        # host            = 'localhost'

        cleanq            = None
        quoted            = False

        combo             = 1

        show_urls         = True
        show_page_count   = False
        show_image_info   = False

        get_thumb_size    = False
        get_image_size    = False

        result_cnt        = None

        safe              = False
        lang              = ""

        timeout           = None
        retries           = 0
        delay             = None

        words             = False

        with_word         = None

        out_file_name     = None

        all_urls          = False



        if  (tzlib.array_find(sys.argv, "--help") >= 0) or (tzlib.array_find(sys.argv, "-h") >= 0) or (tzlib.array_find(sys.argv, "-?") >= 0) :
            print """
GoogleSearch        Exercise the logic.

--no_clean              Don't clean up the query.
--all_urls              Don't filter out all but HTML urls.
--show_count            Print the page count.
--no_show_url           Don't print the URLs found.
--show_image_info       Print image information.
--get_thumb_size        Get the thumbnail image sizes.
--get_image_size        Get the target image sizes.
--words                 Search for each individual word         (implies --no_clean)
--quoted                Quote the query                         (implies --no_clean)
--paired                Search words are given in pairs         (i.e.    --combo 2 )
--combo         count   Search words are given in count-groups  (implies --no_clean)
--results_count cnt     Set the maximum results value   (1..100)
--safe                  Set 'safe' search mode.
--out_file_name name    Print the output to the given file.
--timeout       seconds Set the web-bit timeout.
--retries       cnt     Set the number of retries.
--delay         n.n     Minimum delay in seconds between each query.
--with_word     word    Set a word to include in every search.
--html_file     name    Use the given file as Google-results.

"""
            sys.exit(254)




        while True :
            oi  = tzlib.array_find(sys.argv, "--no_clean")
            if  oi < 0 :    break
            del sys.argv[oi]
            cleanq          = False


        while True :
            oi  = tzlib.array_find(sys.argv, "--all_urls")
            if  oi < 0 :    break
            del sys.argv[oi]
            all_urls        = True


        while True :
            oi  = tzlib.array_find(sys.argv, "--show_count")
            if  oi < 0 :    break
            del sys.argv[oi]
            show_page_count = True

        while True :
            oi  = tzlib.array_find(sys.argv, "--no_show_url")
            if  oi < 0 :    break
            del sys.argv[oi]
            show_urls       = False
            result_cnt      = 1

        while True :
            oi  = tzlib.array_find(sys.argv, "--show_image_info")
            if  oi < 0 :    break
            del sys.argv[oi]
            show_image_info = True

        while True :
            oi  = tzlib.array_find(sys.argv, "--get_thumb_size")
            if  oi < 0 :    break
            del sys.argv[oi]
            get_thumb_size  = True

        while True :
            oi  = tzlib.array_find(sys.argv, "--get_image_size")
            if  oi < 0 :    break
            del sys.argv[oi]
            get_image_size  = True

        while True :
            oi  = tzlib.array_find(sys.argv, "--words")
            if  oi < 0 :    break
            del sys.argv[oi]
            words           = True
            cleanq          = False

        while True :
            oi  = tzlib.array_find(sys.argv, "--quoted")
            if  oi < 0 :    break
            del sys.argv[oi]
            quoted          = True
            cleanq          = False

        while True :
            oi  = tzlib.array_find(sys.argv, "--paired")
            if  oi < 0 :    break
            del sys.argv[oi]
            combo           = 2
            cleanq          = False

        while True :
            oi  = tzlib.array_find(sys.argv, "--combo")
            if  oi < 0 :    break
            del sys.argv[oi]
            combo           = int(sys.argv.pop(oi))
            cleanq          = False

        while True :
            oi  = tzlib.array_find(sys.argv, "--results_count")
            if  oi < 0 :    break
            del sys.argv[oi]
            result_cnt      = int(sys.argv.pop(oi))
            result_cnt      = max(result_cnt,   1)              # note that google no longer takes num=-1
            result_cnt      = min(result_cnt, 100)

        while True :
            oi  = tzlib.array_find(sys.argv, "--safe")
            if  oi < 0 :    break
            del sys.argv[oi]
            safe            = True

        while True :
            oi  = tzlib.array_find(sys.argv, [ "--language", "--lang", "-l", ] )
            if  oi < 0 :    break
            del sys.argv[oi]
            lang            = sys.argv.pop(oi)

        while True :
            oi  = tzlib.array_find(sys.argv, "--out_file_name")
            if  oi < 0 :    break
            del sys.argv[oi]
            out_file_name   = sys.argv.pop(oi)

        while True :
            oi  = tzlib.array_find(sys.argv, "--timeout")
            if  oi < 0 :    break
            del sys.argv[oi]
            timeout         = int(sys.argv.pop(oi))

        while True :
            oi  = tzlib.array_find(sys.argv, "--retries")
            if  oi < 0 :    break
            del sys.argv[oi]
            retries         = int(sys.argv.pop(oi))

        while True :
            oi  = tzlib.array_find(sys.argv, "--delay")
            if  oi < 0 :    break
            del sys.argv[oi]
            delay           = float(sys.argv.pop(oi))

        while True :
            oi  = tzlib.array_find(sys.argv, "--with_word")
            if  oi < 0 :    break
            del sys.argv[oi]
            with_word       = sys.argv.pop(oi)
            if (with_word.find(" ") >= 0) and (with_word[0] != "\"") :
                with_word   = "\"" + with_word + "\""
            with_word       = " +" + with_word



        googler = a_google_html_querier(host)

        googler.set_number_of_results(result_cnt)
        googler.set_safe_search(safe)
        googler.set_language(lang)
        googler.set_delay(delay)




        def _do_it(q, html, width = 20) :

            if  width == None : width = 20

            if  all_urls :
                googler.find_any_type_google_urls(html)
            else :
                googler.find_google_urls(html);

            if  show_urls :
                print string.join(googler.urls, "\n")

            if  show_page_count :
                cnt = googler.page_count()
                if  cnt == None :
                    cnt = 0

                width = max(width, len(q))

                print "%-*s %15lu" % ( width, q, cnt )

                sys.stdout.flush()


            if  show_image_info :

                image_info = googler.find_image_info(html)

                if  get_thumb_size  :   image_info = googler.resolve_thumbnails(image_info)
                if  get_image_size  :   image_info = googler.resolve_images(image_info)

                for ii in image_info :
                    tlen                        = 0
                    if  ii[3] != None : tlen    = len(ii[3])
                    ilen                        = 0
                    if  ii[7] != None : ilen    = len(ii[7])
                    print "Bytes=%06u/%08u size=%05ux%05u tsize=%03ux%03u" % ( ilen, tlen, ii[5], ii[6], ii[1], ii[2] )
                    print "   url=%s" % ( ii[4] )
                    print "  turl=%s" % ( ii[0] )
                pass

            return(width)



        def _hit_it(q) :

            html      = None

            for rcnt in range(0, retries + 1) :
                if  show_image_info :
                    html  = googler.do_image_query(q, timeout, cleanq)
                else :
                    html  = googler.do_query(      q, timeout, cleanq)

                if  googler.page_count() != None :
                    break
                pass

            if  html and (out_file_name != None) :
                fo = open(out_file_name, "wb")
                fo.write(html)
                fo.close()

            return(html)



        width = None


        while True :
            oi  = tzlib.array_find(sys.argv, "--html_file")
            if  oi < 0 :    break
            del sys.argv[oi]
            fn      = sys.argv.pop(oi)
            fi      =  open(fn)
            html    = fi.read()
            fi.close()
            width   = _do_it(fn, html, width)


        if  len(sys.argv) % combo :
            print "Uneven number of queries:", len(sys.argv), "%", combo
            sys.exit(112)


        if  (combo < 2) and show_page_count :
            wh      = {}
            for i in xrange(len(sys.argv) - 1, -1, -1) :
                if  wh.has_key(sys.argv[i]) :
                    del(sys.argv[i])                                                    # remove dupes
                else :
                    wh[sys.argv[i]] = True
                pass
            pass

        qc      = []
        while len(sys.argv) > 0 :

            q   = sys.argv.pop(0)

            if  q[0:1] == '-' :
                print "Did you mean for", q, "to be a command line parameter?"

            if  (not words) and (not quoted) and (combo < 2) and os.path.isfile(q) :

                fi = open(q)
                html = fi.read()                                                    # read a test file or something
                fi.close()

            else :
                html    = None

                if  words :
                    q   = q.strip()

                    if  (not quoted) and (q.find(" ") < 0) :
                        q = q.split()
                        q = "+" + string.join(q, " +")
                    elif (q.find(" ") >= 0) and (q[0] != "\"") :
                        q = "+\"" + q + "\""
                    else :
                        q = "+" + q
                    pass


                qc.append(q)
                if len(qc) >= combo :
                    if  with_word :
                        qc.append(with_word)

                    q       = " ".join(qc)
                    if  quoted :
                        q   = '+"' + q + '"'

                    html    = _hit_it(q)

                    qc      = []

                pass

            if  q and html :
                width = _do_it(q, html, width)
            pass

        pass

    pass


#
#
#


__ALL__ = [
            'get_googlish_request',

            'a_google_html_querier',

          ]


#
#
#
# eof

