#!/usr/bin/python

# net_lyrics.py
#       --copyright--                   Copyright 2007 (C) Tranzoa, Co. All rights reserved.    Warranty: You're free and on your own here. This code is not necessarily up-to-date or of public quality.
#       --url--                         http://www.tranzoa.net/tzpython/
#       --email--                       pycode is the name to send to. tranzoa.com is the place to send to.
#       --bodstamps--
#       March 1, 2005           bar
#       March 2, 2005           bar     typo
#       March 4, 2005           bar     replace 0x92 with single quotes
#                                       pur \n at the end of the lyrics
#                                       lyricsdownload.com
#       March 5, 2005           bar     rename a routine
#                                       new sites
#                                       more counts
#                                       write out files we are unable to parse
#       March 10, 2005          bar     lyred
#       March 12, 2005          bar     change the meaning of .host to be the snagged host
#                                       file io
#                                       <PRE stuff
#       March 13, 2005          bar     more things to ignore
#       March 23, 2005          bar     filter lyrics that match certain things
#       November 18, 2007       bar     turn on doxygen
#       November 27, 2007       bar     insert boilerplate copyright
#       May 17, 2008            bar     email adr
#       November 29, 2011       bar     pyflake cleanup
#       May 27, 2012            bar     doxygen namespace
#       --eodstamps--
##      \file
#       \namespace              tzpython.net_lyrics
#
#
#       Get the lyrics for a song from the net.
#
#       TODO:
#
#           two different ways to get the lyrics/artist/title must be in arrays, not in separate items
#
#           musicsonglyrics - get the artist and album better
#
#

import  os
import  random
import  re
import  string
import  sys
import  time
import  urllib2
import  urlparse

import  GoogleSearch
import  tzlib
import  url_getter


__ALL__ = [
            'a_lyrics_getter',

            'a_lyrics_from_file',

            'get_lyrics_from_htm',
            'could_be_lyrics',

            'ignore_these_lyrics',

            'all_hosts_counts',
            'known_hosts_counts',
            'found_hosts_counts',
          ]


file_re         =   re.compile(r"Artist:([^\n]*)\nAlbum:([^\n]*)\nTitle:([^\n]*)\nHost:([^\n]*)\nMatch_host:([^\n]*)\nReported_artist:([^\n]*)\nReported_album:([^\n]*)\nReported_title:([^\n]*)\nWhen:([^\n]*)\n[^\n]*\n(.+)",     re.IGNORECASE + re.DOTALL)
file_01_re      =   re.compile(r"Artist:([^\n]*)\nAlbum:([^\n]*)\nTitle:([^\n]*)\nHost:([^\n]*)\nReported_artist:([^\n]*)\nReported_album:([^\n]*)\nReported_title:([^\n]*)\nWhen:([^\n]*)\n[^\n]*\n(.+)",                          re.IGNORECASE + re.DOTALL)
file_00_re      =   re.compile(r"Artist:([^\n]*)\nAlbum:([^\n]*)\nTitle:([^\n]*)\nHost:([^\n]*)\nReported_artist:([^\n]*)\nReported_title:([^\n]*)\nWhen:([^\n]*)\n[^\n]*\n(.+)",                                                   re.IGNORECASE + re.DOTALL)


pre_re          =   re.compile(r"\s*<PRE[^>]+>\s*(.+?)\s*</PRE\s*>\s*$", re.IGNORECASE + re.DOTALL)

de_br_re        =   re.compile(r"</?br\s*/?>",  re.IGNORECASE)
de_p_re         =   re.compile(r"</?p\s*/?>",   re.IGNORECASE)
de_i_re         =   re.compile(r"</?i\s*/?>",   re.IGNORECASE)
de_lb_re        =   re.compile(r"\n{2,}",       re.IGNORECASE)
strip_line_re   =   re.compile(r"^(.*)$",       re.MULTILINE)


opener              = urllib2.build_opener()
opener.addheaders   = []                                                    # get rid of 'User-agent' the only way that seems to work (yes, I tried lower-casing 'Agent')
urllib2.install_opener(opener)



all_hosts_counts    = {}
known_hosts_counts  = {}
found_hosts_counts  = {}



lyrics_hosts      = [
                        [ 'lyricsfreak.com',
                            -1,
                            r"<td\s+colspan=\"2\"\s+style=\"BACKGROUND-COLOR:\s+#CAE7FF;\s+BORDER-LEFT:\s+(.*)",
                            r"class=\"blue\s+none\">(.*?)\s+Lyrics</a></td>",
                            r"class=\"none\">([^<]+)</a>\s+<font\s+color=\"#70B9EF\">\&#8250;</font>\s+<a\s+href=\"[^\"]+\"\s+class=\"blue\s+none\">",
                            r"PADDING-BOTTOM:\s+50px;\s+PADDING-TOP:\s+30px;\s+COLOR:\s+#525252;\">(.*?)</div>",
                        ],



                        [ 'lyrics007.com',
                            -1,
                            r"document\.write\(\'<a\s+href=\"/other/rank\.php\?id=[^\"]+\">Rank</a>\'\)\s*</script>\s*<strong>(.*)",
                            r"\s+-\s+(.*?)Song\s+Lyrics</strong><br>",
                            r"<strong>(.*?)\s+-\s+.*?Lyrics</strong>",
                            r"Lyrics</strong><br><br>(.*?)<br><br>The\s+hottest\s+songs",
                        ],



                        [ 'lyricsdownload.com',
                            -1,
                            r"jpg\"\s+height=\"23\">(.*)width=\"469\"\s+id=\"table3\">",
                            r"<font\s+face=\"Verdana\"\s+size=\"2\"\s+color=\"\#CBCBCB\">\s*[^<]+</font></a>\s*\-\s+(.*?)\s+Lyrics\s*</font></b>",
                            r"<font\s+face=\"Verdana\"\s+size=\"2\"\s+color=\"\#CBCBCB\">\s*([^<]+)</font></a>",
                            r"</a> </center><br>(.*?)(?:<a\s+href=\"|<br><br><br>\s*<table\b)",
                        ],



                        [ 'lyricsxp.com',
                            -1,
                            r"(</table><h1\s+title=\".*?)<p><strong>",
                            r"<h1\s+title=\"[^\"]+\"><B>(.*?)\s*Lyrics</b></h1>",
                            r"<B>Artist:\s+</B><b><a\s+href=\"[^\"]+\"\s+title=\"[^\"]+\"\s*>([^<]+)</a></b></h1>",
                            r"</script><br><br>.*?<BR><BR>(.*?)<a\s+href=",
                        ],

                        [ 'lyricsxp.com',
                            -1,
                            r"(</table><h1\s+title=\".*?)<p><strong>",
                            r"<h1\s+title=\"[^\"]+\"><B>(.*?)\s*Lyrics</b></h1>",
                            r"<B>Artist:\s+</B><b><a\s+href=\"[^\"]+\"\s+title=\"[^\"]+\"\s*>([^<]+)</a></b></h1>",
                            r"</script><br><br>(.*?)<a\s+href=",
                        ],



                        [ 'dubba.com' ,
                            -1,                                                                          # collapse \n+ to \n ?
                            r"bordercolorlight=\"\#FFFFCC\"(.*?)</font></b>",                               # all of it
                            r"<font\s+face=\"Arial\"\s+size=\"6\">([^<]+)<br>\s*</font>",                   # title
                            r"<font\s+face=\"Arial\"\s+size=\"5\">~([^<]+)</font>",                         # artist
                            r"<font\s+size=\"2\"\s+face=\"Arial\"><br>\s*<br>\s*(.*)",                      # lyrics
                        ],



                        [ 'lyrics.ly',
                            -1,
                            r"<table\s+width=600\s+cellpadding=0\s+cellspacing=0\s+align=center><tr><td\s+valign=top>(.*?)</div></td></tr></table>",
                            r"</td></tr><tr><td\s+colspan=3\s+bgcolor=C0C0C0\s+align=center><b><span\s+class=medium>([^<]+)</span>",
                            r"</span>\s+Lyrics</b><br>\(by\s+<span\s+class=medium>([^<]+)</span>\)</td></tr>",
                            r"else\s+\{this\.style\.zoom=1;zoomed=0;\}\">(.*)",
                        ],



                        [ 'elyrics.net',                                                                    # also successfully matches nomorelyrics.net !
                            -1,
                            r"</td><td\s+valign=top\s+width=45\%\s+style=\"font-size\:11px;\">(.*?)</td></tr></table><p>",
                            r"<input\s+type=\"hidden\"\s+name=\"song\"\s+value=\"([^\"]+)\">",
                            r"<input\s+type=\"hidden\"\s+name=\"band\"\s+value=\"([^\"]+)\">",
                            r"<!--.*?-->(.*?)<!--.*?-->",
                        ],

                        [ 'nomorelyrics.net',                                                               # also successfully matches elyrics.net !
                            -1,
                            r"</td><td\s+valign=top\s+width=45\%\s+style=\"font-size\:11px;\">(.*?)</td></tr></table><p>",
                            r"<input\s+type=\"hidden\"\s+name=\"song\"\s+value=\"([^\"]+)\">",
                            r"<input\s+type=\"hidden\"\s+name=\"band\"\s+value=\"([^\"]+)\">",
                            r"<!--.*?-->(.*?)<!--.*?-->",
                        ],

                        [ 'nomorelyrics.net',
                            -1,
                            r"<td\s+width=\"700\"\s+valign=\"top\"><b><font\s+class=\"title2\">(.+?)<font\s+class=\"storytitle4\">",
                            r"^.+?---([^<]+)</font>",
                            r"^(.+?)---",
                            r"</font></b>(.+)",
                        ],



                        [ 'allthelyrics.com',
                            -1,
                            r"<table\s+border=\"0\"\s+cellspacing=\"0\"\s+cellpadding=\"5\"\s+width=\"100\%\">(.*)</font></ul>",
                            r"<td><font\s+face=\"Arial\"><h1>(.*?)\s+lyrics</h1></font></td>",
                            r"<a\s+class=\"breakout\"\s+href=\"[^\"]+\"\s+title=\"[^\"]+\">(.*?)\s+lyrics</a>",
                            r"<ul><br><font\s+class=\"lyricsbody\">(.*)",
                        ],



                        [ 'reallyrics.com',
                            -1,
                            r"<table\s+border=\"\s*1\"\s+cellpadding=\"0\"\s+cellspacing=\"0\"\s+width=\"81\%\">(.*)",
                            r"<td\s+class\s*=\s*HeadTitle>Song</td>\s*<td\s+class\s*=\s*HeadMatter>([^<]+)</td>",
                            r"<td\s+class\s*=\s*HeadTitle>Artist</td>\s*<td\s+class\s*=\s*HeadMatter>([^<]+)</td>",
                            r"<td\s+class\s*=\s*TDSONG>(.*?)</td>\s*</tr>\s*</table>",
                        ],



                        [ 'lyricskeeper.com',
                            -1,
                            r"<td\s+width=\"100\%\"\s+align=\"left\"\s+valign=\"top\"\s+class=\"main\">(.*)<center><a\s+href=\"",
                            r"<br/><h4>Lyrics\s+-\s+([^<]+)</h4><br/><br/>",
                            r"<br/><h4>Artist\s+-\s+([^<]+)</h4>",
                            r"<font\s+class=\"main-text\">(.*)</font>",
                        ],



                        [ 'lyricsondemand.com',
                            -1,
                            r"(class=\"NoUnder\".*)<td\s+valign=top\s+align=center\s+width=268>",
                            r"class=\"NoUnderPlain\"\s+onMouseOver=\"window\.status=\' \';\s*return\s+true\"\s+onMouseOut=\"window\.status=\' \';\s*return\s+true\"\s*>(.*?)\s*Lyrics</a><br></font>",
                            r"class=\"NoUnder\"\s+onMouseOver=\"window\.status=\' \';\s*return\s+true\"\s+onMouseOut=\"window\.status=\' \';\s*return\s+true\"\s*>(.*?)\s*Lyrics</a></b><br>",
                            r"<p><font\s+size=\"2\"\s+face=\"Verdana\">(.*?)</font>",
                        ],

                        [ 'lyricsondemand.com',
                            1,
                            r"(class=\"NoUnder\".*)<td\s+valign=top\s+align=center\s+width=268>",
                            r"class=\"NoUnderPlain\"\s+onMouseOver=\"window\.status=\' \';\s*return\s+true\"\s+onMouseOut=\"window\.status=\' \';\s*return\s+true\"\s*>(.*?)\s*Lyrics</a><br></font>",
                            r"class=\"NoUnder\"\s+onMouseOver=\"window\.status=\' \';\s*return\s+true\"\s+onMouseOut=\"window\.status=\' \';\s*return\s+true\"\s*>(.*?)\s*Lyrics</a></b><br>",
                            r"<pre><font\s+size=\"2\"\s+face=\"Verdana\">(.*?)</pre>",
                        ],



                        [ 'lyricscafe.com',
                            -1,
                            r"<!--\s+startprint\s+-->(.*)<!--\s+stopprint\s+-->",
                            r"<b>\s*(.*?)\s*::.*?</b>",
                            r"<b>.*?::\s*(.*?)\s*</b>",
                            r"class=\"NormalText\">.*?<br>\s+<br>\s*(.+?)\s*</td>",
                        ],

                        [ 'lyricscafe.com',
                            -1,
                            r"<!--\s+startprint\s+-->(.*)<!--\s+stopprint\s+-->",
                            r"<b>\s*(.*?)\s*::.*?</b>",
                            r"<b>.*?::\s*(.*?)\s*</b>",
                            r"class=\"NormalText\">(.+?)<br><br>\s+</td>",
                        ],

                        [ 'lyricscafe.com',
                            -1,
                            r"<!--\s+startprint\s+-->(.*?)<!--\s+stopprint\s+-->",
                            r"class=\"Red\">\s*([^<]+)<br>",
                            r"<p\s+align=\"center\"><B>\s*(.*?)\s*Lyrics</B><BR>",
                            r"class=\"Red\">.*?<br>\s*<br>(.+)",
                        ],



                        [ 'arelyrics.com',
                            -1,
                            r"<td\s+align=\"right\">Artist:</td>(.+?)</td></tr></table>",
                            r"<td\s+align=\"right\">Title:</td>\s*<td>([^<]+)</td>",
                            r"<td><a\s+href=\"/artist/[^\"]+\">(.+?)\s*Lyrics</a></td>",
                            r"<hr\s+color[^>]+>(.+?)</span>",
                        ],


                        [ 'lyred.com',
                            -1,
                            r"<DIV\s+CLASS=\"ramik\">(.+)<P\s+class=endvypisbuy\s+align=\"center\"><script",
                            r"<A\s+STYLE=\"font-weight\s+:\s+normal;\s+font-style:\s+italic;\s*\"\s+HREF=\"[^\"]+\">\s*(.+?)\s+lyrics</A>",
                            r"<H1\s+class=\"velkost13\"><A\s+HREF=\"[^\"]+\">(.+?)\s*<b>",
                            r"</DIV></DIV><BR><P>(.+)",
                        ],


                        #
                        #
                        #
                        #   Why not insert new ones here? Less likely to dupe the zero [1] value from the entry below.
                        #
                        #
                        #


                        [ 'tonsoflyrics.com',
                            0,
                            r"<div\s+class=top_body>(.+?)</table>",
                            r".+Lyrics\">(.+?)\s*Lyrics</a>\s+</div>",
                            r".+class=hm\s+href=\"[^\"]+\"\s+title=\"[^\"]+\"\s*>(.+?)\s*Lyrics</a>.*?class=hm\s+.+?Lyrics\">[^<]+</a>\s+</div>",
                            r"<pre\s+class=lyrics>(.*)</pre>",
                        ],



#                       [ 'www.geocities.com/bestlyr_x/lyrics/',
#                           -1,
#                           r"<p><h1><b>Lyr<span>\.</span><big>X</big></h1></b></p>(.*)</nobr></font>",
#                           r"<p><h2><u><b>([^<]+)</b></u></h2></p>",
#                           r"<h3>([^<]+)<br></h3>",
#                           r"<font\s+face=\"[^>]+>(.*)",
#                       ],

                        [ 'www.geocities.com/bestlyr_x/lyrics/',
                            -1,
                            r"<!--\s+preceding\s+code\s+added\s+by\s+server\.\s+PLEASE\s+REMOVE\s+-->\s+<nobr>\s+.*?(<p><h2><u><b>[^<]+</b></u></h2></p>\s+<h3>[^<]+<br></h3>\s+<font\s+face=\"orchid\",\"Foxglove\",\"Copperplate\s+Gothic\s+Bold\",\"Times\s+New\s+Roman\">.*)</nobr></font>",
                            r"<p><h2><u><b>([^<]+)</b></u></h2></p>",
                            r"<h3>([^<]+)<br></h3>",
                            r"<font\s+face=\"[^>]+>(.*)",
                        ],



                        [ 'musicsonglyrics.com',
                            -1,
                            r"<td\s+width=\'100\%\'\s+bgcolor=\'#6699FF\'><font\s+color=\'#FFFFFF\'>(.*)",
                            r".*?\s+lyrics\s+::\s+(.*?)\s+lyrics</font></td>",
                            r"(.*?)\s+lyrics\s+::\s+[^<]+</font></td>",
                            r"</font></td></tr><tr><td\s+width=\'100\%\'>(.*?)</td></tr></table>",
                        ],

                        [ 'musicsonglyrics.com',
                            -1,
                            r"</td></tr></table></center></td>(.*?)</td></tr></table></div></td>",
                            r"<font\s+color=\s*[\'\"][^\'\"']+[\'\"]\s*>([^<>]+?)\s+lyrics</font>",             # gets both artist and song
                            r"<font\s+color=\s*[\'\"][^\'\"']+[\'\"]\s*>([^<>]+?)\s+lyrics</font>",             # gets both artist and song
                            r"<p\s+align=[\'\"]justify[\'\"]\s*>(.*)",
                        ],

                        [ 'musicsonglyrics.com',
                            -1,
                            r"</td></tr><center><tr><td>(.*?)</td></tr></table></div></td>",
                            r"<h5>\s*.+?\s+-\s+(.+)\s*lyrics</h5>",
                            r"<h5>\s*(.+?)\s+-\s+.+\s*lyrics</h5>",
                            r"<p\s+align=[\'\"]justify[\'\"]\s*>(.*)",
                        ],

                        [ 'musicsonglyrics.com',
                            -1,
                            r"</td></tr></table></td></tr></table></div></td></tr>(.*)</span></td></tr></table></center></div></td></tr>",
                            r"<b>([^<>]+?)\s+lyrics</b>",                                                       # gets both artist and song
                            r"<b>([^<>]*?)\s+lyrics</b>",                                                       # gets both artist and song
                            r"<span\s+style=\'font-size:\s+9pt\'>(.*)",
                        ],
                    ]







for h in lyrics_hosts :
    if  len(h) != 6 :
        print "Put the commas in after the regxes!"
        sys.exit(101)
    for i in range(2, len(h)) :
        h[i] = re.compile(h[i], re.DOTALL + re.IGNORECASE)
    pass




def get_request(req, timeout = None) :

    req.add_header('User-Agent',       'LYBrowser/00.01 Graph/01.00 Text/01.00 Gen/01.00')
    req.add_header('Accept',           'text/html')
    req.add_header('Accept-Language',  'en-us,en')
    req.add_header('Accept-Charset',   'ISO-8859-1,utf-8')

    f = url_getter.url_open_read_with_timeout(req, timeout)

    return(f)



strip_before_likely =   [
                            re.compile(r"<meta[^>]+?>",                                                                                                             re.DOTALL + re.IGNORECASE),

                            re.compile(r"<script.+?</script\b",                                                                                                     re.DOTALL + re.IGNORECASE),
                            re.compile(r"<title.+?</title\b",                                                                                                       re.DOTALL + re.IGNORECASE),

                            re.compile(r"http:",                                                                                                                    re.DOTALL + re.IGNORECASE),

#                            re.compile(r"\{[^}]+\}",                                                                                                                re.DOTALL + re.IGNORECASE),    # used to say who is singing/talking e.g. {name of performer}
                            re.compile(r"<style>.*?</style>",                                                                                                       re.DOTALL + re.IGNORECASE),     # alternate way to get rid of a lot of css stuff


                            re.compile(r"\%20",                                                                                                                     re.DOTALL + re.IGNORECASE),


                            #
                            #
                            #   Attributes with double-quoted values
                            #
                            #
                            re.compile(r"\bALINK\s*=\s*\"[^\"]+\"",                                                                                                 re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bALT\s*=\s*\"[^\"]+\"",                                                                                                   re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bALLOWTRANSPARENCY\s*=\s*\"[^\"]+\"",                                                                                     re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bBACKGROUND\s*=\s*\"[^\"]+\"",                                                                                            re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bBGCOLOR\s*=\s*\"[^\"]+\"",                                                                                               re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bBORDER\s*=\s*\"[^\"]+\"",                                                                                                re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bCLASS\s*=\s*\"[^\"]+\"",                                                                                                 re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bFRAMEBORDER\s*=\s*\"[^\"]+\"",                                                                                           re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bHEIGHT\s*=\s*\"[^\"]+\"",                                                                                                re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bHREF\s*=\s*\"[^\"]+\"",                                                                                                  re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bHSPACE\s*=\s*\"[^\"]+\"",                                                                                                re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bLEFTMARGIN\s*=\s*\"[^\"]+\"",                                                                                            re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bLINK\s*=\s*\"[^\"]+\"",                                                                                                  re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bMARGINHEIGHT\s*=\s*\"[^\"]+\"",                                                                                          re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bMARGINWIDTH\s*=\s*\"[^\"]+\"",                                                                                           re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bNAME\s*=\s*\"[^\"]+\"",                                                                                                  re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bONCLICK\s*=\s*\"[^\"]+\"",                                                                                               re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bONMOUSEOUT\s*=\s*\"[^\"]+\"",                                                                                            re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bONMOUSEOVER\s*=\s*\"[^\"]+\"",                                                                                           re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bRIGHTMARGIN\s*=\s*\"[^\"]+\"",                                                                                           re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bSRC\s*=\s*\"[^\"]+\"",                                                                                                   re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bSROLLING\s*=\s*\"[^\"]+\"",                                                                                              re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bSTYLE\s*=\s*\"[^\"]+\"",                                                                                                 re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bTARGET\s*=\s*\"[^\"]+\"",                                                                                                re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bTITLE\s*=\s*\"[^\"]+\"",                                                                                                 re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bTOPMARGIN\s*=\s*\"[^\"]+\"",                                                                                             re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bTYPE\s*=\s*\"[^\"]+\"",                                                                                                  re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bVALIGN\s*=\s*\"[^\"]+\"",                                                                                                re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bVALUE\s*=\s*\"[^\"]+\"",                                                                                                 re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bVLINK\s*=\s*\"[^\"]+\"",                                                                                                 re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bVSPACE\s*=\s*\"[^\"]+\"",                                                                                                re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bWIDTH\s*=\s*\"[^\"]+\"",                                                                                                 re.DOTALL + re.IGNORECASE),


                            #
                            #
                            #   Attributes with single-quoted values
                            #
                            #
                            re.compile(r"\bALINK\s*=\s*\'[^\']+\'",                                                                                                 re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bALT\s*=\s*\'[^\']+\'",                                                                                                   re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bALLOWTRANSPARENCY\s*=\s*\'[^\']+\'",                                                                                     re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bBACKGROUND\s*=\s*\'[^\']+\'",                                                                                            re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bBGCOLOR\s*=\s*\'[^\']+\'",                                                                                               re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bBORDER\s*=\s*\'[^\']+\'",                                                                                                re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bCLASS\s*=\s*\'[^\']+\'",                                                                                                 re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bFRAMEBORDER\s*=\s*\'[^\']+\'",                                                                                           re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bHEIGHT\s*=\s*\'[^\']+\'",                                                                                                re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bHREF\s*=\s*\'[^\']+\'",                                                                                                  re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bHSPACE\s*=\s*\'[^\']+\'",                                                                                                re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bLEFTMARGIN\s*=\s*\'[^\']+\'",                                                                                            re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bLINK\s*=\s*\'[^\']+\'",                                                                                                  re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bMARGINHEIGHT\s*=\s*\'[^\']+\'",                                                                                          re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bMARGINWIDTH\s*=\s*\'[^\']+\'",                                                                                           re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bNAME\s*=\s*\'[^\']+\'",                                                                                                  re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bONCLICK\s*=\s*\'[^\']+\'",                                                                                               re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bONMOUSEOUT\s*=\s*\'[^\']+\'",                                                                                            re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bONMOUSEOVER\s*=\s*\'[^\']+\'",                                                                                           re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bRIGHTMARGIN\s*=\s*\'[^\']+\'",                                                                                           re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bSRC\s*=\s*\'[^\']+\'",                                                                                                   re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bSROLLING\s*=\s*\'[^\']+\'",                                                                                              re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bSTYLE\s*=\s*\'[^\']+\'",                                                                                                 re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bTARGET\s*=\s*\'[^\']+\'",                                                                                                re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bTITLE\s*=\s*\'[^\']+\'",                                                                                                 re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bTOPMARGIN\s*=\s*\'[^\']+\'",                                                                                             re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bTYPE\s*=\s*\'[^\']+\'",                                                                                                  re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bVALIGN\s*=\s*\'[^\']+\'",                                                                                                re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bVALUE\s*=\s*\'[^\']+\'",                                                                                                 re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bVLINK\s*=\s*\'[^\']+\'",                                                                                                 re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bVSPACE\s*=\s*\'[^\']+\'",                                                                                                re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bWIDTH\s*=\s*\'[^\']+\'",                                                                                                 re.DOTALL + re.IGNORECASE),


                            #
                            #
                            #   Web stuff
                            #
                            #
                            re.compile(r"\.(cgi|html|php)\b",                                                                                                       re.DOTALL + re.IGNORECASE),



                            #
                            #
                            #   Attributes with unquoted, numeric values
                            #
                            #
                            re.compile(r"\bBORDER\s*=\s*\d+",                                                                                                       re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bFRAMEBORDER\s*=\s*\d+",                                                                                                  re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bHEIGHT\s*=\s*\d+",                                                                                                       re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bHSPACE\s*=\s*\d+",                                                                                                       re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bMARGINHEIGHT\s*=\s*\d+",                                                                                                 re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bMARGINWIDTH\s*=\s*\d+",                                                                                                  re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bSCROLLING\s*=\s*\d+",                                                                                                    re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bVSPACE\s*=\s*\d+",                                                                                                       re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bWIDTH\s*=\s*\d+",                                                                                                        re.DOTALL + re.IGNORECASE),


                            #
                            #
                            #   Style stuff
                            #
                            #
                            re.compile(r"\bfont-family\b",                                                                                                          re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bfont-size\b",                                                                                                            re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bfont-weight\b",                                                                                                          re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bscrollbar-base-color\b",                                                                                                 re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bscrollbar-face-color\b",                                                                                                 re.DOTALL + re.IGNORECASE),
                            re.compile(r"\btext-decoration\b",                                                                                                      re.DOTALL + re.IGNORECASE),


                            #
                            #
                            #   HTML entity stuff
                            #
                            #
                            re.compile(r"\&Atilde;\&copy;",                                                                                                         re.DOTALL + re.IGNORECASE),


                            #
                            #
                            #   User messages and such
                            #
                            #
                            re.compile(r"\bproperty\s+and\s+copyright\b",                                                                                           re.DOTALL + re.IGNORECASE),
                            re.compile(r"\beducational\s+purposes\s+and\s+personal\s+use\s+only\b",                                                                 re.DOTALL + re.IGNORECASE),
                            re.compile(r"\ball\s+other\s+song\s+lyrics\b",                                                                                          re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bfollow\s+the\s+link\.\s",                                                                                                re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bappearing\s+on\s+this\s+site\b",                                                                                         re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bthe\s+material\s+in\+question\b",                                                                                        re.DOTALL + re.IGNORECASE),
                            re.compile(r"\byou\s+have\s+specific\s+requirements\b",                                                                                 re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bprovided\s+for\s+educational\s+purposes\b",                                                                              re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bdesign\s+and\s+layout\s+copyright\b",                                                                                    re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bAll\s+Rights\s+Reserved\b",                                                                                              re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bsubmitted\s+and\s+corrected\s+by\s+users\b",                                                                             re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bIf\s+you\s+have\s+any\s+lyrics\,\s",                                                                                     re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bcollection\,\s+other\s+music\s+lyrics\b",                                                                                re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bsearch\s+service\s+to\s+find\s+music\s+lyrics\.\s",                                                                      re.DOTALL + re.IGNORECASE),
                            re.compile(r"\blist\s+of\s+lyrics\s+our\s+visitors\b",                                                                                  re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bmusic\s+lyrics\s+extended\s+index\b",                                                                                    re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bapproving\s+corrections\s+on\s+lyrics\b",                                                                                re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bAll\s+lyrics\s+are\s+provided\s+for\b",                                                                                  re.DOTALL + re.IGNORECASE),
                            re.compile(r"\ball\s+music\s+genres\s+and\s+a\s+lot\b",                                                                                 re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bOriginal\s+Non-remixed\b",                                                                                               re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bLive\s+At\s+The\s+Isle\s+Of\s+Wight\s+Festival\b",                                                                       re.DOTALL + re.IGNORECASE),
                            re.compile(r"\b\d\d\d\d-\d\d\d\d\b",                                                                                                    re.DOTALL + re.IGNORECASE),


                            #
                            #
                            #   Month day, year
                            #
                            #
                            re.compile(r"\b\w{3,9}\s+\d{1,2}(?:th|st|rd|nd)?(?:\s+|\,(?:\s*\&nbsp;\s*))\d\d\d\d\b",                                                 re.DOTALL + re.IGNORECASE),


                            #
                            #
                            #   Dead singers (month day, year helps with this, too)
                            #
                            #
                            re.compile(r"\bsix\s+passengers\.\s+She\s+died\b",                                                                                      re.DOTALL + re.IGNORECASE),
                            re.compile(r"\bcar\s+crash\.\s+TLC\s+was\b",                                                                                            re.DOTALL + re.IGNORECASE),


                            #
                            #
                            #   Comma-separated lists of names
                            #
                            #
                            re.compile(r"(?:(?:\s+[A-Z]\w+){2,3}\,){6,}",                                                                                           re.DOTALL),


                            #
                            #
                            #   Special-case a couple of dead singers (month day, year and some special words above help with this, too)
                            #
                            #
                            #   Why can't the "b" be added at the end? Or more be put at the start ????
                            re.compile(r"><br>\s+In\s+Memory\s+of\s+[^<]+?\(?\d\d\d\d-\d\d\d\d\)?<br>.{25,400}<div\s+align=\"center\">\s+<center>\s+<table\s+",     re.DOTALL + re.IGNORECASE),
                        ]

likely_lyrics_re    =   re.compile(r"((?:[^<>]|<br\s*/?>|</?p\s*/?>){100,})",   re.DOTALL + re.IGNORECASE)


def could_be_lyrics(htm) :
    """
        Returns an array of possible lyrics-like chunks of text in the given html text.
        If there are more than one in the file, then the file is probably a listing of lyrics samples.
    """

    for s in strip_before_likely :
        htm = s.sub("<>", htm)                          # turn various things that are not inside lyrics in to "<>", which 'likely_lyrics_re' doesn't like

    htm = re.sub(r"[ \t\f\v]+", ' ', htm)               # there are various disclaimers, etc. which are spaced out to the right a long way, so collapse whitespace (but not CR LF - so that the output is nice and readable, still)

    g   = likely_lyrics_re.findall(htm)                 # find all the chunks of lyrics-like text in the html text
    if  len(g) >= 1 :
        return(g)
    return(None)




def _strip_each_line(s) :

    s = s.lstrip()


    def _strip(g) :
        return(g.group(1).strip())


    s = strip_line_re.sub(_strip, s)

    return(s)







nothing_found   = {                                                                     \
                        "(collection of sound clips)" : 1,
                        "(instrumental)" : 1,
                        "*instrumental*" : 1,
                        "[an error occurred while processing this directive]" : 1,
                        "dub track" : 1,
                        "instrumental song" : 1,
                        "instrumental song." : 1,
                        "instrumental" : 1,
                        "no lyrics" : 1,
                        "no text" : 1,
                        "this artist has requested that the lyrics to thier songs be removed from the internet, and we respect this decision." : 1,
                        "we're sorry but there are no lyrics available for this song." : 1,
                  }


def ignore_these_lyrics(lyrics) :

    lyrics = lyrics.strip().lower()

    if  nothing_found.has_key(lyrics) :
        return(True)

    if  re.match(r"music:\s*.{0,250}",                      lyrics, re.DOTALL) :
        return(True)

    if  re.match(r"<font\s+.{0,250}",                       lyrics, re.DOTALL) :
        return(True)

    if  re.match(r"<!--\s+.{0,250}",                        lyrics, re.DOTALL) :
        return(True)

    if  re.match(r"we haven't lyrics of this song.{0,250}", lyrics, re.DOTALL) :
        return(True)

    return(False)





def get_lyrics_from_htm(htm) :

    artist = None
    title  = None
    lyrics = None

    for h in lyrics_hosts :

        # print h[0]

        try :
            g   = h[2].search(htm)
        except RuntimeError :
            g   = None
        if  g :
            sh  = g.group(1)

            # print h[0]

            try :
                lg  = h[5].search(sh)
            except RuntimeError :
                lg   = None
            if  lg and (lg.lastindex > 0) :

                lyrics = lg.group(1)

                lyrics = tzlib.decode_html_entities(lyrics, ' ')

                lyrics = re.sub(r"\r\n", "\n",          lyrics)
                lyrics = re.sub(r"\r",   "\n",          lyrics)

                pre                 = False
                if  pre_re.match(lyrics) :
                    pre             = True
                    lyrics          = pre_re.sub(r'\1', lyrics)

                if  (  h[1] < 0) and (not pre) :
                    lyrics = re.sub(r"\n+", " ",        lyrics)
                elif  (h[1] > 0) or       pre  :
                    lyrics = _strip_each_line(lyrics)
                    if  not pre :
                        lyrics = re.sub(r"\n{2}", "\n", lyrics)
                    lyrics = re.sub(r"\n{3,}", "\n\n",  lyrics)


                lyrics = de_br_re.sub("\n",             lyrics)
                lyrics = de_p_re.sub("\n\n",            lyrics)
                lyrics = de_i_re.sub("",                lyrics)
                lyrics = de_lb_re.sub("\n\n",           lyrics)

                lyrics = _strip_each_line(lyrics)

                lyrics = re.sub(r"\n{3,}", "\n\n",      lyrics)

                lyrics = lyrics.replace("\x92", "'").strip()

                if  (len(lyrics) == 0) or ignore_these_lyrics(lyrics):

                    lyrics = None

                else :

                    lyrics     += "\n"

                    try :
                        title   = h[3].search(sh)
                    except RuntimeError :
                        title   = None
                    if  title   : title  = tzlib.decode_html_entities(title.group(1),  ' ')

                    try :
                        artist  = h[4].search(sh)
                    except RuntimeError :
                        artist  = None
                    if  artist  : artist = tzlib.decode_html_entities(artist.group(1), ' ')

                    break

                pass

            pass
        pass

    return( ( lyrics, title, artist ) )


class a_lyrics_getter :

    def __init__(me) :

        me.host                 = ""
        me.match_host           = ""

        me.qartist              = ""
        me.qalbum               = ""
        me.qtitle               = ""

        me.artist               = ""
        me.album                = ""
        me.title                = ""

        me.lyrics               = ""

        me.when                 = ""

        me.write_htm_path       = ""
        me.write_bad_htm_path   = ""

        me.do_fast              = True

        me.file_up_to_date      = True

        pass



    def get_lyrics(me, title = None, artist = None, album = None, timeout = None) :


        retval      = False

        if  artist == None :    artist  = ""
        if  album  == None :    album   = ""
        if  title  == None :    title   = ""

        if  len(title) :

            me.qartist  = artist
            me.qalbum   = album
            me.qtitle   = title

            googler     = GoogleSearch.a_google_html_querier()

            q           = "lyrics \"%s\" \"%s\"" % ( title, artist )

            for rcnt in range(0, 2) :
                googler.do_query(q, timeout, False)

                if  googler.page_count() != None :
                    break
                pass

            googler.find_google_urls()

            bad_htms = []

            for url in googler.urls :

                me.host = urlparse.urlparse(url)[1]
                if  not all_hosts_counts.has_key(me.host) : all_hosts_counts[me.host] = 0
                all_hosts_counts[me.host] += 1

                if  not retval :
                    for h in lyrics_hosts :
                        if  url.find(h[0]) >= 0 :
                            # print "trying", url

                            if  not known_hosts_counts.has_key(me.host) :  known_hosts_counts[me.host] = 0
                            known_hosts_counts[me.host] += 1

                            req = urllib2.Request(url)

                            htm = get_request(req, timeout)

                            if  htm :

                                ( l, t, a ) = get_lyrics_from_htm(htm)
                                if  l :

                                    if  not found_hosts_counts.has_key(me.host) :  found_hosts_counts[me.host] = 0
                                    found_hosts_counts[me.host] += 1

                                    me.artist       = a
                                    me.album        = ""
                                    me.title        = t
                                    me.lyrics       = l

                                    me.when         = time.asctime(time.localtime())
                                    me.match_host   = h[0]

                                    retval          = True

                                    if  me.do_fast :

                                        if  me.write_htm_path :
                                            while True :
                                                fn    = os.path.join(me.write_htm_path, "net_lyrics_%05u.htm" % ( random.randint(0, 99999) ) )
                                                if  not os.path.exists(fn) :
                                                    fo = open(fn, "wb")
                                                    fo.write("<!-- %s\r\n     %s\r\n  -->\r\n" % ( me.host, url ) )
                                                    fo.write(htm)
                                                    fo.close()
                                                pass
                                            pass

                                        break

                                    pass

                                elif me.write_bad_htm_path :
                                    bad_htms.append( [ me.host, url, htm ] )
                                pass
                            pass

                        if  retval and me.do_fast :
                            break
                        pass
                    pass

                if  retval and me.do_fast :
                    break

                pass

            if  not retval :
                htms    = {}
                for bad in bad_htms :
                    crc = tzlib.blkcrc32(0, bad[2])
                    htms[crc] = bad                                             # strip duplicates

                for bad in htms.values() :
                    ll  = could_be_lyrics(bad[2])
                    if  ll and (len(ll) == 1) :
                        while True :
                            fn    = os.path.join(me.write_bad_htm_path, "net_lyrics_%05u.htm" % ( random.randint(0, 99999) ) )
                            if  not os.path.exists(fn) :
                                fo = open(fn, "wb")
                                fo.write("<!-- %s\r\n     %s\r\n  -->\r\n" % ( bad[0], bad[1] ) )
                                fo.write(bad[2])
                                fo.close()

                                break
                            pass
                        pass
                    pass
                pass

            pass


        return(retval)


    def write_file(me, file_name = None) :

        if  file_name :

            fo = open(file_name, "wb")

            fo.write("Artist: %s\n"             % ( me.qartist )                    )
            fo.write("Album:  %s\n"             % ( me.qalbum )                     )
            fo.write("Title:  %s\n"             % ( me.qtitle )                     )

            fo.write("Host:   %s\n"             % ( me.host )                       )
            fo.write("Match_host: %s\n"         % ( me.match_host )                 )

            fo.write("Reported_artist: %s\n"    % ( me.artist )                     )
            fo.write("Reported_album:  %s\n"    % ( me.album )                      )
            fo.write("Reported_title:  %s\n"    % ( me.title )                      )

            fo.write("When:   %s\n"             % ( me.when  )                      )

            fo.write("\n")

            fo.write(me.lyrics.strip())

            fo.write("\n")

            fo.close()

            me.file_up_to_date = True

        pass


    def read_file(me, file_name = None) :

        if  file_name :

            fi      = open(file_name, "rb")
            lyrics  = fi.read()
            fi.close()

            g = file_re.match(lyrics)
            if  g :
                ( me.qartist, me.qalbum, me.qtitle, me.host, me.match_host, me.artist, me.album, me.title, me.when, me.lyrics ) = map(string.strip, g.groups())
                me.file_up_to_date = True

                return(True)

            g = file_01_re.match(lyrics)
            if  g :
                ( me.qartist, me.qalbum, me.qtitle, me.host,                me.artist, me.album, me.title, me.when, me.lyrics ) = map(string.strip, g.groups())
                me.file_up_to_date = False

                return(True)

            g = file_00_re.match(lyrics)
            if  g :
                me.lyrics   = lyrics
                ( me.qartist, me.qalbum, me.qtitle, me.host,                me.artist,           me.title, me.when, me.lyrics ) = map(string.strip, g.groups())
                me.file_up_to_date = False

                return(True)

            pass

        return(False)



def a_lyrics_from_file(file_name) :
    me = a_lyrics_getter()
    if  not me.read_file(file_name) :
        if  not ignore_these_lyrics(me.lyrics) :                            # don't give 'em old files that should be purged
            return(None)
        pass

    return(me)





if  __name__ == '__main__' :

    if  len(sys.argv) < 2 :

        print   "Tell me a title artist search string (or a .htm file)."

    else :
        import  glob

        import  TZCommandLineAtFile

        del(sys.argv[0])

        TZCommandLineAtFile.expand_at_sign_command_line_files(sys.argv)

        while   len(sys.argv) > 0 :

            ss  = sys.argv.pop(0)

            if  ss.endswith(".htm") or ss.endswith(".html") :

                files   = glob.glob(ss)

                for fn in files :

                    htm     = open(fn, "rb").read()

                    (lyrics, title, artist) = get_lyrics_from_htm(htm)

                    print
                    print   fn
                    print   "------", artist, " -------", title
                    print   lyrics
                    print   "------", artist, " -------", title

                    ply =   could_be_lyrics(htm)
                    if  (ply and not lyrics) or (lyrics and not ply) :
                        print   "Has (or not, if it should) lyrics:", len(ply), ply
                    pass
                pass
            else :
                lg = a_lyrics_getter()

                if  lg.get_lyrics(ss) :
                    print lg.artist, "-", lg.title
                    print lg.lyrics
                    print lg.artist, "-", lg.title
                pass
        pass

    pass

#
#
#
# eof
