#!/usr/bin/python # net_lyrics.py # --copyright-- Copyright 2007 (C) Tranzoa, Co. All rights reserved. Warranty: You're free and on your own here. This code is not necessarily up-to-date or of public quality. # --url-- http://www.tranzoa.net/tzpython/ # --email-- pycode is the name to send to. tranzoa.com is the place to send to. # --bodstamps-- # March 1, 2005 bar # March 2, 2005 bar typo # March 4, 2005 bar replace 0x92 with single quotes # pur \n at the end of the lyrics # lyricsdownload.com # March 5, 2005 bar rename a routine # new sites # more counts # write out files we are unable to parse # March 10, 2005 bar lyred # March 12, 2005 bar change the meaning of .host to be the snagged host # file io #
]+>\s*(.+?)\s*\s*$", re.IGNORECASE + re.DOTALL) de_br_re = re.compile(r"?br\s*/?>", re.IGNORECASE) de_p_re = re.compile(r"?p\s*/?>", re.IGNORECASE) de_i_re = re.compile(r"?i\s*/?>", re.IGNORECASE) de_lb_re = re.compile(r"\n{2,}", re.IGNORECASE) strip_line_re = re.compile(r"^(.*)$", re.MULTILINE) opener = urllib2.build_opener() opener.addheaders = [] # get rid of 'User-agent' the only way that seems to work (yes, I tried lower-casing 'Agent') urllib2.install_opener(opener) all_hosts_counts = {} known_hosts_counts = {} found_hosts_counts = {} lyrics_hosts = [ [ 'lyricsfreak.com', -1, r"
", r"", r"", r"(.*?)", ], [ 'nomorelyrics.net', # also successfully matches elyrics.net ! -1, r"
", r"", r"", r"(.*?)", ], [ 'nomorelyrics.net', -1, r"
(.*?)\s+lyrics | ",
r"(.*?)\s+lyrics",
r"
Song | \s*([^<]+) | ", r"Artist | \s*([^<]+) | ", r"(.*?) | \s*\s*
(.*?)", ], [ 'lyricsondemand.com', 1, r"(class=\"NoUnder\".*)
(.*?)", ], [ 'lyricscafe.com', -1, r"(.*)", r"\s*(.*?)\s*::.*?", r".*?::\s*(.*?)\s*", r"class=\"NormalText\">.*?
\s*(.*?)\s*Lyrics
",
r"class=\"Red\">.*?
\s*
(.+)",
],
[ 'arelyrics.com',
-1,
r"
.*?", re.DOTALL + re.IGNORECASE), # alternate way to get rid of a lot of css stuff
re.compile(r"\%20", re.DOTALL + re.IGNORECASE),
#
#
# Attributes with double-quoted values
#
#
re.compile(r"\bALINK\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bALT\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bALLOWTRANSPARENCY\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bBACKGROUND\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bBGCOLOR\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bBORDER\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bCLASS\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bFRAMEBORDER\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bHEIGHT\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bHREF\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bHSPACE\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bLEFTMARGIN\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bLINK\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bMARGINHEIGHT\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bMARGINWIDTH\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bNAME\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bONCLICK\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bONMOUSEOUT\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bONMOUSEOVER\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bRIGHTMARGIN\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bSRC\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bSROLLING\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bSTYLE\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bTARGET\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bTITLE\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bTOPMARGIN\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bTYPE\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bVALIGN\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bVALUE\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bVLINK\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bVSPACE\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
re.compile(r"\bWIDTH\s*=\s*\"[^\"]+\"", re.DOTALL + re.IGNORECASE),
#
#
# Attributes with single-quoted values
#
#
re.compile(r"\bALINK\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bALT\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bALLOWTRANSPARENCY\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bBACKGROUND\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bBGCOLOR\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bBORDER\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bCLASS\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bFRAMEBORDER\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bHEIGHT\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bHREF\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bHSPACE\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bLEFTMARGIN\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bLINK\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bMARGINHEIGHT\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bMARGINWIDTH\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bNAME\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bONCLICK\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bONMOUSEOUT\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bONMOUSEOVER\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bRIGHTMARGIN\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bSRC\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bSROLLING\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bSTYLE\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bTARGET\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bTITLE\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bTOPMARGIN\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bTYPE\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bVALIGN\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bVALUE\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bVLINK\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bVSPACE\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
re.compile(r"\bWIDTH\s*=\s*\'[^\']+\'", re.DOTALL + re.IGNORECASE),
#
#
# Web stuff
#
#
re.compile(r"\.(cgi|html|php)\b", re.DOTALL + re.IGNORECASE),
#
#
# Attributes with unquoted, numeric values
#
#
re.compile(r"\bBORDER\s*=\s*\d+", re.DOTALL + re.IGNORECASE),
re.compile(r"\bFRAMEBORDER\s*=\s*\d+", re.DOTALL + re.IGNORECASE),
re.compile(r"\bHEIGHT\s*=\s*\d+", re.DOTALL + re.IGNORECASE),
re.compile(r"\bHSPACE\s*=\s*\d+", re.DOTALL + re.IGNORECASE),
re.compile(r"\bMARGINHEIGHT\s*=\s*\d+", re.DOTALL + re.IGNORECASE),
re.compile(r"\bMARGINWIDTH\s*=\s*\d+", re.DOTALL + re.IGNORECASE),
re.compile(r"\bSCROLLING\s*=\s*\d+", re.DOTALL + re.IGNORECASE),
re.compile(r"\bVSPACE\s*=\s*\d+", re.DOTALL + re.IGNORECASE),
re.compile(r"\bWIDTH\s*=\s*\d+", re.DOTALL + re.IGNORECASE),
#
#
# Style stuff
#
#
re.compile(r"\bfont-family\b", re.DOTALL + re.IGNORECASE),
re.compile(r"\bfont-size\b", re.DOTALL + re.IGNORECASE),
re.compile(r"\bfont-weight\b", re.DOTALL + re.IGNORECASE),
re.compile(r"\bscrollbar-base-color\b", re.DOTALL + re.IGNORECASE),
re.compile(r"\bscrollbar-face-color\b", re.DOTALL + re.IGNORECASE),
re.compile(r"\btext-decoration\b", re.DOTALL + re.IGNORECASE),
#
#
# HTML entity stuff
#
#
re.compile(r"\Ã\©", re.DOTALL + re.IGNORECASE),
#
#
# User messages and such
#
#
re.compile(r"\bproperty\s+and\s+copyright\b", re.DOTALL + re.IGNORECASE),
re.compile(r"\beducational\s+purposes\s+and\s+personal\s+use\s+only\b", re.DOTALL + re.IGNORECASE),
re.compile(r"\ball\s+other\s+song\s+lyrics\b", re.DOTALL + re.IGNORECASE),
re.compile(r"\bfollow\s+the\s+link\.\s", re.DOTALL + re.IGNORECASE),
re.compile(r"\bappearing\s+on\s+this\s+site\b", re.DOTALL + re.IGNORECASE),
re.compile(r"\bthe\s+material\s+in\+question\b", re.DOTALL + re.IGNORECASE),
re.compile(r"\byou\s+have\s+specific\s+requirements\b", re.DOTALL + re.IGNORECASE),
re.compile(r"\bprovided\s+for\s+educational\s+purposes\b", re.DOTALL + re.IGNORECASE),
re.compile(r"\bdesign\s+and\s+layout\s+copyright\b", re.DOTALL + re.IGNORECASE),
re.compile(r"\bAll\s+Rights\s+Reserved\b", re.DOTALL + re.IGNORECASE),
re.compile(r"\bsubmitted\s+and\s+corrected\s+by\s+users\b", re.DOTALL + re.IGNORECASE),
re.compile(r"\bIf\s+you\s+have\s+any\s+lyrics\,\s", re.DOTALL + re.IGNORECASE),
re.compile(r"\bcollection\,\s+other\s+music\s+lyrics\b", re.DOTALL + re.IGNORECASE),
re.compile(r"\bsearch\s+service\s+to\s+find\s+music\s+lyrics\.\s", re.DOTALL + re.IGNORECASE),
re.compile(r"\blist\s+of\s+lyrics\s+our\s+visitors\b", re.DOTALL + re.IGNORECASE),
re.compile(r"\bmusic\s+lyrics\s+extended\s+index\b", re.DOTALL + re.IGNORECASE),
re.compile(r"\bapproving\s+corrections\s+on\s+lyrics\b", re.DOTALL + re.IGNORECASE),
re.compile(r"\bAll\s+lyrics\s+are\s+provided\s+for\b", re.DOTALL + re.IGNORECASE),
re.compile(r"\ball\s+music\s+genres\s+and\s+a\s+lot\b", re.DOTALL + re.IGNORECASE),
re.compile(r"\bOriginal\s+Non-remixed\b", re.DOTALL + re.IGNORECASE),
re.compile(r"\bLive\s+At\s+The\s+Isle\s+Of\s+Wight\s+Festival\b", re.DOTALL + re.IGNORECASE),
re.compile(r"\b\d\d\d\d-\d\d\d\d\b", re.DOTALL + re.IGNORECASE),
#
#
# Month day, year
#
#
re.compile(r"\b\w{3,9}\s+\d{1,2}(?:th|st|rd|nd)?(?:\s+|\,(?:\s*\ \s*))\d\d\d\d\b", re.DOTALL + re.IGNORECASE),
#
#
# Dead singers (month day, year helps with this, too)
#
#
re.compile(r"\bsix\s+passengers\.\s+She\s+died\b", re.DOTALL + re.IGNORECASE),
re.compile(r"\bcar\s+crash\.\s+TLC\s+was\b", re.DOTALL + re.IGNORECASE),
#
#
# Comma-separated lists of names
#
#
re.compile(r"(?:(?:\s+[A-Z]\w+){2,3}\,){6,}", re.DOTALL),
#
#
# Special-case a couple of dead singers (month day, year and some special words above help with this, too)
#
#
# Why can't the "b" be added at the end? Or more be put at the start ????
re.compile(r">
\s+In\s+Memory\s+of\s+[^<]+?\(?\d\d\d\d-\d\d\d\d\)?
.{25,400}