#!/usr/bin/python

# MusicMetaData.py
#       --copyright--                   Copyright 2007 (C) Tranzoa, Co. All rights reserved.    Warranty: You're free and on your own here. This code is not necessarily up-to-date or of public quality.
#       --url--                         http://www.tranzoa.net/tzpython/
#       --email--                       pycode is the name to send to. tranzoa.com is the place to send to.
#       --bodstamps--
#       May 12, 2003            bar
#       May 13, 2003            bar     print_file
#                                       savage_beast_id
#       May 14, 2003            bar     handle "correct" XML - fix an assert or something
#       May 15, 2003            bar     double-underscore privates
#       May 16, 2003            bar     id_idx()
#       May 27, 2003            bar     savage_beast_id_idx()
#       June 9, 2003            bar     print song
#                                       matching_savage_beast_ids
#       June 11, 2003           bar     dupe_of_id
#       June 17, 2003           bar     allow the escape routine to take an optional escape pattern parameter
#       June 18, 2003           bar     convert_music_info_to_tokens() moved here from PJbayes.py
#                                       use the numeric_tokens logic to group file sizes into 16 buckets in tokenizing the songs' metadata
#                                         (in practice, it does worse with it turned on, so I put a default of off)
#       June 19, 2003           bar     force input to escaping routines to be strings
#                                       tell the numeric tokenizer to index the token names
#       July 10, 2003           bar     allow caller to specify that the tokens aren't prefixed in convert_music_info_to_tokens()
#                                       allow caller to specify XML tags whose tokens should be ignored by convert_music_info_to_tokens()
#       August 5, 2003          bar     allow main program to merge a particular key from one file to another and output 'em to a third file
#       December 9, 2003        bar     shuffle the single output file
#       September 11, 2004      bar     allow id to be numeric in print_song
#       March 2, 2005           bar     gracefully handle empty files when reading/parsing
#                                       don't write out empty fields
#       November 18, 2007       bar     turn on doxygen
#       November 27, 2007       bar     insert boilerplate copyright
#       May 12, 2008            bar     don't use id as a variable name
#       May 17, 2008            bar     email adr
#       November 15, 2008       bar     egad! I've been making default params as [] and {}
#       May 12, 2011            bar     go to utf8 and entity encoding - incompatible with old code and data
#       May 27, 2012            bar     doxygen namespace
#       --eodstamps--
##      \file
#       \namespace              tzpython.MusicMetaData
#
#
#       This module reads a really simple "XML" text file.
#
#       Actually, the file looks like this:
#
#       <id_info>
#         <sub_tag>maybe some text with ignored leading and trailing white space</sub_tag>
#         <another_sub_tag>maybe some text with ignored leading and trailing white space</another_sub_tag>
#         <yet_another_sub_tag>maybe some text with ignored leading and trailing white space</yet_another_sub_tag>
#         </id_info>
#
#       This logic is really pretty forgiving ... and hard-coded for one level of depth.
#         For instance, text outside the inner tags is ignored.
#         As is the closing </id_info> tag.
#
#

import  os
import  re
import  string

import  numeric_tokens
import  tzlib


#
#
#   Pythonism: if the guy says "from MusicMetaData import *", he gets these names, only.
#
#
__all__ = [
            "escape_str",
            "unescape_str",


            "empty_song",


            "convert_music_info_to_tokens",


            "parse_file",

            "print_song",
            "print_file",

            "id_idx",
            "savage_beast_id_idx",
          ]


def __esc_chr(s) :
    return("&#" + str(ord(s.group(0))) + ";")

def escape_str(s, esc_pattern = ur"[\000-\037\%\&\<\>\177-\377\u0100-\uffff]") :
    if  not isinstance(s, basestring) :
        s   = str(s)
    try     :
        s   = unicode(s, 'utf8')
    except  TypeError :
        pass
    except  UnicodeDecodeError :
        pass
    # print "@@@@", [ [ ord(c), str(c) ] for c in s ], repr(s)
    try     :
        s   = s.encode('utf8')
    except UnicodeDecodeError :
        s   = s.decode('latin1')
        s   = s.encode('utf8')
    s   = re.sub(esc_pattern, __esc_chr, s)
    return(s)

def unescape_str(s) :
    ss = str(s).split("%")
    for i in range(1, len(ss)) :
        ss[i] = unichr(int(ss[i][0:2], 16)).encode('utf8') + ss[i][2:]
    ss  = "".join(ss)
    try :
        ss  = tzlib.decode_html_entities(ss)            # produces string of latin1 - ugh
    except  ( UnicodeEncodeError, UnicodeDecodeError ) :
        ss  = ss.decode('latin1')
        ss  = tzlib.decode_html_entities(ss)

    try :
        ss  = ss.encode('latin1')
        ss  = unicode(ss, 'utf8')
    except  TypeError :
        pass
    except  ( UnicodeEncodeError, UnicodeDecodeError ) :
        try :
            ss  = ss.decode('utf8')
            ss  = unicode(ss)
        except  ( UnicodeEncodeError, UnicodeDecodeError ) :
            pass
        pass
    return(ss)


def empty_song() :
    s = {}

    s['id']                             = ""

    s['full_id']                        = ""
    s['clip_id']                        = ""
    s['quick_id']                       = ""

    s['file_name']                      = ""

    s['audio_url']                      = ""

    s['artist']                         = ""
    s['title']                          = ""
    s['album']                          = ""
    s['disc']                           = ""
    s['track']                          = ""
    s['genre']                          = ""

    s['year']                           = ""
    s['date']                           = ""

    s['duration']                       = ""

    s['file_size']                      = ""

    s['savage_beast_id']                = ""
    s['matching_savage_beast_ids']      = ""

    s['dupe_of_id']                     = ""

    return(s)



def convert_music_info_to_tokens(md, simplify_numeric = False, no_prefixes = False, ignore_tags = None) :
    """
        Convert an array of MusicMetaData music data to a map, keyed by ID, containing a text token array.
    """

    if  ignore_tags == None :
        ignore_tags =  {}

    me = {}

    nt = numeric_tokens.a_numeric_token_translator(16, 1)

    for song in md :
        sid    = song['id']
        tokens = []

        for v in song.keys() :
            if  len(song[v]) and not ignore_tags.has_key(v) :
                t = re.split(r"\s+", song[v])

                if  not no_prefixes :
                    t = map(lambda t : str(v + "~" + t), t)

                tokens[len(tokens):] = map(string.lower, t)
            pass

        # print tokens

        if  simplify_numeric :  nt.learn(tokens)

        me[sid] = tokens


    if  simplify_numeric :
        for song in md :
            sid     = song['id']

            me[sid] = nt.translate(me[sid])             # fix up numeric tokens so that there are only 16 different tokens by each "name". The "name" is the non-numeric, left part of the token.
        pass

    return(me)




def parse_file(f, meta_tag = "<id_info>") :

    if  not isinstance(f, FileType) :
        f = open(f)

    f       = f.read()

    f       = f.split(meta_tag)

    if  len(f) != 0 :
        f[0]            = re.sub(r"^\s*<music_list>\s*", "", f[0],              re.IGNORECASE)
        f[len(f) - 1]   = re.sub(r"\s*</music_list>\s*$", "", f[len(f) - 1],    re.IGNORECASE + re.DOTALL)

    pat     = re.compile(r"<([^\/>][^>]*)>\s*([^<]*)<\/([^>]+)>")

    retval  = []

    for t in f :
        if  len(t) :
            # print "[" + t + "]"
            tag_val_tags = pat.findall(t)

            if  len(tag_val_tags) :
                song = empty_song()

                # s = "";
                for tv in tag_val_tags :
                    if  tv[0] != tv[2] :
                        print "tv02", tv[0], tv[1], tv[2]
                        assert (tv[0] == tv[2])

                    v = unescape_str(tv[1].rstrip())

                    # print s + tv[0] + ":" + v
                    # s = "  "

                    song[tv[0]] = v

                retval.append(song)

    return(retval)



def id_idx(me, sid) :
    """
        Find an ID.
        Return the array index.
        Return None if not found.
    """
    if  sid :
        for si in range(0, len(me)) :
            if  me[si]['id'] == sid :
                return(si)
        pass

    return(None)




def savage_beast_id_idx(me, sid) :
    """
        Find a Savage Beast ID.
        Return the array index.
        Return None if not found.
    """
    if  sid :
        for si in range(0, len(me)) :
            if  me[si]['savage_beast_id'] == sid :
                return(si)
        pass

    return(None)




from types import FileType


def print_file_header(f) :
    print     >> f, "<music_list>"
    print     >> f

def print_file_trailer(f) :
    print     >> f
    print     >> f, "</music_list>"


def print_song(song, f = None) :
    print     >> f, ( "<id_info>"     )

    kys = song.keys()
    kys.sort()

    for i in kys :
        if  song[i] :
            print >> f, ( "  <" + i + ">" + escape_str(song[i]) + "</" + i + ">" )
        pass

    print     >> f, ( "  </id_info>"  )
    print     >> f



def print_file(f, songs) :
    opened = 0

    if  not isinstance(f, FileType) :
        f = open(f, "w")
        opened = 1
        print_file_header(f)


    for s in songs :
        assert len(str(s['id']))
        print_song(s, f)


    if  opened :
        print_file_trailer(f)
        f.close()

    pass




#
#
#
if __name__ == '__main__':

    import sys


    if  len(sys.argv) <= 1 :

        print "Tell me a music meta data file to parse!"

    elif (len(sys.argv) == 4) or (len(sys.argv) == 5) :                 # combine two files into a third

        if  os.path.exists(sys.argv[3]) :
            print "No. Please whack", sys.argv[3], "so that I can create a new one by combining", sys.argv[1], "and", sys.argv[2]
        else :
            md1 = parse_file(sys.argv[1])
            md2 = parse_file(sys.argv[2])
            for i in range(0, len(md2)) :
                s2 = md2[i]
                sid = s2['id']
                s1  = id_idx(md1, sid)
                if  s1 == None :
                    print "No ID", sid, "in", sys.argv[1]
                else :
                    s1 = md1[s1]
                    if  len(sys.argv) == 5 :
                        k   = sys.argv[4]
                        if  s2.has_key(k) and ((not s1.has_key(k)) or (not len(s1[k]))) :
                            s1[k] = s2[k]
                        pass
                    else :
                        for k in s2.keys() :
                            if  (not s1.has_key(k)) or (not len(s1[k])) :
                                s1[k] = s2[k]
                            pass
                        pass
                    pass
                pass

            print_file(sys.argv[3], md1)
        pass

    else :
        import  random

        for fn in sys.argv[1:] :
            md = parse_file(fn)

            if  len(md) :
                print_song(md[0])

            random.shuffle(md);

            print_file("x.y", md)

#
#
#
# eof
