#!/usr/bin/python

# MusicMetaData.py
#       --copyright--                   Copyright 2007 (C) Tranzoa, Co. All rights reserved.    Warranty: You're free and on your own here. This code is not necessarily up-to-date or of public quality.
#       --url--                         http://www.tranzoa.net/tzpython/
#       --email--                       pycode is the name to send to. tranzoa.com is the place to send to.
#       --bodstamps--
#       May 12, 2003            bar
#       May 13, 2003            bar     print_file
#                                       savage_beast_id
#       May 14, 2003            bar     handle "correct" XML - fix an assert or something
#       May 15, 2003            bar     double-underscore privates
#       May 16, 2003            bar     id_idx()
#       May 27, 2003            bar     savage_beast_id_idx()
#       June 9, 2003            bar     print song
#                                       matching_savage_beast_ids
#       June 11, 2003           bar     dupe_of_id
#       June 17, 2003           bar     allow the escape routine to take an optional escape pattern parameter
#       June 18, 2003           bar     convert_music_info_to_tokens() moved here from PJbayes.py
#                                       use the numeric_tokens logic to group file sizes into 16 buckets in tokenizing the songs' metadata
#                                         (in practice, it does worse with it turned on, so I put a default of off)
#       June 19, 2003           bar     force input to escaping routines to be strings
#                                       tell the numeric tokenizer to index the token names
#       July 10, 2003           bar     allow caller to specify that the tokens aren't prefixed in convert_music_info_to_tokens()
#                                       allow caller to specify XML tags whose tokens should be ignored by convert_music_info_to_tokens()
#       August 5, 2003          bar     allow main program to merge a particular key from one file to another and output 'em to a third file
#       December 9, 2003        bar     shuffle the single output file
#       September 11, 2004      bar     allow id to be numeric in print_song
#       March 2, 2005           bar     gracefully handle empty files when reading/parsing
#                                       don't write out empty fields
#       November 18, 2007       bar     turn on doxygen
#       November 27, 2007       bar     insert boilerplate copyright
#       May 12, 2008            bar     don't use id as a variable name
#       May 17, 2008            bar     email adr
#       November 15, 2008       bar     egad! I've been making default params as [] and {}
#       May 12, 2011            bar     go to utf8 and entity encoding - incompatible with old code and data
#       May 27, 2012            bar     doxygen namespace
#       February 23, 2023       bar     get rid of has_keys
#       March 5, 2023           bar     future print
#       March 7, 2023           bar     untested python3
#       --eodstamps--
##      \file
#       \namespace              tzpython.MusicMetaData
#
#
#       This module reads a really simple "XML" text file.
#
#       Actually, the file looks like this:
#
#       <id_info>
#         <sub_tag>maybe some text with ignored leading and trailing white space</sub_tag>
#         <another_sub_tag>maybe some text with ignored leading and trailing white space</another_sub_tag>
#         <yet_another_sub_tag>maybe some text with ignored leading and trailing white space</yet_another_sub_tag>
#         </id_info>
#
#       This logic is really pretty forgiving ... and hard-coded for one level of depth.
#         For instance, text outside the inner tags is ignored.
#         As is the closing </id_info> tag.
#
#

from    __future__  import  print_function

import  os
import  re
import  sys
import  string


import  numeric_tokens
import  tzlib


#
#
#   Pythonism: if the guy says "from MusicMetaData import *", he gets these names, only.
#
#
__all__ = [
            "escape_str",
            "unescape_str",


            "empty_song",


            "convert_music_info_to_tokens",


            "parse_file",

            "print_song",
            "print_file",

            "id_idx",
            "savage_beast_id_idx",
          ]


def __esc_chr(s) :
    return("&#" + str(ord(s.group(0))) + ";")

if  sys.version_info[0] < 3 :
    UR      = "u"
else        :
    UR      = ""

def escape_str(s, esc_pattern = eval(UR + '"[\\000-\\037\\%\\&\\<\\>\\177-\\377\u0100-\uffff]"')) :
    if  not tzlib.is_stringish(s) :
        s   = str(s)
    s       = tzlib.convert_to_unicode(s)
    # print("@@@@", [ [ ord(c), str(c) ] for c in s ], repr(s))
    s   = re.sub(esc_pattern, __esc_chr, s)
    return(s)

def unescape_str(s) :
    s   = tzlib.convert_to_unicode(s)
    ss  = s.split("%")
    for i in range(1, len(ss)) :
        ss[i] = chr(int(ss[i][0:2], 16)) + ss[i][2:]
    ss  = "".join(ss)
    try :
        ss  = tzlib.decode_html_entities(ss)            # produces string of latin1 - ugh (March 7, 2023 nope, it wants and returns unicode)
    except  ( UnicodeEncodeError, UnicodeDecodeError ) :
        ss  = ss.decode('latin1')
        ss  = tzlib.decode_html_entities(ss)

    if  sys.version_info[0] < 3 :
        ss  = tzlib.convert_to_utf8(ss)
    return(ss)


def empty_song() :
    s = {}

    s['id']                             = ""

    s['full_id']                        = ""
    s['clip_id']                        = ""
    s['quick_id']                       = ""

    s['file_name']                      = ""

    s['audio_url']                      = ""

    s['artist']                         = ""
    s['title']                          = ""
    s['album']                          = ""
    s['disc']                           = ""
    s['track']                          = ""
    s['genre']                          = ""

    s['year']                           = ""
    s['date']                           = ""

    s['duration']                       = ""

    s['file_size']                      = ""

    s['savage_beast_id']                = ""
    s['matching_savage_beast_ids']      = ""

    s['dupe_of_id']                     = ""

    return(s)


def convert_music_info_to_tokens(md, simplify_numeric = False, no_prefixes = False, ignore_tags = None) :
    """
        Convert an array of MusicMetaData music data to a map, keyed by ID, containing a text token array.
    """

    if  ignore_tags == None :
        ignore_tags =  {}

    me = {}

    nt = numeric_tokens.a_numeric_token_translator(16, 1)

    for song in md :
        sid    = song['id']
        tokens = []

        for v in song.keys() :
            if  len(song[v]) and not (v in ignore_tags) :
                t = re.split(r"\s+", song[v])

                if  not no_prefixes :
                    t = map(lambda t : str(v + "~" + t), t)

                tokens[len(tokens):] = list(map(string.lower, t))
            pass

        # print(tokens)

        if  simplify_numeric :  nt.learn(tokens)

        me[sid] = tokens


    if  simplify_numeric :
        for song in md :
            sid     = song['id']

            me[sid] = nt.translate(me[sid])             # fix up numeric tokens so that there are only 16 different tokens by each "name". The "name" is the non-numeric, left part of the token.
        pass

    return(me)


def parse_file(f, meta_tag = "<id_info>") :

    if  not hasattr(f, 'read') :
        fd  = f.read()
    else    :
        f   = open(f)
        fd  = f.read()
        f.close()

    fd      = fd.split(meta_tag)

    if  len(fd) != 0    :
        fd[0]           = re.sub(r"^\s*<music_list>\s*", "", fd[0],                re.IGNORECASE)
        fd[len(fd) - 1] = re.sub(r"\s*</music_list>\s*$", "", fd[len(fd) - 1],    re.IGNORECASE + re.DOTALL)

    pat     = re.compile(r"<([^\/>][^>]*)>\s*([^<]*)<\/([^>]+)>")

    retval  = []

    for t in fd :
        if  len(t) :
            # print("[" + t + "]")
            tag_val_tags = pat.findall(t)

            if  len(tag_val_tags) :
                song = empty_song()

                # s = "";
                for tv in tag_val_tags :
                    if  tv[0] != tv[2] :
                        print("tv02", tv[0], tv[1], tv[2])
                        assert (tv[0] == tv[2])

                    v = unescape_str(tv[1].rstrip())

                    # print(s + tv[0] + ":" + v)
                    # s = "  "

                    song[tv[0]] = v

                retval.append(song)

    return(retval)


def id_idx(me, sid) :
    """
        Find an ID.
        Return the array index.
        Return None if not found.
    """
    if  sid :
        for si in range(0, len(me)) :
            if  me[si]['id'] == sid :
                return(si)
        pass

    return(None)


def savage_beast_id_idx(me, sid) :
    """
        Find a Savage Beast ID.
        Return the array index.
        Return None if not found.
    """
    if  sid :
        for si in range(0, len(me)) :
            if  me[si]['savage_beast_id'] == sid :
                return(si)
        pass

    return(None)


def print_file_header(f) :
    print("<music_list>",   file = f)
    print('',               file = f)

def print_file_trailer(f) :
    print('',               file = f)
    print("</music_list>",  file = f)


def print_song(song, f = None) :
    print("<id_info>",      file = f)

    kys = list(song.keys())
    kys.sort()

    for i in kys :
        if  song[i] :
            print("  <" + i + ">" + escape_str(song[i]) + "</" + i + ">", file = f)
        pass

    print("  </id_info>",   file = f)
    print('',               file = f)


def print_file(f, songs) :
    opened      = False
    if  not hasattr(f, 'write') :
        f       = open(f, "w")
        opened  = True
        print_file_header(f)


    for s in songs :
        assert len(str(s['id']))
        print_song(s, f)


    if  opened  :
        print_file_trailer(f)
        f.close()

    pass


#
#
#
if __name__ == '__main__':

    if  len(sys.argv) <= 1 :

        print("Tell me a music meta data file to parse!")

    elif (len(sys.argv) == 4) or (len(sys.argv) == 5) :                 # combine two files into a third

        if  os.path.exists(sys.argv[3]) :
            print("No. Please whack", sys.argv[3], "so that I can create a new one by combining", sys.argv[1], "and", sys.argv[2])
        else :
            md1 = parse_file(sys.argv[1])
            md2 = parse_file(sys.argv[2])
            for i in range(0, len(md2)) :
                s2 = md2[i]
                sid = s2['id']
                s1  = id_idx(md1, sid)
                if  s1 == None :
                    print("No ID", sid, "in", sys.argv[1])
                else :
                    s1 = md1[s1]
                    if  len(sys.argv) == 5 :
                        k   = sys.argv[4]
                        if  (k in s2) and ((not (k in s1)) or (not len(s1[k]))) :
                            s1[k] = s2[k]
                        pass
                    else :
                        for k in s2.keys() :
                            if  (not (k in s1)) or (not len(s1[k])) :
                                s1[k] = s2[k]
                            pass
                        pass
                    pass
                pass

            print_file(sys.argv[3], md1)
        pass

    else :
        import  random

        for fn in sys.argv[1:] :
            md = parse_file(fn)

            if  len(md) :
                print_song(md[0])

            random.shuffle(md);

            print_file("x.y", md)
        pass

    pass

#
#
#
# eof