#!/usr/bin/python

# string_array_dictionary.py
#       --copyright--                   Copyright 2007 (C) Tranzoa, Co. All rights reserved.    Warranty: You're free and on your own here. This code is not necessarily up-to-date or of public quality.
#       --url--                         http://www.tranzoa.net/tzpython/
#       --email--                       pycode is the name to send to. tranzoa.com is the place to send to.
#       --bodstamps--
#       June 23, 2006           bar
#       November 18, 2007       bar     turn on doxygen
#       November 27, 2007       bar     insert boilerplate copyright
#       May 17, 2008            bar     email adr
#       August 29, 2008         bar     basestring instead of StringType because of unicode strings and others
#       November 29, 2011       bar     pyflake cleanup
#       --eodstamps--
##      \file
#
#
#       Implement a dictionary keyed by arrays of strings.
#
#       It has an optimization that allows the 'longest_match_length' function.
#
#


import  gc
import  re


test_str        = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"    \
                  "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"    \
                  " !\"#$%&'()*+,-./0123456789:;<=>?"                                   \
                  "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_"                                    \
                  "`abcdefghijklmnopqrstuvwxyz{|}~"                                    \
                  "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"    \
                  "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"    \
                  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"    \
                  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"    \
                  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"    \
                  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"    \
                  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"    \
                  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"    \
                  ""                                                                    \
                  "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"    \
                  "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"    \
                  " !\"#$%&'()*+,-./0123456789:;<=>?"                                   \
                  "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_"                                    \
                  "`abcdefghijklmnopqrstuvwxyz{|}~"                                    \
                  "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"    \
                  "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"    \
                  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"    \
                  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"    \
                  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"    \
                  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"    \
                  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"    \
                  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"    \
                  ""




class a_string_array_dictionary :
    """
        This structure maintains a dictionary keyed by string arrays.

        The values are given by the caller in the 'set' routint.
    """


    def _split(me, s) :
        return( [ s for s in me.split_regx.split(s) if len(s) > 0 ] )


    def _join(me, sa, idx = 0, ln = None) :

        if  ln   == None :
            ln    = len(sa) - idx

        if  (idx == 0) and (ln == len(sa)) :
            return(me.join_str.join(sa))

        return(me.join_str.join(sa[idx:idx + ln]))




    def refresh(me) :
        """
            Should be called after the dictionary is pruned so that lookup speed does not degrade.
        """

        me.lns  = {}
        for s in me.dct.iterkeys() :
            sa              = me._split(s)
            me.lns[sa[0]]   = max(me.lns.get(sa[0], 0), len(sa))

        gc.collect()



    def __init__(me, split_regx = None, join_str = None) :

        if  split_regx == None :
            split_regx  = re.compile(r"\s", re.DOTALL)
        if  isinstance(split_regx, basestring) :
            split_regx  = re.compile(split_regx)
        me.split_regx   = split_regx

        if  join_str   == None :
            join_str    = " "
        me.join_str     = join_str

        ta              = me._split(test_str)
        ts              = me._join(ta, 0, len(ta))
        if  ts != me._join(me._split(ts)) :
            raise "Imcompatible split and join values"

        me.dct          = {}

        me.refresh()


    def __len__(me) :
        return(len(me.dct))



    def longest_match_length(me, sa, idx = 0, ln = None) :
        """
            Return the longest known string array in sa[idx:].
        """

        if  ln == None :
            ln  = len(sa) - idx
        ln      = min(ln, len(sa) - idx)

        if  ln <= 0 :
            return(0)

        ln      = max(me.lns.get(sa[idx], 0), ln)

        while ln > 0 :

            s   = me._join(sa[idx : idx + ln])

            if  me.dct.has_key(s) :
                break

            ln -= 1

        return(ln)



    def get_value(me, sa, dflt = None, idx = 0, ln = None) :
        if  ln == None :
            ln  = len(sa) - idx
        ln      = min(ln, len(sa) - idx)

        if  ln <= 0 :
            return(dflt)

        s   = me._join(sa, idx, ln)

        return(me.dct.get(s, dflt))



    def set(me, sa, value = None, idx = 0, ln = None) :
        if  ln == None :
            ln  = len(sa) - idx
        ln      = min(ln, len(sa) - idx)

        if  ln <= 0 :
            raise "Setting zero length sa"

        s               = me._join(sa, idx, ln)

        me.dct[s]       = value

        me.lns[sa[idx]] = max(me.lns.get(sa[idx], 0), ln)



    pass




if  __name__ == '__main__' :

    me  = a_string_array_dictionary()

    ta  = [ "a" ]
    me.set(ta, 0)

    ta.append("b")
    me.set(ta, 1)

    me.refresh()

    print "get_value", me.get_value(ta)

    taa = ta
    taa.append("c")

    print "longest len abc", me.longest_match_length(taa)

    me.set(taa, 5)

    print "get_value abc", me.get_value(ta)
    print "longest len abc", me.longest_match_length(taa)

    me.refresh()

    print "get_value abc", me.get_value(ta)
    print "longest len abc", me.longest_match_length(taa)


    print "len", len(me)

    pass



__ALL__ = [
            'a_string_array_dictionary',
          ]



#
#
#
# eof

