#!/usr/bin/python

# numeric_tokens.py
#       --copyright--                   Copyright 2007 (C) Tranzoa, Co. All rights reserved.    Warranty: You're free and on your own here. This code is not necessarily up-to-date or of public quality.
#       --url--                         http://www.tranzoa.net/tzpython/
#       --email--                       pycode is the name to send to. tranzoa.com is the place to send to.
#       --bodstamps--
#       June 18, 2003           bar
#       June 19, 2003           bar     use_indices
#       November 18, 2007       bar     turn on doxygen
#       November 27, 2007       bar     insert boilerplate copyright
#       May 17, 2008            bar     email adr
#       November 29, 2011       bar     pyflake cleanup
#       May 27, 2012            bar     doxygen namespace
#       --eodstamps--
##      \file
#       \namespace              tzpython.numeric_tokens
#
#
#       Learn a set of some-numeric tokens.
#
#       Then be able to translate/simplify the tokens into roughly equal sizeed groups of 'em.
#
#
#       Example:
#
#           Say you have the tokens a1.3 a1.7 a2.5 a3.9
#             And want them to be divided into two groups.
#
#             This logic will do so, putting a1.3 and a1.7 into one group and the other two in the other group.
#             It will do so be translating the first two into a0 and the other two into a1 (or into something appropriate).
#
#       We allow multiple sets of tokens. The names of the sets are the characters to the left of the trailing numbers.
#       An empty "name" is ok. That is, all-numeric tokens belong to their own "set" of tokens.
#       The names are case-sensitive.
#
#


import  bisect
import  re
from    types                   import  ListType, TupleType


class a_numeric_token_translator :

    def __init__(me, group_count = 16, use_indices = 0) :
        me.group_count = group_count
        me.use_indices = use_indices

        me.token_sets  = {}

        me.learned     = None

        me.regx        = re.compile(r"\s*(.*?)(\d*\.?\d+)\s*$")



    def learn(me, tokens, one_of_each = 0) :

        if  not isinstance(tokens, ListType) and not isinstance(tokens, TupleType) :
            tokens   = [ tokens ]

        seen_name    = {}

        for ti in range(0, len(tokens)) :
            t  = str(tokens[ti])

            rg = me.regx.search(t)
            if  rg :

                name = rg.group(1)

                if  not seen_name.has_key(name) :
                    seen_name[name] = 0
                seen_name[name] = seen_name[name] + 1

                if  me.use_indices :
                    name = name + '~' + str(seen_name[name]) + '~'

                if  not one_of_each or (seen_name[name] <= 1) :

                    num = float(rg.group(2))

                    if  not me.token_sets.has_key(name) :
                        me.token_sets[name] = []                    # new numeric token name

                    me.token_sets[name].append(num)                 # record the numeric token

                    me.learned  = None                              # and remember that we need pre-processing before translating tokens
                pass
            pass

        pass




    def translate(me, tokens) :
        if  tokens == None :    return(None)

        if  me.learned == None :
            me.learned = {}
            for name in me.token_sets.keys() :
                ts = me.token_sets[name]
                ts.sort()

                learned = []
                for i in range(1, me.group_count) :
                    learned.append(ts[(len(ts) * i) / me.group_count])      # this needs to fix dupes and runs!!!!

                me.learned[name] = learned

                # print name, len(me.learned[name])
                pass
            pass

        is_array     = 1
        if  not isinstance(tokens, ListType) and not isinstance(tokens, TupleType) :
            tokens   = [ tokens ]
            is_array = 0

        seen_name    = {}

        otokes       = []
        for t in tokens :
            t  = str(t)

            rg = me.regx.search(t)
            if  not rg :
                otokes.append(t)
            else :

                name = rg.group(1)

                if  not seen_name.has_key(name) :
                    seen_name[name] = 0
                seen_name[name] = seen_name[name] + 1

                if  me.use_indices :
                    name = name + '~' + str(seen_name[name]) + '~'

                num  = float(rg.group(2))               # get the numeric value from the token

                if  not me.learned.has_key(name) :
                    otokes.append(t)
                else :
                    n      = bisect.bisect(me.learned[name], num)           # translate this token into a simplified, grouped token

                    if len(me.token_sets[name]) <= me.group_count :
                        otokes.append(name + str(n))
                    else :
                        # print len(me.token_sets[name]), me.group_count

                        #
                        #
                        #   E.g. if group_count is 5, then indices 0..4 lead to this mapping:
                        #
                        #       0 -> 0
                        #       1 -> 0 and 1
                        #       2 -> 1 and 2
                        #       3 -> 2 and 3
                        #       4 -> 3
                        #
                        #       So, for example, a number in the 2'th bucket will map to tokens that match numbers in the 1'th, 2'th, and 3'th buckets.
                        #

                        if  n != 0 :
                            otokes.append(name + str(n - 1))
                        if  n < me.group_count - 1 :
                            otokes.append(name + str(n    ))
                        pass

                    pass
                pass
            pass

        if  not is_array :  return(tokens[0])

        return(otokes)


    pass


#
#
#       Give a bunch of numeric tokens on the command line and we'll translate 'em.
#
#
if __name__ == '__main__' :
    import  sys

    def _doit(s, n) :
        tt = a_numeric_token_translator(n)
        tt.learn(s)
        print n, s, tt.translate(s)


    if  (len(sys.argv) < 2) or (sys.argv[1] != "-t") :
        _doit(sys.argv, 3)
    else :
        _doit([ 1, 2, 3                   ], 3)
        _doit([ 3, 2, 1                   ], 3)
        _doit([ "a3", "b2", "c1"          ], 3)
        _doit([ "a3", "b2", "c1", "b2"    ], 3)
        _doit([ "a3", "b2", "c1", "b4"    ], 3)
        _doit([ 1, 2, 3, 4, 5, 6, 7, 8, 9 ], 3)
        _doit([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9 ], 3)
        _doit([ 1, 2, 3, 4, 5                                        ], 3)
        _doit([ "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9", "b1", "b2", "b3" ], 3)

        print "Should be:"
        print """

3 [1, 2, 3] ['0', '1', '2']
3 [3, 2, 1] ['2', '1', '0']
3 ['a3', 'b2', 'c1'] ['a2', 'b2', 'c2']
3 ['a3', 'b2', 'c1', 'b2'] ['a2', 'b2', 'c2', 'b2']
3 ['a3', 'b2', 'c1', 'b4'] ['a2', 'b1', 'c2', 'b2']
3 [1, 2, 3, 4, 5, 6, 7, 8, 9] ['0', '0', '0', '0', '1', '0', '1', '0', '1', '1', '1', '1']
3 [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9] ['0', '0', '0', '0', '1', '0', '1', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '1', '0', '1', '1', '1', '1']
3 [1, 2, 3, 4, 5] ['0', '0', '1', '0', '1', '1', '1']
3 ['a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'b1', 'b2', 'b3'] ['a0', 'a0', 'a0', 'a0', 'a1', 'a0', 'a1', 'a0', 'a1', 'a1', 'a1', 'a1', 'b0', 'b1', 'b2']

        """

    pass


#
#
#
# eof
