#!/usr/bin/python

# TZBayesClassifier.py
#       --copyright--                   Copyright 2007 (C) Tranzoa, Co. All rights reserved.    Warranty: You're free and on your own here. This code is not necessarily up-to-date or of public quality.
#       --url--                         http://www.tranzoa.net/tzpython/
#       --email--                       pycode is the name to send to. tranzoa.com is the place to send to.
#       --bodstamps--
#       May 21, 2006            bar
#       May 22, 2006            bar     change NUMBER_OF_DISTINCT_TOKENS from 10 to 23 and back
#       May 24, 2006            bar     a number of fixes
#       May 25, 2006            bar     UNIQUE_TOKEN_PROBABILITY
#                                       2nd sort key is high to low, token counts
#       June 10, 2006           bar     play around with settings
#       June 11, 2006           bar     comments
#       June 12, 2006           bar     experiment with multi-stage classify logic
#                                       correct() and wrong()
#       June 16, 2006           bar     MAX_ZERO_GOOD_TOKEN_CNT
#       November 18, 2007       bar     turn on doxygen
#       November 27, 2007       bar     insert boilerplate copyright
#       May 17, 2008            bar     email adr
#       --eodstamps--
##      \file
#
#
#       Simple Bayes logic.
#
#


import  copy
import  gc

import  tzlib
import  tgcmsg


class   a_category :

    def __init__(me, name) :

        me.name             = name

        me.token_cnts       = {}
        me.token_cnt        = 0

        me.thing_cnt        = 0



    def learn(me, tokens) :

        tcnts   = me.token_cnts
        for t in tokens :
            if  len(t) >= TZBayesClassifier.MININUM_TOKEN_LENGTH :
                tcnts[t]        = tcnts.get(t, 0) + 1
            pass

        me.token_cnt       += len(tokens)

        me.thing_cnt       += 1



    pass



class   a_token_probability :

    def __init__(me) :

        me.extreme_probability  = 0.0
        me.extreme_cname        = ""
        me.category_vals        = {}


    pass




class   TZBayesClassifier :

    NUMBER_OF_DISTINCT_TOKENS   = 23                # how many tokens we actually care about (this many of the most extreme tokens are considered, only)
    MINIMUM_TOKEN_COUNT         =  3                # tokens must be seen at least this many times for them to be noticed (except if there are a rare number of docs)

    MININUM_TOKEN_LENGTH        =  3                # tokens shorter than this are ignored

    UNKNOWN_WORD_PROBABILITY    = 0.001             # limit on probability

    UNIQUE_TOKEN_PROBABILITY    = 0.999             # probability of a symbol that's only in documents in one category

    MININUM_TOKENS_TO_CONSIDER  = 1                 # don't ignore tokens ('cause they don't seem important) if there are fewer than this number of 'em

    MAX_ZERO_GOOD_TOKEN_CNT     = 2                 # any good_token_cnts value equal or under this value is zero



    def __init__(me) :

        me.categories           = {}
        me.probs                = None
        me.token_cnts           = {}

        me._use_good_token_cnts = False
        me.good_token_cnts      = {}

        pass



    def _modify_tokens(me, tokens) :

        #
        #
        #   Here is where we tweak the tokens, if we want to, though the outside caller should really be doing this sort of thing.
        #   Remember to replace the array, though, as it's the caller's memory, not ours.
        #
        #

        return(tokens)



    def learn(me, cname, tokens) :

        me.probs                    = None                          # signal that pre-processing must be done before any classifying

        tokens                      = me._modify_tokens(tokens)

        tcnts                       = me.token_cnts
        for t in tokens :
            if  len(t) >= TZBayesClassifier.MININUM_TOKEN_LENGTH :
                tcnts[t]                = tcnts.get(t, 0) + 1
            pass

        if  not me.categories.has_key(cname) :
            me.categories[cname]    = a_category(cname)

        me.categories[cname].learn(tokens)




    def forget_good_token_cnts(me) :
        me.good_token_cnts  = {}


    def use_good_token_cnts(me, how = None) :
        ov  = me._use_good_token_cnts
        if  (how == True) or (how == False) :
            me._use_good_token_cnts = how

        return(ov)


    def correct(me, cname, tokens) :

        for t in tokens :
            me.good_token_cnts[t]   = me.good_token_cnts.get(t, 0) + 1

        pass


    def wrong(me, cname, tokens) :

        for t in tokens :
            me.good_token_cnts[t]   = me.good_token_cnts.get(t, 0) - 1

        pass


    def good_token_counts(me) :
        """
            Return a copy of the dictionary of "good tokens".
            Keys are the tokens.
            Values are the number of times the token has been used to correct()'ly identify a token array.
        """

        return(copy.copy(me.good_token_cnts))



    def write_good_tokens_to_file(me, file_name) :
        gta = [ [ c, t ] for t, c in me.good_token_counts().iteritems() ]
        gta.sort()
        gta.reverse()
        tgcmsg.amp_messages_to_file(file_name, gta)




    def _make_probs_and_token_cnts(me) :

        if  me.probs == None :

            min_cnt         = TZBayesClassifier.MINIMUM_TOKEN_COUNT

            me.probs        = {}                                        # key is token, value is a_token_probability

            for cname in me.categories.iterkeys() :
                cat         = me.categories[cname]
                min_cnt     = min(min_cnt, max(cat.token_cnts))
                me.probs.update(cat.token_cnts)                         # set the keys for all known tokens, values will be thrown away

            if  min_cnt > 1 :
                for t in me.probs.keys() :
                    if  me.token_cnts[t] < min_cnt :
                        del(me.probs[t])
                    pass
                pass

            for t in me.probs.iterkeys() :
                me.probs[t]     = None                                  # will be replaced by a_token_probability

            gc.collect()


            #
            #
            #   Note: looks like it goes pretty quickly to taking a gig when the len(me.probs) is 147k and there are 20+ categories
            #
            #

            for t in me.probs.iterkeys() :
                tprob           = a_token_probability()                 # make the value

                ccnt            = 0
                for cname, cat in me.categories.iteritems() :

                    cnt         = cat.token_cnts.get(t, 0)

                    if  cnt    != 0 :
                        ccnt   += 1

                        tc          = cat.thing_cnt or 1000000

                        tprob.category_vals[cname] = cnt / float(tc)    # normalize the count by dividing by the category's thing count

                    pass

                tsum            = sum(tprob.category_vals.values())

                middle          = 1.0 / float(ccnt)

                for cname in tprob.category_vals.keys() :
                    tprob.category_vals[cname]     /= tsum              # figure the probability (counts) relative to the others, 0..1 (that is, what fraction of the total counts this category has)

                    if  tprob.category_vals[cname] != 0.0 :             # for purposes of finding important tokens, ignore those categories that don't have a count at all

                        if  middle == 1.0 :
                            p                       = TZBayesClassifier.UNIQUE_TOKEN_PROBABILITY        # multiplying by tsum (to factor in that this will be here no matter what percentage of docs in the 1 category have the token) hurts
                        else :
                            p                       = abs(middle - tprob.category_vals[cname])
                        if  tprob.extreme_probability <= p :
                            tprob.extreme_probability  = p
                            tprob.extreme_cname        = cname

                        # print "%-30s %-36s %9.7f %9.7f %4.2f %9.7f" % ( t, cname, tprob.category_vals[cname], tsum, middle, p )
                    pass

                me.probs[t] = tprob

            pass

        pass



    def _tokens_sorted_by_max_probability(me, tokens, only_cnames) :

        toke_max_probs  = {}

        if  (only_cnames != None) and (len(only_cnames) > 0) :
            for t in tokens :
                if  me.probs.has_key(t) and only_cnames.has_key(me.probs[t].extreme_cname) :
                    toke_max_probs[t]   = me.probs[t].extreme_probability
                pass
            pass

        if  len(toke_max_probs) < TZBayesClassifier.MININUM_TOKENS_TO_CONSIDER :
            # print "Giving up"
            for t in tokens :
                if  me.probs.has_key(t) :
                    toke_max_probs[t]   = me.probs[t].extreme_probability
                pass
            pass



        def _cmp_tok(a, b) :
            if  me._use_good_token_cnts :
                d   = cmp(min(TZBayesClassifier.MAX_ZERO_GOOD_TOKEN_CNT, me.good_token_cnts.get(b, 0)), min(TZBayesClassifier.MAX_ZERO_GOOD_TOKEN_CNT, me.good_token_cnts.get(a, 0)))
                if  d != 0 :    return(d)

            d   = cmp(toke_max_probs[b], toke_max_probs[a])
            if  d != 0 :    return(d)

            # d   = cmp(me.token_cnts[b], me.token_cnts[a])
            # if  d != 0 :    return(d)

            d   = cmp(len(b), len(a))
            if  d != 0 :    return(d)

            d   = cmp(a.lower(), b.lower())
            if  d != 0 :    return(d)

            return(cmp(a, b))

        tokens = toke_max_probs.keys()
        tokens.sort(_cmp_tok)

        return(tokens)



    def classify_all(me, tokens, only_cnames = None, distinct_tokens = NUMBER_OF_DISTINCT_TOKENS) :

        if  only_cnames == None :
            only_cnames     = {}

        me._make_probs_and_token_cnts()

        tokens              = me._modify_tokens(tokens)

        tokens              = me._tokens_sorted_by_max_probability(tokens, only_cnames)[0 : distinct_tokens]

        cat_probs           = []
        for cname in me.categories.keys() :

            likely          = 1.0
            unlikely        = 1.0

            for t in tokens :

                p           = me.probs[t].category_vals.get(cname, TZBayesClassifier.UNKNOWN_WORD_PROBABILITY)
                p           = max(p,        TZBayesClassifier.UNKNOWN_WORD_PROBABILITY)
                p           = min(p, 1.0 -  TZBayesClassifier.UNKNOWN_WORD_PROBABILITY)

                likely     *=   p
                unlikely   *= (1.0 - p)

            if  len(tokens) != 0 :
                likely      = pow(likely,   1.0 / len(tokens))
                unlikely    = pow(unlikely, 1.0 / len(tokens))

            np              = 0.0
            if  likely + unlikely != 0.0 :
                np          = (1 + ((likely - unlikely) / (likely + unlikely))) / 2.0

            cat_probs.append( [ np, cname ] )

        cat_probs.sort()
        cat_probs.reverse()

        return( ( cat_probs, tokens ) )



    def classify(me, tokens) :

        only_cnames = {}

        cat_cnt     = len(me.categories)

        first_name  = None

        distinct_tokens = TZBayesClassifier.NUMBER_OF_DISTINCT_TOKENS

        while True :
            ( cat_probs, deciding_tokens )          = me.classify_all(tokens, only_cnames, distinct_tokens)

            if  len(cat_probs)  <  3 :
                break

            if  cat_probs[0][0] >= 0.100 :
                break

            #
            #
            #   Both of these things do not seem to really help.
            #
            #

            break

            if  True :

                if  distinct_tokens >= len(tokens) :
                    if  cat_probs[0][0] == 0.0 :
                        first_name      =  None
                        ( cat_probs, deciding_tokens )  = me.classify_all(tokens, {}, TZBayesClassifier.NUMBER_OF_DISTINCT_TOKENS)
                    else :
                        print "Plenty of distinct_tokens", distinct_tokens, len(tokens)
                    break

                if  distinct_tokens == TZBayesClassifier.NUMBER_OF_DISTINCT_TOKENS :
                    first_name      =  cat_probs[0][1]

                distinct_tokens     = min(distinct_tokens * 3 / 2, len(tokens))

            else :

                #
                #
                #   Find the point in the list of categories where the slope of the curve of p's changes,
                #   then go around again,
                #   only this time checking only tokens that are high probs for the categories that seem most likely.
                #
                #

                p1  = cat_probs[0][0] - cat_probs[1][0]
                p2  = cat_probs[1][0] - cat_probs[2][0]

                if  p1 > p2 :                           # if the curve of probabilities is not an S curve, then kick out
                    break                               #   (or more exactly, if the best catagory is better than the 2nd best by more than the 2nd best is better than the 3rd, kick out)

                pi      = 3
                while (pi < cat_cnt) and (pi < len(cat_probs)) :
                    p1  = p2                            # find the point in the curve of probabilities where diminishing returns come in to play (the 2nd derivative goes negative - slope angle starts levelling out)
                    p2  = cat_probs[pi - 1][0] - cat_probs[pi][0]

                    if  p1 > p2 :
                        break

                    pi += 1

                if  pi >= cat_cnt :
                    break                               # stop if there is an ever-steepening plunge in probabilities to the end of the possible categories

                only_cnames         = tzlib.make_dictionary(map(lambda cp : cp[1], cat_probs[0:pi]))        # now go around, considering only those categories that seem real likely

                cat_cnt             = len(only_cnames) - 1

                if  first_name == None :
                    first_name      = cat_probs[0][1]

                pass

            print "Trying", len(only_cnames), only_cnames, distinct_tokens

            pass


        cname                       = ""
        p                           = 0.0
        if  len(cat_probs) > 0 :
            cname                   = cat_probs[0][1]
            p                       = cat_probs[0][0]


        if  (first_name != None) and (first_name != cname) :
            print "Picked", cname, "over", first_name, p, distinct_tokens, len(tokens)
            pass


        return( ( cname, p, deciding_tokens ) )



    pass


if  __name__ == '__main__' :

    c       = TZBayesClassifier()

    if  True :
        tokes   = [ 'x', 'y', 'z' ]     # note: that these other tokens are ham makes it more likely that neutral tokens are spam
        c.learn("ham", tokes)
        c.learn("ham", tokes)
        c.learn("ham", tokes)
        c.learn("ham", tokes)

    tokes   = [ 'ABC' ]
    c.learn("ham",  tokes)
    c.learn("SPAM", tokes)

    tokes   = [ 'tok', 'another', 'etc', 'ham'  ]
    c.learn("ham", tokes)

    is_spam = "SPAM"
    tokes   = [ 'tok', 'another', 'etc', 'spam' ]
    c.learn(is_spam, tokes)



    tokes       = [ 'tok', 'etc', 'ham'  ]
    ( t, p )    = c.classify(tokes)
    print "Ham", t, p


    tokes       = [ 'tok', 'etc', 'spam'  ]
    ( t, p )    = c.classify(tokes)
    print   "Spam", t, p

    tokes       = [ 'tok', 'etc', 'spam', 'spam' ]
    ( t, p )    = c.classify(tokes)
    print   "Double Spam", t, p

    tokes       = [ 'ABC' ]
    ( t, p )    = c.classify(tokes)
    print   "Neutral", t, p




    pass



__ALL__ = [
            "TZBayesClassifier",
          ]


#
#
#
# eof

