#!/usr/bin/python

# word_freqs.py
#       --copyright--                   Copyright 2007 (C) Tranzoa, Co. All rights reserved.    Warranty: You're free and on your own here. This code is not necessarily up-to-date or of public quality.
#       --url--                         http://www.tranzoa.net/tzpython/
#       --email--                       pycode is the name to send to. tranzoa.com is the place to send to.
#       --bodstamps--
#       June 23, 2006           bar
#       November 18, 2007       bar     turn on doxygen
#       November 27, 2007       bar     insert boilerplate copyright
#       December 1, 2007        bar     use tzlib's elapsed_time for timing
#       May 17, 2008            bar     email adr
#       May 25, 2010            bar     tz_os_priority
#       November 29, 2011       bar     pyflake cleanup
#       May 27, 2012            bar     doxygen namespace
#       March 5, 2023           bar     future print
#       April 13, 2023          bar     listize keys values items
#       --eodstamps--
##      \file
#       \namespace              tzpython.word_freqs
#
#
#       Given some text files, compute the word frequencies.
#
#
#


from    __future__  import  print_function

import  glob
import  os
import  re
import  sys

import  TZCommandLineAtFile
import  tzlib


class a_thang :

    def __init__(me) :


        def do_em(me, s) :
            for fname in glob.glob(s) :
                me.fnames.update(tzlib.make_dictionary(os.path.abspath(os.path.normpath(fname))))
            pass


        def do_dir(me, s, names) :
            do_em(me, os.path.abspath(os.path.join(s, me.base_amb_name)))


        program_name        = sys.argv.pop(0)
        TZCommandLineAtFile.expand_at_sign_command_line_files(sys.argv)


        if  tzlib.array_find(sys.argv, [ "--help", "-h", "-H", "-?", "/h", "/H", "/?" ] ) >= 0 :
            print("""
python %s (options) ambiguous_file(s)...

Compute word frequencies for given text files.

""") % os.path.basename(program_name)
            sys.exit(254)


        me.do_subdirs       =   False
        me.show_top_n       =   50

        me.fnames           =   {}


        while True :
            oi  = tzlib.array_find(sys.argv, [ "--subdirs", "-s" ] )
            if  oi < 0 :    break
            del sys.argv[oi]
            me.do_subdirs   = True


        while True :
            oi  = tzlib.array_find(sys.argv, [ "--top", "-t" ] )
            if  oi < 0 :    break
            del sys.argv[oi]
            me.show_top_n   = int(sys.argv.pop(oi))


        #
        #   Remember all the file names to do in me.fnames
        #
        for s in sys.argv :

            if  s[0:1] == "-" :
                print(  "Did you mean this to be a file or directory:", s)

            if me.do_subdirs :
                ( s, me.base_amb_name ) = os.path.split(os.path.normpath(s))
                if  s == "" :   s = "."
                if  not os.path.isdir(s) :
                    do_dir(me, s, [])
                else :
                    os.path.walk(s, do_dir, me)
                pass

            else :
                do_em(me, s)

            pass


        pass


    pass


split_re    = re.compile(r"[\s.,;:!\?\/\[\]\(\)\{\}\+\|\\\=\&]")
no_syms_re  = re.compile(r"[^a-z\-\'_]", re.DOTALL + re.IGNORECASE)

def read_words_from_file(fname) :

    fi      = open(fname, "rb")
    fd      = fi.read()
    fi.close()

    fd      = fd.lower()
    words   = split_re.split(fd)

    words   = [ no_syms_re.sub("", w) for w in words ]
    words   = [ w for w in words if len(w) > 0 ]

    return(words)


class a_words_metrics :


    TOP_WORD_LIMIT          = 15
    TOP_WORD_PERCENTAGE     = 20


    def __init__(me, words, top_word_cnt = TOP_WORD_LIMIT, top_word_percentage = TOP_WORD_PERCENTAGE, print_words = False) :

        word_cnts           = {}

        for w in words :
            word_cnts[w]    = word_cnts.get(w, 0) + 1

        wrds                = list(word_cnts.keys())

        def _cmp_words(a, b) :
            return(cmp(word_cnts[b], word_cnts[a]))

        wrds.sort(_cmp_words)

        twc     = min(top_word_cnt, int((len(wrds) * top_word_percentage) / 100.0), len(wrds))

        tratio  = 0.0
        tc      = 0
        trc     = 0
        while tc < twc :
            wc  = word_cnts[wrds[tc]]

            if  print_words :
                print("%-24s %u" % ( wrds[tc], wc ))

            tc += 1
            if  tc < len(wrds) :
                trc    += 1
                tratio += (float(word_cnts[wrds[tc]]) / float(wc))
            pass


        #
        #   Set our attribute values
        #

        me.word_cnt                 = len(words)
        me.unique_word_cnt          = len(wrds)

        me.unique_words_ratio       = 1.0
        if  me.word_cnt > 0 :
            me.unique_words_ratio   = float(me.unique_word_cnt) / float(me.word_cnt)

        me.power_law_ratio          = 1.0
        if  trc > 0 :
            me.power_law_ratio      = (tratio / float(trc))                         # average ratio of consecutive counts

        pass


    pass


if  __name__ == '__main__' :

    import  tz_os_priority


    me = a_thang()

    tz_os_priority.set_proc_to_idle_priority()

    start_time  = tzlib.elapsed_time()

    total_files_read        = 0
    total_word_cnt          = 0
    total_uword_cnt         = 0
    total_ratio             = 0.0
    total_uratio            = 0.0

    for fname in me.fnames.iterkeys() :

        words               = read_words_from_file(fname)

        if  len(words) > 0 :

            print('')
            print(fname)

            total_files_read   += 1
            total_word_cnt     += len(words)

            metrics             = a_words_metrics(words, me.show_top_n, 100, print_words = True)

            total_uword_cnt    += metrics.unique_word_cnt

            print(  "Words:             %u"    % ( metrics.word_cnt           ))
            print(  "Unique words:      %u"    % ( metrics.unique_word_cnt    ))
            print(  "Unique word ratio: %5.2f" % ( metrics.unique_words_ratio ))
            print(  "Power law factor:  %5.2f" % ( metrics.power_law_ratio    ))

            total_uratio       += metrics.unique_words_ratio
            total_ratio        += metrics.power_law_ratio

        pass


    print('')
    print(  "Time taken", tzlib.elapsed_time() - start_time)

    print(  "Files:                       %9u"   % (      total_files_read ))
    print(  "Words:                       %9u"   % (      total_word_cnt   ))
    print(  "Words per file:              %9u"   % ( int((total_word_cnt   / float(total_files_read)) + 0.5) ))
    print(  "Unique words per file:       %9u"   % ( int((total_uword_cnt  / float(total_files_read)) + 0.5) ))
    print(  "Average unique words ration: %9.2f" % (      total_uratio     / float(total_files_read) ))
    print(  "Average power law factor:    %9.2f" % (      total_ratio      / float(total_files_read) ))

    pass


__ALL__ = [
            'read_words_from_file',

            'a_words_metrics',
          ]


#
#
#
# eof