#!/usr/bin/python

# word_gram.py
#       --copyright--                   Copyright 2010 (C) Tranzoa, Co. All rights reserved.    Warranty: You're free and on your own here. This code is not necessarily up-to-date or of public quality.
#       --url--                         http://www.tranzoa.net/tzpython/
#       --email--                       pycode is the name to send to. tranzoa.com is the place to send to.
#       --bodstamps--
#       February 26, 2011       bar
#       February 27, 2011       bar     max_grams
#       May 27, 2012            bar     doxygen namespace
#       June 2, 2013            bar     proper interpretor
#       November 7, 2017        bar     maxint->maxsize
#       --eodstamps--
##      \file
#       \namespace              tzpython.word_gram
#
#
#       Output word N-grams from give text file(s).
#
#

import  random


def learn(grams, pa, w) :
    del(pa[0])
    pa.append(w)
    k           = " ".join(pa)
    grams[k]    = grams.get(k, 0) + 1


def delearn(grams, m)  :
    if  len(grams) > m :
        kys = grams.keys()
        random.shuffle(kys)
        kys.sort(lambda a, b : cmp(grams[b], grams[a]))
        for k in kys[m / 2:] :
            del(grams[k])
        pass
    pass




help_str    = """
%s (options) input_file(s)_with_words output_file
Options:
    --gram          #               Output # grams.                        (default: %d)
    --lower_case                    Do things case-insensitive.
    --max_grams     #               Set the maximum number of grams.       (default: all)
    --output        file_name       Write to given output file             (default: last file name given)
    --begin         word            Use the given "word" as the begin word (default: %s)
    --end           word            Use the given "word" as the end   word (default: %s)
    --no_begin_end                  Don't put out grams at begin/end of input text lines. (implies non --flow)
    --flow                          All words are in an unending flow. No --begin and --end token words.

Write out word N-grams from given text file(s).
    Input files' lines beginning with (white-space)semi-colon are ignored.
    Unless told otherwise, text line beginning and end are N-grammed with %s and %s.
"""

#
#
#
if __name__ == '__main__' :

    import  os
    import  re
    import  sys

    import  TZCommandLineAtFile
    import  tzlib
    import  output_files


    program_name    = sys.argv.pop(0)
    TZCommandLineAtFile.expand_at_sign_command_line_files(sys.argv)

    ofile_name      = ""
    encoding        = 'utf8'
    gram            = 2
    begin           = "<BEGIN>"
    end             = "<END>"
    be              = True
    lc              = False
    max_grams       = sys.maxsize

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--help", "-h", "-?", "/?", "/h", "/H" ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        print help_str % ( os.path.basename(program_name), gram, begin, end, begin, end, )

        sys.exit(254)


    while True :
        oi  = tzlib.array_find(sys.argv, [ "--lower_case", "--lowercase", "--lower-case", "--lower", "--low", "-l", ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        lc              = True

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--encoding", "--enc", ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        encoding        = sys.argv.pop(oi)

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--gram", "-g", ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        gram            = max(2, int(sys.argv.pop(oi)))

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--max_grams", "--maxgrams", "--max-grams", "--max", "-m", ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        m               = 1
        v               = sys.argv.pop(oi).strip().lower()
        if  v.endswith('h') :
            v           = v[:-1].strip()
            m           = 100
        if  v.endswith('t') :
            v           = v[:-1].strip()
            m           = 1000
        if  v.endswith('m') :
            v           = v[:-1].strip()
            m           = 1000000
        if  v.endswith('b') :
            v           = v[:-1].strip()
            m           = 1000000000
        max_grams       = max(2, int(v) * m)

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--output", "--out", "-o", ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        ofile_name      = sys.argv.pop(oi)

    while True :
        oi  = tzlib.array_find(sys.argv, [ "--begin", "-b", ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        begin           = sys.argv.pop(oi)
    while True :
        oi  = tzlib.array_find(sys.argv, [ "--end", "-e", ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        end             = sys.argv.pop(oi)
    while True :
        oi  = tzlib.array_find(sys.argv, [ "--flowing", "--flow", "-f", ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        begin   = end   = ""
    while True :
        oi  = tzlib.array_find(sys.argv, [ "--no_begin_end", "--nobeginend", "--no-begin-end", "--nbe", "-n", ] )
        if  oi < 0 :    break
        del sys.argv[oi]
        be              = False


    if  not ofile_name :
        if  not len(sys.argv) :
            print >>sys.stderr, "No output file name given!"
            sys.exit(101)
        ofile_name  = sys.argv.pop()

    if  not len(sys.argv) :
        print >>sys.stderr, "No input file name given!"
        sys.exit(102)

    afns    = {}
    for afn in sys.argv :
        fns = tzlib.ambiguous_file_list(afn, True)
        if  not len(fns) :
            print "No files in [%s]!" % afn
            sys.exit(101)
        for fn in fns :
            afns[fn]    = True
        pass
    pass

    fns     = afns.keys()
    fns.sort()

    grams   = {}
    pa      = [ begin ] * gram
    for fn in fns :
        fi  = open(fn, "rt")
        while True :
            ln  = fi.readline()
            if  not ln :
                break
            ln  = ln.strip()
            if  ln :
                if  ln[0] != ';' :
                    if  lc :
                        ln  = ln.lower()

                    wa  = re.split(r"\s+", ln)

                    bi  = 0
                    if  not be :
                        bi  = gram - 1
                        pa  = [ begin or end ] + wa[:bi]
                    elif begin or end :
                        pa  = [ begin or end ] * gram

                    for w in wa[bi:] :
                        learn(grams, pa, w)

                    if  (not be) and (end or begin) :
                        for i in xrange(gram - 1) :
                            learn(grams, pa, end or begin)
                        pass

                    delearn(grams, max_grams)
                pass
            pass
        fi.close()

    fo      = output_files.a_file(ofile_name)
    fo.write(";     %d-grams: %u\n\n" % ( gram, len(grams)) )

    kys     = grams.keys()
    kys.sort()
    for k in kys :
        fo.write("%s %u\n" % ( k, grams[k]) )

    fo.close()
#
#
#
# eof
