#!/usr/bin/env python

import sys
import argparse

import codecs
import os

this_folder = os.path.dirname(os.path.realpath(__file__))

# This updates the load path to ensure that the local site-packages directory
# can be used to load packages (e.g. a locally installed copy of lxml).
sys.path.append(os.path.join(this_folder, 'site-packages/pre_install'))

from VUKafParserPy import KafParser
from lxml import etree
from collections import defaultdict

__desc='VUA property tagger'
__last_edited='20may2014'
__version='1.0'

###
__module_dir = os.path.dirname(__file__)
max_ngram = 1
verbose = False
##


########################################
## Format of the file:
#lemma pos aspect
#lemma pos aspect
########################################
def loadAspects(my_lang,this_file=None):
  my_aspects = {}
  if this_file is not None:
    aspects_filename = this_file
  else:
    filename = "{0}.txt".format(my_lang)
    print>>sys.stderr, "filename thingy",filename
    print>>sys.stderr, "path thingy",arguments.path
    aspects_filename = os.path.join(arguments.path,filename)

  if not os.path.exists(aspects_filename):
    print>>sys.stderr,'ERROR: file with aspects for the language',my_lang,'not found in',aspects_filename
  else:
    fic = codecs.open(aspects_filename,'r','utf-8')
    for line in fic:
      fields = line.strip().split('\t')
      if len(fields) == 3:
        lemma,pos,aspect = fields
        my_aspects[lemma] = aspect
    fic.close()
  return aspects_filename, my_aspects
########################################


###### MAIN ########

argument_parser = argparse.ArgumentParser(description='Tags a text with polarities at lemma level')
argument_parser.add_argument("--no-time",action="store_false", default=True, dest="my_time_stamp",help="For not including timestamp in header")
argument_parser.add_argument("--lexicon", action="store", default=None, dest="lexicon", help="Force to use this lexicon")
argument_parser.add_argument("--path", action="store", default=None, dest="path", help="Set the path where the property aspects are found.")

arguments = argument_parser.parse_args()

if not sys.stdin.isatty():
    ## READING FROM A PIPE
    pass
else:
    print>>sys.stderr,'Input stream required.'
    print>>sys.stderr,'Example usage: cat myUTF8file.kaf.xml |',sys.argv[0]
    print>>sys.stderr,sys.argv[0]+' -h  for help'
    sys.exit(-1)


## Load the tree and the list of terms with the id
my_data = []
try:
  my_kaf_tree = KafParser(sys.stdin)
except Exception as e:
  print>>sys.stdout,'Error parsing input. Input is required to be KAF'
  print>>sys.stdout,str(e)
  sys.exit(2)


## Get language from the KAF file
my_lang  = my_kaf_tree.getLanguage()

my_aspects_filename = my_aspects = None
if arguments.lexicon is None:
  if my_lang not in ['nl','en','de','fr','it','es']:
    print>>sys.stdout,'Error in the language specified in your KAF. The language is ',my_lang,' and possible values for this module '
    print>>sys.stdout,'are nl for Dutch ,en for English, es Spanish, fr French, it Italian or de German'
    sys.exit(1)

  my_aspects_filename, my_aspects = loadAspects(my_lang)
else:
  my_aspects_filename, my_aspects = loadAspects(my_lang,this_file=arguments.lexicon)

if verbose:
  print>>sys.stderr,'Loaded ',len(my_aspects),'aspects from',my_aspects_filename


for term in my_kaf_tree.getTerms():
    my_data.append((term.getLemma(),term.getId()))
if verbose: print>>sys.stderr,'Number of terms in the kaf file:',len(my_data)


current_token = found = 0
uniq_aspects = defaultdict(list)
while current_token < len(my_data):
    for tam_ngram in range(1,max_ngram+1):
        # Build an n-gram of size tam_ngram and beginning in current_token
        if current_token + tam_ngram <=  len(my_data):
            ngram = ' '.join(lemma for lemma,_ in my_data[current_token:current_token+tam_ngram])
            aspect = my_aspects.get(ngram.lower(),None)
            if aspect is not None:
                list_of_ids = [id for _,id in my_data[current_token:current_token+tam_ngram]]
                uniq_aspects[aspect].append((list_of_ids,ngram))
    current_token += 1


## Code for generating the propery layer included in the Parser
for aspect, list_of_lists in uniq_aspects.items():
  for list_of_ids, str_text in list_of_lists:
    my_kaf_tree.add_property(aspect,list_of_ids,str_text)

my_kaf_tree.addLinguisticProcessor(__desc,__last_edited+'_'+__version,'features', arguments.my_time_stamp)
my_kaf_tree.saveToFile(sys.stdout)