#!/usr/bin/env python

import sys
import getopt
import codecs
import os
import tempfile
from subprocess import Popen,PIPE
import shutil
import glob
import logging

from convert_penn_to_kaf import convert_penn_to_kaf

## Last changes
# 23dec2013 --> adapted output to KAF

this_name = 'Stanford German constituency parser trained on NEGRA corpus'
version = '1.3'
last_modified = '17jan2014'
this_layer = 'constituents'
__module_folder__ = os.path.dirname(__file__)

# This updates the load path to ensure that the local site-packages directory
# can be used to load packages (e.g. a locally installed copy of lxml).
sys.path.append(os.path.join(__module_folder__, 'site-packages/pre_build'))
sys.path.append(os.path.join(__module_folder__, 'site-packages/pre_install'))

from VUKafParserPy import KafParser
from lxml import etree

## CONFIGURATION FOR THE STANFORD PARSER #

STANFORD_HOME=os.path.join(__module_folder__,'vendor','stanford-parser')
STANFORD_MEM="3g"
STANFORD_GERMAN_OPTS='-hMarkov 1 -vMarkov 2 -vSelSplitCutOff 300 -uwm 1 -unknownSuffixSize 2 -nodeCleanup 2 -encoding UTF-8'
STANFORD_PARSER_OPTS='-tokenized -escaper edu.stanford.nlp.process.PTBEscapingProcessor '
STANDFORD_GRAMMAR='edu/stanford/nlp/models/lexparser/germanPCFG.ser.gz'
##########################################

logging.basicConfig(stream=sys.stderr,format='%(asctime)s - %(levelname)s - %(message)s',level=logging.DEBUG)

## MAIN ##

if not sys.stdin.isatty(): 
    ## READING FROM A PIPE
    pass
else:
    print>>sys.stderr,'Input stream required in KAF format at least with the text layer.'
    print>>sys.stderr,'The language encoded in the KAF has to be Dutch, otherwise it will raise an error.'
    print>>sys.stderr,'Example usage: cat myUTF8file.kaf.xml |',sys.argv[0]
    sys.exit(-1)

my_time_stamp = True
try:
  opts, args = getopt.getopt(sys.argv[1:],"",["no-time"])
  for opt, arg in opts:
    if opt == "--no-time":
      my_time_stamp = False
except getopt.GetoptError:
  pass
  
  
logging.debug('Starting stanford parser for German text')
logging.debug('Loading and parsing KAF file ...')
my_kaf = KafParser(sys.stdin)

lang = my_kaf.getLanguage()
if lang != 'de':
  print>>sys.stdout,'ERROR! Language is',lang,'and must be "de" (German)'
  sys.exit(-1)
  
logging.debug('Extracting sentences from the KAF')

termid_for_token = {}
lemma_for_termid = {}
for term in my_kaf.getTerms():
    lemma_for_termid[term.getId()] = term.getLemma()
    tokens_id = term.get_list_span()
    for token_id in tokens_id:
        termid_for_token[token_id] = term.getId()

sentences = []
current_sent = [] 
previous_sent = None
term_ids = []
current_sent_tid = []

for token,sent,token_id in my_kaf.getTokens():
  if sent != previous_sent and previous_sent!=None:
    sentences.append(current_sent)
    current_sent = [token]
    
    term_ids.append(current_sent_tid)
    current_sent_tid = [termid_for_token[token_id]]
  else:
    current_sent.append(token)
    current_sent_tid.append(termid_for_token[token_id])
  previous_sent = sent
  
if len(current_sent) !=0:
  sentences.append(current_sent)
  term_ids.append(current_sent_tid)
  
 
logging.debug('Calling to Stanford parser for GERMAN in '+STANFORD_HOME)


# Creating a temp file with the input
tmp_file = tempfile.NamedTemporaryFile(mode='w', delete=False)
for sentence in sentences:
    for token in sentence:
        tmp_file.write(token.encode('utf-8')+' ')
    tmp_file.write('\n')
tmp_file.close()
######################################

cmd = 'java -Xmx"'+STANFORD_MEM+'" -cp "'+STANFORD_HOME+'/*:" edu.stanford.nlp.parser.lexparser.LexicalizedParser -maxLength "1000" '
cmd+= ' -tLPP "edu.stanford.nlp.parser.lexparser.NegraPennTreebankParserParams" '+STANFORD_GERMAN_OPTS+' '+STANFORD_PARSER_OPTS
cmd+= ' -outputFormat "oneline" -outputFormatOptions "markHeadNodes" -sentences newline -loadFromSerializedFile '+STANDFORD_GRAMMAR
cmd+=' '+tmp_file.name

stanford_process = Popen(cmd,stdout=PIPE,stderr=PIPE,shell=True)
code = stanford_process.wait()
logging.debug('Stanford parser finished with code:'+str(code))
stanford_output, stanford_error  = stanford_process.communicate()
logging.debug('STANFORD LOG: '+ stanford_error)

const = etree.Element('constituency')
cnt_t = cnt_nt = cnt_edge = 0
for num_sent, str_tree in enumerate(stanford_output.splitlines()):
    list_term_ids_for_sentence = term_ids[num_sent]
    tree_obj, cnt_t, cnt_nt , cnt_edge  = convert_penn_to_kaf(str_tree,list_term_ids_for_sentence,logging,lemma_for_termid,cnt_t,cnt_nt,cnt_edge)
    const.append(tree_obj)

my_kaf.tree.getroot().append(const)
my_kaf.addLinguisticProcessor(this_name, version+'_'+last_modified, this_layer, my_time_stamp)
my_kaf.saveToFile(sys.stdout)

os.remove(tmp_file.name)
sys.exit(0)