#!/usr/bin/env python

import sys
import getopt
import os

this_folder = os.path.dirname(os.path.realpath(__file__))

# This updates the load path to ensure that the local site-packages directory
# can be used to load packages (e.g. a locally installed copy of lxml).
sys.path.append(os.path.join(this_folder, 'site-packages/pre_install'))

import codecs
from VUKafParserPy import KafParser
from lxml import etree
import tempfile
from subprocess import Popen,PIPE
import shutil
import glob
import logging

from convert_penn_to_kaf import convert_penn_to_kaf_with_numtokens
## LAST CHANGES ##
# 20-dec-2013: modified to generate KAF output
# 15-jan-2014: order in alpino XML does not math the order of tokens
#       so the label "begin" in the xml is used to know which is the number of token of each <node>


last_modified='21Jan2014'
version="1.4"
this_name = 'alpino kaf constituency parser'
this_layer = 'constituents'

#### SET THIS VARIABLE TO YOUR LOCAL FOLDER OF ALPINO
os.environ['SP_CTYPE']='utf8'
os.environ['SP_CSETLEN']='212'
ALPINO_HOME = os.environ['ALPINO_HOME']

logging.basicConfig(stream=sys.stderr,format='%(asctime)s - %(levelname)s - %(message)s',level=logging.DEBUG)

__module_dir = os.path.dirname(__file__)

## Function to convert to penn treebank bracketd format 
def node_to_penn(node):
  children = node.getchildren()
  if len(children) == 0:
    word = node.get('word',None)
    if word is not None:
      #The attribute begin gives you the number of the token
      word = word.replace('(','-LRB')
      word = word.replace(')','-RRB-')
      
      
      num_token = node.get('begin')

      word = num_token+'#'+word
      if node.get('rel') == 'hd': 
        head = '=H'
      else: 
        head = ''
      return '('+node.get('pos')+head+' '+word.encode('utf-8')+')'
    else:
      return ''
  else:
    str = '('+node.get('cat')+' '
    for n in children:
      str+=node_to_penn(n)
    str+=')'
    return str

    
def xml_to_penn(filename):
    
  ## Under certain condition, there is know bug of Alpino, it sets the encoding in the XML
  ## to iso-8859-1, but the real encoding is UTF-8. So we need to force to use this encoding
  
  parser = etree.XMLParser(encoding='UTF-8')
  #parser = etree.XMLParser(encoding='ISO-8859-1')
  tree = etree.parse(filename,parser)
  
  str = node_to_penn(tree.find('node'))
  return str

if not sys.stdin.isatty(): 
    ## READING FROM A PIPE
    pass
else:
    print>>sys.stderr,'Input stream required in KAF format at least with the text layer.'
    print>>sys.stderr,'The language encoded in the KAF has to be Dutch, otherwise it will raise an error.'
    print>>sys.stderr,'Example usage: cat myUTF8file.kaf.xml |',sys.argv[0]
    sys.exit(-1)

my_time_stamp = True
try:
  opts, args = getopt.getopt(sys.argv[1:],"",["no-time"])
  for opt, arg in opts:
    if opt == "--no-time":
      my_time_stamp = False
except getopt.GetoptError:
  pass
  
  
logging.debug('Loading and parsing KAF file ...')
my_kaf = KafParser(sys.stdin)

lang = my_kaf.getLanguage()
if lang != 'nl':
  print>>sys.stdout,'ERROR! Language is ',lang,' and must be nl (Dutch)'
  sys.exit(-1)
  
logging.debug('Extracting sentences from the KAF')
sentences = []
current_sent = [] 
term_ids = []
current_sent_tid = []

    
lemma_for_termid = {}
termid_for_token = {}

for term in my_kaf.getTerms():
    lemma_for_termid[term.getId()] = term.getLemma()
    tokens_id = term.get_list_span()
    for token_id in tokens_id:
        termid_for_token[token_id] = term.getId()
 

previous_sent = None
for token,sent,token_id in my_kaf.getTokens():
  ##To avoid using tokens that have no term linked
  if token_id not in termid_for_token:
    continue
  if sent != previous_sent and previous_sent!=None:
    sentences.append(current_sent)
    current_sent = [token]
    term_ids.append(current_sent_tid)
    current_sent_tid = [termid_for_token[token_id]]
  else:
    current_sent.append(token)
    current_sent_tid.append(termid_for_token[token_id])
  previous_sent = sent
  
if len(current_sent) !=0:
  sentences.append(current_sent)
  term_ids.append(current_sent_tid)
  
  
out_folder_alp = tempfile.mkdtemp()


logging.debug('Calling to Alpino parser in '+ALPINO_HOME)
logging.debug('Temporary folder: '+out_folder_alp)


## CALL TO ALPINO
alpino_bin = os.path.join(ALPINO_HOME,'bin','Alpino')
cmd = alpino_bin+' end_hook=xml -flag treebank '+out_folder_alp+' -parse'
alpino_pro = Popen(cmd,stdout=PIPE,stdin=PIPE,stderr=PIPE,shell=True)

for sentence in sentences:
  for token in sentence:
    token = token.replace('[','\[')
    token = token.replace(']','\]')
    token = token.replace('|','\|')
    alpino_pro.stdin.write(token.encode('utf-8')+' ')
  alpino_pro.stdin.write('\n')
  #print>>sys.stderr
alpino_pro.stdin.close()

error_log = alpino_pro.stderr.read()
#print>>sys.stderr,alpino_pro.stderr.read()

# As we are not reading the stdout or stderr of the process, if we dont wait to it to be done
# the parent will keep running without alpino be completed, and we will get empty XML files
# If the parent reads from stdout or stderr, it waits to the child to be completed before keep running
alpino_pro.wait()


## There should be as many files as number of sentences in the KAF

const = etree.Element('constituency')

#for xml_file in glob.glob(os.path.join(out_folder_alp,'*.xml')):
cnt_t = cnt_nt = cnt_edge = 0
some_error = False
for num_sent in range(len(sentences)):
  xml_file = os.path.join(out_folder_alp,str(num_sent+1)+'.xml')    
  if os.path.exists(xml_file):
    logging.debug('Converting alpino XML to pennTreebank, sentence num '+str(num_sent+1))
    penn_str = xml_to_penn(xml_file)
    tree_node,cnt_t,cnt_nt,cnt_edge = convert_penn_to_kaf_with_numtokens(penn_str,term_ids[num_sent],logging,lemma_for_termid,cnt_t,cnt_nt,cnt_edge)
  else:
    tree_node = etree.Element('tree') #empty
    some_error = True
  const.append(tree_node)

if some_error:
  print>>sys.stderr,'POSSIBLE ERROR',error_log
  value = -1
else:
  value = 0

my_kaf.tree.getroot().append(const)
my_kaf.addLinguisticProcessor(this_name, version+'_'+last_modified, this_layer, my_time_stamp)
my_kaf.saveToFile(sys.stdout)
  

logging.debug('Number of sentences in the input KAF:  '+str(len(sentences)))
logging.debug('PROCESS DONE')

##Remove temporary stuff
shutil.rmtree(out_folder_alp)
#print out_folder_alp
sys.exit(value)