# coding=utf-8
""" The kaf version of the graph builder

"""

__author__ = 'Josu Bermudez <josu.bermudez@deusto.es>'


from ..graph.graph_builder import BaseGraphBuilder
from ..graph.xutils import GraphWrapper
from ..resources import tree
from ..resources.dictionaries import verbs, pronouns
from ..resources.tagset import ner_tags, pos_tags, constituent_tags


from collections import defaultdict, deque
from operator import itemgetter
import logging
READER = None


class KafAndTreeGraphBuilder(BaseGraphBuilder):
    """Extract the info from KAF documents and TreeBank."""
    kaf_document_property = "kaf"
    kaf_id_property = "kaf_id"
    kaf_offset_property = "offset"

    def __init__(self, reader_name, secure_tree=True, logger=logging.getLogger("GraphBuilder")):
        super(KafAndTreeGraphBuilder, self).__init__()
        if reader_name == "NAF":
            import pynaf
            reader = pynaf
            self.document_reader = reader.NAFDocument
        elif reader_name == "KAF":
            import pykaf
            reader = pykaf
            self.document_reader = reader.KafDocument
        else:
            raise Exception("Unknown Reader")
        global READER
        READER = reader
        self.secure_tree = secure_tree
        self.logger = logger
        self.syntax_count = 0
        self.leaf_count = 0
        self.kaf = None
        self.sentence_order = 0
        self.utterance = -1
        self.speakers = []
        self.terms_pool = []
        self.term_by_id = dict()
        self.term_by_word_id = dict()
        self.entities_by_word = defaultdict(set)
        self.entities = list()
        self.max_utterance = 1
        self.doc_type = "unknown"
        self._sentences = None
        self.graph_utils = None

    def set_graph(self, graph):
        """ Set the graph where this builders works
        @param graph: The graph target of this builder
        """
        super(self.__class__, self).set_graph(graph)
        self.graph_utils = SyntacticTreeUtils(graph)

    def get_graph_utils(self):
        """  Returns a object that provide complex relation finder for graph nodes.
        @return: The utility object
        """
        return self.graph_utils

    def process_document(self, graph, document):
        """ Get a document and prepare the graph an the graph builder to sentence by sentence
         processing of the document.
        @param graph: The graph where the kaf info is loaded
        @param document: A tuple that contains (the KAF,Sentences or none, speakers or none)
        """
        self.graph = graph
        # Counter to order the sentences inside a text. A easier way to work with sentences that order
        self.sentence_order = 1
        if document[1]:
            self._sentences = document[1].strip().split("\n")
        # If speaker is None store None otherwise store split speaker file
        if document[2]:
            # Remove the blank lines and split
            self.speakers = []
            current_speaker = None
            self.max_utterance = -1
            for line in document[2].split("\n"):
                if line == "":
                    continue
                self.speakers.append(line)
                if current_speaker != line:
                    self.max_utterance += 1
                    current_speaker = line

            # A doc is a conversation if exist two o more speakers in it
            if self.max_utterance > 1:
                self.doc_type = "conversation"
            else:
                self.doc_type = "article"
        else:
            self.speakers = []
            self.max_utterance = 1
            self.doc_type = "article"
        self.utterance = -1
        self.parse_kaf(kaf_string=document[0].strip())

    def get_sentences(self):
        """ Get the sentences of the document.
        @return: A list of trees(kaf nodes or Penn-treebank strings)
        """
        if self._sentences:
            return self._sentences
        else:
            return self.kaf.get_constituency_trees()

    def parse_kaf(self, kaf_string):
        """ Parse al tha kaf info tho the graph except of sentence parsing.
        @param kaf_string:
        """
        self.terms_pool = []
        # Store original kaf for further recreation
        self.kaf = self.document_reader(input_stream=kaf_string)
        GraphWrapper.set_graph_property(self.graph, self.kaf_document_property, self.kaf)
        self.set_terms(self.kaf)
        self.set_entities(self.kaf)
        self.set_dependencies(self.kaf)

    def process_sentence(self, graph, sentence, root_index, sentence_namespace):
        """Add to the graph the morphological, syntactical and dependency info contained in the sentence.

        :param graph: The graph where the kaf info is loaded
        :param sentence: the sentence to parse
        :param sentence_namespace: prefix added to all nodes ID strings.
        :param root_index: The index of the root node
        """
        self.graph = graph
        sentence_id = sentence_namespace
        sentence_label = sentence_namespace

        # Sentence Root
        sentence_root_node = self.add_sentence(root_index=root_index, sentence_form="", sentence_label=sentence_label,
                                               sentence_id=sentence_id)
        sentence_root_node["graph"] = graph
        sentence_root_node["sentence_order"] = self.sentence_order

        first_constituent = self.parse_syntax(sentence=sentence, syntactic_root=sentence_root_node)

        # copy the properties to the root
        if first_constituent != sentence_root_node:
            sentence_root_node["lemma"] = first_constituent["lemma"]
            sentence_root_node["form"] = first_constituent["form"]
            sentence_root_node["span"] = first_constituent["span"]
            sentence_root_node["ord"] = first_constituent["ord"]
            sentence_root_node["begin"] = first_constituent["begin"]
            sentence_root_node["end"] = first_constituent["end"]

        self.sentence_order += 1
        # Statistics
        self.statistics_sentence_up()
        #self.show_graph()
        # Return the generated context graph
        return sentence_root_node

    def set_entities(self, kaf):
        """ Extract the entities of the kaf and add to the file

        @param kaf: The kaf file manager
        """
        # A dict of entities that contains a list of references. A reference is a list of terms.
        self.entities_by_word = defaultdict(set)
        self.entities = list()
        for kaf_entity in kaf.get_entities():
            entity_type = kaf_entity.attrib["type"]
            entity_id = kaf_entity.attrib[READER.NAMED_ENTITY_ID_ATTRIBUTE]
            for reference in kaf.get_entity_references(kaf_entity):
                # Fetch terms
                entity_terms = sorted(
                    [self.term_by_id[term.attrib["id"]] for term in kaf.get_entity_reference_span(reference)],
                    key=itemgetter("ord"))
                # attach 's if exist
                next_term = self.term_by_id.get("t{0}".format(int(entity_terms[-1]["id"][1:]) + 1))
                if next_term and next_term["form"] == "'s":
                    entity_terms.append(next_term)
                    # Convert ID into terms
                # Build form
                form = self.expand_node(entity_terms)
                # Build the entity
                label = "{0} | {1}".format(form, entity_type)
                entity = self.add_named_entity(entity_type=entity_type, entity_id=entity_id, label=label)
                # Set the other attributes
                entity["begin"] = entity_terms[0]["begin"]
                entity["end"] = entity_terms[-1]["end"]
                entity["form"] = form
                entity["ord"] = entity_terms[0]["span"][0], entity_terms[-1]["span"][1]
                entity["span"] = entity["ord"]

                # Link words_ids to mention as word
                for term in entity_terms:
                    self.link_word(entity, term)
                # Index the entity by its first word
                first_word_id = entity_terms[0]["id"]
                if first_word_id in self.entities_by_word:
                    self.entities_by_word[first_word_id].append(entity)
                else:
                    self.entities_by_word[first_word_id] = [entity]

    def set_terms(self, kaf):
        """ Extract the terms of the kaf and add to the graph

        @param kaf: The kaf file manager
        """
        # Words
        kaf_words = dict([
            (kaf_word.attrib[READER.WORD_ID_ATTRIBUTE], kaf_word) for kaf_word in kaf.get_words()])
        # Terms
        self.term_by_id = dict()
        self.term_by_word_id = dict()
        prev_speaker = "NO"
        inside_utterance = deque()
        inside_plain_quotes = False
        for term in kaf.get_terms():
            term_id = term.attrib[READER.TERM_ID_ATTRIBUTE]
            # Fetch the words of the term values
            term_words = sorted((kaf_words[word.attrib["id"]] for word in kaf.get_terms_words(term)),
                                key=lambda x: x.attrib[self.kaf_offset_property])
            # Build term attributes
            form = self.expand_kaf_word(term_words)
            order = int(term_words[0].attrib[READER.WORD_ID_ATTRIBUTE][1:]), \
                int(term_words[-1].attrib[READER.WORD_ID_ATTRIBUTE][1:])
            span = order
            begin = int(term_words[0].attrib[self.kaf_offset_property])
            end = int(term_words[-1].attrib[self.kaf_offset_property]) + int(term_words[-1].attrib["length"]) - 1
            # We want pennTreeBank tagging no kaf tagging
            pos = term.attrib[READER.MORPHOFEAT_ATTRIBUTE]
            kaf_id = "{0}#{1}".format(term_id, "|".join([word.attrib[READER.WORD_ID_ATTRIBUTE] for word in term_words]))
            # Clear unicode problems
            if isinstance(form, unicode):
                form = form.encode("utf8")
            try:
                lemma = term.attrib[READER.LEMMA_ATTRIBUTE]
                if lemma == "-":
                    raise KeyError
            except KeyError:
                lemma = form
            if isinstance(lemma, unicode):
                lemma = lemma.encode("utf8")

            label = "\n".join((form, pos, lemma, term_id))
            #Create word node
            word_node = self.add_word(
                form=form, node_id=term_id, label=label, lemma=lemma, pos=pos, order=order,
                begin=begin, end=end)
            word_node["span"] = span
            word_node[self.kaf_id_property] = kaf_id
            word_node["prev_speaker"] = prev_speaker
            if self.speakers:
                speaker = self.speakers.pop(0)
                if prev_speaker != speaker:
                    if prev_speaker != "NO":
                        self.utterance += 1
            else:
                speaker = "PER{0}".format(self.utterance)
            if not speaker or speaker == "-":
                speaker = "PER{0}".format(self.utterance)

            if prev_speaker != speaker:
                    prev_speaker = speaker

            # Manage Quotation
            # TODO improve  nested quotation
            if form == "``" or (form == '"' and not inside_plain_quotes):
                self.max_utterance += 1
                inside_utterance.append(self.max_utterance)
                if form == '"':
                    inside_plain_quotes = True
            elif form == "''" or (form == '"' and inside_plain_quotes):
                if form == '"':
                    inside_plain_quotes = False
                try:
                    inside_utterance.pop()
                except IndexError:
                    self.logger.warning("Unbalanced quotes")

            if len(inside_utterance):
                word_node["utterance"] = inside_utterance[-1]
                word_node["speaker"] = "PER{0}".format(inside_utterance[-1])
                word_node["quoted"] = True
            else:
                word_node["speaker"] = speaker
                word_node["utterance"] = self.utterance
                word_node["quoted"] = False

            word_node["doc_type"] = self.doc_type
            # Store term
            # ONLY FOR STANFORD DEPENDENCIES IN KAF
            for word in term_words:
                self.term_by_word_id[word.attrib[READER.WORD_ID_ATTRIBUTE]] = word_node
            self.term_by_id[term_id] = word_node
            self.terms_pool.append(word_node)
            self.statistics_word_up()
        self.leaf_count = 0

    def set_dependencies(self, kaf):
        """ Extract the dependencies of the kaf and add to the graph

        @param kaf: The kaf file manager
        """
        for dependency in kaf.get_dependencies():
            dependency_from = dependency.attrib[READER.DEPENDENCY_FROM_ATTRIBUTE]
            dependency_to = dependency.attrib[READER.DEPENDENCY_TO_ATTRIBUTE]
            dependency_type = dependency.attrib[READER.DEPENDENCY_FUNCTION_ATTRIBUTE]
            #IFS For STANFORD DEPENDENCIES IN KAF
            if dependency_from[0] == "w":
                dependency_from = self.term_by_word_id[dependency_from]
            else:
                dependency_from = self.term_by_id[dependency_from]
            if dependency_to[0] == "w":
                dependency_to = self.term_by_word_id[dependency_to]
            else:
                dependency_to = self.term_by_id[dependency_to]
            self.link_dependency(dependency_from, dependency_to, dependency_type)

    def iterate_syntax(self, syntactic_tree, parent, syntactic_root):
        """ Walk recursively over the syntax tree and add their info to the graph.
        @param syntactic_tree: The subtree to process
        @param parent: The parent node of the subtree
        @param syntactic_root: The syntactic root node of all the tree
        @return: The element created from the top of the subtree
        """
        # Aux functions
        def syntax_leaf_process(parent_node, leaf):
            """ Process a final node of the tree
            @param parent_node: The upside node of the element
            @param leaf: The node to process
            @return: The word that correspond to the leaf.
            """
            # the tree node is a leaf
            # Get the text of the tree to obtain more attributes
            self.leaf_count += 1
            text_leaf = leaf.node
            #treebank_word = leaf[0]
            is_head = "=H" in text_leaf or "-H" in text_leaf
            # Get the word node pointed by the leaf
            try:
                word_node = self.terms_pool.pop(0)
                self.last_word = word_node
            except IndexError:
                word_node = self.last_word
            # Word is mark as head
            if is_head:
                self.set_head(parent_node, word_node)
            # Word is mark as Named Entity
            if "|" in text_leaf:
                self.set_ner(constituent=word_node, ner_type=text_leaf.split("|")[-1])
            #Link the word to the node
            self.link_syntax_terminal(parent=parent_node, terminal=word_node)
            #link the word to the sentence
            self.link_root(sentence=syntactic_root, element=word_node)
            self.link_word(sentence=syntactic_root, word=word_node)
            # Enlist entities that appears in the phrase
            for mention in self.entities_by_word.get(word_node["id"], []):
                            self.add_mention_of_named_entity(sentence=syntactic_root, mention=mention)
            return word_node

        def syntax_branch_process(parent_node, branch):
            """ Process a intermediate node of the tree
            @param parent_node: The upside node of the element
            @param branch: The node to process
            @return: The constituent created from the top of the branch
            """
            # Create a node for this element
            label = branch.node
            # constituent is mark as head
            head = "=H" in label or "-H" in label
            tag = label.replace("=H", "").replace("-H", "")
            # Constituent is mark as ner
            if "|" in label:
                ner = label.split("|")[-1]
            else:
                ner = ner_tags.no_ner

            tag = tag.split("|")[0]
            order = self.syntax_count

            new_constituent = self.add_constituent(node_id="C{0}".format(order), sentence=syntactic_root, tag=tag,
                                                   order=order, label=label)
            self.set_ner(new_constituent, ner)
            self.syntax_count += 1
            # Process the children
            children = [
                self.iterate_syntax(
                    syntactic_tree=child, parent=new_constituent, syntactic_root=syntactic_root)
                for child in branch]
            children.sort(key=itemgetter("ord"))

            # Link the child with their parent (The actual processed node)
            self.link_syntax_non_terminal(parent=parent_node, child=new_constituent)
            if head:
                self.set_head(parent_node, new_constituent)
            head_word = self.get_head_word(new_constituent)

            content_text = self.expand_node(children)
            new_constituent["tree"] = branch
            new_constituent["label"] = (" | ".join((content_text, tag)))
            new_constituent["lemma"] = content_text
            new_constituent["form"] = content_text

            new_constituent["doc_type"] = self.doc_type
            new_constituent["utterance"] = head_word["utterance"]
            new_constituent["quoted"] = head_word["quoted"]
            new_constituent["begin"] = children[0]["begin"]
            new_constituent["end"] = children[-1]["end"]
            new_constituent["ord"] = (children[0]["span"][0], children[-1]["span"][1])
            new_constituent["span"] = new_constituent["ord"]

            # Add in tree named entities to entities in graph
            if constituent_tags.ner_constituent(tag):
                self.add_mention_of_named_entity(sentence=syntactic_root, mention=new_constituent)
                new_constituent["constituent"] = new_constituent

            return new_constituent

        # Determine if the syntactic tree Node is as branch or a leaf
        if len(syntactic_tree) > 1 or not (
                isinstance(syntactic_tree[0], str) or isinstance(syntactic_tree[0], unicode)):
            constituent_or_word = syntax_branch_process(parent_node=parent, branch=syntactic_tree)
            self.syntax_count += 1
        else:
            constituent_or_word = syntax_leaf_process(parent_node=parent, leaf=syntactic_tree)
        return constituent_or_word

    def parse_syntax_kaf(self, sentence, syntactic_root):
        """ Add the syntax info from a KAF tree node

        @param sentence: The KAF tree element
        @param syntactic_root: The sentence node
        @return: the syntax root node or the first constituent
        """
        constituents_by_id = dict()
        root = None
        root_head = None
        node_process_list = []
        for non_terminal in self.kaf.get_contituent_tree_non_terminals(sentence):
            constituent_id = non_terminal.attrib["id"]
            tag = non_terminal.attrib["label"]
            order = self.syntax_count
            self.syntax_count += 1
            constituent = self.add_constituent(
                node_id=constituent_id, sentence=syntactic_root, tag=tag, order=order, label=tag)
            constituent["ner"] = ner_tags.no_ner
            if constituent_tags.root(tag):
                root = constituent
            constituents_by_id[constituent_id] = constituent
        constituents = constituents_by_id.values()
        terminals = self.kaf.get_contituent_tree_terminals(sentence)
        terminals_words = dict()
        for terminal in terminals:
            terminal_id = terminal.attrib["id"]
            node_process_list.append(terminal_id)
            terminals_words[terminal_id] = [
                self.term_by_id[target_term.attrib["id"]]
                for target_term in self.kaf.get_contituent_terminal_words(terminal)]
        edges_by_departure_node = {}
        edges_list = self.kaf.get_contituent_tree_edges(sentence)
        for edge in edges_list:
            edges_by_departure_node[edge.attrib["from"]] = edge

        if self.secure_tree:
            node_process_list = [edge.attrib["from"] for edge in edges_list]
            node_process_list.reverse()

        while len(node_process_list):
            edge = edges_by_departure_node[node_process_list.pop(0)]
            # The edges have a down-top direction
            target_id = edge.attrib["to"]
            source_id = edge.attrib["from"]
            target = constituents_by_id[target_id]
            # select link type in base of the source node type
            if target != root and not self.secure_tree:
                if target_id not in node_process_list:
                    node_process_list.append(target_id)
            if source_id.startswith("n"):
                source = constituents_by_id[source_id]
                if target == root:
                    if root_head is None or edge.attrib.get("head", False):
                        root_head = source            
                else:
                    self.link_syntax_non_terminal(parent=target, child=source)
                    # Set the head of the constituent
                    if edge.attrib.get("head", False):
                        try:
                            self.set_head(parent=target, head=source)
                        except Exception as ex:
                            self.logger.warning("Error setting a head: Source %s ID#%s Target %s ID#%s Error: %s",
                                              target_id, target, source_id, source, ex)
            else:
                node_process_list.append(target_id)
                source = terminals_words[source_id]
                if len(source) == 1 and target["tag"] == source[0]["pos"]:
                    word = source[0]
                    self.link_root(sentence=syntactic_root, element=word)
                    nexus_constituent = constituents_by_id[target_id]
                    constituents_by_id[target_id] = word
                    self.remove(nexus_constituent)
                    constituents.remove(nexus_constituent)
                    self.link_word(sentence=syntactic_root, word=word)
                    # Enlist entities that appears in the phrase
                    for mention in self.entities_by_word.get(word["id"], []):
                        self.add_mention_of_named_entity(sentence=syntactic_root, mention=mention)
                else:
                    for word in source:
                        self.link_root(sentence=syntactic_root, element=word)
                        self.link_syntax_terminal(parent=target, terminal=word)
                        self.link_word(sentence=syntactic_root, word=word)
                        # Enlist entities that appears in the phrase
                        for mention in self.entities_by_word.get(word["id"], []):
                            self.add_mention_of_named_entity(sentence=syntactic_root, mention=mention)
                        # Set the head of the constituent
                    self.set_head(target, source[-1])

        # Build constituent child based values
        for constituent in constituents:
            if constituent == root:
                continue
            children = self.get_words(constituent)
            children.sort(key=itemgetter("ord"))
            head_word = self.get_head_word(constituent)
            content_text = self.expand_node(children)
            constituent["doc_type"] = self.doc_type
            constituent["utterance"] = head_word["utterance"]
            constituent["quoted"] = head_word["quoted"]
            constituent["label"] = (" | ".join((content_text, constituent["tag"])))
            constituent["lemma"] = self.expand_node_lemma(children)
            constituent["form"] = content_text
            constituent["begin"] = children[0]["begin"]
            constituent["end"] = children[-1]["end"]
            constituent["ord"] = (children[0]["span"][0], children[-1]["span"][1])
            constituent["span"] = constituent["ord"]

        # link the tree with the root
        if root_head is None:
            self.logger.warning("No ROOT found, used the first constituent, sentence: %s",
                                syntactic_root["sentence_order"])
            root_head = constituents[0]

        self.link_syntax_non_terminal(parent=syntactic_root, child=root_head)
        # Set the head of the constituent
        self.set_head(syntactic_root, root_head)
        return root_head

    def parse_syntax(self, sentence, syntactic_root):
        """ Parse the syntax of the sentence.

        @param sentence:  The sentence
        @param syntactic_root:
        @return: The upper node of the syntax tree.
        """
        # Convert the syntactic tree
        if type(sentence) is str:
            # Is a plain Penn-tree
            sentence = self.clean_penn_tree(sentence)
            syntactic_tree = tree.Tree(sentence)
            # Call to the recursive function
            return self.iterate_syntax(
                syntactic_tree=syntactic_tree, parent=syntactic_root, syntactic_root=syntactic_root)
        else:
            # Is a kaf tree
            return self.parse_syntax_kaf(sentence=sentence, syntactic_root=syntactic_root)

    # AUX FUNCTIONS
    @staticmethod
    def expand_kaf_word(words):
        """ Rebuild the text form from a list of kaf words
        @param words: a list of KAF words
        @return: the form of all words separated by comas.
        """
        text = " ".join([word.text for word in words])
        return text.strip()

    @staticmethod
    def expand_node(terms):
        """ Rebuild the from of a element
        @param terms: The ordered term lsit of this element
        @return: The form of the element
        """
        text = " ".join([term["form"] for term in terms])
        return text.strip()

    @staticmethod
    def expand_node_lemma(terms):
        """ Rebuild the lemma of a element
        @param terms: The ordered term list of this element
        @return: The form of the element
        """
        text = " ".join([term["lemma"] for term in terms])
        return text.strip()

    @staticmethod
    def clean_penn_tree(penn_tree):
        """ Clean from the tree all knows problems
        @param penn_tree: the plain tree
        @return: cleaned tree
        """
        penn_tree = penn_tree.strip()
        return penn_tree


class SyntacticTreeUtils():
    """ A collection of functions that resolves complex operations in the graph representation of syntactic tree

    """
    def __init__(self, graph, logger=logging.getLogger("GraphBuilder")):
        self.graph = graph
        self.graph_builder = GraphWrapper.get_graph_property(self.graph, 'graph_builder')
        self.graph.graph["utils"] = self
        self.logger = logger

    def skip_root(self, sentence_root):
        """Get the first chunk of the sentence (usually S) Skip al ROOT nodes,created by the parser o the graph builder.
        Skip all the dummy roots crated by the parsers/graph builder.
        :param sentence_root: The syntactic tree root node.
        """
        chunk = sentence_root
        while chunk and (chunk["tag"] == self.graph_builder.root_pos):
            children = self.graph_builder.get_syntactic_children(chunk)[0]
            if len(children) > 1:
                return chunk
            chunk = children[0]
        return chunk

# Allocation of Named Entities
    def get_span_constituent(self, sentence, span):
        """ Try to fit a span (a group of sequential words) into a existing constituent.

        :param sentence: The sentence where the word must be allocated.
        :param span: The list of word that must be allocated.
        """
        nodes = self.graph_builder.get_syntactic_children(sentence)
        while nodes:
            node = nodes.pop()
            node_span = node["span"]
            if node_span == span:
                return node
            children = self.graph_builder.get_syntactic_children(node)
            if not (node_span[0] > span[0] or node_span[-1] < span[-1]):
                nodes.extend(children)
        return None

    def get_plausible_head_word(self, words):
        """ Get a Head word for the NE that preserves the head coherence.

        Find the words of the NE that are heads. If more than one are head use the head assign rules( NP cases) with
        the head word to select the head. If no head is contained in the bag of word use every word instead of head
        words.

        #  head word assignment preferences for NP cases:
        # "NN", "NNP", "NNPS", "NNS", "NX", "JJR", "POS"

        @param words: A list of words
        @return A word
        """

        head_words = [word for word in words if self.graph_builder.is_head(word)]
        for pos in pos_tags.head_rules:
            for word in head_words:
                if word["pos"] == pos:
                    return word

        for pos in pos_tags.head_rules:
            for word in words:
                if word["pos"] == pos:
                    return word
        return words[0]

    def get_plausible_constituent(self, head):
        """ Get the highest NP that has the same head.

        Get the constituent that complains these restriction:
            + Have the same terminal head.
            + Is NP.
            + Is the highest NP of the first chain of NPs.

        If no valid NP is found use the constituent of the head.

        # Source StanfordCoreNLP::MentionExtractor.Java::Class:MentionExtractor:Arrage
        :param head: the terminal head that must be the head of the constituent.
        """

        base_constituent = head
        constituent = self.graph_builder.get_syntactic_parent(head)
        valid_constituent = None
        # Climb until head chain is broken
        while constituent and self.graph_builder.is_head(constituent) and \
                self.graph_builder.get_head(constituent)["id"] == head["id"]:
            # If is a valid constituent store it
            if constituent_tags.noun_phrases("tag" in constituent and constituent["tag"]):
                valid_constituent = constituent
            # If already have a valid constituent and valid constituent chain is broken, Stop the search
            elif valid_constituent:
                break
                # Climb
            constituent = self.graph_builder.get_syntactic_parent(constituent)
            # Fallback constituent
        if not valid_constituent:
            valid_constituent = base_constituent
        return valid_constituent

    def allocate_named_entities(self, entity, sentence):
        """ Try to set a terminal head and a constituent of a named entity.

         The constituent and the head is used to order the mention in the sieve searching.
        :param sentence: The sentence where the word must be allocated.
        :param entity: The named entity that must be allocated.
        """
        # Find a plausible terminal head
        entity_span = entity["span"]
        # Try to find a constituent that fit the mention span
        valid_constituent = self.get_span_constituent(sentence, entity_span)
        if valid_constituent:
            head = self.graph_builder.get_head(valid_constituent)
            constituent = valid_constituent
        else:
            # Use the head finder
            head = self.get_plausible_head_word(self.graph_builder.get_words(entity))
            if not head:
                self.logger.warning("Unable to fit NE: %s", entity)
                head = self.graph_builder.get_words(entity)[-1]
            # With the artificial Terminal head find a plausible NP constituent
            constituent = self.get_plausible_constituent(head)

        self.graph_builder.set_head(entity, head)
        # Used in speaker sieve, The head word is the relevant info source
        entity["constituent"] = constituent
        entity["doc_type"] = head["doc_type"]
        entity["utterance"] = head["utterance"]
        entity["quoted"] = head["quoted"]

        self.graph_builder.link_root(entity, self.graph_builder.get_root(constituent))
        return constituent

# Syntax complex relations
    def is_role_appositive(self, first_constituent, second_constituent):
        """Check if are in a role appositive construction.

        @param first_constituent:
        @param second_constituent:
        @return: True or False
        """
        if not self.graph_builder.same_sentence(second_constituent, first_constituent):
            return False
        # If candidate or mention are NE use their constituent
        if first_constituent["type"] == "named_entity":
            first_constituent = first_constituent["constituent"]
        if second_constituent["type"] == "named_entity":
            second_constituent = second_constituent["constituent"]

        # The Candidate is headed by a noun.
        candidate_head = self.graph_builder.get_head_word(first_constituent)
        if not candidate_head or not pos_tags.all_nouns(candidate_head["pos"]):
            return False
            # The Candidate appears as a modifier of a NP
        candidate_syntactic_father = self.graph_builder.get_syntactic_parent(first_constituent)
        if not constituent_tags.noun_phrases(candidate_syntactic_father["tag"]):
            return False
            # The NP whose head is the mention
        return second_constituent["id"] == self.graph_builder.get_head(candidate_syntactic_father)["id"]

    def is_appositive_construction_child(self, constituent):
        """ Check if the mention is in a appositive construction.

        "NP=m1 < (NP=m2 $.. (/,/ $.. NP=m3))";
        "NP=m1 < (NP=m2 $.. (/,/ $.. (SBAR < (WHNP < WP|WDT=m3))))";
        "/^NP(?:-TMP|-ADV)?$/=m1 < (NP=m2 $- /^,$/ $-- NP=m3 !$ CC|CONJP)";
        "/^NP(?:-TMP|-ADV)?$/=m1 < (PRN=m2 < (NP < /^NNS?|CD$/ $-- /^-LRB-$/ $+ /^-RRB-$/))";

        @param constituent: The mention to check
        """
        if constituent["type"] == "named_entity":
            constituent = constituent["constituent"]

        # TODO Improve the precision
        parent = self.graph_builder.get_syntactic_parent(constituent)
        if not constituent_tags.noun_phrases(parent["tag"]):
            return False
        siblings = self.graph_builder.get_syntactic_sibling(constituent)
        for sibling in siblings:
            if "pos" in sibling and pos_tags.conjunction(sibling["pos"]):
                return False
        if len(siblings) >= 3:
            first_child = siblings[0]
            second_child = siblings[1]
            if ("tag" in first_child) and constituent_tags.noun_phrases(first_child["tag"]) and\
                    second_child["form"] == ",":
                if len(siblings) == 3:
                    return True
                elif len(siblings) > 3:
                    return siblings[3]["form"] == ","
        else:
            return False

    def is_predicative_nominative(self, constituent):
        """ Check if the constituent is a predicate in a predicative nominative mention

        Stanford check for the relation:
        # "S < (NP=m1 $.. (VP < ((/VB/ < /^(am|are|is|was|were|'m|'re|'s|be)$/) $.. NP=m2)))";
        # "S < (NP=m1 $.. (VP < (VP < ((/VB/ < /^(be|been|being)$/) $.. NP=m2))))";

        @param constituent: The mention to check
        """
        # The constituent is in a VP that start with a copulative verb
        parent = self.graph_builder.get_syntactic_parent(constituent)
        if constituent_tags.verb_phrases(parent.get("tag", False)):
            for child in self.graph_builder.get_syntactic_children(parent):
                if child["span"] < constituent["span"]:
                    if pos_tags.verbs(child.get("pos", False)):
                        if child["form"] in verbs.copulative:
                            return True
        return False

    def is_bare_plural(self, constituent):
        """ Check if the constituent is Bare plural.
        @param constituent: The constituent to check
        @return: Boolean
        """
        span = constituent["span"]
        return (span[0] - span[1] == 0) and \
            pos_tags.bare_plurals(self.graph_builder.get_constituent_words(constituent)[0]["pos"])

    def is_enumeration(self, constituent):
        """ Check if the constituent is a enumeration.
        @param constituent: The constituent to check
        @return: True or False
        """
        coordination = False
        np_pre_coordination = False
        for children in self.graph_builder.get_syntactic_children(constituent):
            children_tag = children.get("tag", "")
            if constituent_tags.noun_phrases(children_tag):
                if coordination:
                    return True
                else:
                    np_pre_coordination = True
            else:
                children_pos = children.get("pos")
                if children_pos != None:
                    if pos_tags.conjunction(children_pos) and np_pre_coordination:
                        coordination = True
        return False

        # Mention sub tree : maneja los casos con and
        #        enumerationPattern = r"NP < (NP=tmp $.. (/,|CC/ $.. NP))";
        #        tgrepPattern = re.compile(enumerationPattern)
        #        match= tgrepPattern.matcher(this.mentionSubTree)
        #        while  m.find():
        #            if(this.mentionSubTree==m.getNode("tmp") and this.spanToString().toLowerCase().contains(" and ")):
        #                number = Number.PLURAL

    def is_relative_pronoun(self, first_constituent, second_constituent):
        """ Check if tho constituents are in relative pronoun construction. Also mark they.
        @param first_constituent:
        @param second_constituent:
        @return: Boolean
        """

        #NP < (NP=m1 $.. (SBAR < (WHNP < WP|WDT=m2)))
        if not self.graph_builder.same_sentence(first_constituent, second_constituent):
            return False
        if second_constituent["form"].lower() not in pronouns.relative:
            return False
        if first_constituent["span"] > second_constituent["span"]:
            return False
        enclosing_np = self.graph_builder.get_syntactic_parent(first_constituent)

        upper = self.graph_builder.get_syntactic_parent(second_constituent)
        while upper and (upper["type"] != self.graph_builder.root_type):
            if self.graph_builder.is_inside(upper["span"], enclosing_np["span"]):
                upper = self.graph_builder.get_syntactic_parent(upper)
            elif upper["id"] == enclosing_np["id"]:
                #TODO check path element
                return True
            else:
                return False

        return False
        # return set(filter(lambda X: X["tag"] in
        #                             constituent_tags.subordinated, mention.in_neighbours())).intersection(
        #     set(filter(lambda X: X["tag"] in
        #                          constituent_tags.subordinated, candidate.out_neighbours())))

    @staticmethod
    def check_sibling_property(base_index, siblings, _property, check_function):
        """ Return the forward siblings and return the first that fulfill the property
        @param base_index: The index of the original sibling
        @param siblings: The ordered list of siblings
        @param _property: The name of the property to check
        @param check_function: The function used to check the property
        @return: the sibling that fulfill the property and its index.
        """
        constituent = None
        index = 0
        for index, sibling in enumerate(siblings[base_index:]):
            if _property in sibling and check_function(sibling[_property]):
                constituent = sibling
                break
        return constituent, index

    def pleonastic_it(self, mention):
        #TODO improve the multi language
        """ Determine if the mention is pleonastic.
        @param mention: THe mention to check
            "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@VP < (VBN $.. @S|SBAR))))"
            // in practice, go with this one (best results)

            "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (ADJP $.. (/S|SBAR/))))"
            "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (ADJP < (/S|SBAR/))))"

            "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (NP < /S|SBAR/)))"
            "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (NP $.. ADVP $.. /S|SBAR/)))"

            "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (VP < (VBN $.. /S|SBAR/))))))"

            "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (ADJP $.. (/S|SBAR/))))))"
                // extraposed. OK 1/2 correct; need non-adverbial case
            "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (ADJP < (/S|SBAR/))))))"
                // OK: 3/3 good matches on dev; but 3/4 wrong on WSJ

            "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (NP < /S|SBAR/)))))"
            "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (NP $.. ADVP $.. /S|SBAR/)))))"

            "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:seems|appears|means|follows)/) $.. /S|SBAR/))"

            "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:turns|turned)/) $.. PRT $.. /S|SBAR/))"
        """
        # NP < (PRP=m1) $..
        # Is a "it" pronoun
        if mention["form"].lower() in pronouns.pleonastic:
            return False
        constituent = mention.get('constituent', mention)
        father = self.graph_builder.get_syntactic_parent(constituent)
        # Is a child of a NP
        if not constituent_tags.noun_phrases(father["form"]):
            return False

        # Have a (next) sibling that is a VP
        wrapper_np_siblings = self.graph_builder.get_syntactic_children(father)
        wrapper_np_index = wrapper_np_siblings.index(father)

        #(VP <
        verb_phrase, verb_phrase_index = self.check_sibling_property(
            wrapper_np_index, wrapper_np_siblings, "tag", constituent_tags.verb_phrases)

        if not verb_phrase:
            return False

        #((/^V.*/ < /^(?:is|was|become|became)/)
        vp_constituents = self.graph_builder.get_syntactic_children(verb_phrase)
        valid_verb, valid_verb_index = self.check_sibling_property(
            0, vp_constituents, "form", lambda x: x in verbs.pleonastic_verbs)

        if not valid_verb:
            #((/^V.*/ < /^(?:seems|appears|means|follows)/)
            alternative_a_valid_verb, alternative_a_valid_verb_index = self.check_sibling_property(
                0, vp_constituents, "form", lambda x: x in verbs.alternative_a_pleonastic_verbs)
            if alternative_a_valid_verb:
                #  $.. /S|SBAR/)
                sbar, sbar_index = self.check_sibling_property(
                    vp_constituents, alternative_a_valid_verb_index, "tag", constituent_tags.simple_or_sub_phrase)
                return sbar
            alternative_b_valid_verb, alternative_b_valid_verb_index = self.check_sibling_property(
                0, vp_constituents, "form", lambda x: x in verbs.alternative_b_pleonastic_verbs)
            if alternative_b_valid_verb:
                # $.. PRT $.. /S|SBAR/))
                particle, particle_index = self.check_sibling_property(
                    vp_constituents, alternative_b_valid_verb_index, "tag", constituent_tags.particle_constituents)
                sbar, sbar_index = self.check_sibling_property(
                    vp_constituents, particle_index, "tag", constituent_tags.simple_or_sub_phrase)
                return sbar

        #(MD $.. (VP < ((/^V.*/ < /^(?:be|become)/)
        if not valid_verb:
            auxiliar_verb, auxiliar_verb_index = self.check_sibling_property(
                0, vp_constituents, "pos", pos_tags.modals)
            if auxiliar_verb:
                verb_phrase, verb_phrase_index = self.check_sibling_property(
                    wrapper_np_index, wrapper_np_siblings, "tag", constituent_tags.verb_phrases)
                vp_constituents = self.graph_builder.get_syntactic_children(verb_phrase)
                valid_verb, valid_verb_index = self.check_sibling_property(
                    0, vp_constituents, "form", lambda x: x in verbs.pleonastic_verbs)

        if not valid_verb:
            return False

        constituents_pri, constituents_pri_index = ([], 0)
        constituents_sec, constituents_sec_index = ([], 0)

        #$.. (@VP < (VBN $.. @S|SBAR))))
        second_verb, second_verb_index = self.check_sibling_property(
            valid_verb_index, vp_constituents, "tag", constituent_tags.verb_phrases)
        if second_verb:
            children = self.graph_builder.get_syntactic_children(second_verb)
            verb_form = self.check_sibling_property(
                0, children, "pos", constituent_tags.past_participle_verb)
            if verb_form:
                constituents_pri, constituents_pri_index = self.check_sibling_property(
                    0, children, "pos", constituent_tags.past_participle_verb)

        # $.. (ADJP $.. (/S|SBAR/))))
        # $.. (ADJP < (/S|SBAR/))))
        adjectival_phrase, adjectival_phrase_index = self.check_sibling_property(
            valid_verb_index, vp_constituents, "tag", constituent_tags.adjectival_prhases)
        if second_verb:
            children = self.graph_builder.get_syntactic_children(adjectival_phrase)
            constituents_pri, constituents_pri_index = (vp_constituents, adjectival_phrase_index)
            constituents_sec, constituents_sec_index = (children, 0)

        # $.. (NP < /S|SBAR/)))
        # $.. (NP $.. ADVP $.. /S|SBAR/)))
        noun_phrase, noun_phrase_index = self.check_sibling_property(
            valid_verb_index, vp_constituents, "tag", constituent_tags.noun_phrases)
        if second_verb:
            children = self.graph_builder.get_syntactic_children(noun_phrase)
            constituents_pri, constituents_pri_index = (0, children)
            adverbial_phrase, adverbial_phrase_index = self.check_sibling_property(
                noun_phrase_index, vp_constituents, "tag", constituent_tags.noun_phrases)
            if adjectival_phrase:
                constituents_sec, constituents_sec_index = (vp_constituents, adverbial_phrase_index)

        sbar1, sbar1_index = self.check_sibling_property(
            constituents_pri_index, constituents_pri, "tag", constituent_tags.simple_or_sub_phrase)
        sbar2, sbar1_index = self.check_sibling_property(
            constituents_sec_index, constituents_sec, "tag", constituent_tags.simple_or_sub_phrase)

        return sbar1 or sbar2