#!/usr/bin/env python import sys import codecs import csv import os from operator import itemgetter #from VUA_pylib.lexicon import MPQA_subjectivity_lexicon def get_first_term_id(token_data,term_data,this_ids): vector_tid_pos = [] for tid in this_ids: span_token = term_data[tid][2] min_token_pos = min(token_data[tok_id][2] for tok_id in span_token) vector_tid_pos.append((tid,min_token_pos)) vector_tid_pos.sort(key=itemgetter(1)) return vector_tid_pos[0][0] def get_mapping_from_lexicon(token_ids,lexicon): #Create index offset --> ids idx = 0 my_map = {} text = ' ' for token, tid in token_ids: for c in token: my_map[idx] = tid idx+=1 text += token+' ' idx+=1 #### all_extracted = [] # List of [(ids,polarity), (ids, polarity) for substring, polarity in lexicon.items(): current_found = 0 while True: start = text.find(' '+substring+' ',current_found) if start == -1: break end = start + len(substring) current_found = end ids = set(my_map[myidx] for myidx in range(start,end) if myidx in my_map) if len(ids) != 0: all_extracted.append((ids,polarity)) final_selected = {} #If w15 has been selected first, for instance (w14,w15,w16) will not be selected later in this file for ids,polarity in sorted(all_extracted, key=lambda t: len(t[0])): already_selected = False for this_id in ids: if this_id in final_selected: already_selected = True if not already_selected: for this_id in ids: final_selected[this_id] = polarity return final_selected def load_propagation_lexicon(propagation_lex_filename): ##Creates a lexicon (map) [lemma] --> polarity propagated_lexicon = {} if not os.path.exists(propagation_lex_filename): print>>sys.stderr,'The propagated lexicon on', propagation_lex_filename,'does not exist' else: fic = open(propagation_lex_filename,'r') for line in fic: line = line.decode('utf-8').rstrip() tokens = line.split(';') lemma = tokens[4] polarity = tokens[2] propagated_lexicon[lemma] = polarity return propagated_lexicon def extract_features_from_kaf_naf_file(knaf_obj,out_file=None,log_file=None,include_class=True,accepted_opinions=None, exp_lex= None, tar_lex=None, propagation_lex_filename=None): labels = [] polarities_found_and_skipped = [] separator = '\t' restore_out = None log_on = False if log_file is not None: log_desc = codecs.open(log_file, 'w', encoding='UTF-8') log_on = True if out_file is not None: restore_out = sys.stdout sys.stdout = open(out_file,'a') print>>log_desc,'Extracting features from ',knaf_obj.get_filename() ########################### ## EXTRACTING TOKENS ####### token_data = {} ## token_data['w_1'] = ('house','s_1') tokens_in_order = [] num_token = 0 tokens_ids = [] for token_obj in knaf_obj.get_tokens(): token = token_obj.get_text() s_id = token_obj.get_sent() w_id = token_obj.get_id() tokens_ids.append((token,w_id)) token_data[w_id] = (token,s_id,num_token) tokens_in_order.append(w_id) num_token += 1 if log_on: print>>log_desc,' Number of tokens: ',len(tokens_in_order) ########################### #Lexicons from the training data mapping_wid_polarity = {} if exp_lex is not None: mapping_wid_polarity = get_mapping_from_lexicon(tokens_ids,exp_lex) mapping_wid_aspect = {} if tar_lex is not None: mapping_wid_aspect = get_mapping_from_lexicon(tokens_ids, tar_lex) propagated_lex = {} if propagation_lex_filename is not None: #Lexicon of [lemma] ==> polarity propagated_lex = load_propagation_lexicon(propagation_lex_filename) ########################### ## EXTRACTING TERMS ####### term_data = {} #(term_lemma,term_pos,term_span,polarity) term_for_token = {} sentence_for_term = {} for term_obj in knaf_obj.get_terms(): term_id = term_obj.get_id() term_lemma = term_obj.get_lemma() term_pos = term_obj.get_morphofeat() # if there is no morphofeat feature, we try to get the pos from the 'pos' attrib if term_pos == None: term_pos = term_obj.get_pos() if term_pos is not None: term_pos = term_pos.split(' ')[0] #[:2] ## Only the 2 first chars of the pos string else: term_pos = 'unknown' term_span = term_obj.get_span().get_span_ids() sentiment = term_obj.get_sentiment() polarity = None if sentiment is not None: polarity = sentiment.get_polarity() if polarity is None: modifier = sentiment.get_modifier() polarity = modifier if polarity is None: polarity='-' term_data[term_id] = (term_lemma,term_pos,term_span,polarity) for tok_id in term_span: term_for_token[tok_id] = term_id if tok_id in token_data: sentence_id = token_data[tok_id][1] sentence_for_term[term_id] = sentence_id else: sentence_for_term[term_id] = '0' if log_on: print>>log_desc,' Number of terms loaded: '+str(len(term_data)) ########################### ########################### # EXTRACTING ENTITIES FOR EACH TERM ########################### entity_for_term = {} for ent_obj in knaf_obj.get_entities(): ent_type = ent_obj.get_type() for reference_obj in ent_obj.get_references(): for span_obj in reference_obj: for t_id in span_obj.get_span_ids(): entity_for_term[t_id] = ent_type if log_on: print>>log_desc,'Entities:'+str(entity_for_term) ########################### # EXTRACTING PROPERTIES FOR EACH TERM ########################### property_for_term = {} for prop_obj in knaf_obj.get_properties(): prop_type = prop_obj.get_type() for reference_obj in prop_obj.get_references(): for span_obj in reference_obj: for t_id in span_obj.get_span_ids(): property_for_term[t_id] = prop_type if log_on: print>>log_desc,'Properties:'+str(property_for_term) ########################### # EXTRACTING CLASS FOR EACH TERM ########################### class_for_term_id = {} if include_class: for opinion in knaf_obj.get_opinions(): ## opinion expression opinion_id = opinion.get_id() opinion_exp = opinion.get_expression() exp_type = '' exp_strength = '' exp_ids = [] if opinion_exp is not None: exp_type = opinion_exp.get_polarity() exp_strength = opinion_exp.get_strength() span = opinion_exp.get_span() if span is not None: exp_ids = span.get_span_ids() opinion_hol = opinion.get_holder() hol_ids = [] if opinion_hol is not None: span = opinion_hol.get_span() if span is not None: hol_ids = span.get_span_ids() opinion_tar = opinion.get_target() tar_ids = [] if opinion_tar is not None: span = opinion_tar.get_span() if span is not None: tar_ids = span.get_span_ids() ############################ if accepted_opinions is not None: if exp_type in accepted_opinions: #Get the mapping label mapped_type = accepted_opinions[exp_type] else: # This opinion wont be considered polarities_found_and_skipped.append(exp_type) continue else: mapped_type = exp_type if log_on: print>>log_desc,' Opinion',opinion_id print>>log_desc,' Expression:' print>>log_desc,' ids:',exp_ids print>>log_desc,' terms:',[term_data[i][0] for i in exp_ids] if len(exp_ids) != 0: first_term_id = get_first_term_id(token_data,term_data,exp_ids) for t_id in exp_ids: if t_id == first_term_id: type='B-' else: type='I-' class_for_term_id[t_id]=type+mapped_type if log_on: print>>log_desc,' Target:' print>>log_desc,' ids:',tar_ids print>>log_desc,' terms:',[term_data[i][0] for i in tar_ids] if len(tar_ids) != 0: first_term_id = get_first_term_id(token_data,term_data,tar_ids) for t_id in tar_ids: if t_id == first_term_id: type='B-' else: type='I-' class_for_term_id[t_id]=type+'target' if log_on: print>>log_desc,' Holder:' print>>log_desc,' ids:',hol_ids print>>log_desc,' terms:',[term_data[i][0] for i in hol_ids] if len(hol_ids) != 0: first_term_id = get_first_term_id(token_data,term_data,hol_ids) for t_id in hol_ids: if t_id == first_term_id: type='B-' else: type='I-' class_for_term_id[t_id]=type+'holder' ############## #my_mpqa_subj_lex = MPQA_subjectivity_lexicon() ## WRITE TO THE OUTPUT prev_sent = None for token_id in tokens_in_order: token,sentence_id,num_token = token_data[token_id] term_id = term_for_token.get(token_id,None) #This is required for wrong KAF files that contain missing terms (tokens not linked with terms) if term_id is not None: data = term_data.get(term_id,None) if data is not None: term_lemma,term_pos,term_span,polarity = data entity = entity_for_term.get(term_id,'-') property = property_for_term.get(term_id,'-') this_class = class_for_term_id.get(term_id,'O') ''' #Mpqa subjectivy from the mpqa corpus mpqa_type = mpqa_pol = '-' if my_mpqa_subj_lex is not None: mpqa_data = my_mpqa_subj_lex.get_type_and_polarity(token,term_pos) if mpqa_data is not None: mpqa_type, mpqa_pol = mpqa_data ''' ## Constituency features constituency_extractor = knaf_obj.get_constituency_extractor() feature_phrase = 'XXX' if constituency_extractor is not None: this_phrase, subsumed_together = constituency_extractor.get_deepest_phrase_for_termid(term_id) if this_phrase is not None: feature_phrase = this_phrase ###################### ### Expression from the domain lexicon polarity_from_domain = mapping_wid_polarity.get(token_id,'-') ## Polarity from the propagated lexicon polarity_from_propagation = propagated_lex.get(term_lemma,'-') ## Target from the training lexicon aspect_from_domain = mapping_wid_aspect.get(token_id,'-') ############################################################################################## ## FEATURE GENERATION!!!! ############################################################################################## labels = ['sentence_id','token_id','token','lemma', 'pos', 'term_id', 'pol/mod', 'poldomain', 'aspect_training'] features = [ sentence_id, token_id, token, term_lemma, term_pos, term_id, polarity ,polarity_from_domain,aspect_from_domain] labels.extend(['entity','property','phrase_type','propagation_polarity','y']) features.extend([entity,property,feature_phrase,polarity_from_propagation,this_class]) ############################################################################################## ############################################################################################## if prev_sent is not None and sentence_id != prev_sent: print>>sys.stdout #breakline print>>sys.stdout,(separator.join(features)).encode('utf-8') prev_sent=sentence_id print>>sys.stdout #Last breakline required for crfsuite print>>log_desc ## Restoring if log_on: log_desc.close() if restore_out is not None: sys.stdout.close() sys.stdout = restore_out return labels, separator, polarities_found_and_skipped