require_relative 'term' require_relative 'opinion' module Opener class OpinionDetectorBasic ## # Class that detects opinions in a given input KAF file. # class Processor attr_accessor :document, :timestamp, :opinion_strength, :pretty ## # @param [String|IO] file The KAF file/input to process. # @param [Hash] options. Options for timestamp and including strength to # opinions. # @param [TrueClass|FalseClass] pretty Enable pretty formatting, disabled # by default due to the performance overhead. # def initialize(file, options = {}) @document = Nokogiri.XML file @timestamp = options[:timestamp] @opinion_strength = options[:opinion_strength] @pretty = options[:pretty] || false raise 'Error parsing input. Input is required to be KAF' unless is_kaf? end ## # Processes the input and returns the new KAF output. # @return [String] # def process add_opinions_layer index = 1 opinions.each do |opinion| add_opinion(opinion, index) index += 1 end add_linguistic_processor pretty ? pretty_print(document) : document.to_xml end def language @language ||= document.at_xpath('KAF').attr('xml:lang') end def terms @terms ||= document.xpath('KAF/terms/term').map do |term| Term.new(term, document, language) end end def opinions unless @opinions set_accumulated_strength apply_modifiers apply_conjunctions ## # Initialize opinions with their expressions. # @opinions = terms.map do |term| if term.is_expression? && term.accumulated_strength != 0 Opinion.new(term) end end.compact ## # Obtain targets for each opinion. # @opinions.each do |opinion| opinion.obtain_targets(sentences) end ## # Obtain holders for each opinion. # @opinions.each do |opinion| opinion.obtain_holders(sentences, language) end end @opinions end ## # Remove the opinions layer from the KAF file if it exists and add a new # one. def add_opinions_layer existing = document.at_xpath('KAF/opinions') existing.remove if existing new_node('opinions', 'KAF') end ## # Adds the entire opinion in the KAF file. # def add_opinion(opinion, index) opinion_node = new_node("opinion", "KAF/opinions") opinion_node['oid'] = "o#{index.to_s}" unless opinion.holders.empty? opinion_holder_node = new_node("opinion_holder", opinion_node) add_opinion_element(opinion_holder_node, opinion.holders) end opinion_target_node = new_node("opinion_target", opinion_node) unless opinion.target_ids.empty? add_opinion_element(opinion_target_node, opinion.target_ids) end expression_node = new_node("opinion_expression", opinion_node) expression_node['polarity'] = opinion.polarity expression_node['strength'] = opinion.strength.to_s add_opinion_element(expression_node, opinion.ids) end ## # Method for adding opinion holders, targets and expressions. # def add_opinion_element(node, ids) lemmas = terms.select{|t| ids.include?(t.id)}.map(&:lemma).join(" ") comment = Nokogiri::XML::Comment.new(document, "#{lemmas}") node.add_child comment span_node = new_node("span", node) ids.each do |id| target_node = new_node("target", span_node) target_node['id'] = id.to_s end end ## # Add linguistic processor layer with basic information # (version, timestamp, description etc) in the KAF file. # def add_linguistic_processor description = 'Basic opinion detector with Pos' last_edited = '13may2015' version = '2.0' node = new_node('linguisticProcessors', 'KAF/kafHeader') node['layer'] = 'opinions' lp_node = new_node('lp', node) lp_node['version'] = "#{last_edited}-#{version}" lp_node['name'] = description if timestamp format = '%Y-%m-%dT%H:%M:%S%Z' lp_node['timestamp'] = Time.now.strftime(format) else lp_node['timestamp'] = '*' end end ## # Format the output document properly. # # TODO: this should be handled by Oga in a nice way. # # @return [String] # def pretty_print(document) doc = REXML::Document.new document.to_xml doc.context[:attribute_quote] = :quote out = "" formatter = REXML::Formatters::Pretty.new formatter.compact = true formatter.write(doc, out) out.strip end ## # Get terms grouped by sentence. # def sentences @sentences ||= terms.group_by{|t| t.sentence} end protected ## # The strength of a term depends heavily on the type of the previous # one. For example if the previous one is a shifter, it needs # to be multiplied. If it's an intensifier, it needs to be # added (or subtracted depending on the strength of the previous # term) etc. # def set_accumulated_strength symbol = :+ terms_count = terms.count terms.each_with_index do |term, i| if i+1 < terms_count if terms[i+1].is_shifter? if term.accumulated_strength != 0 terms[i+1].accumulated_strength *= term.accumulated_strength terms[i+1].list_ids += term.list_ids term.use = false symbol = terms[i+1].accumulated_strength > 0 ? :+ : :- else symbol = :* end elsif terms[i+1].is_intensifier? terms[i+1].accumulated_strength = term.accumulated_strength.send(symbol, terms[i+1].accumulated_strength) term.use = false symbol = terms[i+1].accumulated_strength > 0 ? :+ : :- if term.accumulated_strength != 0 terms[i+1].list_ids += term.list_ids end else symbol = terms[i+1].accumulated_strength >= 0 ? :+ : :- end end end end ## # Apply strength to the next term after a shifter or intensifier. # def apply_modifiers terms_count = terms.count terms.each_with_index do |term, i| if i+1 < terms_count if term.use && (term.is_shifter? || term.is_intensifier?) terms[i+1].accumulated_strength *= term.accumulated_strength terms[i+1].list_ids += term.list_ids term.use = false end end end end ## # Ignore conjunctions when applying strength. # def apply_conjunctions terms_count = terms.count i = 0 while i < terms_count if terms[i].use && terms[i].accumulated_strength != 0 used = [i] list_ids = terms[i].list_ids strength = terms[i].accumulated_strength terms[i].use = false j = i+1 while true if j >= terms_count break end if terms[j].is_conjunction terms[j].use = false j += 1 elsif terms[j].use && terms[j].accumulated_strength != 0 list_ids += terms[j].list_ids used << j terms[j].use = false strength += terms[j].accumulated_strength j += 1 else break end end last_used = used.last terms[last_used].accumulated_strength = strength terms[last_used].list_ids = list_ids terms[last_used].use = true i = j end i += 1 end end ## # Creates a new node in the KAF file. # def new_node(tag, parent) if parent.is_a?(String) parent_node = document.at_xpath(parent) else parent_node = parent end node = Nokogiri::XML::Element.new(tag, document) parent_node.add_child node node end ## # Check if input is a KAF file. # @return [Boolean] # def is_kaf? !!document.at_xpath('KAF') end end end end