lib/opener/stanza/tokenizer_pos.rb in opener-chained-daemon-3.1.0 vs lib/opener/stanza/tokenizer_pos.rb in opener-chained-daemon-3.1.1

- old
+ new

@@ -50,35 +50,37 @@ w_index = 0 tokens.map{ |t| t.reverse! } if RTL_LANGUAGES.include? kaf.language tokens.each_with_index do |sentence, s_index| - text = nil - misc = nil + miscs = {} sentence.each_with_index do |word| w_index += 1 - # fallback to previous token due to MWT - text = word['text'] || text - misc = word['misc'] || misc - next if misc.nil? + # save misc for later usase in a MWT case + if word['id'].is_a? Array + miscs['id'].each { |id| miscs[id] = word['misc'] } + next + end + misc = word['misc'] || miscs[word['id']] offset = misc.match(/start_char=(\d+)|/)[1].to_i length = misc.match(/end_char=(\d+)/)[1].to_i - offset u_pos = word['upos'] - next if u_pos.nil? # MWT pos = POS[u_pos] - raise "Didn't find a map for #{u_pos}" if pos.nil? + Rollbar.scoped({ input: input, params: params, tokens: tokens }) do + raise "Didn't find a map for #{u_pos}" + end if pos.nil? type = if POS_OPEN.include? pos then 'open' else 'close' end params = { wid: w_index, sid: s_index + 1, tid: w_index, para: 1, offset: offset, length: length, - text: text, + text: word['text'], lemma: word['lemma'], morphofeat: u_pos, pos: pos, type: type, }