lib/opener/stanza/tokenizer_pos.rb in opener-chained-daemon-3.1.0 vs lib/opener/stanza/tokenizer_pos.rb in opener-chained-daemon-3.1.1
- old
+ new
@@ -50,35 +50,37 @@
w_index = 0
tokens.map{ |t| t.reverse! } if RTL_LANGUAGES.include? kaf.language
tokens.each_with_index do |sentence, s_index|
- text = nil
- misc = nil
+ miscs = {}
sentence.each_with_index do |word|
w_index += 1
- # fallback to previous token due to MWT
- text = word['text'] || text
- misc = word['misc'] || misc
- next if misc.nil?
+ # save misc for later usase in a MWT case
+ if word['id'].is_a? Array
+ miscs['id'].each { |id| miscs[id] = word['misc'] }
+ next
+ end
+ misc = word['misc'] || miscs[word['id']]
offset = misc.match(/start_char=(\d+)|/)[1].to_i
length = misc.match(/end_char=(\d+)/)[1].to_i - offset
u_pos = word['upos']
- next if u_pos.nil? # MWT
pos = POS[u_pos]
- raise "Didn't find a map for #{u_pos}" if pos.nil?
+ Rollbar.scoped({ input: input, params: params, tokens: tokens }) do
+ raise "Didn't find a map for #{u_pos}"
+ end if pos.nil?
type = if POS_OPEN.include? pos then 'open' else 'close' end
params = {
wid: w_index,
sid: s_index + 1,
tid: w_index,
para: 1,
offset: offset,
length: length,
- text: text,
+ text: word['text'],
lemma: word['lemma'],
morphofeat: u_pos,
pos: pos,
type: type,
}