lib/opener/stanza/tokenizer_pos.rb in opener-chained-daemon-3.0.7 vs lib/opener/stanza/tokenizer_pos.rb in opener-chained-daemon-3.1.0

- old
+ new

@@ -6,10 +6,13 @@ VERSION = '1.0' BASE_URL = ENV['STANZA_SERVER'] LANGUAGES_CACHE = Opener::ChainedDaemon::LanguagesCache.new + RTL_LANGUAGES = [ "ar", "ara", "arc", "ae", "ave", "egy", "he", "heb", "nqo", "pal", "phn", "sam", + "syc", "syr", "fa", "per", "fas", "ku", "kur", "ur", "urd" ] + POS = { 'DET' => 'D', 'ADJ' => 'G', 'NOUN' => 'N', 'VERB' => 'V', @@ -44,18 +47,27 @@ raise Core::UnsupportedLanguageError, kaf.language if response.status == 406 raise response.body if response.status >= 400 tokens = JSON.parse response.body w_index = 0 + + tokens.map{ |t| t.reverse! } if RTL_LANGUAGES.include? kaf.language tokens.each_with_index do |sentence, s_index| + text = nil + misc = nil sentence.each_with_index do |word| w_index += 1 - misc = word['misc'] + # fallback to previous token due to MWT + text = word['text'] || text + misc = word['misc'] || misc + next if misc.nil? + offset = misc.match(/start_char=(\d+)|/)[1].to_i length = misc.match(/end_char=(\d+)/)[1].to_i - offset u_pos = word['upos'] + next if u_pos.nil? # MWT pos = POS[u_pos] raise "Didn't find a map for #{u_pos}" if pos.nil? type = if POS_OPEN.include? pos then 'open' else 'close' end params = { @@ -63,10 +75,10 @@ sid: s_index + 1, tid: w_index, para: 1, offset: offset, length: length, - text: word['text'], + text: text, lemma: word['lemma'], morphofeat: u_pos, pos: pos, type: type, }