tokenizer_pos.rb in opener-chained-daemon-3.1.0

- old
+ new

@@ -6,10 +6,13 @@
       VERSION         = '1.0'
 
       BASE_URL        = ENV['STANZA_SERVER']
       LANGUAGES_CACHE = Opener::ChainedDaemon::LanguagesCache.new
 
+      RTL_LANGUAGES   = [ "ar", "ara", "arc", "ae", "ave", "egy", "he", "heb", "nqo", "pal", "phn", "sam",
+                          "syc", "syr", "fa", "per", "fas", "ku", "kur", "ur", "urd" ]
+
       POS             = {
         'DET'   => 'D',
         'ADJ'   => 'G',
         'NOUN'  => 'N',
         'VERB'  => 'V',
@@ -44,18 +47,27 @@
         raise Core::UnsupportedLanguageError, kaf.language if response.status == 406
         raise response.body if response.status >= 400
         tokens   = JSON.parse response.body
 
         w_index = 0
+
+        tokens.map{ |t| t.reverse! } if RTL_LANGUAGES.include? kaf.language
         tokens.each_with_index do |sentence, s_index|
+          text = nil
+          misc = nil
           sentence.each_with_index do |word|
             w_index += 1
-            misc   = word['misc']
+            # fallback to previous token due to MWT
+            text = word['text'] || text
+            misc = word['misc'] || misc
+            next if misc.nil?
+
             offset = misc.match(/start_char=(\d+)|/)[1].to_i
             length = misc.match(/end_char=(\d+)/)[1].to_i - offset
 
             u_pos  = word['upos']
+            next if u_pos.nil? # MWT
             pos    = POS[u_pos]
             raise "Didn't find a map for #{u_pos}" if pos.nil?
             type   = if POS_OPEN.include? pos then 'open' else 'close' end
 
             params = {
@@ -63,10 +75,10 @@
               sid:        s_index + 1,
               tid:        w_index,
               para:       1,
               offset:     offset,
               length:     length,
-              text:       word['text'],
+              text:       text,
               lemma:      word['lemma'],
               morphofeat: u_pos,
               pos:        pos,
               type:       type,
             }