lib/opener/stanza/tokenizer_pos.rb in opener-chained-daemon-3.0.7 vs lib/opener/stanza/tokenizer_pos.rb in opener-chained-daemon-3.1.0
- old
+ new
@@ -6,10 +6,13 @@
VERSION = '1.0'
BASE_URL = ENV['STANZA_SERVER']
LANGUAGES_CACHE = Opener::ChainedDaemon::LanguagesCache.new
+ RTL_LANGUAGES = [ "ar", "ara", "arc", "ae", "ave", "egy", "he", "heb", "nqo", "pal", "phn", "sam",
+ "syc", "syr", "fa", "per", "fas", "ku", "kur", "ur", "urd" ]
+
POS = {
'DET' => 'D',
'ADJ' => 'G',
'NOUN' => 'N',
'VERB' => 'V',
@@ -44,18 +47,27 @@
raise Core::UnsupportedLanguageError, kaf.language if response.status == 406
raise response.body if response.status >= 400
tokens = JSON.parse response.body
w_index = 0
+
+ tokens.map{ |t| t.reverse! } if RTL_LANGUAGES.include? kaf.language
tokens.each_with_index do |sentence, s_index|
+ text = nil
+ misc = nil
sentence.each_with_index do |word|
w_index += 1
- misc = word['misc']
+ # fallback to previous token due to MWT
+ text = word['text'] || text
+ misc = word['misc'] || misc
+ next if misc.nil?
+
offset = misc.match(/start_char=(\d+)|/)[1].to_i
length = misc.match(/end_char=(\d+)/)[1].to_i - offset
u_pos = word['upos']
+ next if u_pos.nil? # MWT
pos = POS[u_pos]
raise "Didn't find a map for #{u_pos}" if pos.nil?
type = if POS_OPEN.include? pos then 'open' else 'close' end
params = {
@@ -63,10 +75,10 @@
sid: s_index + 1,
tid: w_index,
para: 1,
offset: offset,
length: length,
- text: word['text'],
+ text: text,
lemma: word['lemma'],
morphofeat: u_pos,
pos: pos,
type: type,
}