module Opener module Stanza class TokenizerPos DESC = 'Tokenizer / POS by Stanza' VERSION = '1.0' BASE_URL = ENV['STANZA_SERVER'] LANGUAGES_CACHE = Opener::ChainedDaemon::LanguagesCache.new RTL_LANGUAGES = [ "ar", "ara", "arc", "ae", "ave", "egy", "he", "heb", "nqo", "pal", "phn", "sam", "syc", "syr", "fa", "per", "fas", "ku", "kur", "ur", "urd" ] POS = { 'DET' => 'D', 'ADJ' => 'G', 'NOUN' => 'N', 'VERB' => 'V', 'AUX' => 'V', 'ADV' => 'A', 'CCONJ' => 'J', 'PUNCT' => '.', 'ADP' => 'P', 'PRON' => 'Q', 'PROPN' => 'R', 'PART' => 'P', 'NUM' => 'O', 'X' => 'O', 'SYM' => 'O', 'SCONJ' => 'P', 'INTJ' => 'O', } POS_OPEN = %w[N R G V A O] def run input, params raise 'missing Stanza server' if ENV['STANZA_SERVER'].blank? kaf = KAF::Document.from_xml input prod = params[:cache_keys][:environment] != 'staging' if prod and !LANGUAGES_CACHE.get.include?(kaf.language) raise Core::UnsupportedLanguageError.new kaf.language end input = kaf.raw input = input.gsub(/\,[^\ ]/, ', ') response = Faraday.post BASE_URL, {lang: kaf.language, input: input}.to_query raise Core::UnsupportedLanguageError, kaf.language if response.status == 406 raise response.body if response.status >= 400 tokens = JSON.parse response.body w_index = 0 miscs = {} tokens.each_with_index do |t, i| miscs[i] ||= {} t.each do |word| word['id'].is_a?(Array) && word['id'].each do |id| puts id miscs[i][id] = word['misc'] end end end tokens.map{ |t| t.reverse! } if RTL_LANGUAGES.include? kaf.language tokens.each_with_index do |sentence, s_index| sentence.each_with_index do |word| w_index += 1 # save misc for later usase in a MWT case next if word['id'].is_a? Array misc = word['misc'] || miscs[s_index][word['id']] Rollbar.scoped({ input: input, params: params, tokens: tokens, word: word }) do raise 'Missing misc' end if misc.nil? offset = misc.match(/start_char=(\d+)|/)[1].to_i length = misc.match(/end_char=(\d+)/)[1].to_i - offset u_pos = word['upos'] pos = POS[u_pos] raise "Didn't find a map for #{u_pos}" if pos.nil? type = if POS_OPEN.include? pos then 'open' else 'close' end params = { wid: w_index, sid: s_index + 1, tid: w_index, para: 1, offset: offset, length: length, text: word['text'], lemma: word['lemma'], morphofeat: u_pos, pos: pos, type: type, } kaf.add_word_form params kaf.add_term params end end kaf.add_linguistic_processor DESC, "#{VERSION}", 'text', timestamp: true kaf.to_xml end end end end