lib/pragmatic_tokenizer/processor.rb in pragmatic_tokenizer-0.1.12 vs lib/pragmatic_tokenizer/processor.rb in pragmatic_tokenizer-0.2.0

- old
+ new

@@ -18,11 +18,13 @@ shift_underscore(text) shift_asterisk(text) shift_at_symbol(text) convert_dbl_quotes(text) convert_sgl_quotes(text) - tokens = separate_full_stop(text.squeeze(' ').split.map { |t| convert_sym_to_punct(t.downcase) }) + shift_beginning_hyphen(text) + shift_ending_hyphen(text) + tokens = separate_full_stop(text.squeeze(' ').split.map { |t| convert_sym_to_punct(t) }) separate_other_ending_punc(tokens) end private @@ -56,10 +58,18 @@ def shift_upsidedown_exclamation(text) text.gsub!(/¡/, ' ¡ ') || text end + def shift_ending_hyphen(text) + text.gsub!(/-\s+/, ' - ') || text + end + + def shift_beginning_hyphen(text) + text.gsub!(/\s+-/, ' - ') || text + end + def shift_special_quotes(text) text.gsub!(/«/, ' « ') || text text.gsub!(/»/, ' » ') || text text.gsub!(/„/, ' „ ') || text text.gsub!(/“/, ' “ ') || text @@ -102,31 +112,35 @@ text.gsub!(/(\.\.+)/o) { ' ' + $1 + ' ' } || text text.gsub!(/(…+)/o) { ' ' + $1 + ' ' } || text end def separate_full_stop(tokens) - abbr = {} - @language::ABBREVIATIONS.each do |i| - abbr[i] = true - end - cleaned_tokens = [] - tokens.each_with_index do |_t, i| - if tokens[i + 1] && tokens[i] =~ /\A(.+)\.\z/ - w = $1 - unless abbr[w.downcase] || w =~ /\A[a-z]\z/i || - w =~ /[a-z](?:\.[a-z])+\z/i - cleaned_tokens << w - cleaned_tokens << '.' - next + if @language.eql?(Languages::English) || @language.eql?(Languages::Common) + abbr = {} + @language::ABBREVIATIONS.each do |i| + abbr[i] = true + end + cleaned_tokens = [] + tokens.each_with_index do |_t, i| + if tokens[i + 1] && tokens[i] =~ /\A(.+)\.\z/ + w = $1 + unless abbr[w.downcase] || w =~ /\A[a-z]\z/i || + w =~ /[a-z](?:\.[a-z])+\z/i + cleaned_tokens << w + cleaned_tokens << '.' + next + end end + cleaned_tokens << tokens[i] end - cleaned_tokens << tokens[i] + if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ + cleaned_tokens[-1] = $1 + cleaned_tokens.push '.' + end + cleaned_tokens + else + tokens.flat_map { |t| t =~ /\.\z/ && !@language::ABBREVIATIONS.include?(Unicode::downcase(t.split(".")[0])) && t.length > 2 ? t.split(".").flatten + ["."] : t } end - if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ - cleaned_tokens[-1] = $1 - cleaned_tokens.push '.' - end - cleaned_tokens end def separate_other_ending_punc(tokens) cleaned_tokens = [] tokens.each do |a| \ No newline at end of file