lib/pragmatic_tokenizer/processor.rb in pragmatic_tokenizer-0.1.12 vs lib/pragmatic_tokenizer/processor.rb in pragmatic_tokenizer-0.2.0
- old
+ new
@@ -18,11 +18,13 @@
shift_underscore(text)
shift_asterisk(text)
shift_at_symbol(text)
convert_dbl_quotes(text)
convert_sgl_quotes(text)
- tokens = separate_full_stop(text.squeeze(' ').split.map { |t| convert_sym_to_punct(t.downcase) })
+ shift_beginning_hyphen(text)
+ shift_ending_hyphen(text)
+ tokens = separate_full_stop(text.squeeze(' ').split.map { |t| convert_sym_to_punct(t) })
separate_other_ending_punc(tokens)
end
private
@@ -56,10 +58,18 @@
def shift_upsidedown_exclamation(text)
text.gsub!(/¡/, ' ¡ ') || text
end
+ def shift_ending_hyphen(text)
+ text.gsub!(/-\s+/, ' - ') || text
+ end
+
+ def shift_beginning_hyphen(text)
+ text.gsub!(/\s+-/, ' - ') || text
+ end
+
def shift_special_quotes(text)
text.gsub!(/«/, ' « ') || text
text.gsub!(/»/, ' » ') || text
text.gsub!(/„/, ' „ ') || text
text.gsub!(/“/, ' “ ') || text
@@ -102,31 +112,35 @@
text.gsub!(/(\.\.+)/o) { ' ' + $1 + ' ' } || text
text.gsub!(/(…+)/o) { ' ' + $1 + ' ' } || text
end
def separate_full_stop(tokens)
- abbr = {}
- @language::ABBREVIATIONS.each do |i|
- abbr[i] = true
- end
- cleaned_tokens = []
- tokens.each_with_index do |_t, i|
- if tokens[i + 1] && tokens[i] =~ /\A(.+)\.\z/
- w = $1
- unless abbr[w.downcase] || w =~ /\A[a-z]\z/i ||
- w =~ /[a-z](?:\.[a-z])+\z/i
- cleaned_tokens << w
- cleaned_tokens << '.'
- next
+ if @language.eql?(Languages::English) || @language.eql?(Languages::Common)
+ abbr = {}
+ @language::ABBREVIATIONS.each do |i|
+ abbr[i] = true
+ end
+ cleaned_tokens = []
+ tokens.each_with_index do |_t, i|
+ if tokens[i + 1] && tokens[i] =~ /\A(.+)\.\z/
+ w = $1
+ unless abbr[w.downcase] || w =~ /\A[a-z]\z/i ||
+ w =~ /[a-z](?:\.[a-z])+\z/i
+ cleaned_tokens << w
+ cleaned_tokens << '.'
+ next
+ end
end
+ cleaned_tokens << tokens[i]
end
- cleaned_tokens << tokens[i]
+ if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/
+ cleaned_tokens[-1] = $1
+ cleaned_tokens.push '.'
+ end
+ cleaned_tokens
+ else
+ tokens.flat_map { |t| t =~ /\.\z/ && !@language::ABBREVIATIONS.include?(Unicode::downcase(t.split(".")[0])) && t.length > 2 ? t.split(".").flatten + ["."] : t }
end
- if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/
- cleaned_tokens[-1] = $1
- cleaned_tokens.push '.'
- end
- cleaned_tokens
end
def separate_other_ending_punc(tokens)
cleaned_tokens = []
tokens.each do |a|
\ No newline at end of file