lib/pragmatic_tokenizer/processor.rb in pragmatic_tokenizer-0.1.9 vs lib/pragmatic_tokenizer/processor.rb in pragmatic_tokenizer-0.1.10
- old
+ new
@@ -12,10 +12,11 @@
shift_upsidedown_exclamation(text)
shift_ellipse(text)
shift_special_quotes(text)
shift_colon(text)
shift_bracket(text)
+ shift_semicolon(text)
convert_dbl_quotes(text)
convert_sgl_quotes(text)
tokens = separate_full_stop(text.squeeze(' ').split.map { |t| convert_sym_to_punct(t.downcase) })
separate_other_ending_punc(tokens)
end
@@ -70,9 +71,13 @@
!(/\A\d+/ == text.partition(':').last[0]) &&
!(/\A\d+/ == text.partition(':').first[-1])
# Ignore web addresses
text.gsub!(/(?<=[http|https]):(?=\/\/)/, PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[":"]) || text
text.gsub!(/:/o, ' :') || text
+ end
+
+ def shift_semicolon(text)
+ text.gsub!(/([;])/o) { ' ' + $1 + ' ' } || text
end
def shift_ellipse(text)
text.gsub!(/(\.\.\.+)/o) { ' ' + $1 + ' ' } || text
end
\ No newline at end of file