lib/pragmatic_tokenizer/processor.rb in pragmatic_tokenizer-0.1.9 vs lib/pragmatic_tokenizer/processor.rb in pragmatic_tokenizer-0.1.10

- old
+ new

@@ -12,10 +12,11 @@ shift_upsidedown_exclamation(text) shift_ellipse(text) shift_special_quotes(text) shift_colon(text) shift_bracket(text) + shift_semicolon(text) convert_dbl_quotes(text) convert_sgl_quotes(text) tokens = separate_full_stop(text.squeeze(' ').split.map { |t| convert_sym_to_punct(t.downcase) }) separate_other_ending_punc(tokens) end @@ -70,9 +71,13 @@ !(/\A\d+/ == text.partition(':').last[0]) && !(/\A\d+/ == text.partition(':').first[-1]) # Ignore web addresses text.gsub!(/(?<=[http|https]):(?=\/\/)/, PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[":"]) || text text.gsub!(/:/o, ' :') || text + end + + def shift_semicolon(text) + text.gsub!(/([;])/o) { ' ' + $1 + ' ' } || text end def shift_ellipse(text) text.gsub!(/(\.\.\.+)/o) { ' ' + $1 + ' ' } || text end \ No newline at end of file