lib/pragmatic_tokenizer/processor.rb in pragmatic_tokenizer-0.2.1 vs lib/pragmatic_tokenizer/processor.rb in pragmatic_tokenizer-0.2.2

- old
+ new

@@ -20,10 +20,14 @@ shift_at_symbol(text) convert_dbl_quotes(text) convert_sgl_quotes(text) shift_beginning_hyphen(text) shift_ending_hyphen(text) - tokens = separate_full_stop(text.squeeze(' ').split.map { |t| convert_sym_to_punct(t) }) + tokens = separate_full_stop(text.squeeze(' ') + .split + .flat_map { |t| (t[0] == '‚' || t[0] == ',') && t.length > 1 ? t.split(/(,|‚)/).flatten : t } + .flat_map { |t| (t[-1] == '’' || t[-1] == "'") && t.length > 1 ? t.split(/(’|')/).flatten : t } + .map { |t| convert_sym_to_punct(t) }) separate_other_ending_punc(tokens) end private \ No newline at end of file