Sha256: 3bb983b21a7e2dab9996c745e1873c073f191807557b774a8973834bc68638d5
Contents?: true
Size: 1014 Bytes
Versions: 6
Compression:
Stored size: 1014 Bytes
Contents
# -*- encoding : utf-8 -*- module PragmaticTokenizer # This class separates true full stops while ignoring # periods that are part of an abbreviation class FullStopSeparator attr_reader :tokens, :abbreviations def initialize(tokens:, abbreviations:) @tokens = tokens @abbreviations = abbreviations end def separate abbr = {} abbreviations.each do |i| abbr[i] = true end cleaned_tokens = [] tokens.each_with_index do |_t, i| if tokens[i + 1] && tokens[i] =~ /\A(.+)\.\z/ w = $1 unless abbr[Unicode::downcase(w)] || w =~ /\A[a-z]\z/i || w =~ /[a-z](?:\.[a-z])+\z/i cleaned_tokens << w cleaned_tokens << '.' next end end cleaned_tokens << tokens[i] end if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ cleaned_tokens[-1] = $1 cleaned_tokens.push '.' end cleaned_tokens end end end
Version data entries
6 entries across 6 versions & 1 rubygems