Sha256: 1983cb3d8654185c424c8b62285ad3a19be7785bcdd6eefd0103c7a40bfcff3f
Contents?: true
Size: 1.48 KB
Versions: 3
Compression:
Stored size: 1.48 KB
Contents
# -*- encoding : utf-8 -*- module PragmaticTokenizer # This class separates true full stops while ignoring # periods that are part of an abbreviation class FullStopSeparator attr_reader :tokens, :abbreviations, :downcase def initialize(tokens:, abbreviations:, downcase:) @tokens = tokens @abbreviations = abbreviations @downcase = downcase end def separate abbr = {} abbreviations.each do |i| abbr[i] = true end cleaned_tokens = [] tokens.each_with_index do |_t, i| if tokens[i + 1] && tokens[i] =~ /\A(.+)\.\z/ w = Regexp.last_match(1) if downcase abbreviation = abbr[w] else abbreviation = abbr[Unicode.downcase(w)] end unless abbreviation || w =~ /\A[a-z]\z/i || w =~ /[a-z](?:\.[a-z])+\z/i cleaned_tokens << w cleaned_tokens << '.' next end end cleaned_tokens << tokens[i] end if downcase abbreviation = abbreviations.include?(cleaned_tokens[-1].chomp(".")) unless cleaned_tokens[-1].nil? else abbreviation = abbreviations.include?(Unicode.downcase(cleaned_tokens[-1]).chomp(".")) unless cleaned_tokens[-1].nil? end if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !abbreviation cleaned_tokens[-1] = Regexp.last_match(1) cleaned_tokens.push '.' end cleaned_tokens end end end
Version data entries
3 entries across 3 versions & 1 rubygems