Sha256: 1983cb3d8654185c424c8b62285ad3a19be7785bcdd6eefd0103c7a40bfcff3f

Contents?: true

Size: 1.48 KB

Versions: 3

Compression:

Stored size: 1.48 KB

Contents

# -*- encoding : utf-8 -*-

module PragmaticTokenizer
  # This class separates true full stops while ignoring
  # periods that are part of an abbreviation
  class FullStopSeparator
    attr_reader :tokens, :abbreviations, :downcase
    def initialize(tokens:, abbreviations:, downcase:)
      @tokens = tokens
      @abbreviations = abbreviations
      @downcase = downcase
    end

    def separate
      abbr = {}
      abbreviations.each do |i|
        abbr[i] = true
      end
      cleaned_tokens = []
      tokens.each_with_index do |_t, i|
        if tokens[i + 1] && tokens[i] =~ /\A(.+)\.\z/
          w = Regexp.last_match(1)
          if downcase
            abbreviation = abbr[w]
          else
            abbreviation = abbr[Unicode.downcase(w)]
          end
          unless abbreviation || w =~ /\A[a-z]\z/i ||
                 w =~ /[a-z](?:\.[a-z])+\z/i
            cleaned_tokens << w
            cleaned_tokens << '.'
            next
          end
        end
        cleaned_tokens << tokens[i]
      end
      if downcase
        abbreviation = abbreviations.include?(cleaned_tokens[-1].chomp(".")) unless cleaned_tokens[-1].nil?
      else
        abbreviation = abbreviations.include?(Unicode.downcase(cleaned_tokens[-1]).chomp(".")) unless cleaned_tokens[-1].nil?
      end
      if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !abbreviation
        cleaned_tokens[-1] = Regexp.last_match(1)
        cleaned_tokens.push '.'
      end
      cleaned_tokens
    end
  end
end

Version data entries

3 entries across 3 versions & 1 rubygems

Version Path
pragmatic_tokenizer-3.0.0 lib/pragmatic_tokenizer/full_stop_separator.rb
pragmatic_tokenizer-1.6.0 lib/pragmatic_tokenizer/full_stop_separator.rb
pragmatic_tokenizer-1.5.1 lib/pragmatic_tokenizer/full_stop_separator.rb