Sha256: 5ee535112ce3600c4e833b36fe98b2953b5b5c68e9f92a9e19ab9ff216330762

Contents?: true

Size: 1.51 KB

Versions: 2

Compression:

Stored size: 1.51 KB

Contents

# -*- encoding : utf-8 -*-

module PragmaticTokenizer
  # This class separates true full stops while ignoring
  # periods that are part of an abbreviation
  class FullStopSeparator
    attr_reader :tokens, :abbreviations, :downcase
    def initialize(tokens:, abbreviations:, downcase:)
      @tokens = tokens
      @abbreviations = abbreviations
      @downcase = downcase
    end

    def separate
      abbr = {}
      abbreviations.each do |i|
        abbr[i] = true
      end
      cleaned_tokens = []
      tokens.each_with_index do |_t, i|
        if tokens[i + 1] && tokens[i] =~ /\A(.+)\.\z/
          w = Regexp.last_match(1)
          if downcase
            abbreviation = abbr[w]
          else
            abbreviation = abbr[UnicodeCaseConverter::downcase(w)]
          end
          unless abbreviation || w =~ /\A[a-z]\z/i ||
                 w =~ /[a-z](?:\.[a-z])+\z/i
            cleaned_tokens << w
            cleaned_tokens << '.'
            next
          end
        end
        cleaned_tokens << tokens[i]
      end
      if downcase
        abbr_included = abbreviations.include?(cleaned_tokens[-1].chomp(".")) unless cleaned_tokens[-1].nil?
      else
        abbr_included = abbreviations.include?(UnicodeCaseConverter::downcase(cleaned_tokens[-1]).chomp(".")) unless cleaned_tokens[-1].nil?
      end
      if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !abbr_included
        cleaned_tokens[-1] = Regexp.last_match(1)
        cleaned_tokens.push '.'
      end
      cleaned_tokens
    end
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
pragmatic_tokenizer-2.2.1 lib/pragmatic_tokenizer/full_stop_separator.rb
pragmatic_tokenizer-2.2.0 lib/pragmatic_tokenizer/full_stop_separator.rb