Sha256: 3bb983b21a7e2dab9996c745e1873c073f191807557b774a8973834bc68638d5

Contents?: true

Size: 1014 Bytes

Versions: 6

Compression:

Stored size: 1014 Bytes

Contents

# -*- encoding : utf-8 -*-

module PragmaticTokenizer
  # This class separates true full stops while ignoring
  # periods that are part of an abbreviation
  class FullStopSeparator
    attr_reader :tokens, :abbreviations
    def initialize(tokens:, abbreviations:)
      @tokens = tokens
      @abbreviations = abbreviations
    end

    def separate
      abbr = {}
      abbreviations.each do |i|
        abbr[i] = true
      end
      cleaned_tokens = []
      tokens.each_with_index do |_t, i|
        if tokens[i + 1] && tokens[i] =~ /\A(.+)\.\z/
          w = $1
          unless abbr[Unicode::downcase(w)] || w =~ /\A[a-z]\z/i ||
            w =~ /[a-z](?:\.[a-z])+\z/i
            cleaned_tokens <<  w
            cleaned_tokens << '.'
            next
          end
        end
        cleaned_tokens << tokens[i]
      end
      if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/
        cleaned_tokens[-1] = $1
        cleaned_tokens.push '.'
      end
      cleaned_tokens
    end
  end
end

Version data entries

6 entries across 6 versions & 1 rubygems

Version Path
pragmatic_tokenizer-1.1.2 lib/pragmatic_tokenizer/full_stop_separator.rb
pragmatic_tokenizer-1.1.1 lib/pragmatic_tokenizer/full_stop_separator.rb
pragmatic_tokenizer-1.1.0 lib/pragmatic_tokenizer/full_stop_separator.rb
pragmatic_tokenizer-1.0.2 lib/pragmatic_tokenizer/full_stop_separator.rb
pragmatic_tokenizer-1.0.1 lib/pragmatic_tokenizer/full_stop_separator.rb
pragmatic_tokenizer-1.0.0 lib/pragmatic_tokenizer/full_stop_separator.rb