Sha256: 2a15f8da799716fd55f5e6cb1db648af8d7a97d76718fc1d6ef48844e70235bf

Contents?: true

Size: 1.15 KB

Versions: 1

Compression:

Stored size: 1.15 KB

Contents

# frozen_string_literal: true

# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
# Copyright:: Copyright (c) 2005 Lucas Carlson
# License::   LGPL

require 'set'

require_relative 'tokenizer/whitespace'
require_relative 'token_filter/stopword'
require_relative 'token_filter/stemmer'

module ClassifierReborn
  module Hasher
    module_function

    # Return a Hash of strings => ints. Each word in the string is stemmed,
    # interned, and indexes to its frequency in the document.
    def word_hash(str, enable_stemmer = true,
                  tokenizer: Tokenizer::Whitespace,
                  token_filters: [TokenFilter::Stopword])
      if token_filters.include?(TokenFilter::Stemmer)
        unless enable_stemmer
          token_filters.reject! do |token_filter|
            token_filter == TokenFilter::Stemmer
          end
        end
      else
        token_filters << TokenFilter::Stemmer if enable_stemmer
      end
      words = tokenizer.call(str)
      token_filters.each do |token_filter|
        words = token_filter.call(words)
      end
      d = Hash.new(0)
      words.each do |word|
        d[word.intern] += 1
      end
      d
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
classifier-reborn-2.3.0 lib/classifier-reborn/extensions/hasher.rb