Sha256: 627523871b9841cd0706bbe61024385aa0a1b1794a226d3bd809f26c8710b347
Contents?: true
Size: 1.28 KB
Versions: 1
Compression:
Stored size: 1.28 KB
Contents
# frozen_string_literal: true # Author:: Lucas Carlson (mailto:lucas@rufy.com) # Copyright:: Copyright (c) 2005 Lucas Carlson # License:: LGPL module ClassifierReborn module TokenFilter # This filter removes stopwords in the language, from given tokens. module Stopword STOPWORDS_PATH = [File.expand_path(File.dirname(__FILE__) + '/../../../../data/stopwords')] @language = 'en' module_function def call(tokens) tokens.reject do |token| token.maybe_stopword? && (token.length <= 2 || STOPWORDS[@language].include?(token)) end end # Add custom path to a new stopword file created by user def add_custom_stopword_path(path) STOPWORDS_PATH.unshift(path) end # Create a lazily-loaded hash of stopword data STOPWORDS = Hash.new do |hash, language| hash[language] = [] STOPWORDS_PATH.each do |path| if File.exist?(File.join(path, language)) hash[language] = Set.new File.read(File.join(path, language.to_s)).force_encoding('utf-8').split break end end hash[language] end # Changes the language of stopwords def language=(language) @language = language end end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
classifier-reborn-2.3.0 | lib/classifier-reborn/extensions/token_filter/stopword.rb |