Sha256: 5f13d5a23dc91a8075e4611c89e4abf8669fc1fd3712ea8c4677b85baa2b1e28
Contents?: true
Size: 1.29 KB
Versions: 2
Compression:
Stored size: 1.29 KB
Contents
require "set" module Semantic class Parser def initialize(options = {}) # English stopwords from ftp://ftp.cs.cornell.edu/pub/smart/english.stop # TODO: nicer way to reference stop file location? @filter_stop_words = options[:filter_stop_words] @stem_words = options[:stem_words] locale = options[:locale] || 'en' if @filter_stop_words File.open("#{File.dirname(__FILE__)}/../../resources/#{locale}.stop", 'r') do |file| @stopwords = Set.new(file.read().split()) end end end def tokenise_and_filter(string) word_list = tokenise_and_stem(string) remove_stop_words(word_list) end # remove any nasty grammar tokens from string def clean(string) string = string.gsub(".","") string = string.gsub(/\s+/," ") string = string.downcase return string end # stop words are common words which have no search value def remove_stop_words(list) if @filter_stop_words list.select {|word| !@stopwords.include?(word) } else list end end def tokenise_and_stem(string) string = clean(string) words = string.split(" ") if @stem_words words.map(&:stem) else words end end end end
Version data entries
2 entries across 2 versions & 1 rubygems
Version | Path |
---|---|
rsemantic-0.2.1 | lib/semantic/parser.rb |
rsemantic-0.2.0 | lib/semantic/parser.rb |