Sha256: 5f13d5a23dc91a8075e4611c89e4abf8669fc1fd3712ea8c4677b85baa2b1e28

Contents?: true

Size: 1.29 KB

Versions: 2

Compression:

Stored size: 1.29 KB

Contents

require "set"

module Semantic
  class Parser

    def initialize(options = {})
      # English stopwords from ftp://ftp.cs.cornell.edu/pub/smart/english.stop
      # TODO: nicer way to reference stop file location?
      @filter_stop_words = options[:filter_stop_words]
      @stem_words        = options[:stem_words]
      locale             = options[:locale] || 'en'

      if @filter_stop_words
        File.open("#{File.dirname(__FILE__)}/../../resources/#{locale}.stop", 'r') do |file|
          @stopwords = Set.new(file.read().split())
        end
      end
    end

    def tokenise_and_filter(string)
      word_list = tokenise_and_stem(string)
      remove_stop_words(word_list)
    end

    # remove any nasty grammar tokens from string
    def clean(string)
      string = string.gsub(".","")
      string = string.gsub(/\s+/," ")
      string = string.downcase
      return string
    end

    # stop words are common words which have no search value
    def remove_stop_words(list)
      if @filter_stop_words
        list.select {|word| !@stopwords.include?(word) }
      else
        list
      end
    end

    def tokenise_and_stem(string)
      string = clean(string)
      words = string.split(" ")

      if @stem_words
        words.map(&:stem)
      else
        words
      end
    end

  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
rsemantic-0.2.1 lib/semantic/parser.rb
rsemantic-0.2.0 lib/semantic/parser.rb