Sha256: 9c31ce3e71d680d68055f4248c3e60f5bb411648112dbea2e0a9dbd82122c879

Contents?: true

Size: 1.39 KB

Versions: 22

Compression:

Stored size: 1.39 KB

Contents

module DataCatalog
  
  class Search
    
    # Returns an array of strings, tokenized with stopwords removed.
    #
    # @param [<String>] array
    #   An array of strings
    #
    # @return [<String>]
    def self.process(array)
      unstop(tokenize(array))
    end
    
    # Tokenize an array of strings.
    #
    # @param [<String>] array
    #   An array of strings
    #
    # @return [<String>]
    def self.tokenize(array)
      array.reduce([]) do |m, x|
        m << tokens(x)
      end.flatten.uniq
    end

    REMOVE = %r([!,;])
    
    # Tokenize a string, removing extra characters too.
    #
    # @param [String] string
    #
    # @return [<String>]
    def self.tokens(s)
      if s
        "#{s} ".downcase.
          gsub(REMOVE, ' ').
          gsub(%r(\. ), ' ').
          split(' ')
      else
        []
      end
    end

    STOP_WORDS = %w(
      a
      about
      and
      are
      as
      at
      be
      by
      data
      for
      from
      how
      in
      is
      it
      of
      on
      or
      set
      that
      the 
      this
      to
      was
      what
      when
      where
      an
      who
      will
      with
      the
    )

    # Remove stopwords from an array of strings.
    #
    # @param [<String>] array
    #   An array of strings
    #
    # @return [<String>]
    def self.unstop(array)
      array - STOP_WORDS
    end
    
  end
  
end

Version data entries

22 entries across 22 versions & 1 rubygems

Version Path
sinatra_resource-0.4.1 examples/datacatalog/model_helpers/search.rb
sinatra_resource-0.4.0 examples/datacatalog/model_helpers/search.rb