Sha256: 7c0a858c903172c55b6dcc04665b361957d61fbfe03444ae38534a1a090a5a42

Contents?: true

Size: 1.37 KB

Versions: 3

Compression:

Stored size: 1.37 KB

Contents

module DataCatalog

  class Search

    # Returns an array of strings, tokenized with stopwords removed.
    #
    # @param [<String>] array
    #   An array of strings
    #
    # @return [<String>]
    def self.process(array)
      unstop(tokenize(array))
    end

    # Tokenize an array of strings.
    #
    # @param [<String>] array
    #   An array of strings
    #
    # @return [<String>]
    def self.tokenize(array)
      array.reduce([]) do |m, x|
        m << tokens(x)
      end.flatten.uniq
    end

    REMOVE = %r([!,;])

    # Tokenize a string, removing extra characters too.
    #
    # @param [String] string
    #
    # @return [<String>]
    def self.tokens(s)
      if s
        "#{s} ".downcase.
          gsub(REMOVE, ' ').
          gsub(%r(\. ), ' ').
          split(' ')
      else
        []
      end
    end

    STOP_WORDS = %w(
      a
      about
      and
      are
      as
      at
      be
      by
      data
      for
      from
      how
      in
      is
      it
      of
      on
      or
      set
      that
      the 
      this
      to
      was
      what
      when
      where
      an
      who
      will
      with
      the
    )

    # Remove stopwords from an array of strings.
    #
    # @param [<String>] array
    #   An array of strings
    #
    # @return [<String>]
    def self.unstop(array)
      array - STOP_WORDS
    end

  end

end

Version data entries

3 entries across 3 versions & 1 rubygems

Version Path
sinatra_resource-0.4.24 examples/datacatalog/model_helpers/search.rb
sinatra_resource-0.4.23 examples/datacatalog/model_helpers/search.rb
sinatra_resource-0.4.22 examples/datacatalog/model_helpers/search.rb