Sha256: ccd0e0f0eababa820b7a96fd2aed7b63bf408b99e84a72a7f0cd54fa51b18e66
Contents?: true
Size: 1.99 KB
Versions: 1
Compression:
Stored size: 1.99 KB
Contents
# coding: utf-8 # A token. # # @note We can add more filters from Solr and stem using Porter's Snowball. # # @see https://github.com/aurelian/ruby-stemmer # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StopFilterFactory # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.WordDelimiterFilterFactory # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.SynonymFilterFactory module TfIdfSimilarity class Token < String # Returns a falsy value if all its characters are numbers, punctuation, # whitespace or control characters. # # @note Some implementations ignore one and two-letter words. # # @return [Boolean] whether the string is a token def valid? !self[%r{ \A ( \d | # number [[:cntrl:]] | # control character [[:punct:]] | # punctuation [[:space:]] # whitespace )+ \z }x] end # Returns a lowercase string. # # @return [Token] a lowercase string # # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.LowerCaseFilterFactory def lowercase_filter self.class.new(defined?(UnicodeUtils) ? UnicodeUtils.downcase(self) : tr( "ÀÁÂÃÄÅĀĂĄÇĆĈĊČÐĎĐÈÉÊËĒĔĖĘĚĜĞĠĢĤĦÌÍÎÏĨĪĬĮĴĶĹĻĽĿŁÑŃŅŇŊÒÓÔÕÖØŌŎŐŔŖŘŚŜŞŠŢŤŦÙÚÛÜŨŪŬŮŰŲŴÝŶŸŹŻŽ", "àáâãäåāăąçćĉċčðďđèéêëēĕėęěĝğġģĥħìíîïĩīĭįĵķĺļľŀłñńņňŋòóôõöøōŏőŕŗřśŝşšţťŧùúûüũūŭůűųŵýŷÿźżž" ).downcase) end # Returns a string with no English possessive or periods in acronyms. # # @return [Token] a string with no English possessive or periods in acronyms # # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory def classic_filter self.class.new(self.gsub('.', '').chomp("'s")) end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
tf-idf-similarity-0.1.4 | lib/tf-idf-similarity/token.rb |