# coding: utf-8 require 'unicode_utils' class TfIdfSimilarity::Document # An optional document identifier. attr_reader :id # The document's text. attr_reader :text # The document's tokenized text. attr_reader :tokens # The number of times each term appears in the document. attr_reader :term_counts # The document size, in terms. attr_reader :size # @param [String] text the document's text # @param [Hash] opts optional arguments # @option opts [String] :id a string to identify the document # @option opts [Array] :tokens the document's tokenized text def initialize(text, opts = {}) @text = text @id = opts[:id] || object_id @tokens = opts[:tokens] @term_counts = Hash.new 0 process end # @return [Array] the set of the document's terms with no duplicates def terms term_counts.keys end # @param [String] term a term # @return [Float] the square root of the term count # # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html def term_frequency(term) Math.sqrt term_counts[term].to_i end alias_method :tf, :term_frequency private # Tokenize the text and counts terms. def process tokenize(text).each do |word| token = TfIdfSimilarity::Token.new word if token.valid? @term_counts[token.lowercase_filter.classic_filter.to_s] += 1 end end @size = term_counts.values.reduce(:+) end # Tokenizes a text, respecting the word boundary rules from Unicode’s Default # Word Boundary Specification. # # If a tokenized text was provided at the document's initialization, those # tokens will be returned without additional processing. # # @param [String] text a text # @return [Enumerator] a token enumerator # # @note We should evaluate the tokenizers by {http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf Google} # or {http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.UAX29URLEmailTokenizerFactory Solr}. # # @see http://unicode.org/reports/tr29/#Default_Word_Boundaries # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory def tokenize(text) @tokens || UnicodeUtils.each_word(text) end end