lib/classifier/lsi.rb in classifier-1.3.4 vs lib/classifier/lsi.rb in classifier-1.3.5

- old
+ new

@@ -2,57 +2,57 @@ # Copyright:: Copyright (c) 2005 David Fayram II # License:: LGPL begin raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true` - + require 'gsl' # requires http://rb-gsl.rubyforge.org/ require 'classifier/extensions/vector_serialize' $GSL = true - + rescue LoadError warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/" - require 'classifier/extensions/vector' + require 'classifier/extensions/vector' end - + require 'classifier/lsi/word_list' require 'classifier/lsi/content_node' require 'classifier/lsi/summary' module Classifier - + # This class implements a Latent Semantic Indexer, which can search, classify and cluster # data based on underlying semantic relations. For more information on the algorithms used, # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing]. class LSI - + attr_reader :word_list attr_accessor :auto_rebuild - + # Create a fresh index. # If you want to call #build_index manually, use # Classifier::LSI.new :auto_rebuild => false # def initialize(options = {}) @auto_rebuild = true unless options[:auto_rebuild] == false @word_list, @items = WordList.new, {} @version, @built_at_version = 0, -1 end - + # Returns true if the index needs to be rebuilt. The index needs # to be built after all informaton is added, but before you start # using it for search, classification and cluster detection. def needs_rebuild? (@items.keys.size > 1) && (@version != @built_at_version) end - - # Adds an item to the index. item is assumed to be a string, but + + # Adds an item to the index. item is assumed to be a string, but # any item may be indexed so long as it responds to #to_s or if - # you provide an optional block explaining how the indexer can + # you provide an optional block explaining how the indexer can # fetch fresh string data. This optional block is passed the item, # so the item may only be a reference to a URL or file name. - # + # # For example: # lsi = Classifier::LSI.new # lsi.add_item "This is just plain text" # lsi.add_item "/home/me/filename.txt" { |x| File.read x } # ar = ActiveRecordObject.find( :all ) @@ -63,211 +63,211 @@ @items[item] = ContentNode.new(clean_word_hash, *categories) @version += 1 build_index if @auto_rebuild end - # A less flexible shorthand for add_item that assumes + # A less flexible shorthand for add_item that assumes # you are passing in a string with no categorries. item - # will be duck typed via to_s . + # will be duck typed via to_s . # def <<( item ) add_item item end - + # Returns the categories for a given indexed items. You are free to add and remove # items from this as you see fit. It does not invalide an index to change its categories. def categories_for(item) return [] unless @items[item] return @items[item].categories end - # Removes an item from the database, if it is indexed. + # Removes an item from the database, if it is indexed. # def remove_item( item ) if @items.keys.contain? item @items.remove item @version += 1 end end - - # Returns an array of items that are indexed. + + # Returns an array of items that are indexed. def items @items.keys end - + # Returns the categories for a given indexed items. You are free to add and remove # items from this as you see fit. It does not invalide an index to change its categories. def categories_for(item) return [] unless @items[item] return @items[item].categories end # This function rebuilds the index if needs_rebuild? returns true. # For very large document spaces, this indexing operation may take some - # time to complete, so it may be wise to place the operation in another - # thread. + # time to complete, so it may be wise to place the operation in another + # thread. # # As a rule, indexing will be fairly swift on modern machines until - # you have well over 500 documents indexed, or have an incredibly diverse - # vocabulary for your documents. + # you have well over 500 documents indexed, or have an incredibly diverse + # vocabulary for your documents. # # The optional parameter "cutoff" is a tuning parameter. When the index is - # built, a certain number of s-values are discarded from the system. The + # built, a certain number of s-values are discarded from the system. The # cutoff parameter tells the indexer how many of these values to keep. # A value of 1 for cutoff means that no semantic analysis will take place, # turning the LSI class into a simple vector search engine. def build_index( cutoff=0.75 ) return unless needs_rebuild? make_word_list - + doc_list = @items.values tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) } - + if $GSL tdm = GSL::Matrix.alloc(*tda).trans ntdm = build_reduced_matrix(tdm, cutoff) - ntdm.size[1].times do |col| + ntdm.size[1].times do |col| vec = GSL::Vector.alloc( ntdm.column(col) ).row doc_list[col].lsi_vector = vec doc_list[col].lsi_norm = vec.normalize end else tdm = Matrix.rows(tda).trans ntdm = build_reduced_matrix(tdm, cutoff) - + ntdm.row_size.times do |col| doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col] doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col] end end - + @built_at_version = @version end - + # This method returns max_chunks entries, ordered by their average semantic rating. # Essentially, the average distance of each entry from all other entries is calculated, # the highest are returned. # # This can be used to build a summary service, or to provide more information about # your dataset's general content. For example, if you were to use categorize on the - # results of this data, you could gather information on what your dataset is generally + # results of this data, you could gather information on what your dataset is generally # about. def highest_relative_content( max_chunks=10 ) return [] if needs_rebuild? - + avg_density = Hash.new @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} } - + avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map end - # This function is the primitive that find_related and classify + # This function is the primitive that find_related and classify # build upon. It returns an array of 2-element arrays. The first element # of this array is a document, and the second is its "score", defining # how "close" it is to other indexed items. - # + # # These values are somewhat arbitrary, having to do with the vector space # created by your content, so the magnitude is interpretable but not always - # meaningful between indexes. + # meaningful between indexes. # # The parameter doc is the content to compare. If that content is not - # indexed, you can pass an optional block to define how to create the - # text data. See add_item for examples of how this works. + # indexed, you can pass an optional block to define how to create the + # text data. See add_item for examples of how this works. def proximity_array_for_content( doc, &block ) return [] if needs_rebuild? - + content_node = node_for_content( doc, &block ) - result = + result = @items.keys.collect do |item| if $GSL val = content_node.search_vector * @items[item].search_vector.col else val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0] end [item, val] end result.sort_by { |x| x[1] }.reverse - end - + end + # Similar to proximity_array_for_content, this function takes similar # arguments and returns a similar array. However, it uses the normalized - # calculated vectors instead of their full versions. This is useful when + # calculated vectors instead of their full versions. This is useful when # you're trying to perform operations on content that is much smaller than # the text you're working with. search uses this primitive. def proximity_norms_for_content( doc, &block ) return [] if needs_rebuild? - + content_node = node_for_content( doc, &block ) - result = + result = @items.keys.collect do |item| if $GSL val = content_node.search_norm * @items[item].search_norm.col else val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0] end [item, val] end result.sort_by { |x| x[1] }.reverse - end - + end + # This function allows for text-based search of your index. Unlike other functions # like find_related and classify, search only takes short strings. It will also ignore - # factors like repeated words. It is best for short, google-like search terms. - # A search will first priortize lexical relationships, then semantic ones. + # factors like repeated words. It is best for short, google-like search terms. + # A search will first priortize lexical relationships, then semantic ones. # # While this may seem backwards compared to the other functions that LSI supports, # it is actually the same algorithm, just applied on a smaller document. def search( string, max_nearest=3 ) return [] if needs_rebuild? carry = proximity_norms_for_content( string ) result = carry.collect { |x| x[0] } return result[0..max_nearest-1] end - + # This function takes content and finds other documents # that are semantically "close", returning an array of documents sorted # from most to least relavant. - # max_nearest specifies the number of documents to return. A value of - # 0 means that it returns all the indexed documents, sorted by relavence. + # max_nearest specifies the number of documents to return. A value of + # 0 means that it returns all the indexed documents, sorted by relavence. # - # This is particularly useful for identifing clusters in your document space. + # This is particularly useful for identifing clusters in your document space. # For example you may want to identify several "What's Related" items for weblog # articles, or find paragraphs that relate to each other in an essay. def find_related( doc, max_nearest=3, &block ) - carry = + carry = proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc } result = carry.collect { |x| x[0] } return result[0..max_nearest-1] end - - # This function uses a voting system to categorize documents, based on - # the categories of other documents. It uses the same logic as the + + # This function uses a voting system to categorize documents, based on + # the categories of other documents. It uses the same logic as the # find_related function to find related documents, then returns the - # most obvious category from this list. + # most obvious category from this list. # - # cutoff signifies the number of documents to consider when clasifying - # text. A cutoff of 1 means that every document in the index votes on + # cutoff signifies the number of documents to consider when clasifying + # text. A cutoff of 1 means that every document in the index votes on # what category the document is in. This may not always make sense. # def classify( doc, cutoff=0.30, &block ) icutoff = (@items.size * cutoff).round carry = proximity_array_for_content( doc, &block ) carry = carry[0..icutoff-1] votes = {} carry.each do |pair| categories = @items[pair[0]].categories - categories.each do |category| + categories.each do |category| votes[category] ||= 0.0 - votes[category] += pair[1] + votes[category] += pair[1] end end - + ranking = votes.keys.sort_by { |x| votes[x] } return ranking[-1] end - + # Prototype, only works on indexed documents. # I have no clue if this is going to work, but in theory # it's supposed to. def highest_ranked_stems( doc, count=3 ) raise "Requested stem ranking on non-indexed content!" unless @items[doc] @@ -287,12 +287,12 @@ s[ord] = 0.0 if s[ord] < s_cutoff end # Reconstruct the term document matrix, only with reduced rank u * ($GSL ? GSL::Matrix : ::Matrix).diag( s ) * v.trans end - - def node_for_content(item, &block) + + def node_for_content(item, &block) if @items[item] return @items[item] else clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash @@ -300,13 +300,13 @@ unless needs_rebuild? cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors end end - + return cn end - + def make_word_list @word_list = WordList.new @items.each_value do |node| node.word_hash.each_key { |key| @word_list.add_word key } end