lsi.rb in classifier-1.3.5

- old
+ new

@@ -2,57 +2,57 @@
 # Copyright:: Copyright (c) 2005 David Fayram II
 # License::   LGPL
 
 begin
    raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
-   
+
    require 'gsl' # requires http://rb-gsl.rubyforge.org/
    require 'classifier/extensions/vector_serialize'
    $GSL = true
-   
+
 rescue LoadError
 	warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/"
-	require 'classifier/extensions/vector'	
+	require 'classifier/extensions/vector'
 end
-   
+
 require 'classifier/lsi/word_list'
 require 'classifier/lsi/content_node'
 require 'classifier/lsi/summary'
 
 module Classifier
-  
+
   # This class implements a Latent Semantic Indexer, which can search, classify and cluster
   # data based on underlying semantic relations. For more information on the algorithms used,
   # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
   class LSI
-    
+
     attr_reader :word_list
     attr_accessor :auto_rebuild
-    
+
     # Create a fresh index.
     # If you want to call #build_index manually, use
     #      Classifier::LSI.new :auto_rebuild => false
     #
     def initialize(options = {})
       @auto_rebuild = true unless options[:auto_rebuild] == false
       @word_list, @items = WordList.new, {}
       @version, @built_at_version = 0, -1
     end
-    
+
     # Returns true if the index needs to be rebuilt.  The index needs
     # to be built after all informaton is added, but before you start
     # using it for search, classification and cluster detection.
     def needs_rebuild?
       (@items.keys.size > 1) && (@version != @built_at_version)
     end
-    
-    # Adds an item to the index. item is assumed to be a string, but 
+
+    # Adds an item to the index. item is assumed to be a string, but
     # any item may be indexed so long as it responds to #to_s or if
-    # you provide an optional block explaining how the indexer can 
+    # you provide an optional block explaining how the indexer can
     # fetch fresh string data. This optional block is passed the item,
     # so the item may only be a reference to a URL or file name.
-    # 
+    #
     # For example:
     #   lsi = Classifier::LSI.new
     #   lsi.add_item "This is just plain text"
     #   lsi.add_item "/home/me/filename.txt" { |x| File.read x }
     #   ar = ActiveRecordObject.find( :all )
@@ -63,211 +63,211 @@
       @items[item] = ContentNode.new(clean_word_hash, *categories)
       @version += 1
       build_index if @auto_rebuild
     end
 
-    # A less flexible shorthand for add_item that assumes 
+    # A less flexible shorthand for add_item that assumes
     # you are passing in a string with no categorries. item
-    # will be duck typed via to_s . 
+    # will be duck typed via to_s .
     #
     def <<( item )
       add_item item
     end
-    
+
     # Returns the categories for a given indexed items. You are free to add and remove
     # items from this as you see fit. It does not invalide an index to change its categories.
     def categories_for(item)
       return [] unless @items[item]
       return @items[item].categories
     end
 
-    # Removes an item from the database, if it is indexed. 
+    # Removes an item from the database, if it is indexed.
     #
     def remove_item( item )
       if @items.keys.contain? item
         @items.remove item
         @version += 1
       end
     end
-    
-    # Returns an array of items that are indexed. 
+
+    # Returns an array of items that are indexed.
     def items
       @items.keys
     end
-    
+
     # Returns the categories for a given indexed items. You are free to add and remove
     # items from this as you see fit. It does not invalide an index to change its categories.
     def categories_for(item)
       return [] unless @items[item]
       return @items[item].categories
     end
 
     # This function rebuilds the index if needs_rebuild? returns true.
     # For very large document spaces, this indexing operation may take some
-    # time to complete, so it may be wise to place the operation in another 
-    # thread. 
+    # time to complete, so it may be wise to place the operation in another
+    # thread.
     #
     # As a rule, indexing will be fairly swift on modern machines until
-    # you have well over 500 documents indexed, or have an incredibly diverse 
-    # vocabulary for your documents. 
+    # you have well over 500 documents indexed, or have an incredibly diverse
+    # vocabulary for your documents.
     #
     # The optional parameter "cutoff" is a tuning parameter. When the index is
-    # built, a certain number of s-values are discarded from the system. The 
+    # built, a certain number of s-values are discarded from the system. The
     # cutoff parameter tells the indexer how many of these values to keep.
     # A value of 1 for cutoff means that no semantic analysis will take place,
     # turning the LSI class into a simple vector search engine.
     def build_index( cutoff=0.75 )
       return unless needs_rebuild?
       make_word_list
-      
+
       doc_list = @items.values
       tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
-      
+
       if $GSL
          tdm = GSL::Matrix.alloc(*tda).trans
          ntdm = build_reduced_matrix(tdm, cutoff)
 
-         ntdm.size[1].times do |col| 
+         ntdm.size[1].times do |col|
            vec = GSL::Vector.alloc( ntdm.column(col) ).row
            doc_list[col].lsi_vector = vec
            doc_list[col].lsi_norm = vec.normalize
          end
       else
          tdm = Matrix.rows(tda).trans
          ntdm = build_reduced_matrix(tdm, cutoff)
-      
+
          ntdm.row_size.times do |col|
            doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
            doc_list[col].lsi_norm = ntdm.column(col).normalize  if doc_list[col]
          end
       end
-   
+
       @built_at_version = @version
     end
-   
+
     # This method returns max_chunks entries, ordered by their average semantic rating.
     # Essentially, the average distance of each entry from all other entries is calculated,
     # the highest are returned.
     #
     # This can be used to build a summary service, or to provide more information about
     # your dataset's general content. For example, if you were to use categorize on the
-    # results of this data, you could gather information on what your dataset is generally 
+    # results of this data, you could gather information on what your dataset is generally
     # about.
     def highest_relative_content( max_chunks=10 )
        return [] if needs_rebuild?
-       
+
        avg_density = Hash.new
        @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
-       
+
        avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
     end
 
-    # This function is the primitive that find_related and classify 
+    # This function is the primitive that find_related and classify
     # build upon. It returns an array of 2-element arrays. The first element
     # of this array is a document, and the second is its "score", defining
     # how "close" it is to other indexed items.
-    # 
+    #
     # These values are somewhat arbitrary, having to do with the vector space
     # created by your content, so the magnitude is interpretable but not always
-    # meaningful between indexes. 
+    # meaningful between indexes.
     #
     # The parameter doc is the content to compare. If that content is not
-    # indexed, you can pass an optional block to define how to create the 
-    # text data. See add_item for examples of how this works. 
+    # indexed, you can pass an optional block to define how to create the
+    # text data. See add_item for examples of how this works.
     def proximity_array_for_content( doc, &block )
       return [] if needs_rebuild?
-      
+
       content_node = node_for_content( doc, &block )
-      result = 
+      result =
         @items.keys.collect do |item|
           if $GSL
              val = content_node.search_vector * @items[item].search_vector.col
           else
              val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
           end
           [item, val]
         end
       result.sort_by { |x| x[1] }.reverse
-    end 
-    
+    end
+
     # Similar to proximity_array_for_content, this function takes similar
     # arguments and returns a similar array. However, it uses the normalized
-    # calculated vectors instead of their full versions. This is useful when 
+    # calculated vectors instead of their full versions. This is useful when
     # you're trying to perform operations on content that is much smaller than
     # the text you're working with. search uses this primitive.
     def proximity_norms_for_content( doc, &block )
       return [] if needs_rebuild?
-  
+
       content_node = node_for_content( doc, &block )
-      result = 
+      result =
         @items.keys.collect do |item|
           if $GSL
             val = content_node.search_norm * @items[item].search_norm.col
           else
             val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
           end
           [item, val]
         end
       result.sort_by { |x| x[1] }.reverse
-    end 
-    
+    end
+
     # This function allows for text-based search of your index. Unlike other functions
     # like find_related and classify, search only takes short strings. It will also ignore
-    # factors like repeated words. It is best for short, google-like search terms. 
-    # A search will first priortize lexical relationships, then semantic ones. 
+    # factors like repeated words. It is best for short, google-like search terms.
+    # A search will first priortize lexical relationships, then semantic ones.
     #
     # While this may seem backwards compared to the other functions that LSI supports,
     # it is actually the same algorithm, just applied on a smaller document.
     def search( string, max_nearest=3 )
       return [] if needs_rebuild?
       carry = proximity_norms_for_content( string )
       result = carry.collect { |x| x[0] }
       return result[0..max_nearest-1]
     end
-    
+
     # This function takes content and finds other documents
     # that are semantically "close", returning an array of documents sorted
     # from most to least relavant.
-    # max_nearest specifies the number of documents to return. A value of 
-    # 0 means that it returns all the indexed documents, sorted by relavence. 
+    # max_nearest specifies the number of documents to return. A value of
+    # 0 means that it returns all the indexed documents, sorted by relavence.
     #
-    # This is particularly useful for identifing clusters in your document space. 
+    # This is particularly useful for identifing clusters in your document space.
     # For example you may want to identify several "What's Related" items for weblog
     # articles, or find paragraphs that relate to each other in an essay.
     def find_related( doc, max_nearest=3, &block )
-      carry = 
+      carry =
         proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
       result = carry.collect { |x| x[0] }
       return result[0..max_nearest-1]
     end
-      
-    # This function uses a voting system to categorize documents, based on 
-    # the categories of other documents. It uses the same logic as the 
+
+    # This function uses a voting system to categorize documents, based on
+    # the categories of other documents. It uses the same logic as the
     # find_related function to find related documents, then returns the
-    # most obvious category from this list. 
+    # most obvious category from this list.
     #
-    # cutoff signifies the number of documents to consider when clasifying 
-    # text. A cutoff of 1 means that every document in the index votes on 
+    # cutoff signifies the number of documents to consider when clasifying
+    # text. A cutoff of 1 means that every document in the index votes on
     # what category the document is in. This may not always make sense.
     #
     def classify( doc, cutoff=0.30, &block )
       icutoff = (@items.size * cutoff).round
       carry = proximity_array_for_content( doc, &block )
       carry = carry[0..icutoff-1]
       votes = {}
       carry.each do |pair|
         categories = @items[pair[0]].categories
-        categories.each do |category| 
+        categories.each do |category|
           votes[category] ||= 0.0
-          votes[category] += pair[1] 
+          votes[category] += pair[1]
         end
       end
-      
+
       ranking = votes.keys.sort_by { |x| votes[x] }
       return ranking[-1]
     end
-    
+
     # Prototype, only works on indexed documents.
     # I have no clue if this is going to work, but in theory
     # it's supposed to.
     def highest_ranked_stems( doc, count=3 )
       raise "Requested stem ranking on non-indexed content!" unless @items[doc]
@@ -287,12 +287,12 @@
         s[ord] = 0.0 if s[ord] < s_cutoff
       end
       # Reconstruct the term document matrix, only with reduced rank
       u * ($GSL ? GSL::Matrix : ::Matrix).diag( s ) * v.trans
     end
-    
-    def node_for_content(item, &block)    
+
+    def node_for_content(item, &block)
       if @items[item]
         return @items[item]
       else
         clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
 
@@ -300,13 +300,13 @@
 
         unless needs_rebuild?
           cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
         end
       end
-      
+
       return cn
     end
-    
+
     def make_word_list
       @word_list = WordList.new
       @items.each_value do |node|
         node.word_hash.each_key { |key| @word_list.add_word key }
       end