require File.dirname(__FILE__) + '/../test_helper' class LSITest < Test::Unit::TestCase def setup # we repeat principle words to help weight them. # This test is rather delicate, since this system is mostly noise. @str1 = "This text deals with dogs. Dogs." @str2 = "This text involves dogs too. Dogs! " @str3 = "This text revolves around cats. Cats." @str4 = "This text also involves cats. Cats!" @str5 = "This text involves birds. Birds." @str6 = "Is it about dogs or birds?" @str7 = "Is it about birds or cats?" @str8 = "I would prefer a bird over thousand cats or dogs because birds are smaller." end def test_basic_indexing lsi = Classifier::LSI.new [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x } assert ! lsi.needs_rebuild? # note that the closest match to str1 is str2, even though it is not # the closest text match. assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3) end def test_not_auto_rebuild lsi = Classifier::LSI.new :auto_rebuild => false lsi.add_item @str1, "Dog" lsi.add_item @str2, "Dog" assert lsi.needs_rebuild? lsi.build_index assert ! lsi.needs_rebuild? end def test_basic_categorizing_with_too_small_dataset lsi = Classifier::LSI.new lsi.add_item @str2, "Dog" assert_equal nil, lsi.classify( @str1 ) assert_equal [], lsi.classify_multiple( @str3 ) end def test_basic_categorizing lsi = Classifier::LSI.new lsi.add_item @str2, "Dog" lsi.add_item @str3, "Cat" lsi.add_item @str4, "Cat" lsi.add_item @str5, "Bird" assert_equal "Dog", lsi.classify( @str1 ) assert_equal "Cat", lsi.classify( @str3 ) assert_equal "Bird", lsi.classify( @str5 ) assert_equal "Dog", lsi.classify( @str6 ) assert_equal "Bird", lsi.classify( @str7 ) assert_equal "Bird", lsi.classify( @str8 ) end def test_multiple_categorizing lsi = Classifier::LSI.new lsi.add_item @str1, "Dog" lsi.add_item @str2, "Dog" lsi.add_item @str3, "Cat" lsi.add_item @str4, "Cat" lsi.add_item @str5, "Bird" assert_equal ["Dog", "Bird"], lsi.classify_multiple( @str6 ) assert_equal ["Cat", "Bird"], lsi.classify_multiple( @str7 ) assert_equal ["Bird"], lsi.classify_multiple( @str8 ) end def test_multiple_categorizing_reverse lsi = Classifier::LSI.new lsi.add_item @str1, "Dog" lsi.add_item @str3, "Cat" lsi.add_item @str4, "Cat" lsi.add_item @str6, "Dog", "Bird", "Flying" lsi.add_item @str7, "Cat", "Bird" lsi.add_item @str8, "Bird", "Dog", "Cat" assert_equal ["Dog"], lsi.classify_multiple( @str2 ) assert_equal ["Cat", "Bird"], lsi.classify_multiple( @str5 ) # test with a word unknown alone assert_equal "Bird", lsi.classify( "Bird!" ) assert_equal ["Bird", "Dog", "Cat"], lsi.classify_multiple( "Bird!" ) end def test_external_classifying lsi = Classifier::LSI.new bayes = Classifier::Bayes.new :categories => ['Dog', 'Cat', 'Bird'] lsi.add_item @str1, "Dog" ; bayes.train_dog @str1 lsi.add_item @str2, "Dog" ; bayes.train_dog @str2 lsi.add_item @str3, "Cat" ; bayes.train_cat @str3 lsi.add_item @str4, "Cat" ; bayes.train_cat @str4 lsi.add_item @str5, "Bird" ; bayes.train_bird @str5 # We're talking about dogs. Even though the text matches the corpus on # cats better. Dogs have more semantic weight than cats. So bayes # will fail here, but the LSI recognizes content. tricky_case = "This text revolves around dogs." assert_equal "Dog", lsi.classify( tricky_case ) assert_not_equal "Dog", bayes.classify( tricky_case ) end def test_recategorize_interface lsi = Classifier::LSI.new lsi.add_item @str1, "Dog" lsi.add_item @str2, "Dog" lsi.add_item @str3, "Cat" lsi.add_item @str4, "Cat" lsi.add_item @str5, "Bird" tricky_case = "This text revolves around dogs." assert_equal "Dog", lsi.classify( tricky_case ) # Recategorize as needed. lsi.categories_for(@str1).clear.push "Cow" lsi.categories_for(@str2).clear.push "Cow" assert !lsi.needs_rebuild? assert_equal "Cow", lsi.classify( tricky_case ) end def test_search lsi = Classifier::LSI.new [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x } # Searching by content and text, note that @str2 comes up first, because # both "dog" and "involve" are present. But, the next match is @str1 instead # of @str4, because "dog" carries more weight than involves. assert_equal( [@str2, @str1, @str4, @str5, @str3], lsi.search("dog involves", 100) ) # Keyword search shows how the space is mapped out in relation to # dog when magnitude is remove. Note the relations. We move from dog # through involve and then finally to other words. assert_equal( [@str1, @str2, @str4, @str5, @str3], lsi.search("dog", 5) ) end def test_serialize_safe lsi = Classifier::LSI.new [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x } lsi_md = Marshal.dump lsi lsi_m = Marshal.load lsi_md assert_equal lsi_m.search("cat", 3), lsi.search("cat", 3) assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3) end def test_keyword_search lsi = Classifier::LSI.new lsi.add_item @str1, "Dog" lsi.add_item @str2, "Dog" lsi.add_item @str3, "Cat" lsi.add_item @str4, "Cat" lsi.add_item @str5, "Bird" assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1) end def test_summary assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2) end end