require 'helper' require 'shoulda' $log = false class TestFuzzyMatchConvoluted < Test::Unit::TestCase def setup clear_ltd # dh 8 400 @a_needle = ['DE HAVILLAND CANADA DHC8400 Dash 8'] @a_haystack = ['DEHAVILLAND DEHAVILLAND DHC8-400 DASH-8'] # dh 88 @b_needle = ['ABCDEFG DH88 HIJKLMNOP'] # dh 89 @c_haystack = ['ABCDEFG DH89 HIJKLMNOP'] # dh 8 200 @d_needle = ['DE HAVILLAND CANADA DHC8200 Dash 8'] @d_haystack = ['BOMBARDIER DEHAVILLAND DHC8-200Q DASH-8'] @d_lookalike = ['ABCD DHC8200 Dash 8'] @t_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good tightening for de havilland' ] @r_1 = [ '/(dh)c?-?(\d{0,2})-?(\d{0,4})(?:.*?)(dash|\z)/i', 'good identity for de havilland' ] @needle = [ @a_needle, @b_needle, ['DE HAVILLAND DH89 Dragon Rapide'], ['DE HAVILLAND CANADA DHC8100 Dash 8 (E9, CT142, CC142)'], @d_needle, ['DE HAVILLAND CANADA DHC8300 Dash 8'], ['DE HAVILLAND DH90 Dragonfly'] ] @haystack = [ @a_haystack, @c_haystack, @d_haystack, ['DEHAVILLAND DEHAVILLAND DHC8-100 DASH-8'], ['DEHAVILLAND DEHAVILLAND TWIN OTTER DHC-6'] ] @tightenings = [] @identities = [] @blockings = [] @positives = [] @negatives = [] end def clear_ltd @_ltd = nil end def ltd @_ltd ||= FuzzyMatch.new @haystack, :tightenings => @tightenings, :identities => @identities, :blockings => @blockings, :positives => @positives, :negatives => @negatives, :blocking_only => @blocking_only, :log => $log end should "optionally only pay attention to things that match blockings" do assert_equal @a_haystack, ltd.improver.match(@a_needle) clear_ltd @blocking_only = true assert_equal nil, ltd.improver.match(@a_needle) clear_ltd @blocking_only = true @blockings.push ['/dash/i'] assert_equal @a_haystack, ltd.improver.match(@a_needle) end # the example from the readme, considerably uglier here should "check a simple table" do @haystack = [ 'seamus', 'andy', 'ben' ] @positives = [ [ 'seamus', 'Mr. Seamus Abshere' ] ] needle = [ 'Mr. Seamus Abshere', 'Sr. Andy Rossmeissl', 'Master BenT' ] assert_nothing_raised do ltd.improver.check needle end end should "treat a String as a full record if passed through" do dash = 'DHC8-400' b747 = 'B747200/300' dc9 = 'DC-9-10' haystack_records = [ dash, b747, dc9 ] simple_ltd = FuzzyMatch.new haystack_records, :log => $log assert_equal dash, simple_ltd.improver.match('DeHavilland Dash-8 DHC-400') assert_equal b747, simple_ltd.improver.match('Boeing 747-300') assert_equal dc9, simple_ltd.improver.match('McDonnell Douglas MD81/DC-9') end should "call it a mismatch if you hit a blank positive" do @positives.push [@a_needle[0], ''] assert_raises(FuzzyMatch::Improver::Mismatch) do ltd.improver.match @a_needle end end should "call it a false positive if you hit a blank negative" do @negatives.push [@a_needle[0], ''] assert_raises(FuzzyMatch::Improver::FalsePositive) do ltd.improver.match @a_needle end end should "have a false match without blocking" do # @d_needle will be our victim @haystack.push @d_lookalike @tightenings.push @t_1 assert_equal @d_lookalike, ltd.improver.match(@d_needle) end should "do blocking if the needle matches a block" do # @d_needle will be our victim @haystack.push @d_lookalike @tightenings.push @t_1 @blockings.push ['/(bombardier|de ?havilland)/i'] assert_equal @d_haystack, ltd.improver.match(@d_needle) end should "treat blocks as exclusive" do @haystack = [ @d_needle ] @tightenings.push @t_1 @blockings.push ['/(bombardier|de ?havilland)/i'] assert_equal nil, ltd.improver.match(@d_lookalike) end should "only use identities if they stem from the same regexp" do @identities.push @r_1 @identities.push [ '/(cessna)(?:.*?)(citation)/i' ] @identities.push [ '/(cessna)(?:.*?)(\d\d\d)/i' ] x_needle = [ 'CESSNA D-333 CITATION V'] x_haystack = [ 'CESSNA D-333' ] @haystack.push x_haystack assert_equal x_haystack, ltd.improver.match(x_needle) end should "use the best score from all of the tightenings" do x_needle = ["BOEING 737100"] x_haystack = ["BOEING BOEING 737-100/200"] x_haystack_wrong = ["BOEING BOEING 737-900"] @haystack.push x_haystack @haystack.push x_haystack_wrong @tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i'] @tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i'] assert_equal x_haystack, ltd.improver.match(x_needle) end should "compare using prefixes if tightened key is shorter than correct match" do x_needle = ["BOEING 720"] x_haystack = ["BOEING BOEING 720-000"] x_haystack_wrong = ["BOEING BOEING 717-200"] @haystack.push x_haystack @haystack.push x_haystack_wrong @tightenings.push @t_1 @tightenings.push ['/(7\d)(7|0)-?\d{1,3}\/(\d\d\d)/i'] @tightenings.push ['/(7\d)(7|0)-?(\d{1,3}|[A-Z]{0,3})/i'] assert_equal x_haystack, ltd.improver.match(x_needle) end should "use the shortest original input" do x_needle = ['De Havilland DHC8-777 Dash-8 Superstar'] x_haystack = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar'] x_haystack_long = ['DEHAVILLAND DEHAVILLAND DHC8-777 DASH-8 Superstar/Supernova'] @haystack.push x_haystack_long @haystack.push x_haystack @tightenings.push @t_1 assert_equal x_haystack, ltd.improver.match(x_needle) end should "perform lookups needle to haystack" do assert_equal @a_haystack, ltd.improver.match(@a_needle) end should "succeed if there are no checks" do assert_nothing_raised do ltd.improver.check @needle end end should "succeed if the positive checks just work" do @positives.push [ @a_needle[0], @a_haystack[0] ] assert_nothing_raised do ltd.improver.check @needle end end should "fail if positive checks don't work" do @positives.push [ @d_needle[0], @d_haystack[0] ] assert_raises(FuzzyMatch::Improver::Mismatch) do ltd.improver.check @needle end end should "succeed if proper tightening is applied" do @positives.push [ @d_needle[0], @d_haystack[0] ] @tightenings.push @t_1 assert_nothing_raised do ltd.improver.check @needle end end should "use a Google Docs spreadsheet as a source of tightenings" do @positives.push [ @d_needle[0], @d_haystack[0] ] @tightenings = RemoteTable.new :url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false # sabshere 9/30/10 this shouldn't raise anything # but the tightenings have been changed... we should be using test-only tightenings, not production ones # assert_nothing_raised do assert_raises(FuzzyMatch::Improver::Mismatch) do ltd.improver.check @needle end end should "fail if negative checks don't work" do @negatives.push [ @b_needle[0], @c_haystack[0] ] assert_raises(FuzzyMatch::Improver::FalsePositive) do ltd.improver.check @needle end end should "do inline checking" do @negatives.push [ @b_needle[0], @c_haystack[0] ] assert_raises(FuzzyMatch::Improver::FalsePositive) do ltd.improver.match @b_needle end end should "fail if negative checks don't work, even with tightening" do @negatives.push [ @b_needle[0], @c_haystack[0] ] @tightenings.push @t_1 assert_raises(FuzzyMatch::Improver::FalsePositive) do ltd.improver.check @needle end end should "succeed if proper identity is applied" do @negatives.push [ @b_needle[0], @c_haystack[0] ] @positives.push [ @d_needle[0], @d_haystack[0] ] @identities.push @r_1 assert_nothing_raised do ltd.improver.check @needle end end end