lib/taxamatch_rb/base.rb in taxamatch_rb-1.1.0 vs lib/taxamatch_rb/base.rb in taxamatch_rb-1.1.1

- old
+ new

@@ -1,33 +1,39 @@ module Taxamatch - # Matches name strings of scientific names + class Base + def initialize @parser = Taxamatch::Atomizer.new @dlm = DamerauLevenshtein end + + # takes two scientific names and returns true + # if names match and false if they don't def taxamatch(str1, str2, return_boolean = true) preparsed_1 = @parser.parse(str1) preparsed_2 = @parser.parse(str2) - match = taxamatch_preparsed(preparsed_1, preparsed_2) - return_boolean ? (!!match && match["match"]) : match + match = taxamatch_preparsed(preparsed_1, preparsed_2) rescue nil + return_boolean ? (!!match && match['match']) : match end + # takes two hashes of parsed scientific names, analyses them and + # returns back this function is useful when species strings are preparsed. def taxamatch_preparsed(preparsed_1, preparsed_2) result = nil if preparsed_1[:uninomial] && preparsed_2[:uninomial] result = match_uninomial(preparsed_1, preparsed_2) - elsif preparsed_1[:genus] && preparsed_2[:genus] + end + if preparsed_1[:genus] && preparsed_2[:genus] result = match_multinomial(preparsed_1, preparsed_2) end - if result && result["match"] - result["match"] = match_authors(preparsed_1, preparsed_2) + if result && result['match'] + result['match'] = match_authors(preparsed_1, preparsed_2) == -1 ? + false : true end - result - rescue StandardError - nil + return result end def match_uninomial(preparsed_1, preparsed_2) match_genera(preparsed_1[:uninomial], preparsed_2[:uninomial]) end @@ -45,74 +51,74 @@ total_length += preparsed_1[:infraspecies][0][:string].size + preparsed_2[:infraspecies][0][:string].size match_hash = match_matches(gen_match, sp_match, infrasp_match) elsif (preparsed_1[:infraspecies] && !preparsed_2[:infraspecies]) || (!preparsed_1[:infraspecies] && preparsed_2[:infraspecies]) - match_hash = { "match" => false, - "edit_distance" => 5, - "phonetic_match" => false } + match_hash = { 'match' => false, + 'edit_distance' => 5, + 'phonetic_match' => false } total_length += preparsed_1[:infraspecies] ? preparsed_1[:infraspecies][0][:string].size : preparsed_2[:infraspecies][0][:string].size else match_hash = match_matches(gen_match, sp_match) end - match_hash.merge({ "score" => - (1 - match_hash["edit_distance"]/(total_length/2)) }) + match_hash.merge({ 'score' => + (1 - match_hash['edit_distance']/(total_length/2)) }) match_hash end def match_genera(genus1, genus2, opts = {}) genus1_length = genus1[:normalized].size genus2_length = genus2[:normalized].size opts = { with_phonetic_match: true }.merge(opts) min_length = [genus1_length, genus2_length].min unless opts[:with_phonetic_match] - genus1[:phonetized] = "A" - genus2[:phonetized] = "B" + genus1[:phonetized] = 'A' + genus2[:phonetized] = 'B' end match = false ed = @dlm.distance(genus1[:normalized], genus2[:normalized], 1, 3) #TODO put block = 2 - return { "edit_distance" => ed, - "phonetic_match" => false, - "match" => false } if ed/min_length.to_f > 0.2 - return { "edit_distance" => ed, - "phonetic_match" => true, - "match" => true } if genus1[:phonetized] == genus2[:phonetized] + return { 'edit_distance' => ed, + 'phonetic_match' => false, + 'match' => false } if ed/min_length.to_f > 0.2 + return { 'edit_distance' => ed, + 'phonetic_match' => true, + 'match' => true } if genus1[:phonetized] == genus2[:phonetized] match = true if ed <= 3 && (min_length > ed * 2) && (ed < 2 || genus1[0] == genus2[0]) - { "edit_distance" => ed, "match" => match, "phonetic_match" => false } + { 'edit_distance' => ed, 'match' => match, 'phonetic_match' => false } end def match_species(sp1, sp2, opts = {}) sp1_length = sp1[:normalized].size sp2_length = sp2[:normalized].size opts = { with_phonetic_match: true }.merge(opts) min_length = [sp1_length, sp2_length].min unless opts[:with_phonetic_match] - sp1[:phonetized] = "A" - sp2[:phonetized] = "B" + sp1[:phonetized] = 'A' + sp2[:phonetized] = 'B' end sp1[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp1[:phonetized] sp2[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp2[:phonetized] match = false ed = @dlm.distance(sp1[:normalized], sp2[:normalized], 1, 4) #TODO put block 4 - return { "edit_distance" => ed, - "phonetic_match" => false, - "match" => false } if ed/min_length.to_f > 0.3334 - return {"edit_distance" => ed, - "phonetic_match" => true, - "match" => true} if sp1[:phonetized] == sp2[:phonetized] + return { 'edit_distance' => ed, + 'phonetic_match' => false, + 'match' => false } if ed/min_length.to_f > 0.3334 + return {'edit_distance' => ed, + 'phonetic_match' => true, + 'match' => true} if sp1[:phonetized] == sp2[:phonetized] match = true if ed <= 4 && (min_length >= ed * 2) && (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) && (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3]) - { "edit_distance" => ed, "match" => match, "phonetic_match" => false } + { 'edit_distance' => ed, 'match' => match, 'phonetic_match' => false } end def match_authors(preparsed_1, preparsed_2) p1 = { normalized_authors: [], years: [] } p2 = { normalized_authors: [], years: [] } @@ -128,27 +134,29 @@ end au1 = p1[:normalized_authors] au2 = p2[:normalized_authors] yr1 = p1[:years] yr2 = p2[:years] - return true if au1.empty? || au2.empty? + return 0 if au1.empty? || au2.empty? score = Taxamatch::Authmatch.authmatch(au1, au2, yr1, yr2) - score == 0 ? false : true + score == 0 ? -1 : 1 end def match_matches(genus_match, species_match, infraspecies_match = nil) match = species_match if infraspecies_match - match["edit_distance"] += infraspecies_match["edit_distance"] - match["match"] &&= infraspecies_match["match"] - match["phonetic_match"] &&= infraspecies_match["phonetic_match"] + match['edit_distance'] += infraspecies_match['edit_distance'] + match['match'] &&= infraspecies_match['match'] + match['phonetic_match'] &&= infraspecies_match['phonetic_match'] end - match["edit_distance"] += genus_match["edit_distance"] - if match["edit_distance"] > (infraspecies_match ? 6 : 4) - match["match"] = false + match['edit_distance'] += genus_match['edit_distance'] + if match['edit_distance'] > (infraspecies_match ? 6 : 4) + match['match'] = false end - match["match"] &&= genus_match["match"] - match["phonetic_match"] &&= genus_match["phonetic_match"] + match['match'] &&= genus_match['match'] + match['phonetic_match'] &&= genus_match['phonetic_match'] match end + end end +