# encoding: UTF-8
require 'spec_helper'

describe 'Atomizer' do
  before(:all) do
    @parser = Taxamatch::Atomizer.new
  end

  it 'should parse uninomials' do
    @parser.parse('Betula').should == { :all_authors => [], :all_years => [],
      :canonical_form => "Betula", :uninomial => { :string => "Betula",
      :normalized => 'BETULA', :phonetized => "BITILA", :authors => [],
      :years => [], :normalized_authors => [] } }
    @parser.parse('Ærenea Lacordaire, 1872').should == {
      :all_authors => ["LACORDAIRE"], :all_years => [1872],
      :canonical_form => "Aerenea", :uninomial => { :string => "Aerenea",
        :normalized => "AERENEA", :phonetized => "ERINIA",
        :authors => ["Lacordaire"], :years => [1872],
        :normalized_authors => ["LACORDAIRE"] } }
  end

  it 'should parse binomials' do
    @parser.parse('Leœptura laetifica Dow, 1913').should == {
      :all_authors => ["DOW"], :all_years => [1913],
      :canonical_form => "Leoeptura laetifica", :genus => {
      :string => "Leoeptura", :normalized => "LEOEPTURA",
      :phonetized => "LIPTIRA", :authors => [], :years => [],
      :normalized_authors => []}, :species => {
      :string => "laetifica", :normalized => "LAETIFICA",
      :phonetized => "LITIFICA", :authors => ["Dow"],
      :years => [1913], :normalized_authors => ["DOW"] } }
  end

  it 'should parse trinomials' do
    @parser.parse('Hydnellum scrobiculatum zonatum ' +
                  '(Banker) D. Hall et D.E. Stuntz 1972').should ==  {
      :all_authors => ["BANKER", "D HALL", "D E STUNTZ"], :all_years => [1972],
      :canonical_form => "Hydnellum scrobiculatum zonatum", :genus=>{
      :string => "Hydnellum", :normalized => "HYDNELLUM",
      :phonetized => "HIDNILIM", :authors => [], :years => [],
      :normalized_authors => [] }, :species => { :string => "scrobiculatum",
      :normalized => "SCROBICULATUM", :phonetized => "SCRABICILATA",
      :authors => [], :years => [], :normalized_authors => [] },
      :infraspecies => [{ :string => "zonatum", :normalized => "ZONATUM",
      :phonetized => "ZANATA", :authors => ["Banker", "D. Hall", "D.E. Stuntz"],
      :years => [1972], :normalized_authors => ["BANKER", "D HALL",
      "D E STUNTZ"] }] }
  end

  it 'should normalize years to integers' do
    future_year = Time.now.year + 10
    @parser.parse("Hydnellum scrobiculatum Kern #{future_year} " +
                  "zonatum (Banker) D. Hall et D.E. Stuntz 1972?").should == {
      :all_authors => ["KERN", "BANKER", "D HALL", "D E STUNTZ"],
      :all_years => [1972],
      :canonical_form => "Hydnellum scrobiculatum zonatum", :genus => {
      :string => "Hydnellum", :normalized => "HYDNELLUM",
      :phonetized => "HIDNILIM", :authors => [], :years => [],
      :normalized_authors => [] }, :species => { :string => "scrobiculatum",
      :normalized => "SCROBICULATUM", :phonetized => "SCRABICILATA",
      :authors => ["Kern"], :years => [], :normalized_authors => ["KERN"] },
      :infraspecies => [{ :string => "zonatum", :normalized => "ZONATUM",
      :phonetized => "ZANATA", :authors =>
      ["Banker", "D. Hall", "D.E. Stuntz"], :years => [1972],
      :normalized_authors => ["BANKER", "D HALL", "D E STUNTZ"] }] }
  end

  it 'should normalize names with abbreviated genus after cf.' do
    @parser.parse('Unio cf. U. alba').should == { :all_authors => [],
      :all_years => [], :canonical_form => "Unio",
      :genus => { :string => "Unio", :normalized => "UNIO",
      :phonetized => "UNIA", :authors => [], :years => [],
      :normalized_authors => [] } }
  end

  it 'should parse names which broke it before' do
    ['Parus caeruleus species complex',
     'Euxoa nr. idahoensis sp. 1clay',
     'Cetraria islandica ? islandica',
     'Buteo borealis ? ventralis'].each do |n|
      res = @parser.parse(n)
      res.class.should == Hash
      res.empty?.should be_false
    end
  end
end


describe 'Taxamatch::Normalizer' do
  it 'should normalize  strings' do
    Taxamatch::Normalizer.normalize('abcd').should == 'ABCD'
    Taxamatch::Normalizer.normalize('Leœptura').should == 'LEOEPTURA'
    Taxamatch::Normalizer.normalize('Ærenea').should == 'AERENEA'
    Taxamatch::Normalizer.normalize('Fallén').should == 'FALLEN'
    Taxamatch::Normalizer.normalize('Fallé€n').should == 'FALLE?N'
    Taxamatch::Normalizer.normalize('Fallén привет').should == 'FALLEN ??????'
    Taxamatch::Normalizer.normalize('Choriozopella trägårdhi').should ==
      'CHORIOZOPELLA TRAGARDHI'
    Taxamatch::Normalizer.normalize('×Zygomena').should == 'xZYGOMENA'
  end

  it 'should normalize words' do
    Taxamatch::Normalizer.normalize_word('L-3eœ|pt[ura$').should ==
      'L-3EOEPTURA'
  end
end

describe 'Taxamatch::Base' do
  before(:all) do
    @tm = Taxamatch::Base.new
  end

  it 'should get txt tests' do
    test_file = File.expand_path(File.dirname(__FILE__)) + '/taxamatch_test.txt'
    read_test_file(test_file, 4) do |y|
      if y
        y[2] = y[2] == 'true' ? true : false
        res = @tm.taxamatch(y[0], y[1], false)
        # puts "%s, %s, %s, %s" % [y[0], y[1], y[2], y[3]]
        res['match'].should == y[2]
        res['edit_distance'].should == y[3].to_i
      end
    end
  end

  it 'should work with names that cannot be parsed' do
    res = @tm.taxamatch('Quadraspidiotus ostreaeformis MacGillivray, 1921',
                        'Quadraspidiotus ostreaeformis Curtis)')
    res = false
  end

  it 'should compare genera' do
    # edit distance 1 always match
    g1 = make_taxamatch_hash 'Plantago'
    g2 = make_taxamatch_hash 'Plantagon'
    @tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
      'edit_distance' => 1, 'match' => true }
    # edit_distance above threshold does not math
    g1 = make_taxamatch_hash 'Plantago'
    g2 = make_taxamatch_hash 'This shouldnt match'
    @tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
      'match' => false, 'edit_distance' => 4 }
    # phonetic_match matches
    g1 = make_taxamatch_hash 'Plantagi'
    g2 = make_taxamatch_hash 'Plantagy'
    @tm.match_genera(g1, g2).should == { 'phonetic_match' => true,
      'edit_distance' => 1, 'match' => true }
    @tm.match_genera(g1, g2, :with_phonetic_match => false).should == {
      'phonetic_match' => false, 'edit_distance' => 1, 'match' => true }
    # distance 1 in first letter also matches
    g1 = make_taxamatch_hash 'Xantheri'
    g2 = make_taxamatch_hash 'Pantheri'
    @tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
      'edit_distance' => 1, 'match' => true }
    # phonetic match tramps everything
    g1 = make_taxamatch_hash 'Xaaaaantheriiiiiiiiiiiiiii'
    g2 = make_taxamatch_hash 'Zaaaaaaaaaaaantheryyyyyyyy'
    @tm.match_genera(g1, g2).should == { 'phonetic_match' => true,
      'edit_distance' => 4, 'match' => true }
    @tm.match_genera(g1, g2, :with_phonetic_match => false).should == {
      'phonetic_match' => false, 'edit_distance' => 4, 'match' => false }
    # same first letter and distance 2 should match
    g1 = make_taxamatch_hash 'Xaaaantherii'
    g2 = make_taxamatch_hash 'Xaaaantherrr'
    @tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
      'match' => true, 'edit_distance' => 2 }
    # First letter is the same and distance is 3 should match, no phonetic match
    g1 = make_taxamatch_hash 'Xaaaaaaaaaaantheriii'
    g2 = make_taxamatch_hash 'Xaaaaaaaaaaantherrrr'
    @tm.match_genera(g1, g2).should ==
      { 'phonetic_match' => false, 'match' => true, 'edit_distance' => 3 }
    # Should not match if one of words is shorter than 2x edit
    # distance and distance is 2 or 3
    g1 = make_taxamatch_hash 'Xant'
    g2 = make_taxamatch_hash 'Xanthe'
    @tm.match_genera(g1, g2).should ==  { 'phonetic_match' => false,
      'match' => false, 'edit_distance' => 2 }
    # Should not match if edit distance > 3 and no phonetic match
    g1 = make_taxamatch_hash 'Xantheriiii'
    g2 = make_taxamatch_hash 'Xantherrrrr'
    @tm.match_genera(g1, g2).should ==  { 'phonetic_match' => false,
      'match' => false, 'edit_distance' => 4 }
  end

  it 'should compare species' do
    # Exact match
    s1 = make_taxamatch_hash 'major'
    s2 = make_taxamatch_hash 'major'
    @tm.match_species(s1, s2).should ==  { 'phonetic_match' => true,
      'match' => true, 'edit_distance' => 0 }
    @tm.match_species(s1, s2, :with_phonetic_match => false).should == {
      'phonetic_match' => false, 'match' => true, 'edit_distance' => 0 }
    # Phonetic match always works
    s1 = make_taxamatch_hash 'xanteriiieeeeeeeeeeeee'
    s2 = make_taxamatch_hash 'zantereeeeeeeeeeeeeeee'
    @tm.match_species(s1, s2).should ==  { 'phonetic_match' => true,
      'match' => true, 'edit_distance' => 4 }
    @tm.match_species(s1, s2, :with_phonetic_match => false).should ==
      { 'phonetic_match' => false, 'match' => false, 'edit_distance' => 4 }
    # Phonetic match works with different endings
    s1 = make_taxamatch_hash 'majorum'
    s2 = make_taxamatch_hash 'majoris'
    @tm.match_species(s1, s2).should ==  {
      'phonetic_match' => true, 'match' => true, 'edit_distance' => 2 }
    @tm.match_species(s1, s2, :with_phonetic_match => false).should ==
      { 'phonetic_match' => false, 'match' => true, 'edit_distance' => 2 }
    # Distance 4 matches if first 3 chars are the same
    s1 = make_taxamatch_hash 'majjjjorrrrr'
    s2 = make_taxamatch_hash 'majjjjoraaaa'
    @tm.match_species(s1, s2).should ==
      { 'phonetic_match' => false, 'match' => true, 'edit_distance' => 4 }
    # Should not match if Distance 4 matches and first 3 chars are not the same
    s1 = make_taxamatch_hash 'majorrrrr'
    s2 = make_taxamatch_hash 'marorraaa'
    @tm.match_species(s1, s2).should == {
      'phonetic_match' => false, 'match' => false, 'edit_distance' => 4 }
    # Distance 2 or 3 matches if first 1 char is the same
    s1 = make_taxamatch_hash 'moooorrrr'
    s2 = make_taxamatch_hash 'mooooraaa'
    @tm.match_species(s1, s2).should == { 'phonetic_match' => false,
      'match' => true, 'edit_distance' => 3 }
    # Should not match if Distance 2 or 3 and first 1 char is not the same
    s1 = make_taxamatch_hash 'morrrr'
    s2 = make_taxamatch_hash 'torraa'
    @tm.match_species(s1, s2).should == {
      'phonetic_match' => false, 'match' => false, 'edit_distance' => 3 }
    # Distance 1 will match anywhere
    s1 = make_taxamatch_hash 'major'
    s2 = make_taxamatch_hash 'rajor'
    @tm.match_species(s1, s2).should == {
      'phonetic_match' => false, 'match' => true, 'edit_distance' => 1 }
    # Will not match if distance 3 and length is less then twice
    # of the edit distance
    s1 = make_taxamatch_hash 'marrr'
    s2 = make_taxamatch_hash 'maaaa'
    @tm.match_species(s1, s2).should == {
      'phonetic_match' => false, 'match' => false, 'edit_distance' => 3 }
  end

  it 'should match matches' do
    # No trobule case
    gmatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 1 }
    smatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 1 }
    @tm.match_matches(gmatch, smatch).should ==
      { 'phonetic_match' => true, 'edit_distance' => 2, 'match' => true }
    # Will not match if either genus or sp. epithet dont match
    gmatch = { 'match' => false,
      'phonetic_match' => false, 'edit_distance' => 1 }
    smatch = { 'match' => true,
      'phonetic_match' => true, 'edit_distance' => 1 }
    @tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => false,
      'edit_distance' => 2, 'match' => false }
    gmatch = { 'match' => true, 'phonetic_match' => true,
      'edit_distance' => 1 }
    smatch = { 'match' => false, 'phonetic_match' => false,
      'edit_distance' => 1 }
    @tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => false,
      'edit_distance' => 2, 'match' => false }
    # Should not match if binomial edit distance > 4
    # NOTE: EVEN with full phonetic match
    gmatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 3 }
    smatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 2 }
    @tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => true,
      'edit_distance' => 5, 'match' => false }
    # Should not have phonetic match if one of the components
    # does not match phonetically
    gmatch = { 'match' => true,
      'phonetic_match' => false, 'edit_distance' => 1 }
    smatch = { 'match' => true,
      'phonetic_match' => true, 'edit_distance' => 1 }
    @tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => false,
      'edit_distance' => 2, 'match' => true }
    gmatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 1 }
    smatch = { 'match' => true,
      'phonetic_match' => false, 'edit_distance' => 1 }
    @tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => false,
      'edit_distance' => 2, 'match' => true }
    # edit distance should be equal the sum of of edit distances
    gmatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 2 }
    smatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 2 }
    @tm.match_matches(gmatch, smatch).should == {
      'phonetic_match'=>true, 'edit_distance'=>4, 'match'=>true }
  end

  it 'should return only boolean values' do
    @tm.taxamatch("AJLJljljlj", "sls").should_not be_nil
    @tm.taxamatch('Olsl','a')
  end

  it "should not match authors from different parts of name" do
    parser = Taxamatch::Atomizer.new
    t = Taxamatch::Base.new
    n1 = parser.parse "Betula Linnaeus"
    n2 = parser.parse "Betula alba Linnaeus"
    n3 = parser.parse "Betula alba alba Linnaeus"
    n4 = parser.parse "Betula alba L."
    n5 = parser.parse "Betula alba"
    n6 = parser.parse "Betula olba"
    n7 = parser.parse "Betula alba Linnaeus alba"
    n8 = parser.parse "Betula alba Linnaeus alba Smith"
    n9 = parser.parse "Betula alba Smith alba L."
    n10 = parser.parse "Betula Linn."
    # if one authorship is empty, return 0
    t.match_authors(n1, n5).should == 0
    t.match_authors(n5, n1).should == 0
    t.match_authors(n5, n6).should == 0
    # if authorship matches on different levels ignore
    t.match_authors(n7, n3).should == 0
    t.match_authors(n8, n3).should == -1
    t.match_authors(n2, n8).should == 0
    t.match_authors(n1, n2).should == 0
    # match on infraspecies level
    t.match_authors(n9, n3).should == 1
    # match on species level
    t.match_authors(n2, n4).should == 1
    # match on uninomial level
    t.match_authors(n1, n10).should == 1
  end


  describe 'Taxamatch::Authmatch' do
    before(:all) do
      @am = Taxamatch::Authmatch
    end

    it 'should calculate score' do
      res = @am.authmatch(['Linnaeus', 'Muller'], ['L'], [], [1788])
      res.should == 90
      res = @am.authmatch(['Linnaeus'],['Kurtz'], [], [])
      res.should == 0
      # found all authors, same year
      res = @am.authmatch(['Linnaeus', 'Muller'],
                          ['Muller', 'Linnaeus'], [1766], [1766])
      res.should == 100
      # all authors, 1 year diff
      res = @am.authmatch(['Linnaeus', 'Muller'],
                          ['Muller', 'Linnaeus'], [1767], [1766])
      res.should == 54
      # year is not counted in
      res = @am.authmatch(['Linnaeus', 'Muller'],
                          ['Muller', 'Linnaeus'], [1767], [])
      res.should == 94
      # found all authors on one side, same year
      res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'],
                          ['Muller', 'Linnaeus'], [1767], [1767])
      res.should == 91
      # found all authors on one side, 1 year diff
      res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'],
                          ['Muller', 'Linnaeus'], [1766], [1767])
      res.should == 51
      # found all authors on one side, year does not count
      res = @am.authmatch(['Linnaeus', 'Muller'],
                          ['Muller', 'Linnaeus', 'Kurtz'], [1766], [])
      res.should == 90
      # found some authors
      res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'],
                          ['Muller', 'Kurtz', 'Stepanov'], [1766], [])
      res.should == 67
      # if year does not match or not present no match for previous case
      res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'],
                          ['Muller', 'Kurtz', 'Stepanov'], [1766], [1765])
      res.should == 0
    end

    it 'should compare years' do
      @am.compare_years([1882],[1880]).should == 2
      @am.compare_years([1882],[]).should == nil
      @am.compare_years([],[]).should == 0
      @am.compare_years([1788,1798], [1788,1798]).should be_nil
    end

    it 'should remove duplicate authors' do
      # Li submatches Linnaeus and it its size 3 is big enought to remove
      # Linnaeus Muller is identical
      res = @am.remove_duplicate_authors(['Lin', 'Muller'],
                                         ['Linnaeus', 'Muller'])
      res.should == [[], []]
      # same in different order
      res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'],
                                         ['Linn', 'Muller'])
      res.should == [[], []]
      # auth Li submatches Linnaeus, but Li size less then 3
      # required to remove Linnaeus
      res = @am.remove_duplicate_authors(['Dem', 'Li'],
                                         ['Linnaeus', 'Stepanov'])
      res.should == [["Dem"], ["Linnaeus", "Stepanov"]]
      # fuzzy match
      res = @am.remove_duplicate_authors(['Dem', 'Lennaeus'],
                                         ['Linnaeus', 'Stepanov'])
      res.should == [["Dem"], ["Stepanov"]]
      res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'],
                                         ['L', 'Kenn'])
      res.should == [['Linnaeus', 'Muller'], ['Kenn']]
      res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'],
                                         ['Muller', 'Linnaeus', 'Kurtz'])
      res.should == [[],['Kurtz']]
    end

    it 'should fuzzy match authors' do
      res = @am.fuzzy_match_authors('L', 'Muller')
      res.should be_false
    end

  end

end