describe "NameSpotter" do subject { NameSpotter } let(:neti) { subject.new(subject::NetiNetiClient.new()) } let(:tf) { subject.new(subject::TaxonFinderClient.new()) } let(:clients) { [neti, tf] } describe ".version" do it "returns version" do expect(subject.version).to match /\d+\.\d+\.\d+/ end end describe ".english?" do let(:eng) { read("english.txt") } let(:eng2) { read("journalofentomol13pomo_0018.txt") } let(:eng3) { read("journalofentomol13pomo_0063.txt") } it "detects english" do 100.times do expect(subject.english? eng).to be true expect(subject.english? eng2).to be true expect(subject.english? eng3).to be false end end end describe ".new" do it "works" do expect(neti).to be_kind_of NameSpotter expect(tf).to be_kind_of NameSpotter end end describe "#find" do context "empty text" do it "returns empty list" do clients.each do |c| expect(c.find(nil)).to eq({ names: [] }) expect(c.find(nil, 'json')).to eq "{\"names\":[]}" expect(c.find(nil, "xml")) .to eq "\n\n" expect(c.find('', 'json')).to eq "{\"names\":[]}" expect(c.find('', "xml")) .to eq "\n\n" end end end context "text without sci names" do let(:text) { "one two three, no scientific names" } it "returns empty list" do clients.each do |c| expect(c.find(text)).to eq({ names: [] }) end end end context "text with one sci name" do let(:text) { "Pardosa moesta" } it "returns empty list" do clients.each do |c| expect(c.find(text)[:names].size).to eq 1 end end end context "text with several names" do let(:text) do "Some text that has Betula\n alba and Mus musculus " \ "and \neven B. alba and even M. mus-\nculus and " \ "unicoded name Aranea röselii. Also it has name " \ "unknown before: Varanus bitatawa" end let(:text2) do "Some another text that has Xysticus \ncanadensis and " \ "Pardosa moesta and \neven X. canadensis and even " \ "P. mo-\nesta." end it "returns names" do res = neti.find(text)[:names].map { |n| n[:scientificName] } expect(res).to eq ["Betula alba", "Mus musculus", "B. alba", "Aranea röselii", "Varanus bitatawa"] res = tf.find(text)[:names].map { |n| n[:scientificName] } expect(res).to eq ["Betula alba", "Mus musculus", "B[etula] alba", "Aranea röselii", "Varanus"] end it "forgets previous searches" do res = neti.find(text)[:names].map { |n| n[:scientificName] } expect(res).to eq ["Betula alba", "Mus musculus", "B. alba", "Aranea röselii", "Varanus bitatawa"] res = tf.find(text)[:names].map { |n| n[:scientificName] } expect(res).to eq ["Betula alba", "Mus musculus", "B[etula] alba", "Aranea röselii", "Varanus"] res = neti.find(text2)[:names].map { |n| n[:scientificName] } expect(res).to eq ['Xysticus canadensis', 'Pardosa moesta', 'X. canadensis'] res = tf.find(text2)[:names].map { |n| n[:scientificName] } expect(res).to eq ['Xysticus canadensis', 'Pardosa moesta', 'X[ysticus] canadensis'] end end context "offsets" do let(:text3) do "\r\r\n>':¥/. \r\nA text with multibyte characters " \ "नेति नेति: Some text that has Betula\n alba and " \ "Mus musculus and \neven B. alba and even M. " \ "mus-\nculus. Also it has name " \ "unknown before: Varanus bitatawa species" end let(:text4) do "We have to be sure that Betula\n alba and " \ "PSEUDOSCORPIONIDA and ×Inkea which is not " \ "Passeriformes. We also have another hybrid Passiflora " \ "×rosea and Aranea röselii and capitalized ARANEA " \ "RÖSELII and Pardosa\n moesta f. moesta Banks, 1892 " \ "all get their offsets" end let(:text5) { read "journalofentomol13pomo_0063.txt" } it "return correct names with multibyte chars" do # this test depends on netineti tornado server, not on # namespotter itself. Go and fix that! # the issue and the fix: https://github.com/mbl-cli/NetiNeti/pull/1 res = neti.find(text3)[:names] res.map do |name| verbatim = name[:verbatim] found_name = text3[name[:offsetStart]..name[:offsetEnd]] expect(found_name).to eq verbatim end end it "returns offset for all names" do res = neti.find(text4) tf_res = tf.find(text4) expect(res).to eq({names: [ {verbatim: "Betula\n alba", scientificName: "Betula alba", offsetStart: 24, offsetEnd: 35}, {verbatim: "Passiflora ×rosea", scientificName: "Passiflora ×rosea", offsetStart: 126, offsetEnd: 142}, {verbatim: "Aranea röselii", scientificName: "Aranea röselii", offsetStart: 148, offsetEnd: 161}, {verbatim: "Pardosa\n moesta", scientificName: "Pardosa moesta", offsetStart: 198, offsetEnd: 212} ]}) expect(tf_res).to eq({names: [ {verbatim: "Betula alba", scientificName: "Betula alba", offsetStart: 24, offsetEnd: 35}, {verbatim: "PSEUDOSCORPIONIDA", scientificName: "Pseudoscorpionida", offsetStart: 41, offsetEnd: 57}, {verbatim: "Passeriformes.", scientificName: "Passeriformes", offsetStart: 83, offsetEnd: 96}, {verbatim: "Passiflora ×rosea", scientificName: "Passiflora rosea", offsetStart: 126, offsetEnd: 142}, {verbatim: "Aranea röselii", scientificName: "Aranea röselii", offsetStart: 148, offsetEnd: 161}, {verbatim: "ARANEA", scientificName: "Aranea", offsetStart: 179, offsetEnd: 184}, {verbatim: "Pardosa moesta f. moesta", scientificName: "Pardosa moesta f. moesta", offsetStart: 198, offsetEnd: 222} ]}) end it "makes offsets in order with netineti" do res = neti.find(text5) offsets = res[:names].map { |n| n[:offsetStart] } expect(offsets).to eq offsets expect(offsets[0]).to eq 67 end end end context "abbreviations" do let(:text) do "Pardosa moesta Banks, 1892 is one spider, Schizocosa " \ "ocreata Keyserling, 1887 is a second and a third is " \ "Schizocosa saltatrix borealis. The abbreviations are P. " \ "moesta, S. ocreata, and S. saltatrix borealis is the third." end let(:text2) do "Pardosa moesta! If we encounter Pardosa moesta and then P.modica " \ "another name I know is Xenopus laevis and also P.moesta. Again " \ "without space TaxonFinder should find both. And Plantago major foreva" end let(:text3) do "What happens another called P. (LYCOSIDAE) is the species?" end it "ignores abbreviated genus before family for TaxonFinder" do res = tf.find(text3) expect(res[:names].size).to be 1 expect(res).to eq( {names: [{verbatim: "(LYCOSIDAE)", scientificName: "Lycosidae", offsetStart: 32, offsetEnd: 42}]} ) end it "preserves TaxonFinder expansions" do tf_res = tf.find(text) expect(tf_res).to eq( {names: [ {verbatim: "Pardosa moesta", scientificName: "Pardosa moesta", offsetStart: 0, offsetEnd: 13}, {verbatim: "Schizocosa ocreata", scientificName: "Schizocosa ocreata", offsetStart: 42, offsetEnd: 59}, {verbatim: "Schizocosa saltatrix borealis", scientificName: "Schizocosa saltatrix borealis", offsetStart: 105, offsetEnd: 133}, {verbatim: "P. moesta", scientificName: "P[ardosa] moesta", offsetStart: 158, offsetEnd: 166}, {verbatim: "S. ocreata", scientificName: "S[chizocosa] ocreata", offsetStart: 169, offsetEnd: 178}, {verbatim: "S. saltatrix borealis", scientificName: "S[chizocosa] saltatrix borealis", offsetStart: 185, offsetEnd: 205}]} ) end it "recognizes abbreviations no space (TF)" do res = tf.find(text2) expect(res).to eq( {names: [ {verbatim: "Pardosa moesta", scientificName: "Pardosa moesta", offsetStart: 0, offsetEnd: 13}, {verbatim: "Pardosa moesta", scientificName: "Pardosa moesta", offsetStart: 32, offsetEnd: 45}, {verbatim: "P.modica", scientificName: "P[ardosa] modica", offsetStart: 56, offsetEnd: 63}, {verbatim: "Xenopus laevis", scientificName: "Xenopus laevis", offsetStart: 88, offsetEnd: 101}, {verbatim: "P.moesta", scientificName: "P[ardosa] moesta", offsetStart: 112, offsetEnd: 119}, {verbatim: "Plantago major", scientificName: "Plantago major", offsetStart: 176, offsetEnd: 189}]} ) res[:names].map do |name| verbatim = name[:verbatim] found_name = text2[name[:offsetStart]..name[:offsetEnd]] expect(found_name).to eq verbatim end end end context "capitalization" do #this is a problem we are aware of let(:text) do "We need to make sure that Ophioihrix nidis and " \ "OPHTOMVXIDAE and also Ophiocynodus and especially " \ "ASTÉROCHEMIDAE and definitely STFROPHVTIDAE and may be " \ "Asleronyx excavata should all be capitalized correctly" end it "does not change capitalization" do res = neti.find(text) expect(res).to eq( {names: [ {verbatim: "Ophioihrix nidis", scientificName: "Ophioihrix nidis", offsetStart: 26, offsetEnd: 41}, {verbatim: "OPHTOMVXIDAE", scientificName: "OPHTOMVXIDAE", offsetStart: 47, offsetEnd: 58}, {verbatim: "Ophiocynodus", scientificName: "Ophiocynodus", offsetStart: 70, offsetEnd: 81}, {verbatim: "ASTÉROCHEMIDAE", scientificName: "ASTÉROCHEMIDAE", offsetStart: 98, offsetEnd: 111}, {verbatim: "STFROPHVTIDAE", scientificName: "STFROPHVTIDAE", offsetStart: 128, offsetEnd: 140}, {verbatim: "Asleronyx excavata", scientificName: "Asleronyx excavata", offsetStart: 153, offsetEnd: 170} ]} ) end end context "OCR errors" do let(:pipe) do "We need to make sure that Oph|oihrix nidis and " \ "OPHTOMVX|DAE will not break results" end it "substitutes | with l" do res = neti.find(pipe) expect(res).to eq( { names: [{ verbatim: "Ophloihrix nidis", scientificName: "Ophloihrix nidis", offsetStart: 26, offsetEnd: 41 }] } ) end end context "extremely nexted infraspecies" do let(:text) do "If we encounter Plantago major it is ok, but if it is " \ "Plantago quercus quercus quercus quercus quercus quercus " \ "quercus quercus quercus quercus quercus quercus quercus " \ "quercus, something is probably not right. However we take " \ "Plantago quercus quercus quercus quercus quercus by some " \ "strange reason. Well, the reason is this kind of thing -- " \ "Pardosa moesta var. moesta f. moesta or something like that" end it "stops at five infraspecies levels" do res = tf.find(text) expect(res).to eq( {names: [ {verbatim: "Plantago major", scientificName: "Plantago major", offsetStart: 16, offsetEnd: 29}, {verbatim: "Plantago quercus quercus quercus quercus quercus", scientificName: "Plantago quercus quercus quercus quercus quercus", offsetStart: 225, offsetEnd: 272}, {verbatim: "Pardosa moesta var. moesta f. moesta", scientificName: "Pardosa moesta var. moesta f. moesta", offsetStart: 340, offsetEnd: 375}]} ) end end context "nested names" do let(:text) do "What happens another called Pardosa moesta (Araneae: Lycosidae) is " \ "the species?" end it "(TF) handles nested names in one cycle" do res = tf.find(text) expect(res).to eq ( {names: [ {verbatim: "Pardosa moesta", scientificName: "Pardosa moesta", offsetStart: 29, offsetEnd: 42}, {verbatim: "(Araneae:", scientificName: "Araneae", offsetStart: 44, offsetEnd: 52}, {verbatim: "Lycosidae)", scientificName: "Lycosidae", offsetStart: 54, offsetEnd: 63}]} ) end end context "diacritics" do let(:text) { "Mactra triangula Renieri. Fissurella nubécula Linnó." } it "finds names with diacrictics" do res = tf.find(text) expect(res[:names].size).to be 2 expect(res).to eq( {names: [ {verbatim: "Mactra triangula", scientificName: "Mactra triangula", offsetStart: 0, offsetEnd: 15}, {verbatim: "Fissurella nubécula", scientificName: "Fissurella nubécula", offsetStart: 26, offsetEnd: 44}]} ) end end def read(file) File.read(File.join(__dir__, "files", file)) end end