describe "NameSpotter" do
subject { NameSpotter }
let(:neti) { subject.new(subject::NetiNetiClient.new()) }
let(:tf) { subject.new(subject::TaxonFinderClient.new()) }
let(:clients) { [neti, tf] }
describe ".version" do
it "returns version" do
expect(subject.version).to match /\d+\.\d+\.\d+/
end
end
describe ".english?" do
let(:eng) { read("english.txt") }
let(:eng2) { read("journalofentomol13pomo_0018.txt") }
let(:eng3) { read("journalofentomol13pomo_0063.txt") }
it "detects english" do
100.times do
expect(subject.english? eng).to be true
expect(subject.english? eng2).to be true
expect(subject.english? eng3).to be false
end
end
end
describe ".new" do
it "works" do
expect(neti).to be_kind_of NameSpotter
expect(tf).to be_kind_of NameSpotter
end
end
describe "#find" do
context "empty text" do
it "returns empty list" do
clients.each do |c|
expect(c.find(nil)).to eq({ names: [] })
expect(c.find(nil, 'json')).to eq "{\"names\":[]}"
expect(c.find(nil, "xml"))
.to eq "\n\n"
expect(c.find('', 'json')).to eq "{\"names\":[]}"
expect(c.find('', "xml"))
.to eq "\n\n"
end
end
end
context "text without sci names" do
let(:text) { "one two three, no scientific names" }
it "returns empty list" do
clients.each do |c|
expect(c.find(text)).to eq({ names: [] })
end
end
end
context "text with one sci name" do
let(:text) { "Pardosa moesta" }
it "returns empty list" do
clients.each do |c|
expect(c.find(text)[:names].size).to eq 1
end
end
end
context "text with several names" do
let(:text) do
"Some text that has Betula\n alba and Mus musculus " \
"and \neven B. alba and even M. mus-\nculus and " \
"unicoded name Aranea röselii. Also it has name " \
"unknown before: Varanus bitatawa"
end
let(:text2) do
"Some another text that has Xysticus \ncanadensis and " \
"Pardosa moesta and \neven X. canadensis and even " \
"P. mo-\nesta."
end
it "returns names" do
res = neti.find(text)[:names].map { |n| n[:scientificName] }
expect(res).to eq ["Betula alba", "Mus musculus",
"B. alba", "Aranea röselii", "Varanus bitatawa"]
res = tf.find(text)[:names].map { |n| n[:scientificName] }
expect(res).to eq ["Betula alba", "Mus musculus",
"B[etula] alba", "Aranea röselii",
"Varanus"]
end
it "forgets previous searches" do
res = neti.find(text)[:names].map { |n| n[:scientificName] }
expect(res).to eq ["Betula alba", "Mus musculus",
"B. alba", "Aranea röselii", "Varanus bitatawa"]
res = tf.find(text)[:names].map { |n| n[:scientificName] }
expect(res).to eq ["Betula alba", "Mus musculus",
"B[etula] alba", "Aranea röselii",
"Varanus"]
res = neti.find(text2)[:names].map { |n| n[:scientificName] }
expect(res).to eq ['Xysticus canadensis', 'Pardosa moesta',
'X. canadensis']
res = tf.find(text2)[:names].map { |n| n[:scientificName] }
expect(res).to eq ['Xysticus canadensis', 'Pardosa moesta',
'X[ysticus] canadensis']
end
end
context "offsets" do
let(:text3) do
"\r\r\n>':¥/. \r\nA text with multibyte characters " \
"नेति नेति: Some text that has Betula\n alba and " \
"Mus musculus and \neven B. alba and even M. " \
"mus-\nculus. Also it has name " \
"unknown before: Varanus bitatawa species"
end
let(:text4) do
"We have to be sure that Betula\n alba and " \
"PSEUDOSCORPIONIDA and ×Inkea which is not " \
"Passeriformes. We also have another hybrid Passiflora " \
"×rosea and Aranea röselii and capitalized ARANEA " \
"RÖSELII and Pardosa\n moesta f. moesta Banks, 1892 " \
"all get their offsets"
end
let(:text5) { read "journalofentomol13pomo_0063.txt" }
it "return correct names with multibyte chars" do
# this test depends on netineti tornado server, not on
# namespotter itself. Go and fix that!
# the issue and the fix: https://github.com/mbl-cli/NetiNeti/pull/1
res = neti.find(text3)[:names]
res.map do |name|
verbatim = name[:verbatim]
found_name = text3[name[:offsetStart]..name[:offsetEnd]]
expect(found_name).to eq verbatim
end
end
it "returns offset for all names" do
res = neti.find(text4)
tf_res = tf.find(text4)
expect(res).to eq({names: [
{verbatim: "Betula\n alba", scientificName: "Betula alba",
offsetStart: 24, offsetEnd: 35},
{verbatim: "Passiflora ×rosea", scientificName: "Passiflora ×rosea",
offsetStart: 126, offsetEnd: 142},
{verbatim: "Aranea röselii", scientificName: "Aranea röselii",
offsetStart: 148, offsetEnd: 161},
{verbatim: "Pardosa\n moesta", scientificName: "Pardosa moesta",
offsetStart: 198, offsetEnd: 212}
]})
expect(tf_res).to eq({names: [
{verbatim: "Betula alba", scientificName: "Betula alba",
offsetStart: 24, offsetEnd: 35},
{verbatim: "PSEUDOSCORPIONIDA",
scientificName: "Pseudoscorpionida", offsetStart: 41,
offsetEnd: 57},
{verbatim: "Passeriformes.", scientificName: "Passeriformes",
offsetStart: 83, offsetEnd: 96},
{verbatim: "Passiflora ×rosea", scientificName: "Passiflora rosea",
offsetStart: 126, offsetEnd: 142},
{verbatim: "Aranea röselii", scientificName: "Aranea röselii",
offsetStart: 148, offsetEnd: 161},
{verbatim: "ARANEA", scientificName: "Aranea", offsetStart: 179,
offsetEnd: 184},
{verbatim: "Pardosa moesta f. moesta", scientificName:
"Pardosa moesta f. moesta", offsetStart: 198, offsetEnd: 222}
]})
end
it "makes offsets in order with netineti" do
res = neti.find(text5)
offsets = res[:names].map { |n| n[:offsetStart] }
expect(offsets).to eq offsets
expect(offsets[0]).to eq 67
end
end
end
context "abbreviations" do
let(:text) do
"Pardosa moesta Banks, 1892 is one spider, Schizocosa " \
"ocreata Keyserling, 1887 is a second and a third is " \
"Schizocosa saltatrix borealis. The abbreviations are P. " \
"moesta, S. ocreata, and S. saltatrix borealis is the third."
end
let(:text2) do
"Pardosa moesta! If we encounter Pardosa moesta and then P.modica " \
"another name I know is Xenopus laevis and also P.moesta. Again " \
"without space TaxonFinder should find both. And Plantago major foreva"
end
let(:text3) do
"What happens another called P. (LYCOSIDAE) is the species?"
end
it "ignores abbreviated genus before family for TaxonFinder" do
res = tf.find(text3)
expect(res[:names].size).to be 1
expect(res).to eq(
{names: [{verbatim: "(LYCOSIDAE)", scientificName: "Lycosidae",
offsetStart: 32, offsetEnd: 42}]}
)
end
it "preserves TaxonFinder expansions" do
tf_res = tf.find(text)
expect(tf_res).to eq(
{names: [
{verbatim: "Pardosa moesta", scientificName: "Pardosa moesta",
offsetStart: 0, offsetEnd: 13},
{verbatim: "Schizocosa ocreata",
scientificName: "Schizocosa ocreata", offsetStart: 42,
offsetEnd: 59},
{verbatim: "Schizocosa saltatrix borealis",
scientificName: "Schizocosa saltatrix borealis",
offsetStart: 105, offsetEnd: 133},
{verbatim: "P. moesta", scientificName: "P[ardosa] moesta",
offsetStart: 158, offsetEnd: 166},
{verbatim: "S. ocreata", scientificName: "S[chizocosa] ocreata",
offsetStart: 169, offsetEnd: 178},
{verbatim: "S. saltatrix borealis",
scientificName: "S[chizocosa] saltatrix borealis",
offsetStart: 185, offsetEnd: 205}]}
)
end
it "recognizes abbreviations no space (TF)" do
res = tf.find(text2)
expect(res).to eq(
{names: [
{verbatim: "Pardosa moesta", scientificName: "Pardosa moesta",
offsetStart: 0, offsetEnd: 13},
{verbatim: "Pardosa moesta", scientificName: "Pardosa moesta",
offsetStart: 32, offsetEnd: 45},
{verbatim: "P.modica", scientificName: "P[ardosa] modica",
offsetStart: 56, offsetEnd: 63},
{verbatim: "Xenopus laevis", scientificName: "Xenopus laevis",
offsetStart: 88, offsetEnd: 101},
{verbatim: "P.moesta", scientificName: "P[ardosa] moesta",
offsetStart: 112, offsetEnd: 119},
{verbatim: "Plantago major", scientificName: "Plantago major",
offsetStart: 176, offsetEnd: 189}]}
)
res[:names].map do |name|
verbatim = name[:verbatim]
found_name = text2[name[:offsetStart]..name[:offsetEnd]]
expect(found_name).to eq verbatim
end
end
end
context "capitalization" do
#this is a problem we are aware of
let(:text) do
"We need to make sure that Ophioihrix nidis and " \
"OPHTOMVXIDAE and also Ophiocynodus and especially " \
"ASTÉROCHEMIDAE and definitely STFROPHVTIDAE and may be " \
"Asleronyx excavata should all be capitalized correctly"
end
it "does not change capitalization" do
res = neti.find(text)
expect(res).to eq(
{names: [
{verbatim: "Ophioihrix nidis", scientificName: "Ophioihrix nidis",
offsetStart: 26, offsetEnd: 41},
{verbatim: "OPHTOMVXIDAE", scientificName: "OPHTOMVXIDAE",
offsetStart: 47, offsetEnd: 58},
{verbatim: "Ophiocynodus", scientificName: "Ophiocynodus",
offsetStart: 70, offsetEnd: 81},
{verbatim: "ASTÉROCHEMIDAE", scientificName: "ASTÉROCHEMIDAE",
offsetStart: 98, offsetEnd: 111},
{verbatim: "STFROPHVTIDAE", scientificName: "STFROPHVTIDAE",
offsetStart: 128, offsetEnd: 140},
{verbatim: "Asleronyx excavata", scientificName: "Asleronyx excavata",
offsetStart: 153, offsetEnd: 170}
]}
)
end
end
context "OCR errors" do
let(:pipe) do
"We need to make sure that Oph|oihrix nidis and " \
"OPHTOMVX|DAE will not break results"
end
it "substitutes | with l" do
res = neti.find(pipe)
expect(res).to eq(
{ names: [{ verbatim: "Ophloihrix nidis",
scientificName: "Ophloihrix nidis",
offsetStart: 26, offsetEnd: 41 }] }
)
end
end
context "extremely nexted infraspecies" do
let(:text) do
"If we encounter Plantago major it is ok, but if it is " \
"Plantago quercus quercus quercus quercus quercus quercus " \
"quercus quercus quercus quercus quercus quercus quercus " \
"quercus, something is probably not right. However we take " \
"Plantago quercus quercus quercus quercus quercus by some " \
"strange reason. Well, the reason is this kind of thing -- " \
"Pardosa moesta var. moesta f. moesta or something like that"
end
it "stops at five infraspecies levels" do
res = tf.find(text)
expect(res).to eq(
{names: [
{verbatim: "Plantago major", scientificName: "Plantago major",
offsetStart: 16, offsetEnd: 29},
{verbatim: "Plantago quercus quercus quercus quercus quercus",
scientificName: "Plantago quercus quercus quercus quercus quercus",
offsetStart: 225, offsetEnd: 272},
{verbatim: "Pardosa moesta var. moesta f. moesta",
scientificName: "Pardosa moesta var. moesta f. moesta",
offsetStart: 340, offsetEnd: 375}]}
)
end
end
context "nested names" do
let(:text) do
"What happens another called Pardosa moesta (Araneae: Lycosidae) is " \
"the species?"
end
it "(TF) handles nested names in one cycle" do
res = tf.find(text)
expect(res).to eq (
{names: [
{verbatim: "Pardosa moesta", scientificName: "Pardosa moesta",
offsetStart: 29, offsetEnd: 42},
{verbatim: "(Araneae:", scientificName: "Araneae",
offsetStart: 44, offsetEnd: 52},
{verbatim: "Lycosidae)", scientificName: "Lycosidae",
offsetStart: 54, offsetEnd: 63}]}
)
end
end
context "diacritics" do
let(:text) { "Mactra triangula Renieri. Fissurella nubécula Linnó." }
it "finds names with diacrictics" do
res = tf.find(text)
expect(res[:names].size).to be 2
expect(res).to eq(
{names: [
{verbatim: "Mactra triangula", scientificName: "Mactra triangula",
offsetStart: 0, offsetEnd: 15},
{verbatim: "Fissurella nubécula",
scientificName: "Fissurella nubécula",
offsetStart: 26, offsetEnd: 44}]}
)
end
end
def read(file)
File.read(File.join(__dir__, "files", file))
end
end