lib/biodiversity/parser/scientific_name_clean.rb in dimus-biodiversity-0.0.13 vs lib/biodiversity/parser/scientific_name_clean.rb in dimus-biodiversity-0.0.15

- old
+ new

@@ -37,10 +37,15 @@ a.value + " × " + b.value end def canonical a.canonical + " × " + b.canonical end + + def pos + a.pos.merge(b.pos) + end + def details {:hybrid => {:scientific_name1 => a.details, :scientific_name2 => b.details}} end end @@ -70,10 +75,14 @@ def canonical a.canonical end + def pos + a.pos + end + def details {:hybrid => {:scientific_name1 => a.details, :scientific_name2 => "?"}} end end @@ -219,10 +228,14 @@ end def canonical a.canonical end + + def pos + a.pos.merge(b.pos).merge(d.pos) + end def details a.details.merge(b.details).merge(c.details(d)).merge({:name_part_verbatim => a.text_value, :auth_part_verbatim => (b.text_value + " " + c.text_value + " " + d.text_value).gsub(/\s{2,}/, ' ').strip}) end end @@ -263,10 +276,14 @@ end def canonical a.canonical end + + def pos + a.pos.merge(c.pos) + end def details a.details.merge(b.details(c)).merge({:name_part_verbatim => a.text_value, :auth_part_verbatim => (b.text_value + " " + c.text_value).gsub(/\s{2,}/, ' ').strip}) end end @@ -306,10 +323,15 @@ a.value + " " + b.value + " " + c.value end def canonical a.canonical end + + def pos + a.pos.merge(b.pos) + end + def details a.details.merge(b.details).merge(c.details).merge({:name_part_verbatim => a.text_value, :auth_part_verbatim => (b.text_value + " " + c.text_value).gsub(/\s{2,}/, ' ').strip}) end end @@ -340,10 +362,15 @@ a.value + " " + b.value end def canonical a.canonical end + + def pos + a.pos.merge(b.pos) + end + def details a.details.merge(b.details).merge({:name_part_verbatim => a.text_value, :auth_part_verbatim => b.text_value.gsub(/\s{2,}/, ' ')}) end end @@ -376,10 +403,14 @@ def canonical a.canonical end + def pos + a.pos.merge(b.pos) + end + def details a.details.merge(b.details).merge({:is_valid => false}).merge({:name_part_verbatim => a.text_value, :auth_part_verbatim => b.text_value.gsub(/\s{2,}/, ' ')}) end end @@ -762,10 +793,15 @@ (a.value + " " + b.value + " " + c.value + " " + d.value).gsub(/\s+/,' ') end def canonical (a.canonical + " " + c.canonical).gsub(/\s+/,' ') end + + def pos + a.pos.merge(b.pos).merge(c.pos).merge(d.pos) + end + def details a.details.merge(c.details).merge({:species_authors=>b.details, :subspecies_authors => d.details}).merge({:name_part_verbatim => a.text_value, :auth_part_verbatim => (b.text_value + " " + c.text_value + " " + d.text_value).gsub(/\s{2,}/, ' ')}) end end @@ -906,11 +942,15 @@ module AuthorsPart1 def value a.value + " " + b.value end - + + def pos + a.pos.merge(b.pos) + end + def details a.details.merge(b.details) end end @@ -939,10 +979,14 @@ module AuthorsPart3 def value a.value + " ex " + b.value end + def pos + a.pos.merge(b.pos) + end + def details {:revised_name_authors => {:revised_authors => a.details[:authors], :authors => b.details[:authors]}} end end @@ -962,10 +1006,15 @@ module AuthorsPart5 def value a.value + " " + b.value end + + def pos + a.pos.merge(b.pos) + end + def details a.details.merge(b.details) end end @@ -1094,10 +1143,15 @@ module SimpleAuthorsPart1 def value a.value + " " + b.value end + + def pos + a.pos.merge(b.pos) + end + def details a.details.merge(b.details) end end @@ -1187,10 +1241,15 @@ module OriginalAuthorsNamesFull1 def value "(" + a.value + " " + b.value + ")" end + + def pos + a.pos.merge(b.pos) + end + def details {:orig_authors => a.details[:authors], :year => b.details[:year]} end end @@ -1218,10 +1277,15 @@ module OriginalAuthorsNamesFull3 def value "(" + a.value + ")" end + + def pos + a.pos + end + def details {:orig_authors => a.details[:authors]} end end @@ -1242,10 +1306,15 @@ module OriginalAuthorsNamesFull5 def value "(" + a.value + ")" end + + def pos + a.pos + end + def details {:orig_authors => a.details[:authors]} end end @@ -1273,10 +1342,15 @@ module OriginalAuthorsNamesFull7 def value "(" + a.value + ")" end + + def pos + a.pos + end + def details {:orig_authors => a.details[:authors]} end end @@ -1548,10 +1622,14 @@ module OriginalAuthorsRevisedName1 def value "(" + a.value + ")" end + def pos + a.pos + end + def details {:original_revised_name_authors => a.details[:revised_name_authors]} end end @@ -1620,10 +1698,15 @@ module AuthorsRevisedName1 def value a.value + " ex " + b.value end + + def pos + a.pos.merge(b.pos) + end + def details {:revised_name_authors =>{:revised_authors => a.details[:authors], :authors => b.details[:authors]}} end end @@ -1688,10 +1771,15 @@ module AuthorsNamesFull1 def value a.value + " " + b.value end + + def pos + a.pos.merge(b.pos) + end + def details {:authors => {:names => a.details[:authors][:names]}.merge(b.details)} end end @@ -1765,10 +1853,15 @@ module UnknownAuth0 def value text_value end + + def pos + {interval.begin => ['unknown_author', interval.end]} + end + def details {:authors => "unknown"} end end @@ -1901,10 +1994,14 @@ module AuthorsNames1 def value sep.apply(a,b) end + def pos + sep.pos(a,b) + end + def details sep.details(a,b) end end @@ -1965,11 +2062,15 @@ def apply(a,b) sep = text_value.strip sep = " et" if ["&","and","et"].include? sep a.value + sep + " " + b.value end - + + def pos(a,b) + a.pos.merge(b.pos) + end + def details(a,b) {:authors => {:names => a.details[:authors][:names] + b.details[:authors][:names]}} end end @@ -2062,11 +2163,15 @@ module AuthorName1 def value a.value + " " + b.value end - + + def pos + a.pos.merge(b.pos) + end + def details {:authors => {:names => [value]}} end end @@ -2125,19 +2230,30 @@ module AuthorWord0 def value text_value.strip end + + def pos + {interval.begin => ['author_word', 1], (interval.begin + 2) => ['author_word', 2], (interval.begin + 5) => ['author_word', 2]} + end + def details {:authors => {:names => [value]}} end end module AuthorWord1 def value text_value.strip end + + def pos + #cheating because there are several words in some of them + {interval.begin => ['author_word', interval.end]} + end + def details {:authors => {:names => [value]}} end end @@ -2146,10 +2262,15 @@ module AuthorWord3 def value text_value.gsub(/\s+/, " ").strip end + + def pos + {interval.begin => ['author_word', interval.end]} + end + def details {:authors => {:names => [value]}} end end @@ -2570,10 +2691,15 @@ a.value + " " + b.value + " " + c.value end def canonical a.canonical end + + def pos + a.pos + end + def details a.details.merge(b.details).merge(c.details) end end @@ -2601,10 +2727,14 @@ end def canonical a.canonical + b.canonical end + def pos + a.pos.merge(b.pos) + end + def details a.details.merge(b.details) end end @@ -2634,10 +2764,14 @@ def canonical value end + def pos + a.pos.merge({b.interval.begin => ['subspecies', b.interval.end]}) + end + def details a.details.merge({:subspecies => {:rank => "n/a", :value =>b.value}}) end end @@ -2794,10 +2928,14 @@ def canonical a.canonical + b.canonical end + def pos + a.pos.merge(b.pos) + end + def details c = a.details[:subspecies] + b.details_subspecies a.details.merge({:subspecies => c, :is_valid => false}) end end @@ -2866,10 +3004,14 @@ sel.apply(a) end def canonical sel.canonical(a) end + + def pos + {a.interval.begin => ['subspecies', a.interval.end]} + end def details sel.details(a) end def details_subspecies details[:subspecies] @@ -3631,10 +3773,15 @@ "× " + a.value + " " + b.value end def canonical a.value + " " + b.value end + + def pos + {a.interval.begin => ['genus', a.interval.end], b.interval.begin => ['species', b.interval.end]} + end + def details {:genus => a.value, :species => b.value, :cross => 'before'} end end @@ -3657,10 +3804,15 @@ "× " + a.value end def canonical a.value end + + def pos + {a.interval.begin => ['uninomial', a.interval.end]} + end + def details {:uninomial => a.value, :cross => 'before'} end end @@ -3691,10 +3843,15 @@ a.value + " × " + b.value end def canonical a.value + " " + b.value end + + def pos + {a.interval.begin => ['genus', a.interval.end], b.interval.begin => ['species', b.interval.end]} + end + def details {:genus => a.value, :species => b.value, :cross => 'inside'} end end @@ -3725,10 +3882,15 @@ a.value + " " + b.value + " " + c.value end def canonical a.value + " " + c.value end + + def pos + {a.interval.begin => ['genus', a.interval.end]}.merge(b.pos).merge({c.interval.begin => ['subspecies', c.interval.end]}) + end + def details {:genus => a.value, :subgenus => b.details, :species => c.value} end end @@ -3752,10 +3914,14 @@ end def canonical value end + def pos + {a.interval.begin => ['genus', a.interval.end], b.interval.begin => ['species', b.interval.end]} + end + def details {:genus => a.value, :species => b.value} end end @@ -3932,10 +4098,15 @@ module Subgenus1 def value "(" + a.value + ")" end + + def pos + {a.interval.begin => ['subgenus', a.interval.end]} + end + def details a.value end end @@ -4117,17 +4288,21 @@ end module CapLatinWord1 def value - a.text_value + b.value + (a.value rescue a.text_value) + b.value end def canonical value end + def pos + {a.interval.begin => ['uninomial', a.interval.end]} + end + def details {:uninomial => value} end end @@ -4141,17 +4316,21 @@ end end module CapLatinWord3 def value - a.text_value + b.value + (a.value rescue a.text_value) + b.value end def canonical value end + def pos + {a.interval.begin => ['uninomial',b.interval.end]} + end + def details {:uninomial => value} end end @@ -4162,10 +4341,14 @@ def canonical value end + def pos + {interval.begin => ['uninomial', interval.end]} + end + def details {:uninomial => value} end end @@ -4177,29 +4360,41 @@ return cached end i0 = index i1, s1 = index, [] - if input.index(Regexp.new('[A-ZŒÆ]'), index) == index - r2 = instantiate_node(SyntaxNode,input, index...(index + 1)) + i2 = index + if input.index(Regexp.new('[A-Z]'), index) == index + r3 = instantiate_node(SyntaxNode,input, index...(index + 1)) @index += 1 else - r2 = nil + r3 = nil end + if r3 + r2 = r3 + else + r4 = _nt_cap_digraph + if r4 + r2 = r4 + else + self.index = i2 + r2 = nil + end + end s1 << r2 if r2 - r3 = _nt_latin_word - s1 << r3 - if r3 + r5 = _nt_latin_word + s1 << r5 + if r5 if input.index("?", index) == index - r4 = instantiate_node(SyntaxNode,input, index...(index + 1)) + r6 = instantiate_node(SyntaxNode,input, index...(index + 1)) @index += 1 else terminal_parse_failure("?") - r4 = nil + r6 = nil end - s1 << r4 + s1 << r6 end end if s1.last r1 = instantiate_node(SyntaxNode,input, i1...index, s1) r1.extend(CapLatinWord0) @@ -4209,245 +4404,257 @@ r1 = nil end if r1 r0 = r1 else - i5, s5 = index, [] - if input.index(Regexp.new('[A-ZŒÆ]'), index) == index - r6 = instantiate_node(SyntaxNode,input, index...(index + 1)) + i7, s7 = index, [] + i8 = index + if input.index(Regexp.new('[A-Z]'), index) == index + r9 = instantiate_node(SyntaxNode,input, index...(index + 1)) @index += 1 else - r6 = nil + r9 = nil end - s5 << r6 - if r6 - r7 = _nt_latin_word - s5 << r7 + if r9 + r8 = r9 + else + r10 = _nt_cap_digraph + if r10 + r8 = r10 + else + self.index = i8 + r8 = nil + end end - if s5.last - r5 = instantiate_node(SyntaxNode,input, i5...index, s5) - r5.extend(CapLatinWord2) - r5.extend(CapLatinWord3) + s7 << r8 + if r8 + r11 = _nt_latin_word + s7 << r11 + end + if s7.last + r7 = instantiate_node(SyntaxNode,input, i7...index, s7) + r7.extend(CapLatinWord2) + r7.extend(CapLatinWord3) else - self.index = i5 - r5 = nil + self.index = i7 + r7 = nil end - if r5 - r0 = r5 + if r7 + r0 = r7 else - i8 = index + i12 = index if input.index("Ca", index) == index - r9 = instantiate_node(SyntaxNode,input, index...(index + 2)) + r13 = instantiate_node(SyntaxNode,input, index...(index + 2)) @index += 2 else terminal_parse_failure("Ca") - r9 = nil + r13 = nil end - if r9 - r8 = r9 - r8.extend(CapLatinWord4) + if r13 + r12 = r13 + r12.extend(CapLatinWord4) else if input.index("Ea", index) == index - r10 = instantiate_node(SyntaxNode,input, index...(index + 2)) + r14 = instantiate_node(SyntaxNode,input, index...(index + 2)) @index += 2 else terminal_parse_failure("Ea") - r10 = nil + r14 = nil end - if r10 - r8 = r10 - r8.extend(CapLatinWord4) + if r14 + r12 = r14 + r12.extend(CapLatinWord4) else if input.index("Ge", index) == index - r11 = instantiate_node(SyntaxNode,input, index...(index + 2)) + r15 = instantiate_node(SyntaxNode,input, index...(index + 2)) @index += 2 else terminal_parse_failure("Ge") - r11 = nil + r15 = nil end - if r11 - r8 = r11 - r8.extend(CapLatinWord4) + if r15 + r12 = r15 + r12.extend(CapLatinWord4) else if input.index("Ia", index) == index - r12 = instantiate_node(SyntaxNode,input, index...(index + 2)) + r16 = instantiate_node(SyntaxNode,input, index...(index + 2)) @index += 2 else terminal_parse_failure("Ia") - r12 = nil + r16 = nil end - if r12 - r8 = r12 - r8.extend(CapLatinWord4) + if r16 + r12 = r16 + r12.extend(CapLatinWord4) else if input.index("Io", index) == index - r13 = instantiate_node(SyntaxNode,input, index...(index + 2)) + r17 = instantiate_node(SyntaxNode,input, index...(index + 2)) @index += 2 else terminal_parse_failure("Io") - r13 = nil + r17 = nil end - if r13 - r8 = r13 - r8.extend(CapLatinWord4) + if r17 + r12 = r17 + r12.extend(CapLatinWord4) else if input.index("Io", index) == index - r14 = instantiate_node(SyntaxNode,input, index...(index + 2)) + r18 = instantiate_node(SyntaxNode,input, index...(index + 2)) @index += 2 else terminal_parse_failure("Io") - r14 = nil + r18 = nil end - if r14 - r8 = r14 - r8.extend(CapLatinWord4) + if r18 + r12 = r18 + r12.extend(CapLatinWord4) else if input.index("Ix", index) == index - r15 = instantiate_node(SyntaxNode,input, index...(index + 2)) + r19 = instantiate_node(SyntaxNode,input, index...(index + 2)) @index += 2 else terminal_parse_failure("Ix") - r15 = nil + r19 = nil end - if r15 - r8 = r15 - r8.extend(CapLatinWord4) + if r19 + r12 = r19 + r12.extend(CapLatinWord4) else if input.index("Lo", index) == index - r16 = instantiate_node(SyntaxNode,input, index...(index + 2)) + r20 = instantiate_node(SyntaxNode,input, index...(index + 2)) @index += 2 else terminal_parse_failure("Lo") - r16 = nil + r20 = nil end - if r16 - r8 = r16 - r8.extend(CapLatinWord4) + if r20 + r12 = r20 + r12.extend(CapLatinWord4) else if input.index("Oa", index) == index - r17 = instantiate_node(SyntaxNode,input, index...(index + 2)) + r21 = instantiate_node(SyntaxNode,input, index...(index + 2)) @index += 2 else terminal_parse_failure("Oa") - r17 = nil + r21 = nil end - if r17 - r8 = r17 - r8.extend(CapLatinWord4) + if r21 + r12 = r21 + r12.extend(CapLatinWord4) else if input.index("Ra", index) == index - r18 = instantiate_node(SyntaxNode,input, index...(index + 2)) + r22 = instantiate_node(SyntaxNode,input, index...(index + 2)) @index += 2 else terminal_parse_failure("Ra") - r18 = nil + r22 = nil end - if r18 - r8 = r18 - r8.extend(CapLatinWord4) + if r22 + r12 = r22 + r12.extend(CapLatinWord4) else if input.index("Ty", index) == index - r19 = instantiate_node(SyntaxNode,input, index...(index + 2)) + r23 = instantiate_node(SyntaxNode,input, index...(index + 2)) @index += 2 else terminal_parse_failure("Ty") - r19 = nil + r23 = nil end - if r19 - r8 = r19 - r8.extend(CapLatinWord4) + if r23 + r12 = r23 + r12.extend(CapLatinWord4) else if input.index("Ua", index) == index - r20 = instantiate_node(SyntaxNode,input, index...(index + 2)) + r24 = instantiate_node(SyntaxNode,input, index...(index + 2)) @index += 2 else terminal_parse_failure("Ua") - r20 = nil + r24 = nil end - if r20 - r8 = r20 - r8.extend(CapLatinWord4) + if r24 + r12 = r24 + r12.extend(CapLatinWord4) else if input.index("Aa", index) == index - r21 = instantiate_node(SyntaxNode,input, index...(index + 2)) + r25 = instantiate_node(SyntaxNode,input, index...(index + 2)) @index += 2 else terminal_parse_failure("Aa") - r21 = nil + r25 = nil end - if r21 - r8 = r21 - r8.extend(CapLatinWord4) + if r25 + r12 = r25 + r12.extend(CapLatinWord4) else if input.index("Ja", index) == index - r22 = instantiate_node(SyntaxNode,input, index...(index + 2)) + r26 = instantiate_node(SyntaxNode,input, index...(index + 2)) @index += 2 else terminal_parse_failure("Ja") - r22 = nil + r26 = nil end - if r22 - r8 = r22 - r8.extend(CapLatinWord4) + if r26 + r12 = r26 + r12.extend(CapLatinWord4) else if input.index("Zu", index) == index - r23 = instantiate_node(SyntaxNode,input, index...(index + 2)) + r27 = instantiate_node(SyntaxNode,input, index...(index + 2)) @index += 2 else terminal_parse_failure("Zu") - r23 = nil + r27 = nil end - if r23 - r8 = r23 - r8.extend(CapLatinWord4) + if r27 + r12 = r27 + r12.extend(CapLatinWord4) else if input.index("La", index) == index - r24 = instantiate_node(SyntaxNode,input, index...(index + 2)) + r28 = instantiate_node(SyntaxNode,input, index...(index + 2)) @index += 2 else terminal_parse_failure("La") - r24 = nil + r28 = nil end - if r24 - r8 = r24 - r8.extend(CapLatinWord4) + if r28 + r12 = r28 + r12.extend(CapLatinWord4) else if input.index("Qu", index) == index - r25 = instantiate_node(SyntaxNode,input, index...(index + 2)) + r29 = instantiate_node(SyntaxNode,input, index...(index + 2)) @index += 2 else terminal_parse_failure("Qu") - r25 = nil + r29 = nil end - if r25 - r8 = r25 - r8.extend(CapLatinWord4) + if r29 + r12 = r29 + r12.extend(CapLatinWord4) else if input.index("As", index) == index - r26 = instantiate_node(SyntaxNode,input, index...(index + 2)) + r30 = instantiate_node(SyntaxNode,input, index...(index + 2)) @index += 2 else terminal_parse_failure("As") - r26 = nil + r30 = nil end - if r26 - r8 = r26 - r8.extend(CapLatinWord4) + if r30 + r12 = r30 + r12.extend(CapLatinWord4) else if input.index("Ba", index) == index - r27 = instantiate_node(SyntaxNode,input, index...(index + 2)) + r31 = instantiate_node(SyntaxNode,input, index...(index + 2)) @index += 2 else terminal_parse_failure("Ba") - r27 = nil + r31 = nil end - if r27 - r8 = r27 - r8.extend(CapLatinWord4) + if r31 + r12 = r31 + r12.extend(CapLatinWord4) else - self.index = i8 - r8 = nil + self.index = i12 + r12 = nil end end end end end @@ -4463,12 +4670,12 @@ end end end end end - if r8 - r0 = r8 + if r12 + r0 = r12 else self.index = i0 r0 = nil end end @@ -4725,10 +4932,63 @@ node_cache[:valid_name_letters][start_index] = r0 return r0 end + module CapDigraph0 + def value + 'Ae' + end + end + + module CapDigraph1 + def value + 'Oe' + end + end + + def _nt_cap_digraph + start_index = index + if node_cache[:cap_digraph].has_key?(index) + cached = node_cache[:cap_digraph][index] + @index = cached.interval.end if cached + return cached + end + + i0 = index + if input.index("Æ", index) == index + r1 = instantiate_node(SyntaxNode,input, index...(index + 1)) + r1.extend(CapDigraph0) + @index += 1 + else + terminal_parse_failure("Æ") + r1 = nil + end + if r1 + r0 = r1 + else + if input.index("Œ", index) == index + r2 = instantiate_node(SyntaxNode,input, index...(index + 1)) + r2.extend(CapDigraph1) + @index += 1 + else + terminal_parse_failure("Œ") + r2 = nil + end + if r2 + r0 = r2 + else + self.index = i0 + r0 = nil + end + end + + node_cache[:cap_digraph][start_index] = r0 + + return r0 + end + module Digraph0 def value 'ae' end end @@ -4746,25 +5006,27 @@ @index = cached.interval.end if cached return cached end i0 = index - if input.index(Regexp.new('[æ]'), index) == index + if input.index("æ", index) == index r1 = instantiate_node(SyntaxNode,input, index...(index + 1)) r1.extend(Digraph0) @index += 1 else + terminal_parse_failure("æ") r1 = nil end if r1 r0 = r1 else - if input.index(Regexp.new('[œ]'), index) == index + if input.index("œ", index) == index r2 = instantiate_node(SyntaxNode,input, index...(index + 1)) r2.extend(Digraph1) @index += 1 else + terminal_parse_failure("œ") r2 = nil end if r2 r0 = r2 else @@ -4842,10 +5104,15 @@ module Year0 def value text_value.strip end + + def pos + {interval.begin => ['year', interval.end]} + end + def details {:year => value} end end @@ -4905,9 +5172,14 @@ module YearWithCharacter1 def value a.text_value end + + def pos + {interval.begin => ['year', interval.end]} + end + def details {:year => value} end end