# encoding: UTF-8 require "unicode_utils" grammar ScientificNameClean rule root space a:scientific_name_5 space { def value a.value.gsub(/\s{2,}/, " ").strip end def canonical a.canonical.gsub(/\s{2,}/, " ").strip end def pos a.pos end def hybrid a.hybrid end def details a.details.class == Array ? a.details : [a.details] end def parser_run 1 end } end rule scientific_name_5 a:multinomial_name space_hard hybrid_character space_hard b:species { def value a.value + " × " + b.value end def canonical a.canonical + " × " + b.canonical end def pos a.pos.merge(b.pos) end def hybrid true end def details [a.details, b.details.merge({:genus => a.details[:genus]})] end } / a:scientific_name_1 space b:taxon_concept_rank space c:authorship { def value a.value + " " + b.apply(c) end def canonical a.canonical end def pos a.pos.merge(c.pos) end def hybrid a.hybrid end def details a.details.merge(b.details(c)) end } / scientific_name_4 end rule scientific_name_4 a:scientific_name_1 space hybrid_character space b:scientific_name_1 { def value a.value + " × " + b.value end def canonical a.canonical + " × " + b.canonical end def pos a.pos.merge(b.pos) end def hybrid true end def details [a.details, b.details] end } / a:scientific_name_1 space hybrid_character space [\?]? { def value a.value + " × ?" end def canonical a.canonical end def pos a.pos end def hybrid true end def details [a.details, "?"] end } / scientific_name_3 end rule scientific_name_3 a:hybrid_character space b:scientific_name_2 { def value a.value + " " + b.value end def canonical b.canonical end def pos b.pos end def hybrid true end def details b.details end } / scientific_name_2 end rule scientific_name_2 a:scientific_name_1 space b:status_part { def value a.value + " " + b.value end def canonical a.canonical end def pos a.pos end def hybrid a.hybrid rescue false end def details a.details.merge(b.details) end } / scientific_name_1 end rule scientific_name_1 multiuninomial_name / multinomial_name / uninomial_name end rule status_part a:status_word space b:status_part { def value a.value + " " + b.value end def details {:status => value} end } / status_word end rule status_word latin_word [\.] { def value text_value.strip end def details {:status => value} end } #/ #latin_word end rule unparsed .+ space { def value "" end def hybrid false end def canonical "" end def pos {interval.begin => ["unparsed", interval.end]} end def details {:unparsed => text_value} end } end rule multinomial_name a:genus space b:infragenus space aid:annotation_identification? space c:species space_hard d:infraspecies_mult { def value a.value + " " + b.value + " " + c.value + " " + d.value end def canonical a.canonical + " " + c.canonical + " " + d.canonical end def pos a.pos.merge(b.pos).merge(c.pos).merge(d.pos) end def hybrid c.hybrid rescue false end def details a.details.merge(b.details).merge(c.details).merge(d.details) end } / a:genus space b:infragenus space aid:annotation_identification? space c:species space aid:annotation_identification space d:infraspecies_mult { def value a.value + " " + b.value + " " + c.value + " " + d.value end def canonical a.canonical + " " + c.canonical + " " + d.canonical end def pos a.pos.merge(b.pos).merge(c.pos).merge(d.pos) end def hybrid c.hybrid rescue false end def details a.details.merge(b.details).merge(c.details).merge(d.details) end } / a:genus space b:infragenus space aid:annotation_identification? space c:species { def value if defined? aid.apply a.value + " " + b.value + aid.apply(c) else a.value + " " + b.value + " " + c.value end end def canonical if defined? aid.apply a.canonical + aid.canonical(c) else a.canonical + " " + c.canonical end end def pos if defined? aid.apply a.pos.merge(b.pos).merge(aid.pos(c)) else a.pos.merge(b.pos).merge(c.pos) end end def hybrid c.hybrid rescue false end def details if defined? aid.apply a.details.merge(b.details).merge(aid.apply(c)) else a.details.merge(b.details).merge(c.details) end end } / a:genus space aid:annotation_identification? space b:species space_hard c:infraspecies_mult { def value a.value + " " + b.value + " " + c.value end def canonical a.canonical + " " + b.canonical + " " + c.canonical end def pos a.pos.merge(b.pos).merge(c.pos) end def hybrid b.hybrid rescue false end def details a.details.merge(b.details).merge(c.details) end } / a:genus space aid:annotation_identification? space b:species { def value if defined? aid.apply a.value + aid.apply(b) else a.value + " " + b.value end end def canonical if defined? aid.apply a.canonical + aid.canonical(b) else a.canonical + " " + b.canonical end end def pos if defined? aid.apply a.pos.merge(aid.pos(b)) else a.pos.merge(b.pos) end end def hybrid b.hybrid rescue false end def details if defined? aid.apply a.details.merge(aid.details(b)) else a.details.merge(b.details) end end } / a:genus space aid:annotation_identification space b:unparsed { def value a.value + aid.apply(b) end def canonical a.canonical + aid.canonical(b) end def pos a.pos.merge(aid.pos(b)) end def hybrid false end def details a.details.merge(aid.details(b)) end } end rule multiuninomial_name a:uninomial_name space b:rank_uninomial space c:uninomial_name { def value a.value + " " + b.value + " " + c.value end def canonical a.canonical end def hybrid false end def pos a.pos.merge(b.pos(c)) end def details a.details.merge(b.details(c)) end } end rule infraspecies_mult a:infraspecies space b:infraspecies_mult { def value a.value + " " + b.value end def canonical a.canonical + " " + b.canonical end def pos a.pos.merge(b.pos) end def details a_array = a.details[:infraspecies].class == Array ? a.details[:infraspecies] : [a.details[:infraspecies]] b_array = b.details[:infraspecies].class == Array ? b.details[:infraspecies] : [b.details[:infraspecies]] a.details.merge({:infraspecies => a_array + b_array}) end } / infraspecies { def details if super[:annotation_identification] {:infraspecies => [{:annotation_identification => super[:annotation_identification], :ignored => super[:ignored]}]} else {:infraspecies => [super[:infraspecies]]} end end } end rule infraspecies a:infraspecies_string space b:authorship { def value a.value + " " + b.value end def canonical a.canonical end def pos a.pos.merge(b.pos) end def details {:infraspecies => a.details[:infraspecies].merge(b.details)} end } / infraspecies_string end rule infraspecies_string sel:rank space a:species_word { def value sel.apply(a) end def canonical sel.canonical(a) end def pos sel.pos(a) end def details sel.details(a) end } / aid:annotation_identification space a:species_word ![\.] { def value aid.apply(a) end def canonical aid.canonical(a) end def pos def a.pos {interval.begin => ["infraspecies", a.interval.end]} end aid.pos(a) end def details def a.details {:infraspecies => {:string => value, :rank => "n/a"}} end aid.details(a) end } / a:species_word ![\.] { def value a.value end def canonical value end def pos {interval.begin => ["infraspecies", interval.end]} end def details {:infraspecies => {:string => value, :rank => "n/a"}} end } end rule taxon_concept_rank ("sec."/"sensu.") { def value "sec." end def apply(a) " " + value + " " + a.value end def details(a = nil) {:taxon_concept => a.details} end } end rule rank ("morph."/"f.sp."/"B "/"ssp."/"ssp "/"mut."/"nat "/"nothosubsp."/"convar."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"var "/"subsp."/"subsp "/"subf."/"race "/"forma."/"forma "/"fma."/"fma "/"form."/"form "/"fo."/"fo "/"f."/"α"/"ββ"/"β"/"γ"/"δ"/"ε"/"φ"/"θ"/"μ"/"a."/"b."/"c."/"d."/"e."/"g."/"k."/"****"/"**"/"*") { def value text_value.strip end def apply(a) " " + text_value.strip + " " + a.value end def canonical(a) " " + a.value end def pos(a) interval_end = text_value[-1] == " " ? interval.end - 1 : interval.end {interval.begin => ["infraspecific_type", interval_end], a.interval.begin => ["infraspecies", a.interval.end]} end def details(a = nil) {:infraspecies => {:string => (a.value rescue nil), :rank => text_value.strip}} end } end rule rank_uninomial ("sect."/"sect "/"subsect."/"subsect "/"trib."/"trib "/"subtrib."/"subtrib "/"ser."/"ser "/"subgen."/"subgen "/"fam."/"fam "/"subfam."/"subfam "/"supertrib."/"supertrib ") { def value text_value.strip end def pos(uni) {interval.begin => ["rank_uninomial", interval.end], uni.interval.begin => ["uninomial", uni.interval.end]} end def details(uni) {:rank_uninomials => value, :uninomial2 => uni.details[:uninomial]} end } end rule species a:species_string space b:authorship { def value a.value + " " + b.value end def canonical a.canonical end def hybrid a.hybrid rescue false end def pos a.pos.merge(b.pos) end def details {:species => a.details[:species].merge(b.details)} end } / species_string end rule species_string species_word { def canonical value end def pos {interval.begin => ["species", interval.end]} end def hybrid false end def details {:species => {:string => value}} end } / species_word_hybrid end rule infragenus left_paren space a:(cap_latin_word/capped_dotted_char) space right_paren { def value "(" + a.value + ")" end def canonical a.value end def pos {a.interval.begin => ["infragenus", a.interval.end]} end def details {:infragenus => {:string => a.value}} end } end rule genus a:(abbreviated_genus/uninomial_string) !(space_hard author_prefix_word space_hard author_word) { def value a.value end def pos {a.interval.begin => ["genus", a.interval.end]} end def canonical a.value end def details {:genus => {:string => a.value}} end } end rule abbreviated_genus [A-Z] [a-z]? [a-z]? [\\.] space { def value text_value.strip end def canonical value end def pos {interval.begin => ["abbreviated_genus", interval.end]} end def details {:abbreviated_genus => {:string => value}} end } end rule uninomial_name a:uninomial_string space b:infragenus space c:simple_authorship { def value a.value + " " + b.value + " " + c.value end def canonical a.canonical end def pos a.pos.merge(b.pos).merge(c.pos) end def hybrid false end def details {:uninomial => a.details[:uninomial].merge(b.details).merge(c.details)} end } / a:uninomial_string space b:infragenus { def value a.value + " " + b.value end def canonical a.canonical end def pos a.pos.merge(b.pos) end def hybrid false end def details {:uninomial => a.details[:uninomial].merge(b.details)} end } / a:uninomial_string space_hard b:authorship { def value a.value + " " + b.value end def canonical a.canonical end def pos a.pos.merge(b.pos) end def hybrid false end def details {:uninomial => a.details[:uninomial].merge(b.details)} end } / uninomial_string end rule uninomial_string (cap_latin_word_pair/cap_latin_word) { def canonical value end def pos {interval.begin => ["uninomial", interval.end]} end def hybrid false end def details {:uninomial => {:string => value}} end } end rule authorship a:basionym_authorship_with_parenthesis space b:simple_authorship ","? space c:ex_authorship { def value a.value + " " + b.value + " " + c.value end def pos a.pos.merge(b.pos).merge(c.pos) end def details val = {:authorship => text_value.strip, :combinationAuthorTeam => b.details[:basionymAuthorTeam], :basionymAuthorTeam => a.details[:basionymAuthorTeam]} val[:combinationAuthorTeam].merge!(c.details) val end } / a:basionym_authorship_with_parenthesis space b:simple_authorship { def value a.value + " " + b.value end def pos a.pos.merge(b.pos) end def details {:authorship => text_value.strip, :combinationAuthorTeam => b.details[:basionymAuthorTeam], :basionymAuthorTeam => a.details[:basionymAuthorTeam]} end } / basionym_authorship_with_parenthesis / a:simple_authorship ","? space b:ex_authorship { def value a.value + " " + b.value end def pos a.pos.merge(b.pos) end def details val = a.details val[:authorship] = text_value.strip val[:basionymAuthorTeam].merge!(b.details) val end } / simple_authorship end rule basionym_authorship_with_parenthesis left_paren space a:authors_names space right_paren space [,]? space b:year { def value "(" + a.value + " " + b.value + ")" end def pos a.pos.merge(b.pos) end def details { :authorship => text_value, :basionymAuthorTeam => {:author_team => text_value}.merge(a.details).merge(b.details) } end } / left_paren space a:simple_authorship ","? space b:ex_authorship space right_paren { def value "(" + a.value + " " + b.value + ")" end def pos a.pos.merge(b.pos) end def details val = a.details val[:basionymAuthorTeam].merge!(b.details) val[:authorship] = text_value.strip val end } / left_paren space a:simple_authorship space right_paren { def value "(" + a.value + ")" end def pos a.pos end def details val = a.details val[:authorship] = text_value val end } / left_paren space a:"?" space right_paren { def value "(?)" end def pos {a.interval.begin => ["unknown_author", a.interval.end]} end def details {:authorship => text_value, :basionymAuthorTeam => {:authorTeam => text_value, :author => ["?"]}} end } end rule ex_authorship ex_sep space b:simple_authorship { def value " ex " + b.value end def pos b.pos end def details val = {:exAuthorTeam => {:authorTeam => b.text_value.strip}.merge(b.details[:basionymAuthorTeam])} val end } end rule simple_authorship a:authors_names space [,]? space b:year? [,]? space "non" space authors_names space [,]? space year { def value a.value + " " + b.value end def pos a.pos.merge(b.pos) end def details details_with_arg(:basionymAuthorTeam) end def details_with_arg(authorTeamType = "basionymAuthorTeam") { :authorship => text_value, authorTeamType.to_sym => { :authorTeam => a.text_value.strip }.merge(a.details).merge(b.details) } end } / a:authors_names space [,]? space b:year { def value a.value + " " + b.value end def pos a.pos.merge(b.pos) end def details details_with_arg(:basionymAuthorTeam) end def details_with_arg(authorTeamType = "basionymAuthorTeam") { :authorship => text_value, authorTeamType.to_sym => { :authorTeam => a.text_value.strip }.merge(a.details).merge(b.details) } end } / authors_names { def details details = details_with_arg(:basionymAuthorTeam) details[:basionymAuthorTeam].merge!(super) details end def details_with_arg(authorTeamType = "basionymAuthorTeam") { :authorship => text_value, authorTeamType.to_sym => { :authorTeam => text_value, } } end } end rule authors_names a:author_name space sep:author_separator space b:authors_names { def value sep.apply(a,b) end def pos sep.pos(a,b) end def details sep.details(a,b) end } / author_name / unknown_auth end rule unknown_auth ("auct."/"auct"/"hort."/"hort"/"anon."/"anon"/"ht."/"ht") !latin_word { def value text_value end def pos {interval.begin => ["unknown_author", interval.end]} end def details {:author => ["unknown"]} end } end rule ex_sep ("ex"/"in") &[\s] end rule author_separator ("&"/"&"/","/"and"/"et") { def apply(a,b) sep = text_value.strip sep = " &" if ["&", "&","and","et"].include? sep a.value + sep + " " + b.value end def pos(a,b) a.pos.merge(b.pos) end def details(a,b) {:author => a.details[:author] + b.details[:author]} end } end rule author_name space a:author_name_without_postfix space b:author_postfix_word space !latin_word { def value a.value + " " + b.value end def pos a.pos.merge(b.pos) end def details {:author => [value]} end } / author_name_without_postfix end rule author_name_without_postfix space a:author_prefix_word space b:author_name { def value a.value + " " + b.value end def pos a.pos.merge(b.pos) end def details {:author => [value]} end } / a:author_word space b:author_name { def value a.value + " " + b.value end def pos a.pos.merge(b.pos) end def details {:author => [value]} end } / author_word end rule author_word "A S. Xu" { def value text_value.strip end def pos {interval.begin => ["author_word", 1], (interval.begin + 2) => ["author_word", 2], (interval.begin + 5) => ["author_word", 2]} end def details {:author => [value]} end } / ("arg."/"et al.\{\?\}"/"et al."/"et al") { def value text_value.strip end def pos #cheating because there are several words in some of them {interval.begin => ["author_word", interval.end]} end def details {:author => [value]} end } / ("Å"/"Ö"/"Á"/"Ø"/"Ô"/"Š"/"Ś"/"Č"/"Ķ"/"Ł"/"É"/"Ž"/[A-W]/[Y-Z]) [^0-9\[\]\(\)\s&,]* { def value text_value.gsub(/([\p{Lu}]{3,})/) do |match| UnicodeUtils.titlecase(match) end end def pos {interval.begin => ["author_word", interval.end]} end def details {:author => [value]} end } / "X" [^0-9\[\]\(\)\s&,]+ { def value text_value end def pos {interval.begin => ["author_word", interval.end]} end def details {:author => [value]} end } / author_prefix_word end rule author_prefix_word space ("ab"/"af"/"bis"/"da"/"der"/"des"/"den"/"della"/"dela"/"de"/"di"/"du"/"la"/"ter"/"van"/"von") &space_hard { def value text_value end def pos #cheating because there are several words in some of them {interval.begin => ["author_word", interval.end]} end } end rule author_postfix_word ("f."/"filius") { def value text_value.strip end def pos {interval.begin => ["author_word", interval.end]} end } end rule cap_latin_word_pair a:cap_latin_word "-" b:cap_latin_word { def value a.value + b.value.downcase end } end rule cap_latin_word a:([A-Z]/cap_digraph) b:latin_word "?" { def value (a.value rescue a.text_value) + b.value end } / a:([A-Z]/cap_digraph) b:latin_word { def value (a.value rescue a.text_value) + b.value end } / a:("AE"/"OE") b:latin_word { def value a.text_value[0..0] + "e" + b.value end } / ("Ca"/"Ea"/"Ge"/"Ia"/"Io"/"Io"/"Ix"/"Lo"/"Oa"/"Ra"/"Ty"/"Ua"/"Aa"/"Ja"/"Zu"/"La"/"Qu"/"As"/"Ba") { def value text_value end } end rule capped_dotted_char [A-Z] "." { def value text_value end } end rule species_word_hybrid a:multiplication_sign space b:species_word { def value a.value + " " + b.value end def canonical b.value end def hybrid true end def pos {b.interval.begin => ["species", b.interval.end]} end def details {:species => {:string => b.value}} end } / a:"X" space b:species_word { def value "× " + b.value end def canonical b.value end def hybrid true end def pos {b.interval.begin => ["species", b.interval.end]} end def details {:species => {:string => b.value}} end } / a:"x" space_hard b:species_word { def value "× " + b.value end def canonical b.value end def hybrid true end def pos {b.interval.begin => ["species", b.interval.end]} end def details {:species => {:string => b.value}} end } end rule annotation_identification ("sp.nr."/"sp. nr."/"nr."/"nr "/"sp.aff."/"sp. aff."/"sp."/"sp "/"species"/"spp."/"spp "/"aff."/"aff "/"monst."/"? ") { def value text_value.strip end def apply(sp) "" end def canonical(sp) "" end def pos(sp) interval_end = text_value[-1] == " " ? interval.end - 1 : interval.end {interval.begin => ["annotation_identification", interval.end]} end def details(sp) {:annotation_identification => value, :ignored => sp.details} end } / ("cf."/"cf ") { def value text_value.strip end def apply(sp) " " + value + " " + sp.value end def canonical(sp) " " + sp.canonical end def pos(sp) interval_end = text_value[-1] == " " ? interval.end - 1 : interval.end {interval.begin => ["annotation_identification", interval.end]}.merge(sp.pos) end def details(sp) {:annotation_identification => value, :species => sp.details} end } end rule species_word a:[0-9]+ "-"? b:latin_word { def value num = {"1" => "uni", "2" => "du", "3" => "tri", "4" => "quadri", "5" => "quinque", "6" => "hexa", "7" => "septem", "8" => "octo", "9" => "novem", "10" => "decem", "11" => "undecim", "12" => "duodec", "13" => "tredec", "14" => "quattuordec", "15" => "quinquadec", "16" => "hexadec", "17" => "septendec", "18" => "octodec", "19" => "novemdec", "20" => "viginti", "21" => "unviginti", "22" => "duodeviginti", "23" => "triviginti", "24" => "quattuorviginti", "25" => "quinquatviginti", "26" => "hexaviginti", "27" => "septenviginti", "28" => "octoviginti", "29" => "novemviginti", "30" => "triginta", "38" => "trigintaocto", "100" => "centi"} a_value = num[a.text_value] ? num[a.text_value] : a.text_value + "-" a_value + b.value end } / latin_word end rule latin_word a:valid_name_letters "-" b:latin_word { def value a.value + "-" + b.value end } / a:valid_name_letter "'" b:latin_word { def value a.value + "'" + b.value end } / a:valid_name_letter b:valid_name_letters { def value a.value + b.value end } end rule valid_name_letters [a-zëæœ]+ { def value res = "" text_value.split("").each do |l| l = "ae" if l == "æ" l = "oe" if l == "œ" # We normalize ë as well. It is legal in botanical code, but it # is beneficial to normalize it for the reconsiliation purposes l = "e" if l == "ë" res << l end res end } end rule valid_name_letter [a-zëæœ] { def value res = text_value res = "ae" if res == "æ" res = "oe" if res == "œ" res = "e" if res == "ë" res end } end rule cap_digraph "Æ" { def value "Ae" end } / "Œ" { def value "Oe" end } end rule year b:left_paren space a:(year_number_with_character/year_number) space c:right_paren { def value a.value end def pos a.pos end def details a.details end } / year_number_with_character / year_number end rule year_number_with_character a:year_number [a-zA-Z] { def value a.text_value end def pos {interval.begin => ["year", interval.end]} end def details {:year => value} end } end rule year_number [12] [7890] [0-9] ([0-9] [\?]?/"?") { def value text_value end def pos {interval.begin => ["year", interval.end]} end def details {:year => value} end } end rule left_paren "(" end rule right_paren ")" end rule hybrid_character ("x"/"X") { def value "×" end } / multiplication_sign end rule multiplication_sign ("×"/"*") { def value "×" end } end rule space [\s]* end rule space_hard [\s]+ end end