require 'mechanize' require 'reverse_adoc' require 'vcr' require 'date' require 'fileutils' require_relative 'asciimath' VCR.configure do |c| c.cassette_library_dir = 'cassettes' c.hook_into :webmock end module Bipm module Data module Importer CONSIDERATIONS = { /(?:having(?: regard)?|ayant|acceptant|concerne|referring|se référant|vu la|agissant conformément)/i => "having / having regard", /(?:noting|took note|note[sd]?|observing|observant que|taking note|takes note|constatant|constate|that|notant|notant que|note également|(?:prend|prenant) (?:acte|note))/i => "noting", /(?:recognizing|recognizes|reconnaissant|reconnaît|acting in accordance|conformément à)/i => "recognizing", /(?:acknowledging|accept(?:s|ed|ing|e)|admet|entendu|empowered by|habilité par)/i => "acknowledging", /(?:(?:further )?recall(?:ing|s)|rappelant|rappelle)/i => "recalling / further recalling", /(?:re-?affirm(?:ing|s)|réaffirme)/i => "reaffirming", /(?:consid(?:ering|érant|ère|ers|ered)|après examen|estime|is of the opinion|examinera)/i => "considering", /(?:taking into account|(prend|prenant) en considération|taking into consideration|tenant compte)/i => "taking into account", "pursuant to" => "pursuant to", /(?:bearing in mind)/i => "bearing in mind", /(?:emphasizing|soulignant)/i => "emphasizing", "concerned" => "concerned" } ACTIONS = { /(?:adopts|adopted?|convient d'adopter)/ => "adopts", /(?:thanks|thanked|expresse[sd](?:[ -]| its )appreciation|appréciant|pays tribute|rend hommage|remercie)/i => "thanks / expresses-appreciation", /(?:approu?ve[ds]?|approuv[ae]nt|approving|entérine|agreed?|supported|soutient|exprime son accord|n'est pas d'accord|convient)/i => "approves", /(?:d[eé]cid(?:e[ds]?|é)|ratifies?|judges|d[ée]clares?|d[ée]finition|sanction(?:s|ne))/i => "decides", /(?:The unit of length is|Supplementary units|Principl?es|Les Délégués des États|Les v\u{9C}ux ou propositions)/i => "decides", # MISC - like declares/defines /(?:L'unité de longueur|Unités supplémentaires|New candle|New lumen|Definitions of|Cubic decimetre|Clarification of|Revision of)/i => "decides", # MISC - like declares/defines /(?:Unit of force|Définitions des|Décimètre cube|Étalons secondaires|Unité spéciale|Efficacités lumineuses)/i => "decides", # MISC - like declares/defines /(?:Unité de force|(?:Joule|Watt|Volt|Ohm|Amp[eè]re|Coulomb|Farad|Henry|Weber) \(unité?|Bougie nouvelle|Lumen nouveau)/i => "decides", # MISC - like declares/defines /(?:Les unités photométriques|\(A\) D[eé]finitions|The photometric units|will (?:provide|circulate|issue|identify|notify|contact|review))/i => "decides", # MISC - like declares/defines /(?:Appendix 1 of the|L'Annexe 1 de la|increased|a (?:examiné|préparé)|transmettra|fournira|increased|developed a document|prendra contact)/i => "decides", # MISC - like declares/defines /(?:asks|asked|souhaite|souhaiterait)/i => "asks", /(?:further )?invit(?:[ée][ds]?|era)|renouvelle en conséquence|convient d'inviter/i => "invites / further invites", /(?:resolve[sd]?)/i => "resolves", /(?:confirms|confirmed?|confirme que)/i => "confirms", /(?:welcome[sd]?|accueille favorablement|salue)/i => "welcomes", /(?:recomm(?:ends|ande|ended)|endorsed)/i => "recommends", /(?:requests?|requested|demande(?:ra)?|requiert)/i => "requests", /(?:congratulate[sd]?|félicite)/i => "congratulates", /(?:instructs|instructed)/i => "instructs", /(?:urges|prie instamment)/i => "urges", /(?:appoints|(?:re)?appointed|granted|reconduit|commended|élit|nomme|elected|autorise|authorized|empowers|charged?)/i => "appoints", /(?:donne|habilite|nominated|Pendant la période|voted|established a \w+ task group|gave the \w+ \w+ the authority)/i => "appoints", /(?:convient d'éablir|transfère|confie|établit|Étant donné que trois sièges|As there will be three vacancies)/i => "appoints", /(?:La Recommandation 1 du Groupe|Recommendation 1 of the ad hoc)/i => "appoints", /(?:resolve[sd]? further)/i => "resolves further", /(?:calls upon|draws the attention|attire l'attention|lance un appel)/i => "calls upon", /(?:encourages?d?|espère|propose[ds]?)/i => "encourages", /(?:affirms|reaffirming|réaffirmant|states|remarks|remarques)/i => "affirms / reaffirming", } PREFIX=/(?:(?:The|Le) CIPM |La Conférence |unanimously |would |a |sont |will |were |did not |strongly |The Conference |and |et |has |renouvelle sa |renews its |further |and further |abrogates the |abroge la |En ce qui |après avoir |\.\.\.\n+)?/i SUFFIX=/ (?:that|que)\b|(?: (?:the |that |le |que les )?((?:[A-Z]|national|laboratoires).{0,80}?)(?: to)?\b|)/ module Common def replace_links ps, res ps.css('a[href]').each do |a| href = a.attr('href') # Account for some mistakes from an upstream document href = href.gsub(%r"\A/jen/", '/en/') href = href.gsub(%r"\A/en/CGPM/jsp/", '/en/CGPM/db/') href = case href when %r'\A/(\w{2})/CGPM/db/(\d+)/(\d+)/(#.*)?\z', %r'\A/jsp/(\w{2})/ViewCGPMResolution\.jsp\?CGPM=(\d+)&RES=(\d+)(#.*)?\z' "cgpm-resolution:#{$1}/#{$2}/#{$3}#{$4}" when %r'\A/(\w{2})/CIPM/db/(\d+)/(\d+)/(#.*)?\z' "cipm-resolution:#{$1}/#{$2}/#{$3}#{$4}" when %r'\A/(\w{2})/committees/cipm/meeting/([0-9()I]+).html(#.*)?\z' "cipm-decisions:#{$1}/#{$2}#{$3}" else URI(res.uri).merge(href).to_s # Relative -> absolute end a.set_attribute('href', href) end end def replace_centers ps centers = ps.css('center').to_a while centers.length > 0 center = centers.first current = center mycenters = [center] loop do break unless current.next while Nokogiri::XML::Text === current.next current = current.next break if current.text.strip != '' end break unless current.next break unless current.next.name == "center" current = current.next mycenters << current end centers -= mycenters if mycenters.length > 1 newtable = Nokogiri::HTML::Builder.new do |doc| doc.table { mycenters.each do |i| doc.tr { doc.td { doc << i.inner_html } } end } end.to_html mycenters.first.replace newtable mycenters[1..-1].each &:remove end end # Remove the remaining centers ps.css('center').each do |i| i.replace i.inner_html end end def format_message part AsciiMath.asciidoc_extract_math( ReverseAdoc.convert(part).strip.gsub(" ", ' ') ) end def ng_to_string ps ps.inner_html.encode('utf-8').gsub("\r", '').gsub(%r'','') end def parse_resolution res, res_id, date, type = :cgpm # Reparse the document after fixing upstream syntax fixed_body = res.body.gsub(" [date], "title" => ng.at_css(".txt12pt .SousTitre").text.strip.gsub(/\*\Z/, ''), "identifier" => res_id, "url" => res.uri.to_s, "reference" => nil, "approvals" => [{ "type" => "affirmative", "degree" => "unanimous", "message" => "Unanimous" }], "considerations" => [], "actions" => [], } if refs.length > 0 r["reference"] = res.uri.merge(refs.first.attr('href')).to_s else r.delete("reference") end ps = case type when :cgpm ng.css('td.txt12pt:not([align])') when :cipm ng.css('td.txt12pt td.txt12pt') end #binding.pry if ps.count != 1 # Replace links Common.replace_links(ps, res) # Replace a group of centers (> 1) with a table Common.replace_centers(ps) doc = Common.ng_to_string(ps) # doc = AsciiMath.html_to_asciimath(doc) parts = doc.split(/(\n(?:

)?.*?<\/b>|

(?:après examen |après avoir entendu )|having noted that |decides to define |décide de définir |conformément à l'invitation|acting in accordance with|recommande que les résultats|(?:considers|recommends) that|estime que|declares<\/p>| v, "date_effective" => date, "message" => Common.format_message(part), } end end && next ACTIONS.any? do |k,v| if parse =~ /\A#{PREFIX}#{k}\b/i r["actions"] << prev = { "type" => v, "date_effective" => date, "message" => Common.format_message(part), } end end && next if parse =~ /\A(?:Appendix |Annexe |\()(\d+)/ r["appendices"] ||= [] r["appendices"] << prev = { "identifier" => $1.to_i, "message" => Common.format_message(part), } next end if parse =~ /\A(becquerel|gray, symbol)/ prev["message"] += "\n" + Common.format_message(part) next end next if parse =~ /\A(|\[Cliquer ici\]|Click here)\z/ r["x-unparsed"] ||= [] r["x-unparsed"] << parse #ReverseAdoc.convert(part).strip end %w[considerations actions].each do |type| map = type == 'actions' ? ACTIONS : CONSIDERATIONS r[type] = r[type].map do |i| islist = false kk = nil if map.any? { |k,v| (i["message"].split("\n").first =~ /\A\s*(\*?)(#{PREFIX}#{k})\1?(#{SUFFIX})\1?\s*\z/i) && (kk = k) } prefix = $2 suffix = $3 subject = $4 listmarker = nil listitems = [] if (i["message"].split(/(?= 1 end end if subject #p subject r['subject'] ||= [] r['subject'] << subject end if islist suffix = suffix.strip suffix = nil if suffix == '' listitems.map do |li| i.merge 'message' => [prefix, suffix, li].compact.join(" ") end else i end end.flatten end if r['subject'] r['subject'] = r['subject'].uniq.join(" and ") end r end extend self end end end end