#!/usr/bin/env ruby require_relative '../lib/bipm-data-importer' BASE_DIR = "data" a = Mechanize.new meetings_en = VCR.use_cassette 'cgpm-meetings' do a.get "https://www.bipm.org/en/committees/cg/cgpm/cgpm-resolutions" end meetings_fr = VCR.use_cassette 'cgpm-meetings-fr' do a.get "https://www.bipm.org/fr/committees/cg/cgpm/cgpm-resolutions" end FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings" FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings-fr" FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings-en" [['en', meetings_en], ['fr', meetings_fr]].each do |meeting_lang, meetings| urls = meetings.css('div.publications__content').map do |option| url = option.at_css('a').attr('href') url = url.gsub('/web/guest/', "/#{meeting_lang}/") url.split('/').first(8).join('/') end.uniq urls.each do |url| meeting_id = url.split('/').last.to_i meeting_lang_sfx = (meeting_lang == 'fr') ? "-fr" : "" meeting_lang_sfx_dir = (meeting_lang == 'fr') ? "-fr" : "-en" meeting = VCR.use_cassette("cgpm-meeting-#{meeting_id}#{meeting_lang_sfx}") { a.get url } title = meeting.at_css('h1.session__title, .journal-content-article h1').text.strip date = Bipm::Data::Importer::Common.extract_date(meeting.at_css('p.session__date, .journal-content-article h2').text) pdf = Bipm::Data::Importer::Common.extract_pdf(meeting, meeting_lang) h = { "metadata" => { "title" => title, "identifier" => meeting_id, "date" => date.to_s, "source" => "BIPM - Pavillon de Breteuil", "url" => meeting.uri.to_s } } h["pdf"] = pdf if pdf resolutions = meeting.links_with(href: %r).map(&:href) # A mistake on a website, resolution 5 listed 4 times... # https://www.bipm.org/fr/committees/cg/cgpm/8-1933 if meeting_id == 8 && meeting_lang == 'fr' && resolutions.sort.uniq != resolutions.sort resolutions = (1..15).map do |i| "https://www.bipm.org/en/committees/cg/cgpm/8-1933/resolution-#{i}" end end h["resolutions"] = resolutions.map do |res_link| res_id = (res_link.split('-')[2] || 0).to_i res_link = res_link.gsub('/web/guest/', "/#{meeting_lang}/") res = VCR.use_cassette("cgpm-resolution-#{meeting_id}-#{res_id}#{meeting_lang_sfx}") { a.get res_link } Bipm::Data::Importer::Common.parse_resolution(res, res_id, date, :cgpm, meeting_lang, "resolution?") end FileUtils.mkdir_p("#{BASE_DIR}/cgpm/meetings#{meeting_lang_sfx_dir}") File.write("#{BASE_DIR}/cgpm/meetings#{meeting_lang_sfx_dir}/meeting-#{"%02d" % meeting_id}.yml", YAML.dump(h)) end end