#!/usr/bin/env ruby require 'json' require 'net/http' require 'nokogiri' root_uri = URI.parse('https://www.evidence.nhs.uk/formulary/bnfc/current') root_response = Net::HTTP.get_response(root_uri) root_ng = Nokogiri::HTML.parse(root_response.body) date_node = root_ng.css('#logo span') date = date_node.text link_nodes = root_ng.css('div#main a') target_uris = link_nodes.map do |ln| URI.join(root_uri, ln.attributes['href']) end bnf_graph = { bnf_date: date, crawled: Time.now.strftime('%Y-%m-%dT%H:%M:%S%z'), entries: {} } total = target_uris.length target_uris.each_with_index do |tu,i| puts "#{i}/#{total} — #{tu}" bnf_graph[:entries][tu] ||= {} g = bnf_graph[:entries][tu] response = Net::HTTP.get_response(tu) ng = Nokogiri::HTML.parse(response.body) title = ng.css('article header h1 > text()') puts title g[:title] = title.text.strip h2s = ng.css('h2') h2s.each do |h2| g[:headings] ||= {} section = h2.parent g[:headings][h2.text.strip] = section.to_s puts "\t#{h2.text}" end breadcrumbs = ng.css('div.breadcrumb a') parent_uri = URI.join(tu, breadcrumbs.last['href']) bnf_graph[:entries][parent_uri] ||= {} bnf_graph[:entries][parent_uri][:children] ||= [] bnf_graph[:entries][parent_uri][:children] << tu end out_file = File.open(ARGV[0], 'w') out_file.puts bnf_graph.to_json out_file.flush out_file.close