Sha256: a522dfccc5297b3675919a570e138a484b1953b5f62d6345479af626f73776a8

Contents?: true

Size: 1.34 KB

Versions: 1

Compression:

Stored size: 1.34 KB

Contents

#!/usr/bin/env ruby

require 'json'
require 'net/http'
require 'nokogiri'

root_uri = URI.parse('https://www.evidence.nhs.uk/formulary/bnfc/current')
root_response = Net::HTTP.get_response(root_uri)

root_ng = Nokogiri::HTML.parse(root_response.body)
date_node = root_ng.css('#logo span')
date = date_node.text
link_nodes = root_ng.css('div#main a')
target_uris = link_nodes.map do |ln|
  URI.join(root_uri, ln.attributes['href'])
end

bnf_graph = { bnf_date: date, crawled: Time.now.strftime('%Y-%m-%dT%H:%M:%S%z'), entries: {} }
total = target_uris.length
target_uris.each_with_index do |tu,i|
  puts "#{i}/#{total} — #{tu}"
  bnf_graph[:entries][tu] ||= {}
  g = bnf_graph[:entries][tu]
  response = Net::HTTP.get_response(tu)
  ng = Nokogiri::HTML.parse(response.body)
  title = ng.css('article header h1 > text()')
  puts title
  g[:title] = title.text.strip
  h2s = ng.css('h2')
  h2s.each do |h2|
    g[:headings] ||= {}
    section = h2.parent
    g[:headings][h2.text.strip] = section.to_s
    puts "\t#{h2.text}"
  end

  breadcrumbs = ng.css('div.breadcrumb a')
  parent_uri = URI.join(tu, breadcrumbs.last['href'])
  bnf_graph[:entries][parent_uri] ||= {}
  bnf_graph[:entries][parent_uri][:children] ||= []
  bnf_graph[:entries][parent_uri][:children] << tu
end

out_file = File.open(ARGV[0], 'w')
out_file.puts bnf_graph.to_json
out_file.flush
out_file.close

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
BNF-0.0.1 bin/crawl_bnf