#!/usr/bin/env ruby require_relative '../lib/MESH' require 'json' require 'nokogiri' class Wikidoc < Nokogiri::XML::SAX::Document def initialize(links_by_uri) super() @links_by_uri = links_by_uri end def start_element name, attrs = [] @elements ||= [] @elements.push(name) if name == 'doc' @current_title = '' @current_abstract = '' @current_url = '' end end def characters string if @elements.last == 'title' @current_title = string elsif @elements.last == 'abstract' @current_abstract = string elsif @elements.last == 'url' @current_url = string end end def end_element name if name == 'doc' && @links_by_uri[@current_url] STDERR.print '.' @links_by_uri[@current_url].each do |l| title = @current_title.gsub(/^Wikipedia: /, '') l[:title] = title l[:abstract] = @current_abstract # puts l end elsif name == 'doc' STDERR.print '-' end @elements.pop end end filename = File.expand_path("../../data/mesh_data_2014/d2014.wikipedia.bin.gz", __FILE__) gzipped_file = File.open(filename) file = Zlib::GzipReader.new(gzipped_file) unique_id = nil mh = nil wikipedia_links = [] by_uri = {} headings = [] file.each_line do |line| case when line.match(/^\*NEWRECORD$/) unless unique_id.nil? hash = { ui: unique_id, mh: mh, wikipedia_links: wikipedia_links } headings << hash wikipedia_links.each do |wl| by_uri[wl[:link]] ||= [] by_uri[wl[:link]] << wl end wikipedia_links = [] unique_id = nil mh = nil end when matches = line.match(/^UI = (.*)/) unique_id = matches[1] when matches = line.match(/^MH = (.*)/) mh = matches[1] when matches = line.match(/^WK = (.*)/) score, link, image = matches[1].split ';' hash = {score: score, link: link.strip} hash[:image] = image.strip unless image.nil? wikipedia_links << hash end end parser = Nokogiri::XML::SAX::Parser.new(Wikidoc.new(by_uri)) parser.parse(File.open(ARGV[0])) headings.each do |h| puts '*NEWRECORD' puts "UI = #{h[:ui]}" puts "MH = #{h[:mh]}" h[:wikipedia_links].each do |wl| puts "WK = #{wl.to_json}" end puts '' end