# coding: utf-8 module ONIX # A utility class that processes the code list XSD from the ONIX spec and # creates a set of TSV files. The generated files are used by this library # to make hashes of the code lists available to users. # class CodeListExtractor # Creates a new extractor. Expects the path to a copy of the code lists # file from the spec (called ONIX_BookProduct_CodeLists.xsd on my system). # def initialize(filename) raise ArgumentError, "#{filename} not found" unless File.file?(filename) @filename = filename end # generate a set of TSV files in the given directory. Creates the directory # if it doesn't exist and will overwrite existing files. # def run(dir) FileUtils.mkdir_p(dir) unless File.directory?(dir) each_list do |number, data| #puts number file = number.to_s.rjust(3, "0") + ".tsv" path = File.join(dir, file) File.open(path, "w") { |f| f.write data} end end private def data @data ||= File.open(@filename) { |f| f.read } end def document @document ||= Nokogiri::XML(data) @document.remove_namespaces! if @document.namespaces.size > 0 @document end def each_list(&block) document.xpath("//simpleType").each do |node| list_name = node.xpath("./@name").first.value list_number = list_name[/List(\d+)/,1].to_i if list_number > 0 yield list_number, list_data(list_number) end end end def list_data(num) str = "" nodes = document.xpath("//simpleType[@name='List#{num}']/restriction/enumeration") nodes.each do |node| code = node.xpath("./@value").first.value desc = node.xpath("./annotation/documentation").first.text ldesc = node.xpath("./annotation/documentation").last.text str += "#{code}\t#{desc}\t#{ldesc}\n" end str end end end