require "stepmod/utils/stepmod_definition_converter" require "stepmod/utils/bibdata" require "stepmod/utils/concept" require "glossarist" require "securerandom" ReverseAdoc.config.unknown_tags = :bypass module Stepmod module Utils class TermsExtractor # TODO: we may want a command line option to override this in the future ACCEPTED_STAGES = %w(IS DIS FDIS TS).freeze WITHDRAWN_STATUS = "withdrawn".freeze attr_reader :stepmod_path, :stepmod_dir, :general_concepts, :resource_concepts, :parsed_bibliography, :encountered_terms, :cvs_mode, :part_concepts, :part_resources, :part_modules, :stdout def self.call(stepmod_dir, stdout = $stdout) new(stepmod_dir, stdout).call end def initialize(stepmod_dir, stdout) @stdout = stdout @stepmod_dir = stepmod_dir @stepmod_path = Pathname.new(stepmod_dir).realpath @general_concepts = Glossarist::Collection.new @resource_concepts = Glossarist::Collection.new @parsed_bibliography = [] @part_concepts = [] @part_resources = [] @part_modules = [] @encountered_terms = {} end def log(message) stdout.puts "[stepmod-utils] #{message}" end def term_special_category(bibdata) case bibdata.part.to_i when 41, 42, 43, 44, 45, 46, 47, 51 true when [56..112] true else false end end def call # If we are using the stepmod CVS repository, provide the revision number per file @cvs_mode = if Dir.exists?(stepmod_path.join("CVS")) require "ptools" # ptools provides File.which File.which("cvs") end log "INFO: STEPmod directory set to #{stepmod_dir}." if cvs_mode log "INFO: STEPmod directory is a \ CVS repository and will detect revisions." log "INFO: [CVS] Detecting file revisions can be slow, \ please be patient!" else log "INFO: STEPmod directory is not a CVS repository, \ skipping revision detection." end log "INFO: Detecting paths..." repo_index = Nokogiri::XML(File.read(stepmod_path.join("repository_index.xml"))).root files = [] # add module paths repo_index.xpath("//module").each do |x| next if x['status'] == WITHDRAWN_STATUS path = Pathname.new("#{stepmod_dir}/modules/#{x['name']}/module.xml") files << path if File.exists? path end # add resource_docs paths repo_index.xpath("//resource_doc").each do |x| next if x['status'] == WITHDRAWN_STATUS path = Pathname.new("#{stepmod_dir}/resource_docs/#{x['name']}/resource.xml") files << path if File.exists? path end # add business_object_models paths repo_index.xpath("//business_object_model").each do |x| next if x['status'] == WITHDRAWN_STATUS path = Pathname.new("#{stepmod_dir}/business_object_models/#{x['name']}/business_object_model.xml") files << path if File.exists? path end # add application_protocols paths repo_index.xpath("//application_protocol").each do |x| next if x['status'] == WITHDRAWN_STATUS path = Pathname.new("#{stepmod_dir}/application_protocols/#{x['name']}/application_protocol.xml") files << path if File.exists? path end files.sort!.uniq! process_term_files(files) [ general_concepts, resource_concepts, parsed_bibliography, part_concepts, part_resources, part_modules, ] end private def process_term_files(files) parsed_schema_names = {} files.each do |file_path| file_path = file_path.realpath fpath = file_path.relative_path_from(stepmod_path) log "INFO: Processing XML file #{fpath}" current_document = Nokogiri::XML(File.read(file_path)).root bibdata = nil begin bibdata = Stepmod::Utils::Bibdata.new(document: current_document) rescue StandardError log "WARNING: Unknown file #{fpath}, skipped" next end unless ACCEPTED_STAGES.include? bibdata.doctype log "INFO: skipped #{bibdata.docid} as it is not \ one of (#{ACCEPTED_STAGES.join(', ')})." next end if bibdata.part.to_s.empty? log "FATAL: missing `part` attribute: #{fpath}" log "INFO: skipped #{bibdata.docid} as it is missing `part` attribute." next end revision_string = "\n// CVS: revision not detected" if cvs_mode # Run `cvs status` to find out version log "INFO: Detecting CVS revision..." Dir.chdir(stepmod_path) do status = `cvs status #{fpath}` unless status.empty? working_rev = status.split(/\n/).grep(/Working revision:/) .first.match(/revision:\s+(.+)$/)[1] repo_rev = status.split(/\n/).grep(/Repository revision:/) .first.match(/revision:\t(.+)\t/)[1] log "INFO: CVS working rev (#{working_rev}), \ repo rev (#{repo_rev})" revision_string = "\n// CVS working rev: (#{working_rev}), repo rev (#{repo_rev})\n" + "// CVS: revision #{working_rev == repo_rev ? 'up to date' : 'differs'}" end end end # read definitions current_part_concepts = Glossarist::Collection.new definition_index = 0 current_document.xpath("//definition").each do |definition| definition_index += 1 term_id = definition["id"] unless term_id.nil? if encountered_terms[term_id] log "FATAL: Duplicated term with id: #{term_id}, #{fpath}" end encountered_terms[term_id] = true end # Assume that definition is located in clause 3 of the ISO document # in order. We really don't have a good reference here. ref_clause = "3.#{definition_index}" concept = Stepmod::Utils::Concept.parse( definition, reference_anchor: bibdata.anchor, reference_clause: ref_clause, file_path: fpath + revision_string, ) next unless concept if term_special_category(bibdata) # log "INFO: this part is special" find_or_initialize_concept(current_part_concepts, concept) else # log "INFO: this part is generic" find_or_initialize_concept(general_concepts, concept) end parsed_bibliography << bibdata end current_part_resources = Glossarist::Collection.new current_part_modules_arm = {} current_part_modules_mim = {} log "INFO: FILE PATH IS #{file_path}" case file_path.to_s when /resource.xml$/ log "INFO: Processing resource.xml for #{file_path}" # Assumption: every schema is only linked by a single resource_docs document. current_document.xpath("//schema").each do |schema_node| schema_name = schema_node["name"] if parsed_schema_names[schema_name] log "ERROR: We have encountered this schema before: \ #{schema_name} from path \ #{parsed_schema_names[schema_name]}, now at #{file_path}" next else parsed_schema_names[schema_name] = file_path end Dir["#{stepmod_path}/resources/#{schema_name}/descriptions.xml"].each do |description_xml_path| log "INFO: Processing resources schema #{description_xml_path}" description_document = Nokogiri::XML(File.read(description_xml_path)).root description_document.xpath("//ext_description").each do |ext_description| # log "INFO: Processing linkend[#{ext_description['linkend']}]" concept = Stepmod::Utils::Concept.parse( ext_description, reference_anchor: bibdata.anchor, reference_clause: nil, file_path: Pathname.new(description_xml_path) .relative_path_from(stepmod_path), ) next unless concept if term_special_category(bibdata) # log "INFO: this part is special" find_or_initialize_concept(current_part_resources, concept) else # log "INFO: this part is generic" find_or_initialize_concept(resource_concepts, concept) end parsed_bibliography << bibdata end end end when /module.xml$/ log "INFO: Processing module.xml for #{file_path}" # Assumption: every schema is only linked by a single module document. # puts current_document.xpath('//module').length schema_name = current_document.xpath("//module").first["name"] if parsed_schema_names[schema_name] log "ERROR: We have encountered this schema before: \ #{schema_name} from path #{parsed_schema_names[schema_name]}, \ now at #{file_path}" next else parsed_schema_names[schema_name] = file_path end description_xml_path = "#{stepmod_path}/modules/#{schema_name}/arm_descriptions.xml" log "INFO: Processing modules schema #{description_xml_path}" if File.exists?(description_xml_path) description_document = Nokogiri::XML( File.read(description_xml_path), ) .root description_document.xpath("//ext_description").each do |ext_description| linkend_schema = ext_description["linkend"].split(".").first concept = Stepmod::Utils::Concept.parse( ext_description, reference_anchor: bibdata.anchor, reference_clause: nil, file_path: Pathname.new(description_xml_path) .relative_path_from(stepmod_path), ) next unless concept current_part_modules_arm[linkend_schema] ||= Glossarist::Collection.new find_or_initialize_concept( current_part_modules_arm[linkend_schema], concept ) # puts part_modules_arm.inspect parsed_bibliography << bibdata end end description_xml_path = "#{stepmod_path}/modules/#{schema_name}/mim_descriptions.xml" log "INFO: Processing modules schema #{description_xml_path}" if File.exists?(description_xml_path) description_document = Nokogiri::XML( File.read(description_xml_path), ) .root description_document.xpath("//ext_description").each do |ext_description| linkend_schema = ext_description["linkend"].split(".").first concept = Stepmod::Utils::Concept.parse( ext_description, reference_anchor: bibdata.anchor, reference_clause: nil, file_path: Pathname .new(description_xml_path) .relative_path_from(stepmod_path), ) next unless concept current_part_modules_mim[linkend_schema] ||= Glossarist::Collection.new find_or_initialize_concept( current_part_modules_mim[linkend_schema], concept ) parsed_bibliography << bibdata end end end log "INFO: Completed processing XML file #{fpath}" if current_part_concepts.to_a.empty? log "INFO: Skipping #{fpath} (#{bibdata.docid}) \ because it contains no concepts." elsif current_part_concepts.to_a.length < 3 log "INFO: Skipping #{fpath} (#{bibdata.docid}) \ because it only has #{current_part_concepts.to_a.length} terms." current_part_concepts.to_a.each do |x| general_concepts.store(x) end else unless current_part_concepts.to_a.empty? part_concepts << [bibdata, current_part_concepts] end end unless current_part_resources.to_a.empty? part_resources << [bibdata, current_part_resources] end if (current_part_modules_arm.to_a.size + current_part_modules_mim.to_a.size).positive? part_modules << [bibdata, current_part_modules_arm, current_part_modules_mim] end end end def find_or_initialize_concept(collection, localized_concept) concept = collection .store(Glossarist::Concept.new(id: SecureRandom.uuid)) concept.add_l10n(localized_concept) end end end end