require "stepmod/utils/stepmod_definition_converter" require "stepmod/utils/bibdata" require "stepmod/utils/concept" require "glossarist" require "securerandom" require "expressir" require "expressir/express/parser" require "indefinite_article" ReverseAdoc.config.unknown_tags = :bypass module Stepmod module Utils class TermsExtractor # TODO: we may want a command line option to override this in the future ACCEPTED_STAGES = %w(IS DIS FDIS TS).freeze WITHDRAWN_STATUS = "withdrawn".freeze attr_reader :stepmod_path, :stepmod_dir, :index_path, :general_concepts, :resource_concepts, :parsed_bibliography, :encountered_terms, :part_concepts, :part_resources, :part_modules, :stdout def self.call(stepmod_dir, index_path, stdout = $stdout) new(stepmod_dir, index_path, stdout).call end def initialize(stepmod_dir, index_path, stdout) @stdout = stdout @stepmod_dir = stepmod_dir @stepmod_path = Pathname.new(stepmod_dir).realpath @index_path = Pathname.new(index_path).to_s @general_concepts = Glossarist::Collection.new @resource_concepts = Glossarist::Collection.new @parsed_bibliography = [] @part_concepts = [] @part_resources = [] @part_modules = [] @encountered_terms = {} end def log(message) stdout.puts "[stepmod-utils] #{message}" end def term_special_category(bibdata) case bibdata.part.to_i when 41, 42, 43, 44, 45, 46, 47, 51 true when [56..112] true else false end end def call log "INFO: STEPmod directory set to #{stepmod_dir}." log "INFO: Detecting paths..." repo_index = Nokogiri::XML(File.read(@index_path)).root files = [] # add module paths repo_index.xpath("//module").each do |x| next if x['status'] == WITHDRAWN_STATUS path = Pathname.new("#{stepmod_dir}/modules/#{x['name']}/module.xml") files << path if File.exists? path end # add resource_docs paths repo_index.xpath("//resource_doc").each do |x| next if x['status'] == WITHDRAWN_STATUS path = Pathname.new("#{stepmod_dir}/resource_docs/#{x['name']}/resource.xml") files << path if File.exists? path end # add business_object_models paths repo_index.xpath("//business_object_model").each do |x| next if x['status'] == WITHDRAWN_STATUS path = Pathname.new("#{stepmod_dir}/business_object_models/#{x['name']}/business_object_model.xml") files << path if File.exists? path end # add application_protocols paths repo_index.xpath("//application_protocol").each do |x| next if x['status'] == WITHDRAWN_STATUS path = Pathname.new("#{stepmod_dir}/application_protocols/#{x['name']}/application_protocol.xml") files << path if File.exists? path end files.sort!.uniq! process_term_files(files) [ general_concepts, resource_concepts, parsed_bibliography, part_concepts, part_resources, part_modules, ] end private def process_term_files(files) parsed_schema_names = {} files.each do |file_path| file_path = file_path.realpath fpath = file_path.relative_path_from(stepmod_path) log "INFO: Processing XML file #{fpath}" current_document = Nokogiri::XML(File.read(file_path)).root bibdata = nil begin bibdata = Stepmod::Utils::Bibdata.new(document: current_document) rescue StandardError log "WARNING: Unknown file #{fpath}, skipped" next end unless ACCEPTED_STAGES.include? bibdata.doctype log "INFO: skipped #{bibdata.docid} as it is not \ one of (#{ACCEPTED_STAGES.join(', ')})." next end if bibdata.part.to_s.empty? log "FATAL: missing `part` attribute: #{fpath}" log "INFO: skipped #{bibdata.docid} as it is missing `part` attribute." next end revision_string = "" # Run `cvs status` to find out version log "INFO: Detecting Git SHA..." Dir.chdir(stepmod_path) do git_sha = `git rev-parse HEAD` unless git_sha.empty? revision_string = "\n// Git: SHA #{git_sha}" end end # read definitions current_part_concepts = Glossarist::Collection.new definition_index = 0 current_document.xpath("//definition").each do |definition| definition_index += 1 term_id = definition["id"] unless term_id.nil? if encountered_terms[term_id] log "FATAL: Duplicated term with id: #{term_id}, #{fpath}" end encountered_terms[term_id] = true end # Assume that definition is located in clause 3 of the ISO document # in order. We really don't have a good reference here. ref_clause = "3.#{definition_index}" concept = Stepmod::Utils::Concept.parse( definition, reference_anchor: bibdata.anchor, reference_clause: ref_clause, file_path: fpath + revision_string, ) next unless concept if term_special_category(bibdata) # log "INFO: this part is special" find_or_initialize_concept(current_part_concepts, concept) else # log "INFO: this part is generic" find_or_initialize_concept(general_concepts, concept) end parsed_bibliography << bibdata end current_part_resources = Glossarist::Collection.new current_part_modules_arm = {} current_part_modules_mim = {} log "INFO: FILE PATH IS #{file_path}" case file_path.to_s when /resource.xml$/ log "INFO: Processing resource.xml for #{file_path}" current_document.xpath("//schema").each do |schema_node| schema_name = schema_node["name"] if parsed_schema_names[schema_name] log "ERROR: We have encountered this schema before: \ #{schema_name} from path \ #{parsed_schema_names[schema_name]}, now at #{file_path}" next else parsed_schema_names[schema_name] = file_path end exp_annotated_path = "#{stepmod_path}/resources/#{schema_name}/#{schema_name}_annotated.exp" log "INFO: Processing resources schema #{exp_annotated_path}" if File.exists?(exp_annotated_path) repo = Expressir::Express::Parser.from_file(exp_annotated_path) schema = repo.schemas.first schema.entities.each do |entity| old_definition = entity.remarks.first domain = "resource: #{schema.id}" entity_definition = generate_entity_definition(entity, domain, old_definition) reference_anchor = bibdata.anchor reference_clause = nil concept = Stepmod::Utils::Concept.new( designations: [entity.id], definition: old_definition, converted_definition: entity_definition, id: "#{reference_anchor}.#{reference_clause}", reference_anchor: reference_anchor, reference_clause: reference_clause, file_path: Pathname.new(exp_annotated_path) .relative_path_from(stepmod_path), language_code: "en", ) next unless concept if term_special_category(bibdata) # log "INFO: this part is special" find_or_initialize_concept(current_part_resources, concept) else # log "INFO: this part is generic" find_or_initialize_concept(resource_concepts, concept) end parsed_bibliography << bibdata end end end when /module.xml$/ log "INFO: Processing module.xml for #{file_path}" # Assumption: every schema is only linked by a single module document. # puts current_document.xpath('//module').length schema_name = current_document.xpath("//module").first["name"] if parsed_schema_names[schema_name] log "ERROR: We have encountered this schema before: \ #{schema_name} from path #{parsed_schema_names[schema_name]}, \ now at #{file_path}" next else parsed_schema_names[schema_name] = file_path end exp_annotated_path = "#{stepmod_path}/modules/#{schema_name}/arm_annotated.exp" log "INFO: Processing modules schema #{exp_annotated_path}" if File.exists?(exp_annotated_path) repo = Expressir::Express::Parser.from_file(exp_annotated_path) repo.schemas.each do |schema| schema.entities.each do |entity| old_definition = entity.remarks.first domain = "application module: #{schema.id}" entity_definition = generate_entity_definition(entity, domain, old_definition) reference_anchor = bibdata.anchor reference_clause = nil concept = Stepmod::Utils::Concept.new( designations: [entity.id], definition: old_definition, converted_definition: entity_definition, id: "#{reference_anchor}.#{reference_clause}", reference_anchor: reference_anchor, reference_clause: reference_clause, file_path: Pathname.new(exp_annotated_path) .relative_path_from(stepmod_path), language_code: "en", ) next unless concept current_part_modules_arm[schema.id] ||= Glossarist::Collection.new find_or_initialize_concept( current_part_modules_arm[schema.id], concept ) # puts part_modules_arm.inspect parsed_bibliography << bibdata end end end mim_exp_annotated_path = "#{stepmod_path}/modules/#{schema_name}/mim_annotated.exp" log "INFO: Processing modules schema #{mim_exp_annotated_path}" if File.exists?(mim_exp_annotated_path) repo = Expressir::Express::Parser.from_file(mim_exp_annotated_path) repo.schemas.each do |schema| schema.entities.each do |entity| old_definition = entity.remarks.first domain = "application module: #{schema.id}" definition = generate_entity_definition(entity, domain, old_definition) reference_anchor = bibdata.anchor reference_clause = nil concept = Stepmod::Utils::Concept.new( designations: [entity.id], definition: old_definition, converted_definition: definition, id: "#{reference_anchor}.#{reference_clause}", reference_anchor: reference_anchor, reference_clause: reference_clause, file_path: Pathname.new(exp_annotated_path) .relative_path_from(stepmod_path), language_code: "en", ) next unless concept current_part_modules_mim[schema.id] ||= Glossarist::Collection.new find_or_initialize_concept( current_part_modules_mim[schema.id], concept ) parsed_bibliography << bibdata end end end end log "INFO: Completed processing XML file #{fpath}" if current_part_concepts.to_a.empty? log "INFO: Skipping #{fpath} (#{bibdata.docid}) \ because it contains no concepts." elsif current_part_concepts.to_a.length < 3 log "INFO: Skipping #{fpath} (#{bibdata.docid}) \ because it only has #{current_part_concepts.to_a.length} terms." current_part_concepts.to_a.each do |x| general_concepts.store(x) end else unless current_part_concepts.to_a.empty? part_concepts << [bibdata, current_part_concepts] end end unless current_part_resources.to_a.empty? part_resources << [bibdata, current_part_resources] end if (current_part_modules_arm.to_a.size + current_part_modules_mim.to_a.size).positive? part_modules << [bibdata, current_part_modules_arm, current_part_modules_mim] end end end def find_or_initialize_concept(collection, localized_concept) concept_id = SecureRandom.uuid concept = collection .store(Glossarist::Concept.new(id: concept_id)) concept.add_l10n(localized_concept) end def combine_paragraphs(full_paragraph, next_paragraph) # If full_paragraph already contains a period, extract that. if m = full_paragraph.match(/\A(?[^\n]*?\.)\s/) # puts "CONDITION 1" if m[:inner_first] return m[:inner_first] else return full_paragraph end end # If full_paragraph ends with a period, this is the last. if full_paragraph =~ /\.\s*\Z/ # puts "CONDITION 2" return full_paragraph end # If next_paragraph is a list if next_paragraph.match(/\A\*/) # puts "CONDITION 3" return full_paragraph + "\n\n" + next_paragraph end # If next_paragraph is a continuation of a list if next_paragraph.match(/\Awhich/) || next_paragraph.match(/\Athat/) # puts "CONDITION 4" return full_paragraph + "\n\n" + next_paragraph end # puts "CONDITION 5" full_paragraph end def trim_definition(definition) # Unless the first paragraph ends with "between" and is followed by a # list, don't split paragraphs = definition.split("\n\n") # puts paragraphs.inspect first_paragraph = paragraphs.first if paragraphs.length > 1 combined = paragraphs[1..-1].inject(first_paragraph) do |acc, p| combine_paragraphs(acc, p) end else combined = combine_paragraphs(first_paragraph, "") end # puts "combined--------- #{combined}" # Remove comments until end of line combined = combined + "\n" combined.gsub!(/\n\/\/.*?\n/, "\n") combined.strip! combined # # TODO: If the definition contains a list immediately after the first paragraph, don't split # return definition if definition =~ /\n\* / # unless ( # first_paragraph =~ /between:?\s*\Z/ || # first_paragraph =~ /include:?\s*\Z/ || # first_paragraph =~ /of:?\s*\Z/ || # first_paragraph =~ /[:;]\s*\Z/ # ) && # definition =~ /\n\n\*/ # # Only taking the first paragraph of the definition # first_paragraph # end end # rubocop:disable Layout/LineLength def generate_entity_definition(entity, domain, old_definition) return "" if entity.nil? entity_text = if entity.subtype_of.size.zero? "entity data type that represents " + entity.id.indefinite_article + " **#{entity.id}** entity" else "entity data type that is a type of "+ "**#{entity.subtype_of.map(&:id).join('** and **')}** that represents " + entity.id.indefinite_article + " **#{entity.id}** entity" end definition = <<~DEFINITION === #{entity.id} domain:[#{domain}] #{entity_text} DEFINITION unless old_definition.nil? || old_definition.blank? old_definition = trim_definition(old_definition) definition << <<~OLD_DEFINITION [NOTE] -- #{old_definition.strip} -- OLD_DEFINITION end # We no longer add Notes and Examples to the extracted terms # definition + format_remark_items(entity.remark_items) definition end def format_remark_items(remark_items) notes = remark_items.detect { |i| i.id == "__note" }&.remarks examples = remark_items.detect { |i| i.id == "__example" }&.remarks formatted_notes = format_remarks(notes, "NOTE", "--") formatted_examples = format_remarks(examples, "example", "====") formatted_notes + formatted_examples end # rubocop:enable Layout/LineLength def format_remarks(remarks, remark_item_name, remark_item_symbol) return "" if remarks.nil? remarks.map do |remark| <<~REMARK [#{remark_item_name}] #{remark_item_symbol} #{remark} #{remark_item_symbol} REMARK end.join end end end end