require "metanorma-utils" require "digest" module Metanorma module Standoc module Cleanup def empty_text_before_first_element(elem) elem.children.each do |c| return false if c.text? && /\S/.match(c.text) return true if c.element? end true end def strip_initial_space(elem) return unless elem.children[0].text? if /\S/.match?(elem.children[0].text) elem.children[0].content = elem.children[0].text.gsub(/^ /, "") else elem.children[0].remove end end def bookmark_cleanup(xmldoc) li_bookmark_cleanup(xmldoc) dt_bookmark_cleanup(xmldoc) end def bookmark_to_id(elem, bookmark) parent = bookmark.parent elem["id"] = bookmark.remove["id"] strip_initial_space(parent) end def li_bookmark_cleanup(xmldoc) xmldoc.xpath("//li[descendant::bookmark]").each do |x| if x.at("./*[1][local-name() = 'p']/"\ "*[1][local-name() = 'bookmark']") && empty_text_before_first_element(x.elements[0]) bookmark_to_id(x, x.elements[0].elements[0]) end end end def dt_bookmark_cleanup(xmldoc) xmldoc.xpath("//dt[descendant::bookmark]").each do |x| if x.at("./*[1][local-name() = 'p']/"\ "*[1][local-name() = 'bookmark']") && empty_text_before_first_element(x.elements[0]) bookmark_to_id(x, x.elements[0].elements[0]) elsif x.at("./*[1][local-name() = 'bookmark']") && empty_text_before_first_element(x) bookmark_to_id(x, x.elements[0]) end end end def concept_cleanup(xmldoc) xmldoc.xpath("//concept[not(termxref)]").each do |x| term = x.at("./refterm") term&.remove if term&.text&.empty? concept_cleanup1(x) end end def concept_cleanup1(elem) elem.children.remove if elem&.children&.text&.strip&.empty? key_extract_locality(elem) if /:/.match?(elem["key"]) then concept_termbase_cleanup(elem) elsif refid? elem["key"] then concept_eref_cleanup(elem) else concept_xref_cleanup(elem) end elem.delete("key") end def related_cleanup(xmldoc) xmldoc.xpath("//related[not(termxref)]").each do |x| term = x.at("./refterm") term.replace("#{term_expr(term.children.to_xml)}"\ "") concept_cleanup1(x) end end def key_extract_locality(elem) return unless /,/.match?(elem["key"]) elem.add_child("#{elem['key'].sub(/^[^,]+,/, '')}") elem["key"] = elem["key"].sub(/,.*$/, "") end def concept_termbase_cleanup(elem) t = elem&.at("./xrefrender")&.remove&.children termbase, key = elem["key"].split(/:/, 2) elem.add_child(%() + "#{t&.to_xml}") end def concept_xref_cleanup(elem) t = elem&.at("./xrefrender")&.remove&.children elem.add_child(%(#{t&.to_xml})) end def concept_eref_cleanup(elem) t = elem&.at("./xrefrender")&.remove&.children&.to_xml l = elem&.at("./locality")&.remove&.children&.to_xml elem.add_child "#{l}" extract_localities(elem.elements[-1]) elem.elements[-1].add_child(t) if t end def to_xreftarget(str) return Metanorma::Utils::to_ncname(str) unless /^[^#]+#.+$/.match?(str) /^(?[^#]+)#(?.+)$/ =~ str pref = pref.gsub(%r([#{Metanorma::Utils::NAMECHAR}])o, "_") suff = suff.gsub(%r([#{Metanorma::Utils::NAMECHAR}])o, "_") "#{pref}##{suff}" end IDREF = "//*/@id | //review/@from | //review/@to | "\ "//callout/@target | //citation/@bibitemid | "\ "//eref/@bibitemid".freeze def anchor_cleanup(elem) anchor_cleanup1(elem) xreftarget_cleanup(elem) contenthash_id_cleanup(elem) end def anchor_cleanup1(elem) elem.xpath(IDREF).each do |s| if (ret = Metanorma::Utils::to_ncname(s.value)) != (orig = s.value) s.value = ret output = s.parent.dup output.children.remove @log.add("Anchors", s.parent, "normalised identifier in #{output} from #{orig}") end end end def xreftarget_cleanup(elem) elem.xpath("//xref/@target").each do |s| if (ret = to_xreftarget(s.value)) != (orig = s.value) s.value = ret output = s.parent.dup output.children.remove @log.add("Anchors", s.parent, "normalised identifier in #{output} from #{orig}") end end end def guid?(str) /^_[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/ .match?(str) end def contenthash_id_cleanup(doc) ids = contenthash_id_make(doc) contenthash_id_update_refs(doc, ids) end def contenthash_id_make(doc) doc.xpath("//*[@id]").each_with_object({}) do |x, m| next unless guid?(x["id"]) m[x["id"]] = contenthash(x) x["id"] = m[x["id"]] end end def contenthash_id_update_refs(doc, ids) [%w(review from), %w(review to), %w(callout target), %w(eref bibitemid), %w(citation bibitemid), %w(xref target), %w(xref to)].each do |a| doc.xpath("//#{a[0]}").each do |x| ids[x[a[1]]] and x[a[1]] = ids[x[a[1]]] end end end def contenthash(elem) Digest::MD5.hexdigest("#{elem.path}////#{elem.text}") .sub(/^(.{8})(.{4})(.{4})(.{4})(.{12})$/, "_\\1-\\2-\\3-\\4-\\5") end def passthrough_cleanup(doc) doc.xpath("//passthrough-inline").each do |p| p.name = "passthrough" p.children = select_odd_chars(p.children.to_xml) end doc.xpath("//identifier").each do |p| p.children = select_odd_chars(p.children.to_xml) end end private # skip ZWNJ inserted to prevent regexes operating in asciidoctor def select_odd_chars(text) text.gsub(/(?!&)([[:punct:]])\u200c/, "\\1") end end end end