module Asciidoctor module Standoc module Cleanup def empty_text_before_first_element(x) x.children.each do |c| return false if c.text? and /\S/.match(c.text) return true if c.element? end true end def strip_initial_space(x) if x.children[0].text? if !/\S/.match(x.children[0].text) x.children[0].remove else x.children[0].content = x.children[0].text.gsub(/^ /, "") end end end def bookmark_cleanup(xmldoc) xmldoc.xpath("//li[descendant::bookmark]").each do |x| if x&.elements&.first&.name == "p" && x&.elements&.first&.elements&.first&.name == "bookmark" if empty_text_before_first_element(x.elements[0]) x["id"] = x.elements[0].elements[0].remove["id"] strip_initial_space(x.elements[0]) end end end end # extending localities to cover ISO referencing LOCALITY_REGEX_STR = <<~REGEXP.freeze ^((?section|clause|part|paragraph|chapter|page| table|annex|figure|example|note|formula|list|time|anchor| locality:[^ \\t\\n\\r:,;=]+)(\\s+|=) (?[^"][^ \\t\\n,:-]*|"[^"]+") (-(?[^"][^ \\t\\n,:-]*|"[^"]"))?| (?whole|locality:[^ \\t\\n\\r:,;=]+))(?[,:;]?)\\s* (?.*)$ REGEXP LOCALITY_RE = Regexp.new(LOCALITY_REGEX_STR.gsub(/\s/, ""), Regexp::IGNORECASE | Regexp::MULTILINE) def tq(x) x.sub(/^"/, "").sub(/"$/, "") end def extract_localities(x) f = x&.children&.first or return f.text? or return head = f.remove.text tail = x&.children&.remove extract_localities1(x, head) tail and x << tail end def extract_localities1(x, text) b = x.add_child("").first if LOCALITY_RE.match text while (m = LOCALITY_RE.match text) ref = m[:ref] ? "#{tq m[:ref]}" : "" refto = m[:to] ? "#{tq m[:to]}" : "" loc = m[:locality]&.downcase || m[:locality2]&.downcase b.add_child("#{ref}#{refto}") text = m[:text] b = x.add_child("").first if m[:punct] == ";" end x.add_child(text) if text end def xref_to_eref(x) x["bibitemid"] = x["target"] x["citeas"] = @anchors&.dig(x["target"], :xref) || @log.add("Crossreferences", x, "#{x['target']} does not have a corresponding anchor ID "\ "in the bibliography!") x.delete("target") extract_localities(x) unless x.children.empty? end def xref_cleanup(xmldoc) xmldoc.xpath("//xref").each do |x| if refid? x["target"] x.name = "eref" xref_to_eref(x) else x.delete("type") end end end def quotesource_cleanup(xmldoc) xmldoc.xpath("//quote/source | //terms/source").each do |x| xref_to_eref(x) end end def origin_cleanup(xmldoc) xmldoc.xpath("//origin/concept[termref]").each do |x| x.replace(x.children) end xmldoc.xpath("//origin").each do |x| x["citeas"] = @anchors&.dig(x["bibitemid"], :xref) || @log.add("Crossreferences", x, "#{x['bibitemid']} does not have a corresponding anchor "\ "ID in the bibliography!") extract_localities(x) unless x.children.empty? end end def concept_cleanup(xmldoc) xmldoc.xpath("//concept").each do |x| x.delete("term") if x["term"].empty? if /:/.match(x["key"]) then concept_termbase_cleanup(x) elsif refid? x["key"] then concept_eref_cleanup(x) else concept_xref_cleanup(x) end x.delete("key") end end def concept_termbase_cleanup(x) text = x&.children&.first&.remove&.text termbase, key = x["key"].split(/:/, 2) x.add_child(%() + "#{text}") end def concept_xref_cleanup(x) text = x&.children&.first&.remove&.text x.add_child(%(#{text})) end def concept_eref_cleanup(x) x.children = "#{x.children.to_xml}" extract_localities(x.first_element_child) end NAMECHAR = "\u0000-\u0022\u0024\u002c\u002f\u003a-\u0040\\u005b-\u005e"\ "\u0060\u007b-\u00b6\u00b8-\u00bf\u00d7\u00f7\u037e\u2000-\u200b"\ "\u200e-\u203e\u2041-\u206f\u2190-\u2bff\u2ff0-\u3000".freeze #"\ud800-\uf8ff\ufdd0-\ufdef\ufffe-\uffff".freeze NAMESTARTCHAR = "\\u002d\u002e\u0030-\u0039\u00b7\u0300-\u036f"\ "\u203f-\u2040".freeze def to_ncname(s) start = s[0] ret1 = %r([#{NAMECHAR}#]).match(start) ? "_" : (%r([#{NAMESTARTCHAR}#]).match(start) ? "_#{start}" : start) ret2 = s[1..-1] || "" ret = (ret1 || "") + ret2.gsub(%r([#{NAMECHAR}#]), "_") ret end module_function :to_ncname def to_xreftarget(s) return to_ncname(s) unless /^[^#]+#.+$/.match(s) /^(?[^#]+)#(?.+)$/ =~ s pref = pref.gsub(%r([#{NAMECHAR}]), "_") suff = suff.gsub(%r([#{NAMECHAR}]), "_") "#{pref}##{suff}" end IDREF = "//*/@id | //review/@from | //review/@to | "\ "//callout/@target | //citation/@bibitemid | //eref/@bibitemid".freeze def anchor_cleanup(x) anchor_cleanup1(x) xreftarget_cleanup(x) end def anchor_cleanup1(x) x.xpath(IDREF).each do |s| if (ret = to_ncname(s.value)) != (orig = s.value) s.value = ret output = s.parent.dup output.children.remove @log.add("Anchors", s.parent, "normalised identifier in #{output} "\ "from #{orig}") end end end def xreftarget_cleanup(x) x.xpath("//xref/@target").each do |s| if (ret = to_xreftarget(s.value)) != (orig = s.value) s.value = ret output = s.parent.dup output.children.remove @log.add("Anchors", s.parent, "normalised identifier in #{output} "\ "from #{orig}") end end end end end end