module Metanorma module Standoc module Cleanup def textcleanup(result) text = result.flatten.map { |l| l.sub(/\s*\Z/, "") } * "\n" text = text.gsub(/\s+([^<]*) }mx) { @c.decode($1) } end text end def smartquotes_cleanup(xmldoc) xmldoc.xpath("//date").each { |d| Metanorma::Utils::endash_date(d) } if @smartquotes then smartquotes_cleanup1(xmldoc) else dumbquote_cleanup(xmldoc) end end def smartquotes_cleanup1(xmldoc) uninterrupt_quotes_around_xml(xmldoc) dumb2smart_quotes(xmldoc) end # "abc", def => "abc", def def uninterrupt_quotes_around_xml(xmldoc) xmldoc.traverse do |n| next unless n.text? && n&.previous&.element? next if uninterrupt_quotes_around_xml_skip(n) uninterrupt_quotes_around_xml1(n.previous) end end IGNORE_QUOTES_ELEMENTS = %w(pre tt sourcecode stem asciimath figure bibdata passthrough identifier presentation-metadata semantic-metadata).freeze def uninterrupt_quotes_around_xml_skip(elem) !(/\A['"]/.match?(elem.text) && elem.previous.path.gsub(/\[\d+\]/, "").split(%r{/})[1..-2] .intersection(IGNORE_QUOTES_ELEMENTS).empty? && ((elem.previous.text.strip.empty? && !empty_tag_with_text_content?(elem.previous)) || ignoretext?(elem.previous))) end def uninterrupt_quotes_around_xml1(elem) prev = elem.at(".//preceding::text()[1]") or return /\S\Z/.match?(prev.text) or return foll = elem.at(".//following::text()[1]") m = /\A(["'][[:punct:]]*)(\s|\Z)/ .match(@c.decode(foll&.text)) or return foll.content = foll.text.sub(/\A(["'][[:punct:]]*)/, "") prev.content = "#{prev.text}#{m[1]}" end IGNORE_TEXT_ELEMENTS = %w(index fn).freeze def ignoretext?(elem) IGNORE_TEXT_ELEMENTS.include? elem.name end def block?(elem) %w(title name variant-title clause figure annex example introduction foreword acknowledgements note li th td dt dd p quote label abstract preferred admitted related deprecates field-of-application usage-info expression pronunciation grammar-value domain definition termnote termexample modification description newcontent floating-title tab).include? elem.name end def empty_tag_with_text_content?(elem) %w(eref xref termref link).include? elem.name end def dumb2smart_quotes(xmldoc) prev = "" xmldoc.traverse do |x| block?(x) and prev = "" empty_tag_with_text_content?(x) and prev = "dummy" x.text? or next ancestors = x.path.gsub(/\[\d+\]/, "").split(%r{/})[1..-2] ancestors.intersection(IGNORE_QUOTES_ELEMENTS).empty? or next dumb2smart_quotes1(x, prev) prev = x.text end end def dumb2smart_quotes1(curr, prev) /[-'"(<>]|\.\.|\dx/.match?(curr.text) or return /\A["']/.match?(curr.text) && prev.match?(/\S\Z/) and curr.content = curr.text.sub(/\A"/, "”").sub(/\A'/, "‘") curr.replace(Metanorma::Utils::smartformat(curr.text)) end def dumbquote_cleanup(xmldoc) xmldoc.traverse do |n| next unless n.text? && /\u2019/.match?(n.text) n.replace(@c.encode( @c.decode(n.text) .gsub(/(?<=\p{Alnum})\u2019(?=\p{Alpha})/, "'"), :basic, :hexadecimal )) end end end end end