# frozen_string_literal: true module Dphil class TeiXML using ::Ragabash::Refinements # Public: Initialize a TeiXML object # def initialize(source) source = %() if source.strip.empty? @raw_xml = source end # Return or re-parse xml def xml @xml ||= begin xml = Nokogiri::XML(@raw_xml) { |config| config.strict.noent } xml.encoding = "UTF-8" xml.remove_namespaces! xml_normalize!(xml) rescue Nokogiri::XML::SyntaxError => e raise "TEIDocument (source: #{@raw_xml}) caught exception: #{e}" end end def to_xml xml.to_xml end alias to_s to_xml def empty? xml.xpath("//text()[normalize-space()]").empty? end # Public: Return a portion of the document as a new document # # expr - a CSS selector or XPath expression # # Returns a new document. def crop(expr) segment = xml.search(expr) pb = page_of(segment) lb = line_of(segment) source = <<~EOS
#{pb&.to_xml}#{lb&.to_xml}
#{segment.to_xml}
EOS self.class.new(source) end def crop_each(expr) xml.search(expr).map do |segment| pb = page_of(segment) lb = line_of(segment) source = <<~EOS
#{pb&.to_xml}#{lb&.to_xml}
#{segment.to_xml}
EOS self.class.new(source) end end # Public: Remove elements from the document based on CSS selector. # # expr - a CSS selector or XPath expression # # Returns a new document. def reject(expr) source = xml.dup source.search(expr).each do |node| node.replace(node.search("pb, lb")) end self.class.new(source.to_xml) end # Public: Substitute elements from the document based on CSS selector with # ID-based token text-nodes. # # expr - a CSS selector or XPath expression # subst_text - an optional text identifier # # Returns a new document. def subst(expr, subst_text = nil) source = parsed_xml.dup subst_text = subst_text.to_s.gsub(/\s+/, "_") unless subst_text.nil? source.search(expr).each do |node| set = Nokogiri::XML::NodeSet.new(source) escaped_text = ":#{node.attribute('id').to_s.gsub(/\s+/, '_')}" text_content = "#{subst_text || node.name}#{escaped_text}" set << Nokogiri::XML::Text.new(" {{#{text_content}}} ", source) node.replace(set + node.search("pb, lb")) end self.class.new(source.to_xml) end private # Get nearest prior node. # # id - node in document to start search from. # # Returns an XML node. def page_of(node) node.xpath("preceding::*[name() = 'pb'][1]") end # Get nearest prior node with everything in between. # # node - node in document to start search from. # # Returns an XML node. def line_of(node) node.xpath("preceding::*[name() = 'lb'][1]") end # Normalize (mostly) whitespace in the XML. def xml_normalize!(doc) doc.search("//text()").each do |text_node| text_node.content = text_node.content.gsub(%r{\s+[\s\.\-\\\/\_]*}, " ") end # Remove empty modification tags. doc.search( "//add[not(node())]|" \ "//del[not(node())]|" \ "//mod[not(node())]|" \ "//unclear[not(node())]|" \ "//g[not(node())]" ).remove doc end end end