# frozen_string_literal: true
require 'nokogiri'
require 'addressable'
module Epuber
class Compiler
require_relative 'file_finders/normal'
class XHTMLProcessor
class UnparseableLinkError < StandardError; end
# Method for parsing incomplete XML, supports multiple root elements
#
# @warning Because of nature of XML, when input string don't contain root element, it will create own called
# `body`, since it will be used in next steps.
#
# @param [String] text input XHTML text
#
# @return [Nokogiri::XML::Document] parsed document
#
def self.xml_doc_from_str_with_errors(text, file_path = nil)
text = text.dup
if /\A[\n\r ]+(<\?xml)/ =~ text
UI.warning('XML header must be at the beginning of document',
location: Epuber::Location.new(path: file_path, lineno: 1))
text = text.lstrip
end
xml_header = nil
if /\A\s*(<\?xml[^>]*\?>)/ =~ text
match = Regexp.last_match
xml_header = text[match.begin(1)...match.end(1)]
text[match.begin(1)...match.end(1)] = ''
end
doctypes = []
while /(\n|\?>|\A)?(]*>\n*)/ =~ text
doctypes << ::Regexp.last_match(2).strip
match = Regexp.last_match
text[match.begin(2)...match.end(2)] = ''
end
before = ([xml_header] + doctypes).compact.join("\n")
before += "\n" unless before.empty?
parse_options = Nokogiri::XML::ParseOptions::DEFAULT_XML |
Nokogiri::XML::ParseOptions::NOERROR | # to silence any errors or warnings printing into console
Nokogiri::XML::ParseOptions::NOWARNING |
Nokogiri::XML::ParseOptions::NOENT
doc = Nokogiri::XML("#{before}#{text}", file_path, nil, parse_options)
text_for_errors = before + text
doc.encoding = 'UTF-8'
doc.file_path = file_path
if doc.errors.empty?
errors = []
else
errors = doc.errors.map do |e|
Problem.new(:error, e.message, text_for_errors, line: e.line, column: e.column, file_path: file_path)
end
end
root = root_node = doc.root
root_elements = root.children.select { |a| a.element? || a.comment? }
if root_elements.count == 1
doc.root = root_elements.first
elsif root_node.at_css('html')
doc.root = root_node.at_css('html')
elsif root_node.at_css('body').nil?
root_node.node_name = 'body'
else
root_node.node_name = 'html'
end
[doc, errors]
end
def self.xml_document_from_string(text, file_path = nil)
xml, = xml_doc_from_str_with_errors(text, file_path)
xml
end
# Method to add all missing items in XML root
#
# Required items:
# - html (with all namespaces and other attributes)
# - body
# - head (with title)
#
# @param [Nokogiri::XML::Document] xhtml_doc input XML document to work with
# @param [String] title title of this document, since this is required by EPUB specification
# @param [Epuber::Version] epub_version version of result EPUB
#
# @return nil
#
def self.add_missing_root_elements(xhtml_doc, title, epub_version)
# add missing body element
if xhtml_doc.at_css('body').nil?
if xhtml_doc.root.node_name == 'html'
xhtml_doc.root << xhtml_doc.create_element('body')
else
xhtml_doc.root.surround_with_element('body')
end
end
html = xhtml_doc.at_css('html')
# add missing root html element
if html.nil?
attrs = {}
attrs['xmlns'] = 'http://www.w3.org/1999/xhtml'
attrs['xmlns:epub'] = 'http://www.idpf.org/2007/ops' if epub_version >= 3
html = xhtml_doc.root.surround_with_element('html', attrs)
elsif html.namespaces.empty?
html['xmlns'] = 'http://www.w3.org/1999/xhtml'
html['xmlns:epub'] = 'http://www.idpf.org/2007/ops' if epub_version >= 3
end
# add missing head in html
if xhtml_doc.at_css('html > head').nil?
head = xhtml_doc.create_element('head')
head << xhtml_doc.create_element('title', title)
head << xhtml_doc.create_element('meta', charset: 'utf-8') if epub_version >= 3.0
if (first = html.children.first)
first.before(head)
else
html << head
end
end
# https://github.com/IDPF/epubcheck/issues/631
return unless epub_version < 3.0
xhtml_doc.internal_subset&.remove
xhtml_doc.create_internal_subset('html', '-//W3C//DTD XHTML 1.1//EN', 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd')
end
# Method for adding style sheets with links, method will not add duplicate items
#
# @param [Nokogiri::XML::Document] xhtml_doc input XML document to work with
# @param [Array] styles links to files
#
# @return nil
#
def self.add_styles(xhtml_doc, styles)
head = xhtml_doc.at_css('html > head')
old_links = head.css('link[rel="stylesheet"]').map { |node| node['href'] }
links_to_add = styles - old_links
links_to_add.each do |path|
head << xhtml_doc.create_element('link', href: path, rel: 'stylesheet', type: 'text/css')
end
end
# Method for adding scripts with links, method will not add duplicate items
#
# @param [Nokogiri::XML::Document] xhtml_doc input XML document to work with
# @param [Array] styles links to files
#
# @return nil
#
def self.add_scripts(xhtml_doc, scripts)
head = xhtml_doc.at_css('html > head')
old_links = head.css('script').map { |node| node['src'] }
links_to_add = scripts - old_links
links_to_add.each do |path|
head << xhtml_doc.create_element('script', src: path, type: 'text/javascript')
end
end
# Adds viewport meta tag to head of some document, but only if there is not some existing tag
#
# @param [Nokogiri::XML::Document] xhtml_doc
# @param [Epuber::Size] viewport_size
#
def self.add_viewport(xhtml_doc, viewport_size)
head = xhtml_doc.at_css('html > head')
return unless head.at_css("meta[name='viewport']").nil?
s = viewport_size
head << xhtml_doc.create_element('meta', name: 'viewport', content: "width=#{s.width},height=#{s.height}")
end
# Method which will resolve path to file from pattern
#
# @param [String] path pattern or path of the file
# @param [Symbol | Array] groups groups of the searching file, could be for example :image when searching
# for file from tag
# @param [String] file_path path to file from which is searching for other file
# @param [Epuber::Compiler::FileFinders::Abstract] file_finder finder for searching for files
#
# @raise UnparseableLinkError, FileFinder::FileNotFoundError, FileFinder::MultipleFilesFoundError
#
# @return [URI] resolved path to file or remote web page
#
def self.resolved_link_to_file(path, groups, file_path, file_finder)
raise FileFinders::FileNotFoundError.new(path, file_path) if path.empty?
begin
uri = URI(path)
rescue URI::InvalidURIError
begin
uri = URI(Addressable::URI.encode(path))
rescue URI::InvalidURIError
# skip not valid uri
raise UnparseableLinkError, "Unparseable link `#{path}`"
end
end
return uri if path == '#'
# skip uri with scheme (links to web pages)
return uri unless uri.scheme.nil?
# skip empty path
return uri if uri.path.empty? && !uri.fragment.nil? && !uri.fragment.empty?
uri.path = file_finder.find_file(uri.path, groups: groups, context_path: file_path)
uri
end
# Resolves all links to files in XHTML document and returns the valid and resolved versions
#
# @param [Nokogiri::XML::Document] xhtml_doc input XML document to work with
# @param [String] tag_name CSS selector for tag
# @param [String] attribute_name name of attribute
# @param [Symbol | Array] groups groups of the searching file, could be for example :image when searching
# for file from tag
# @param [String] file_path path to file from which is searching for other file
# @param [Epuber::Compiler::FileFinder] file_finder finder for searching for files
#
# @return [Array] resolved links
#
def self.resolve_links_for(xhtml_doc, tag_name, attribute_name, groups, file_path, file_finder)
founded_links = []
xhtml_doc.css("#{tag_name}[#{attribute_name}]").each do |node|
src = node[attribute_name]
# @type [String] src
next if src.nil?
next if src.start_with?('$')
target_file = resolved_link_to_file(src, groups, file_path, file_finder)
founded_links << target_file
node[attribute_name] = target_file.to_s
rescue UnparseableLinkError, FileFinders::FileNotFoundError, FileFinders::MultipleFilesFoundError => e
UI.warning(e.to_s, location: node)
# skip not found files
next
end
founded_links
end
# Resolves all links to files in XHTML document and returns the valid and resolved versions
#
# @param [Nokogiri::XML::Document] xhtml_doc input XML document to work with
# @param [String] file_path path to file from which is searching for other file
# @param [Epuber::Compiler::FileFinder] file_finder finder for searching for files
#
# @return [Array] resolved links
#
def self.resolve_links(xhtml_doc, file_path, file_finder)
[
resolve_links_for(xhtml_doc, 'a', 'href', :text, file_path, file_finder),
resolve_links_for(xhtml_doc, 'map > area', 'href', :text, file_path, file_finder),
].flatten
end
# @param [Nokogiri::XML::Document] xhtml_doc input XML document to work with
#
# @return [Bool]
#
def self.using_javascript?(xhtml_doc)
!xhtml_doc.at_css('script').nil?
end
def self.using_remote_resources?(xhtml_doc)
regexp = %r{^[^:/?#]+://.*}
result = false
result ||= xhtml_doc.css('[src]').any? { |node| node['src'] =~ regexp }
result ||= xhtml_doc.css('link[href]').any? { |node| node['href'] =~ regexp }
result
end
# @param [Nokogiri::XML::Document] xhtml_doc input XML document to work with
#
# @return [Bool]
#
def self.using_mathml?(xhtml_doc)
!xhtml_doc.at_css('math|math', 'math' => 'http://www.w3.org/1998/Math/MathML').nil?
end
def self.resolve_mathml_namespace(xhtml_doc)
xhtml_doc.css('math').each do |math_node|
math_node.add_namespace('xmlns', 'http://www.w3.org/1998/Math/MathML')
end
end
# @param [Nokogiri::XML::Document] xhtml_doc
# @param [String] file_path path of referring file
# @param [FileResolver] file_resolver
#
# @return nil
#
def self.resolve_images(xhtml_doc, file_path, file_resolver)
resolve_resources_in('img', 'src', :image, xhtml_doc, file_path, file_resolver)
end
# @param [Nokogiri::XML::Document] xhtml_doc
# @param [String] file_path path of referring file
# @param [FileResolver] file_resolver
#
# @return nil
#
def self.resolve_scripts(xhtml_doc, file_path, file_resolver)
resolve_resources_in('script', 'src', :script, xhtml_doc, file_path, file_resolver)
end
# @param [Nokogiri::XML::Document] xhtml_doc
# @param [String] file_path path of referring file
# @param [FileResolver] file_resolver
#
# @return nil
#
def self.resolve_stylesheets(xhtml_doc, file_path, file_resolver)
resolve_resources_in('link[rel="stylesheet"]', 'href', :style, xhtml_doc, file_path, file_resolver)
end
def self.resolve_resources_in(node_css_query, attribute_name, resource_group, xhtml_doc, file_path, file_resolver)
xhtml_doc.css(node_css_query).each do |img|
path = img[attribute_name]
next if path.nil?
new_path = Compiler::FileTypes::SourceFile.resolve_relative_file(file_path,
path,
file_resolver,
group: resource_group,
location: img)
img[attribute_name] = new_path if new_path
end
end
# @param [Nokogiri::XML::Document] xhtml_doc input XML document to work with
# @return [Array] list of nodes with global ids
#
def self.find_global_ids_nodes(xhtml_doc)
xhtml_doc
.css('[id^="$"]')
end
# @param [Nokogiri::XML::Document] xhtml_doc input XML document to work with
# @return [Array] list of global ids (without dollar signs)
#
def self.find_global_ids(xhtml_doc)
find_global_ids_nodes(xhtml_doc)
.map { |node| node['id'][1..-1] }
end
# @param [Nokogiri::XML::Document] xhtml_doc input XML document to work with
# @return [Array] list of nodes with global links
#
def self.find_global_links_nodes(xhtml_doc)
xhtml_doc
.css('[href^="$"]')
end
# @param [Nokogiri::XML::Document] xhtml_doc input XML document to work with
# @return [Array] list of global ids (without dollar signs)
#
def self.find_global_links(xhtml_doc)
find_global_links_nodes(xhtml_doc)
.map { |node| node['href'][1..-1] }
end
end
end
end