# encoding=utf-8 module Polytexnic module Postprocessor module Html # Converts Tralics XML output to HTML. def xml_to_html(xml) doc = Nokogiri::XML(xml) emphasis(doc) boldface(doc) small_caps(doc) typewriter(doc) skips(doc) verbatim(doc) code(doc) metacode(doc) quote(doc) verse(doc) itemize(doc) enumerate(doc) item(doc) remove_errors(doc) set_ids(doc) chapters_and_sections(doc) subsection(doc) subsubsection(doc) headings(doc) sout(doc) kode(doc) filepath(doc) codelistings(doc) backslash_break(doc) spaces(doc) asides(doc) center(doc) title(doc) doc = smart_single_quotes(doc) tex_logos(doc) restore_literal(doc) restore_inline_verbatim(doc) make_cross_references(doc) hrefs(doc) graphics_and_figures(doc) images_and_imageboxes(doc) tables(doc) math(doc) frontmatter(doc) mainmatter(doc) footnotes(doc) table_of_contents(doc) convert_to_html(doc) end private # Handles output of \emph{} and \textit{}. def emphasis(doc) doc.xpath('//hi[@rend="it"]').each do |node| node.name = 'em' node.remove_attribute('rend') end end # Handles output of \textbf{}. def boldface(doc) doc.xpath('//hi[@rend="bold"]').each do |node| node.name = 'strong' node.remove_attribute('rend') end end # Handles output of \textsc{}. def small_caps(doc) doc.xpath('//hi[@rend="sc"]').each do |node| node.name = 'span' node['class'] = 'sc' node.remove_attribute('rend') end end # Handles \bigskip, etc. def skips(doc) doc.xpath('//p[@spacebefore]').each do |node| node['style'] = "margin-top: #{node['spacebefore']}" node.remove_attribute('spacebefore') end end # Handles output of \texttt{}. def typewriter(doc) doc.xpath('//hi[@rend="tt"]').each do |node| node.name = 'span' node['class'] = 'tt' node.remove_attribute('rend') end end # Handles verbatim and Verbatim environments. # \begin{verbatim} # # \end{verbatim} # and # \begin{Verbatim} # # \end{Verbatim} # Note that verbatim is a built-in LaTeX environment, whereas # Verbatim is loaded by the Verbatim package (and used by the # code environment). def verbatim(doc) doc.xpath('//verbatim').each do |node| node.name = 'pre' node['class'] = 'verbatim' end doc.xpath('//Verbatim').each do |node| node.name = 'pre' node['class'] = 'verbatim' end end # Handles code environments. # \begin{code} # # \end{code} def code(doc) doc.xpath('//code').each do |node| node.name = 'div' node['class'] = 'code' end end # Handles metacode environments. # \begin{metacode} # # \end{metacode} def metacode(doc) doc.xpath('//metacode').each do |node| node.name = 'div' node['class'] = 'code' end end # Handles math environments. # Included are # \begin{equation} # # \end{equation} # and all the AMS-LaTeX variants defined in # Preprocessor#math_environments. # We also handle inline/display math of the form \(x\) and \[y\]. def math(doc) # math environments doc.xpath('//equation//texmath[@textype="equation"]').each do |node| node.name = 'div' node['class'] = 'equation' node.content = literal_cache[node.content.strip] + "\n" clean_node node, ['textype', 'type'] node.parent.replace(node) begin # Mimic default Tralics behavior of giving paragraph tags after # math a 'noindent' class. This allows the HTML to be styled with # CSS in a way that replicates the default behavior of LaTeX, where # math can be included in a paragraph. In such a case, paragraphs # are indented by default, but text after math environments isn't # indented. In HTML, including a math div inside a p tag is illegal, # so the next best thing is to add a 'noindent' class to the p tag # following the math. Most documents won't use this, as the HTML # convention is not to indent paragraphs anyway, but we want to # support that case for completeness (mainly because Tralics does). next_paragraph = node.next_sibling next_paragraph['noindent'] = 'true' rescue # We rescue nil in case the math isn't followed by any text. nil end end doc.xpath('//equation//texmath[@textype="equation*"]').each do |node| node.name = 'div' node['class'] = 'equation' node.content = literal_cache[node.content.strip] + "\n" clean_node node, ['textype', 'type'] node.parent.replace(node) begin # Mimic default Tralics behavior of giving paragraph tags after # math a 'noindent' class. This allows the HTML to be styled with # CSS in a way that replicates the default behavior of LaTeX, where # math can be included in a paragraph. In such a case, paragraphs # are indented by default, but text after math environments isn't # indented. In HTML, including a math div inside a p tag is illegal, # so the next best thing is to add a 'noindent' class to the p tag # following the math. Most documents won't use this, as the HTML # convention is not to indent paragraphs anyway, but we want to # support that case for completeness (mainly because Tralics does). next_paragraph = node.next_sibling next_paragraph['noindent'] = 'true' rescue # We rescue nil in case the math isn't followed by any text. nil end end # Paragraphs with noindent # See the long comment above. doc.xpath('//p[@noindent="true"]').each do |node| node['class'] = 'noindent' node.remove_attribute('noindent') end # inline math doc.xpath('//inline').each do |node| node.name = 'span' node.content = literal_cache[node.content.strip] node['class'] = 'inline_math' clean_node node, ['textype', 'type'] end end # Handles frontmatter (if any). def frontmatter(doc) doc.xpath('//frontmatter').each do |node| node.name = 'div' node['id'] = 'frontmatter' node['data-number'] = 0 end end # Handles mainmatter. def mainmatter(doc) doc.xpath('//mainmatter').each do |node| node.parent << node.children node.remove end end # Processes and places footnotes. def footnotes(doc) footnotes = Hash.new { |h, k| h[k] = [] } doc.xpath('//note[@place="foot"]').each do |footnote| footnotes[chapter_number(footnote)] << footnote end # Handle chapters 1 through n-1. doc.xpath('//div[@class="chapter"]').each_with_index do |chapter, i| make_footnotes(footnotes, i, chapter) end # Place the footnotes for Chapter n (if any). final_chapter_number = doc.xpath('//div[@class="chapter"]').length make_footnotes(footnotes, final_chapter_number) rewrite_contents(footnotes) end # Returns a unique CSS id for the footnotes of a given chapter. def footnotes_id(chapter_number) "cha-#{chapter_number}_footnotes" end # Returns a unique CSS id for footnote n in given chapter. def footnote_id(chapter_number, n) "cha-#{chapter_number}_footnote-#{n}" end # Returns the href needed to link to footnote n. def footnote_href(chapter_number, n) "##{footnote_id(chapter_number, n)}" end # Returns a unique CSS id for the footnote reference. def footnote_ref_id(chapter_number, n) "cha-#{chapter_number}_footnote-ref-#{n}" end # Returns the href needed to link to reference for footnote n. def footnote_ref_href(chapter_number, n) "##{footnote_ref_id(chapter_number, n)}" end def make_footnotes(footnotes, previous_chapter_number, chapter = nil) unless (chapter_footnotes = footnotes[previous_chapter_number]).empty? doc = chapter_footnotes.first.document footnotes_node = footnotes_list(footnotes, previous_chapter_number) place_footnotes(footnotes_node, previous_chapter_number, chapter) end end # Returns a list of footnotes ready for placement. def footnotes_list(footnotes, chapter_number) doc = footnotes.values[0][0].document # For symbolic footnotes, we want to suppress numbers, which can be # done in CSS, but it doesn't work in many EPUB & MOBI readers. # As a kludge, we switch to ul in this case, which looks nicer. list_type = footnote_symbols? ? 'ul' : 'ol' footnotes_node = Nokogiri::XML::Node.new(list_type, doc) footnotes_node['class'] = 'footnotes' footnotes_node['class'] += ' nonumbers' if footnote_symbols? footnotes[chapter_number].each_with_index do |footnote, i| n = i + 1 note = Nokogiri::XML::Node.new('li', doc) note['id'] = footnote_id(chapter_number, n) reflink = Nokogiri::XML::Node.new('a', doc) reflink['class'] = 'arrow' reflink.content = "↑" reflink['href'] = footnote_ref_href(chapter_number, n) html = "#{footnote.inner_html} #{reflink.to_xhtml}" html = "#{fnsymbol(i)} #{html}" if footnote_symbols? note.inner_html = html footnotes_node.add_child note end footnotes_node end # Places footnotes for Chapter n-1 just before Chapter n. def place_footnotes(footnotes_node, chapter_number, chapter = nil) doc = footnotes_node.document footnotes_wrapper_node = Nokogiri::XML::Node.new('div', doc) footnotes_wrapper_node['id'] = footnotes_id(chapter_number) footnotes_wrapper_node.add_child footnotes_node if chapter.nil? doc.children.last.add_child(footnotes_wrapper_node) else chapter.add_previous_sibling(footnotes_wrapper_node) end end # Rewrites contents of each footnote with its corresponding number. def rewrite_contents(footnotes) footnotes.each do |chapter_number, chapter_footnotes| chapter_footnotes.each_with_index do |node, i| n = i + 1 node.name = 'sup' clean_node node, %w{place id id-text data-tralics-id data-number} node['id'] = footnote_ref_id(chapter_number, n) node['class'] = 'footnote' link = Nokogiri::XML::Node.new('a', node.document) link['href'] = footnote_href(chapter_number, n) content = footnote_symbols? ? fnsymbol(i) : n.to_s link.content = content node.inner_html = link # Add an inter-sentence space if appropriate. previous_character = node.previous_sibling.content[-1] end_of_sentence = %w[. ! ?].include?(previous_character) after = node.next_sibling end_of_paragraph = after.nil? || after.content.strip.empty? if end_of_sentence && !end_of_paragraph space = Nokogiri::XML::Node.new('span', node.document) space['class'] = 'intersentencespace' node['class'] += ' intersentence' node.add_next_sibling(space) end end end end # Returns the nth footnote symbol for use in non-numerical footnotes. # By using the modulus operator %, we arrange to loop around to the # front if the number footnotes exceeds the number of symbols. def fnsymbol(n) symbols = %w[* † ‡ § ¶ ‖ ** †† ‡‡] symbols[n % symbols.size] end # Returns the chapter number for a given node. # Every node is inside some div that has a 'data-number' attribute, # so recursively search the parents to find it. # Then return the first number in the value, e.g., "1" in "1.2". def chapter_number(node) number = node['data-number'] if number && !number.empty? number.split('.').first.to_i else chapter_number(node.parent) end end # Handles logos for TeX and LaTeX. def tex_logos(doc) doc.xpath('//TeX').each do |node| node.replace(Nokogiri::XML::fragment(tex)) end doc.xpath('//LaTeX').each do |node| node.replace(Nokogiri::XML::fragment(latex)) end end # Returns HTML for a nicely styled TeX logo. def tex %(TEX) end # Returns HTML for a nicely styled LaTeX logo. def latex %(LATEX) end # Handles \begin{quote} ... \end{quote}. def quote(doc) doc.xpath('//p[@rend="quoted"]').each do |node| clean_node node, 'rend' node.name = 'blockquote' node['class'] = 'quote' end end # Handles \begin{verse} ... \end{verse}. def verse(doc) doc.xpath('//p[@rend="verse"]').each do |node| clean_node node, %w{rend noindent} node.name = 'blockquote' node['class'] = 'verse' end end # Converts itemized lists to uls. def itemize(doc) doc.xpath('//list[@type="simple"]').each do |node| clean_node node, 'type' node.name = 'ul' end end # Converts enumerated lists to ols. def enumerate(doc) doc.xpath('//list[@type="ordered"]').each do |node| clean_node node, 'type' node.name = 'ol' end end # Returns the node for a list item (li). def item(doc) doc.xpath('//item/p[@noindent="true"]').each do |node| node.replace(node.inner_html) end doc.xpath('//item').each do |node| clean_node node, %w{id-text id label} node.name = 'li' end end # Removes remaining errors. def remove_errors(doc) doc.xpath('//error').map(&:remove) end # Set the Tralics ids. def set_ids(doc) doc.xpath('//*[@id]').each do |node| # TODO: make whitelist of non-tralics id's next if node['id'] =~ /footnote/ node['data-tralics-id'] = node['id'] convert_labels(node) clean_node node, %w{data-label} end # Replace '' tags with their children. doc.xpath('//unexpected').each do |node| node.parent.children = node.children node.remove end doc.xpath('//figure').each do |node| if unexpected = node.at_css('unexpected') # Tralics puts in an 'unexpected' tag sometimes. label = node.at_css('data-label') node['id'] = pipeline_label(label) unexpected.remove clean_node node, %w{data-label} elsif label = node.at_css('data-label') node['id'] = pipeline_label(label) label.remove clean_node node, %w{data-label} end end doc.xpath('//table').each do |node| if unexpected = node.at_css('unexpected') # Tralics puts in an 'unexpected' tag sometimes. label = node.at_css('data-label') node['id'] = pipeline_label(label) unexpected.remove clean_node node, %w{data-label} elsif label = node.at_css('data-label') node['id'] = pipeline_label(label) label.remove clean_node node, %w{data-label} end end doc.xpath('//equation').each do |node| if label = node.at_css('data-label') node.at_css('texmath')['id'] = pipeline_label(label) label.remove end end end # Convert data-labels to valid CSS ids. def convert_labels(node) node.children.each do |child| if child.name == 'data-label' node['id'] = pipeline_label(child) child.remove break end end end # Restores the label. # Tralics does weird stuff with underscores, so they are subbed out # so that they can be passed through the pipeline intact. This is where # we restore them. def pipeline_label(node) node.inner_html.gsub(underscore_digest, '_') end # Processes the tag given a section node. # Supports chapter, section, and subsection. def make_headings(doc, node, name) head_node = node.children.first head_node.name = name a = doc.create_element 'a' a['href'] = "##{node['id']}" unless node['id'].nil? a['class'] = 'heading' a << head_node.children head_node << a end # Converts div0 to chapters and sections depending on node type. def chapters_and_sections(doc) doc.xpath('//div0').each do |node| node.name = 'div' if node['type'] == 'chapter' node['class'] = 'chapter' heading = 'h1' else node['class'] = 'section' heading = 'h2' end if node['rend'] == 'nonumber' node['class'] += '-star' end clean_node node, %w{type rend} make_headings(doc, node, heading) end end # Converts div1 to subsections. def subsection(doc) doc.xpath('//div1').each do |node| node.name = 'div' node['class'] = 'subsection' if node['rend'] == 'nonumber' node['class'] += '-star' end clean_node node, %w{rend} make_headings(doc, node, 'h3') end end # Converts div2 to subsections. def subsubsection(doc) doc.xpath('//div2').each do |node| node.name = 'div' node['class'] = 'subsubsection' clean_node node, %w{rend} make_headings(doc, node, 'h4') end end # Converts heading elements to the proper spans. # Headings are used in theorem-like environments like asides. def headings(doc) doc.xpath('//heading').each do |node| node.name = 'span' node['class'] = 'description' end end # Converts strikeout text (\sout) to the proper tag. def sout(doc) doc.xpath('//sout').each do |node| node.name = 'del' end end # Converts inline code (\kode) to the proper tag. def kode(doc) doc.xpath('//kode').each do |node| node.name = 'code' end end # Converts filesystem path (\filepath) to the proper tag. def filepath(doc) doc.xpath('//filepath').each do |node| node.name = 'span' node['class'] = 'filepath' end end # Builds the full heading for codelisting-like environments. # The full heading, such as "Listing 1.1: Foo bars." needs to be # extracted and manipulated to produce the right tags and classes. def build_heading(node, css_class) node.name = 'div' node['class'] = css_class heading = node.at_css('p') heading.attributes.each do |key, value| node.set_attribute(key, value) heading.remove_attribute(key) end heading.name = 'div' heading['class'] = 'heading' number = heading.at_css('strong') number.name = 'span' number['class'] = 'number' if css_class == 'codelisting' description = node.at_css('.description').content number.content += ':' unless description.empty? else number.content += '.' end heading end # Processes codelisting environments. def codelistings(doc) doc.xpath('//codelisting').each do |node| heading = build_heading(node, 'codelisting') code = heading.at_css('div.code') node.add_child(code) end end # Add in breaks from '\\'. # We use a span instead of '
' because breaks can't be styled # easily, and are also invalid in some contexts where we want a # break (e.g., inside h1 tags). def backslash_break(doc) doc.xpath('//backslashbreak').each do |node| node.name = 'span' node['class'] = 'break' end end # Handles normal, thin, and intersentence spaces. def spaces(doc) doc.xpath('//thinspace').each do |node| node.name = 'span' node['class'] = 'thinspace' node.inner_html = ' ' end doc.xpath('//normalspace').each do |node| node.replace(' ') end doc.xpath('//intersentencespace').each do |node| node.name = 'span' node['class'] = 'intersentencespace' end end # Processes boxes/asides. def asides(doc) doc.xpath('//aside').each do |node| build_heading(node, 'aside') end end # Processes centered elements. def center(doc) doc.xpath('//center').each do |node| node.name = 'div' node['class'] = 'center' end end # Handles the title, author, date, etc., produced by \maketitle. def title(doc) doc.xpath('//maketitle').each do |node| node.name = 'div' node['id'] = 'title_page' %w{title subtitle author date}.each do |field| title_element = maketitle_elements[field] if title_element type = %w{title subtitle}.include?(field) ? 'h1' : 'h2' el = Nokogiri::XML::Node.new(type, doc) pipe = Polytexnic::Pipeline.new(title_element, literal_cache: literal_cache) raw_html = pipe.to_html content = Nokogiri::HTML.fragment(raw_html).at_css('p') unless (content.nil? && field == 'date') el.inner_html = content.inner_html.strip el['class'] = field node.add_child el end elsif field == 'date' # Date is missing, so insert today's date. el = Nokogiri::XML::Node.new('h2', doc) el['class'] = field el.inner_html = Date.today.strftime("%A, %b %e") node.add_child el end end end end # Converts text to smart single quotes and apostrophes. # This means `foo bar' and "don't" is converted to to use nice curly # "smart" quotes and apostrophes. # We don't bother with double quotes because Tralics already handles # those. def smart_single_quotes(doc) s = doc.to_xml s.gsub!('`', '‘') s.gsub!("'", '’') Nokogiri::XML(s) end # Restores literal environments (verbatim, code, math, etc.). # These environments are hashed and passed through the pipeline # so that Tralics doesn't process them. def restore_literal(doc) doc.xpath('//literal').each do |node| raw_content = literal_cache[node.content] node.parent.content = escape_backslashes(raw_content) node.remove end # Restore equation references. doc.xpath('//eqref').each do |node| node.content = literal_cache[node.content] node.name = 'span' node['class'] = 'eqref' end # Restore non-ASCII unicode doc.xpath('//unicode').each do |node| node.content = literal_cache[node.content] node.name = 'span' node['class'] = 'unicode' end end # Restores things inside \verb+...+ def restore_inline_verbatim(doc) doc.xpath('//inlineverbatim').each do |node| node.content = literal_cache[node.content] node.name = 'span' node['class'] = 'inline_verbatim' end end # Creates linked cross-references. def make_cross_references(doc) # build numbering tree doc.xpath('//*[@data-tralics-id]').each do |node| node['data-number'] = formatted_number(node) clean_node node, 'id-text' # Add number span if (head = node.css('h1 a, h2 a, h3 a').first) el = doc.create_element 'span' number = node['data-number'] is_section = number.match(/\./) prefix = (@cha.nil? || is_section) ? '' : "#{chaptername} " el.content = prefix + node['data-number'] + ' ' el['class'] = 'number' chapter_name = head.children.first if chapter_name.nil? head.add_child(el) else chapter_name.add_previous_sibling(el) end end end targets = doc.xpath("//*[@data-tralics-id]") target_cache = {} targets.each do |target| target_cache[target['data-tralics-id']] = target end doc.xpath('//ref').each do |node| node.name = 'span' target = target_cache[node['target']] if target.nil? node['class'] = 'undefined_ref' node.content = node['target'] else node['class'] = 'ref' node.content = target['data-number'] end clean_node node, 'target' end doc.xpath('//*[@target]').each do |node| node['href'] = "##{node['target'].gsub(':', '-')}" node['class'] = 'hyperref' clean_node node, 'target' end end # Returns the name to use for chapters. # The default is 'Chapter', of course, but this can be overriden # using '\renewcommand', especially in books other than Engilsh. def chaptername name_regex = /\\renewcommand\{\\chaptername\}\{(.*?)\}/ name = custom_commands.scan(name_regex).flatten.last name || 'Chapter' end # Returns the formatted number appropriate for the node. # E.g., "2.1" for a section. # Note: sets @cha as a side-effect. Yes, this is gross. def formatted_number(node) if node['class'] == 'chapter' # Tralics numbers figures & equations # overall, not per chapter, so we need # counters. @equation = 0 @figure = 0 @cha = node['id-text'] elsif node['class'] == 'section' @sec = node['id-text'] label_number(@cha, @sec) elsif node['class'] == 'subsection' @subsec = node['id-text'] label_number(@cha, @sec, @subsec) elsif node['class'] == 'subsubsection' @ssubsec = node['id-text'] label_number(@cha, @sec, @subsec, @ssubsec) elsif node['textype'] == 'equation' if @cha.nil? @equation = node['id-text'] else @equation += 1 end label_number(@cha, @equation) elsif node['class'] == 'codelisting' node['id-text'] elsif node['class'] == 'aside' node['id-text'] elsif node.name == 'table' && node['id-text'] @table = node['id-text'] label_number(@cha, @table) elsif node.name == 'figure' if @cha.nil? @figure = node['id-text'] else @figure += 1 end label_number(@cha, @figure) end end # Returns a label number for use in headings. # For example, label_number("1", "2") returns "1.2". def label_number(*args) args.compact.join('.') end def hrefs(doc) doc.xpath('//xref').each do |node| node.name = 'a' node['href'] = unescape_underscores(literal_cache[node['url']]) # Put a class on hrefs containing TeX to allow a style override. node.traverse do |descendant| if descendant['class'] == 'texhtml' node['class'] = 'tex' break end end clean_node node, 'url' end end # Unescapes underscores, which are escaped by kramdown. def unescape_underscores(url) url.gsub(/\\_/, '_') end # Handles both \includegraphics and figure environments. # The unified treatment comes from Tralics using the
tag # in both cases. def graphics_and_figures(doc) doc.xpath('//figure').each do |node| process_graphic(node, klass: 'figure') end end # Processes a graphic, including the description. def process_graphic(node, options={}) klass = options[:klass] node.name = 'div' raw_graphic = (node['rend'] == 'inline') unless raw_graphic if node['class'] node['class'] += " #{klass}" else node['class'] = klass end end if internal_paragraph = node.at_css('p') clean_node internal_paragraph, 'rend' end if node['file'] && node['extension'] filename = png_for_pdf(node['file'], node['extension']) alt = File.basename(node['file']) img = %(#{alt}) graphic = %(
#{img}
) graphic_node = Nokogiri::HTML.fragment(graphic) if description_node = node.children.first description_node.add_previous_sibling(graphic_node) else node.add_child(graphic_node) end clean_node node, %w[file extension rend] end add_caption(node, name: 'figure') unless raw_graphic end # Handles \image and \imagebox commands. def images_and_imageboxes(doc) doc.xpath('//image').each do |node| handle_image(node, klass: 'image') end doc.xpath('//imagebox').each do |node| handle_image(node, klass: 'image box') end end # Processes custom image environment to use a div and the right class. def handle_image(node, options={}) klass = options[:klass] container = node.parent container.name = 'div' container['class'] = 'graphics ' + klass node.name = 'img' node['src'] = png_for_pdf(node.content.gsub(underscore_digest, '_')) node['alt'] = node['src'].split('.').first node.content = "" end # Returns the name of an image file with PNG for PDF if necessary. # This is to support PDF images in the raw source, which look good in # PDF document, but need to be web-friendly in the HTML. We standardize # on PNG for simplicity. This means that, to do something like # \image{images/foo.pdf} # authors need to have both foo.pdf and foo.png in their images/ # directory. In this case, foo.pdf will be used in the PDF output, while # foo.png will automatically be used in the HTML, EPUB, & MOBI versions. def png_for_pdf(name, extension=nil) if extension.nil? name.sub('.pdf', '.png') else ext = extension == 'pdf' ? 'png' : extension "#{name}.#{ext}" end end # Adds a caption to a node. # This works for figures and tables (at the least). def add_caption(node, options={}) name = options[:name].to_s.capitalize doc = node.document full_caption = Nokogiri::XML::Node.new('div', doc) full_caption['class'] = 'caption' n = node['data-number'] if description_node = node.at_css('head') h = %(#{name} #{n}: ) d = %(#{description_node.inner_html}) description_node.remove full_caption.inner_html = Nokogiri::HTML.fragment(h + d) else header = %(#{name} #{n}) full_caption.inner_html = header end node.add_child(full_caption) clean_node node, ['id-text'] end # Converts XML to HTML tables. def tables(doc) doc.xpath('//table/row/cell').each do |node| node.name = 'td' if node['cols'] node['colspan'] = node['cols'] end end doc.xpath('//table/row').each do |node| node.name = 'tr' klass = [] if node['top-border'] == 'true' klass << 'top_border' clean_node node, %w[top-border] end if node['bottom-border'] == 'true' klass << 'bottom_border' clean_node node, %w[bottom-border] end node['class'] = klass.join(' ') unless klass.empty? end tabular_count = 0 doc.xpath('//table').each do |node| if tabular?(node) node['class'] = 'tabular' clean_node node, %w[rend] add_cell_alignment(node, tabular_count) tabular_count += 1 elsif table?(node) node.name = 'div' node['class'] = 'table' unless node.at_css('table') inner_table = Nokogiri::XML::Node.new('table', doc) inner_table['class'] = 'tabular' inner_table.children = node.children add_cell_alignment(inner_table, tabular_count) tabular_count += 1 node.add_child(inner_table) end clean_node node, %w[rend] add_caption(node, name: 'table') end end end # Adds the alignment (left, center, right) plus the border (if any). def add_cell_alignment(table, tabular_count) alignments = @tabular_alignment_cache[tabular_count] cell_alignments = alignments.scan(/(\|*(?:l|c|r)\|*)/).flatten table.css('tr').each do |row| row.css('td').zip(cell_alignments).each do |cell, alignment| if custom_alignment?(cell) cell['class'] = custom_class(cell) else cell['class'] = alignment_class(alignment) end clean_node cell, %w[halign right-border left-border cols] end end end # Returns true if the cell comes with custom alignment. # This is the case with a multicolumn row. def custom_alignment?(cell) cell['cols'] end # Returns the custom class for a cell. def custom_class(cell) [].tap do |klass| klass << 'left_border' if cell['left-border'] klass << "align_#{cell['halign']}" if cell['halign'] klass << 'right_border' if cell['right-border'] klass << 'top-border' if cell['top-border'] end.join(' ') end # Returns the CSS class corresponding to the given table alignment. def alignment_class(alignment) alignment.sub('l', 'align_left') .sub('r', 'align_right') .sub('c', 'align_center') .sub(/^\|/, 'left_border ') .sub(/\|$/, ' right_border') end # Returns true if a table node is from a 'tabular' environment. # Tralics converts both # \begin{table}... # and # \begin{tabular} # to tags, so we have to disambiguate them. def tabular?(table) table['rend'] == 'inline' end # Returns true if a table node is from a 'table' environment. # The make_cross_references method tags such tables with a # 'data-number' attribute, so we use that to detect 'table' envs. def table?(table) !table['data-number'].nil? end # Trims empty paragraphs. # Sometimes a

creeps in due to idiosyncrasies of the # Tralics conversion. def trim_empty_paragraphs(string) string.gsub!(/

\s*<\/p>/, '') end # Converts a document to HTML. # Because there's no way to know which elements are block-level # (and hence can't be nested inside a paragraph tag), we first extract # an HTML fragment by converting the document to HTML, and then use # Nokogiri's HTML.fragment method to read it in and emit valid markup. # This process transforms, e.g., the invalid #

Preformatted text:

text
foo

# to the valid #

Preformatted text:

text

foo

def convert_to_html(doc) highlight_source_code(doc) File.open(@highlight_cache_filename, 'wb') do |f| f.write(highlight_cache.to_msgpack) end body = doc.at_css('document').children.to_xhtml Nokogiri::HTML.fragment(body).to_xhtml.tap do |html| trim_empty_paragraphs(html) end end # Handles table of contents (if present). # This code could no doubt be made much shorter, but probably at the # cost of clarity. def table_of_contents(doc) toc = doc.at_css('tableofcontents') return if toc.nil? toc.add_previous_sibling('

Contents

') toc.name = 'div' toc['id'] = 'table_of_contents' toc.remove_attribute 'depth' html = [] current_depth = 0 doc.css('div').each do |node| case node['class'] when 'chapter' html << '
    ' if current_depth == 0 while current_depth > 1 close_list(html) current_depth -= 1 end current_depth = 1 insert_li(html, node) when 'section' open_list(html) if current_depth == 1 while current_depth > 2 close_list(html) current_depth -= 1 end current_depth = 2 insert_li(html, node) when 'subsection' open_list(html) if current_depth == 2 while current_depth > 3 close_list(html) current_depth -= 1 end current_depth = 3 insert_li(html, node) end end toc.add_child(Nokogiri::HTML::DocumentFragment.parse(html.join)) end def open_list(html, li=true) html << '
  • ' if li html << '
      ' end def close_list(html, li=true) html << '
    ' html << '
  • ' if li end def insert_li(html, node) open = %(
  • ) link = node.at_css('a.heading') link['class'] += ' hyperref' html << open << link.to_xhtml << '
  • ' end # Cleans a node by removing all the given attributes. def clean_node(node, attributes) [*attributes].each { |a| node.remove_attribute a } end end end end