# encoding=utf-8 require 'kramdown' require 'securerandom' $cache = {} $label_salt = SecureRandom.hex module Kramdown module Converter class Latex < Base # Converts `inline codespan`. # This overrides kramdown's default to use `\kode` instead of `\tt`. def convert_codespan(el, opts) "\\kode{#{latex_link_target(el)}#{escape(el.value)}}" end # Overrides default convert_a. # Unfortunately, kramdown is too aggressive in escaping characters # in hrefs, converting # [foo bar](http://example.com/foo%20bar) # into # \href{http://example.com/foo\%20bar}{foo bar} # The '\%20' in the href then won't work properly. def convert_a(el, opts) url = el.attr['href'] if url =~ /^#/ "\\hyperlink{#{escape(url[1..-1])}}{#{inner(el, opts)}}" else "\\href{#{url}}{#{inner(el, opts)}}" end end # Uses figures for images only when label is present. # This allows users to put raw (centered) images in their documents. # The default behavior of kramdown is to wrap such images in a figure # environment, which causes LaTeX to (a) treat them as floats and (b) # include a caption. This may not be what the user wants, and it's also # nonstandard Markdown. On the other hand, it is really nice to be # able to include captions using the default image syntax, so as a # compromise we use Markdown behavior by default and kramdown behavior # if the alt text contains a '\label' element. def convert_standalone_image(el, opts, img) alt_text = el.children.first.attr['alt'] if has_label?(alt_text) attrs = attribute_list(el) # Override the kramdown default by adding "here" placement. # Authors who want a different behavior can always use raw LaTeX. "\\begin{figure}[H]#{attrs}\n\\begin{center}\n#{img}\n\\end{center}\n\\caption{#{escape(el.children.first.attr['alt'])}}\n#{latex_link_target(el, true)}\n\\end{figure}#{attrs}\n" else img.gsub('\includegraphics', '\image') + "\n" end end # Detects if text has a label. def has_label?(text) text.include?($label_salt) end end end end module Polytexnic module Preprocessor module Polytex include Polytexnic::Literal # Converts Markdown to PolyTeX. # We adopt a unified approach: rather than convert "Markdown" (I use # the term loosely*) directly to HTML, we convert it to PolyTeX and # then run everything through the PolyTeX pipeline. Happily, kramdown # comes equipped with a `to_latex` method that does most of the heavy # lifting. The ouput isn't as clean as that produced by Pandoc (our # previous choice), but it comes with significant advantages: (1) It's # written in Ruby, available as a gem, so its use eliminates an external # dependency. (2) It's the foundation for the "Markdown" interpreter # used by Leanpub, so by using it ourselves we ensure greater # compatibility with Leanpub books. # # * The number of mutually incompatible markup languages going # by the name "Markdown" is truly mind-boggling. Most of them add things # to John Gruber's original Markdown language in an ever-expanding # attempt to bolt on the functionality needed to write longer documents. # At this point, I fear that "Markdown" has become little more than a # marketing term. def to_polytex math_cache = {} cleaned_markdown = cache_code_environments(@source) expand_input!(cleaned_markdown, Proc.new { |source| cache_code_environments(source) }) puts cleaned_markdown if debug? cleaned_markdown.tap do |markdown| convert_code_inclusion(markdown) cache_latex_literal(markdown) cache_raw_latex(markdown) cache_image_locations(markdown) puts markdown if debug? cache_math(markdown, math_cache) end puts cleaned_markdown if debug? # Override the header ordering, which starts with 'section' by default. lh = 'chapter,section,subsection,subsubsection,paragraph,subparagraph' kramdown = Kramdown::Document.new(cleaned_markdown, latex_headers: lh) puts kramdown.inspect if debug? puts kramdown.to_html if debug? puts kramdown.to_latex if debug? @source = kramdown.to_latex.tap do |polytex| remove_kramdown_comments(polytex) convert_includegraphics(polytex) restore_math(polytex, math_cache) restore_hashed_content(polytex) end end # Adds support for <<(path/to/code) inclusion. def convert_code_inclusion(text) text.gsub!(/^\s*(<<\(.*?\))/) do key = digest($1) $cache[key] = "%= #{$1}" # reduce to a previously solved case key end end # Caches literal LaTeX environments. def cache_latex_literal(markdown) # Add tabular and tabularx support. literal_types = Polytexnic::Literal.literal_types + %w[tabular tabularx longtable] literal_types.each do |literal| regex = /(\\begin\{#{Regexp.escape(literal)}\} .*? \\end\{#{Regexp.escape(literal)}\}) /xm markdown.gsub!(regex) do content = $1 key = digest(content) $cache[key] = content key end end end # Caches raw LaTeX commands to be passed through the pipeline. def cache_raw_latex(markdown) command_regex = /( ^[ \t]*\\\w+.*\}[ \t]*$ # Command on line with arg | ~\\ref\{.*?\} # reference with a tie | ~\\eqref\{.*?\} # eq reference with a tie | \\[^\s(]+\{.*?\} # command with one arg | \\\w+ # normal command | \\- # hyphenation | \\[ %&$\#@] # space or special character | \\\\ # double backslashes ) /x markdown.gsub!(command_regex) do content = $1 puts content.inspect if debug? key = digest(content) # Used to speed up has_label? in convert_standalone_image. key += $label_salt if content.include?('\label') $cache[key] = content if content =~ /\{table\}|\\caption\{/ # Pad tables & captions with newlines for kramdown compatibility. "\n#{key}\n" else key end end end # Caches the locations of images to be passed through the pipeline. # This works around a Kramdown bug, which fails to convert images # properly when their location includes a URL. def cache_image_locations(text) # Matches '![Image caption](/path/to/image)' text.gsub!(/^\s*(!\[.*?\])\((.*?)\)/) do key = digest($2) $cache[key] = $2 "\n#{$1}(#{key})" end end # Restores raw code from the cache. def restore_hashed_content(text) $cache.each do |key, value| # Because of the way backslashes get interpolated, we need to add # some extra ones to cover all the cases of hashed LaTeX. text.gsub!(key, value.gsub(/\\/, '\\\\\\')) end end # Caches Markdown code environments. # Included are indented environments, Leanpub-style indented environments, # and GitHub-style code fencing. def cache_code_environments(source) output = [] lines = source.split("\n") indentation = ' ' * 4 while (line = lines.shift) if line =~ /\{lang="(.*?)"\}/ language = $1 code = [] while (line = lines.shift) && line.match(/^#{indentation}(.*)$/) do code << $1 end code = code.join("\n") key = digest(code) code_cache[key] = [code, language] output << key output << line elsif line =~ /^```([\w+]*)(,\s*options:.*)?$/ # highlighted fences count = 1 language = $1.empty? ? 'text' : $1 options = $2 code = [] while (line = lines.shift) do count += 1 if line =~ /^```.+$/ count -= 1 if line.match(/^```\s*$/) break if count.zero? code << line end code = code.join("\n") data = [code, language, false, options] key = digest(data.join("--")) code_cache[key] = data output << key else output << line end end output.join("\n") end # Removes comments produced by kramdown. # These have the special form of always being at the beginning of the # line. def remove_kramdown_comments(text) text.gsub!(/^% (.*)$/, '') end # Converts \includegraphics to \image inside figures. # The reason is that raw \includegraphics is almost always too wide # in the PDF. Instead, we use the custom-defined \image command, which # is specifically designed to fix this issue. def convert_includegraphics(text) in_figure = false newtext = text.split("\n").map do |line| line.gsub!('\includegraphics', '\image') if in_figure if line =~ /^\s*\\begin\{figure\}/ in_figure = true elsif line =~ /^\s*\\end\{figure\}/ in_figure = false end line end.join("\n") text.replace(newtext) end # Caches math. # Leanpub uses the notation {$$}...{/$$} for both inline and block math, # with the only difference being the presences of newlines: # {$$} x^2 {/$$} % inline # and # {$$} # x^2 % block # {/$$} # I personally hate this notation and convention, so we also support # LaTeX-style \( x \) and \[ x^2 - 2 = 0 \] notation. def cache_math(text, cache) text.gsub!(/(?:\{\$\$\}\n(.*?)\n\{\/\$\$\}|\\\[(.*?)\\\])/m) do key = digest($1 || $2) cache[[:block, key]] = $1 || $2 key end text.gsub!(/(?:\{\$\$\}(.*?)\{\/\$\$\}|\\\((.*?)\\\))/m) do key = digest($1 || $2) cache[[:inline, key]] = $1 || $2 key end end # Restores the Markdown math. # This is easy because we're running everything through our LaTeX # pipeline. def restore_math(text, cache) cache.each do |(kind, key), value| case kind when :inline open = '\(' close = '\)' when :block open = '\[' close = '\]' end text.gsub!(key, open + value + close) end end end end end