require "securerandom" require "sablon/html/ast_builder" require "sablon/html/node_properties" module Sablon class HTMLConverter # A top level abstract class to handle common logic for all AST nodes class Node PROPERTIES = [].freeze def self.node_name @node_name ||= name.split('::').last end # Returns a hash defined on the configuration object by default. However, # this method can be overridden by subclasses to return a different # node's style conversion config (i.e. :run) or a hash unrelated to the # config itself. The config object is used for all built-in classes to # allow for end-user customization via the configuration object def self.style_conversion # converts camelcase to underscored key = node_name.gsub(/([a-z])([A-Z])/, '\1_\2').downcase.to_sym Sablon::Configuration.instance.defined_style_conversions.fetch(key, {}) end # maps the CSS style property to it's OpenXML equivalent. Not all CSS # properties have an equivalent, nor share the same behavior when # defined on different node types (Paragraph, Table and Run). def self.process_properties(properties) # process the styles as a hash and store values style_attrs = {} properties.each do |key, value| key = key.strip if key.respond_to? :strip value = value.strip if value.respond_to? :strip # unless key.is_a? Symbol key, value = *convert_style_property(key, value) end style_attrs[key] = value if key end style_attrs end # handles conversion of a single attribute allowing recursion through # super classes. If the key exists and conversion is succesful a # symbol is returned to avoid conflicts with a CSS prop sharing the # same name. Keys without a conversion class are returned as is def self.convert_style_property(key, value) if style_conversion.key?(key) key, value = style_conversion[key].call(value) key = key.to_sym if key [key, value] elsif self == Node [key, value] else superclass.convert_style_property(key, value) end end def initialize(_env, _node, _properties) @properties ||= nil @attributes ||= {} end def accept(visitor) visitor.visit(self) end # Simplifies usage at call sites by only requiring them to supply # the tag name to use and any child AST nodes to render def to_docx(tag) prop_str = @properties.to_docx if @properties # "<#{tag}#{attributes_to_docx}>#{prop_str}#{children_to_docx}" end private # Simplifies usage at call sites def transferred_properties @properties.transferred_properties end # Gracefully handles conversion of an attributes hash into a # string def attributes_to_docx return '' if @attributes.nil? || @attributes.empty? ' ' + @attributes.map { |k, v| %(#{k}="#{v}") }.join(' ') end # Acts like an abstract method allowing subclases full flexibility to # define any content inside the tags. def children_to_docx '' end end # A container for an array of AST nodes with convenience methods to # work with the internal array as if it were a regular node class Collection < Node attr_reader :nodes def initialize(nodes) @properties ||= nil @attributes ||= {} @nodes = nodes end def accept(visitor) super @nodes.each do |node| node.accept(visitor) end end def to_docx nodes.map(&:to_docx).join end def inspect "[#{nodes.map(&:inspect).join(', ')}]" end def <<(node) @nodes << node end end # Stores all of the AST nodes from the current fragment of HTML being # parsed class Root < Collection def initialize(env, node) # strip text nodes from the root level element, these are typically # extra whitespace from indenting the markup if there are any # block level tags at the top level if ASTBuilder.any_block_tags?(node.children) node.search('./text()').remove end # convert children from HTML to AST nodes super(ASTBuilder.html_to_ast(env, node.children, {})) end def grep(pattern) visitor = GrepVisitor.new(pattern) accept(visitor) visitor.result end def inspect "" end end # An AST node representing the top level content container for a word # document. These cannot be nested within other paragraph elements class Paragraph < Node attr_accessor :runs PROPERTIES = %w[framePr ind jc keepLines keepNext numPr outlineLvl pBdr pStyle rPr sectPr shd spacing tabs textAlignment].freeze # Permitted child tags defined by the OpenXML spec CHILD_TAGS = %w[w:bdo w:bookmarkEnd w:bookmarkStart w:commentRangeEnd w:commentRangeStart w:customXml w:customXmlDelRangeEnd w:customXmlDelRangeStart w:customXmlInsRangeEnd w:customXmlInsRangeStart w:customXmlMoveFromRangeEnd w:customXmlMoveFromRangeStart w:customXmlMoveToRangeEnd w:customXmlMoveToRangeStart w:del w:dir w:fldSimple w:hyperlink w:ins w:moveFrom w:moveFromRangeEnd w:moveFromRangeStart w:moveTo w:moveToRangeEnd w:moveToRangeStart m:oMath m:oMathPara w:pPr w:proofErr w:r w:sdt w:smartTag] def initialize(env, node, properties) super properties = self.class.process_properties(properties) @properties = NodeProperties.paragraph(properties) # trans_props = transferred_properties @runs = ASTBuilder.html_to_ast(env, node.children, trans_props) @runs = Collection.new(@runs) end def to_docx super('w:p') end def accept(visitor) super runs.accept(visitor) end def inspect "" end private def children_to_docx runs.to_docx end end # Manages the child nodes of a list type tag class List < Collection def initialize(env, node, properties) # intialize values @list_tag = node.name # @definition = nil if node.ancestors(".//#{@list_tag}").length.zero? # Only register a definition upon the first list tag encountered @definition = env.document.add_list_definition(properties['pStyle']) end # update attributes of all child nodes transfer_node_attributes(node.children, node.attributes) # Move any list tags that are a child of a list item up one level process_child_nodes(node) # convert children from HTML to AST nodes super(ASTBuilder.html_to_ast(env, node.children, properties)) end def inspect "" end private # handles passing all attributes on the parent down to children def transfer_node_attributes(nodes, attributes) nodes.each do |child| # update all attributes merge_attributes(child, attributes) # set attributes specific to list items if @definition child['pStyle'] = @definition.style child['numId'] = @definition.numid end child['ilvl'] = child.ancestors(".//#{@list_tag}").length - 1 end end # merges parent and child attributes together, preappending the parent's # values to allow the child node to override it if the value is already # defined on the child node. def merge_attributes(child, parent_attributes) parent_attributes.each do |name, par_attr| child_attr = child[name] ? child[name].split(';') : [] child[name] = par_attr.value.split(';').concat(child_attr).join('; ') end end # moves any list tags that are a child of a list item tag up one level # so they become a sibling instead of a child def process_child_nodes(node) node.xpath("./li/#{@list_tag}").each do |list| # transfer attributes from parent now because the list tag will # no longer be a child and won't inheirit them as usual transfer_node_attributes(list.children, list.parent.attributes) list.parent.add_next_sibling(list) end end end # Sets list item specific attributes registered on the node to properly # generate a list paragraph class ListParagraph < Paragraph def initialize(env, node, properties) list_props = { pStyle: node['pStyle'], numPr: [{ ilvl: node['ilvl'] }, { numId: node['numId'] }] } properties = properties.merge(list_props) super end private def transferred_properties super end end # Builds a table from html table tags class Table < Node PROPERTIES = %w[jc shd tblBorders tblCaption tblCellMar tblCellSpacing tblInd tblLayout tblLook tblOverlap tblpPr tblStyle tblStyleColBandSize tblStyleRowBandSize tblW].freeze def initialize(env, node, properties) super # Process properties properties = self.class.process_properties(properties) @properties = NodeProperties.table(properties) trans_props = transferred_properties # Pull out the caption node if it exists and convert it separately. # If multiple caption tags are defined, only the first one is kept. @caption = node.xpath('./caption').remove @caption = nil if @caption.empty? if @caption cap_side_pat = /caption-side: ?(top|bottom)/ @cap_side = @caption.attr('style').to_s.match(cap_side_pat).to_a[1] node.add_previous_sibling @caption @caption = ASTBuilder.html_to_ast(env, @caption, trans_props)[0] end # convert remaining child nodes and pass on transferrable properties @children = ASTBuilder.html_to_ast(env, node.children, trans_props) @children = Collection.new(@children) end def to_docx if @caption && @cap_side == 'bottom' super('w:tbl') + @caption.to_docx elsif @caption # caption always goes above table unless explicitly set to "bottom" @caption.to_docx + super('w:tbl') else super('w:tbl') end end def accept(visitor) super @children.accept(visitor) end def inspect if @caption && @cap_side == 'bottom' "" elsif @caption "" else "" end end private def children_to_docx @children.to_docx end end # Converts html table rows into wordML table rows class TableRow < Node PROPERTIES = %w[cantSplit hidden jc tblCellSpacing tblHeader trHeight tblPrEx].freeze def initialize(env, node, properties) super properties = self.class.process_properties(properties) @properties = NodeProperties.table_row(properties) # trans_props = transferred_properties @children = ASTBuilder.html_to_ast(env, node.children, trans_props) @children = Collection.new(@children) end def to_docx super('w:tr') end def accept(visitor) super @children.accept(visitor) end def inspect "" end private def children_to_docx @children.to_docx end end # Converts html table cells into wordML table cells class TableCell < Node PROPERTIES = %w[gridSpan hideMark noWrap shd tcBorders tcFitText tcMar tcW vAlign vMerge].freeze # Permitted child tags defined by the OpenXML spec CHILD_TAGS = %w[w:altChunk w:bookmarkEnd w:bookmarkStart w:commentRangeEnd w:commentRangeStart w:customXml w:customXmlDelRangeEnd w:customXmlDelRangeStart w:customXmlInsRangeEnd w:customXmlInsRangeStart w:customXmlMoveFromRangeEnd w:customXmlMoveFromRangeStart w:customXmlMoveToRangeEnd w:customXmlMoveToRangeStart w:del w:ins w:moveFrom w:moveFromRangeEnd w:moveFromRangeStart w:moveTo w:moveToRangeEnd w:moveToRangeStart m:oMath m:oMathPara w:p w:permEnd w:permStart w:proofErr w:sdt w:tbl w:tcPr] def initialize(env, node, properties) super properties = self.class.process_properties(properties) @properties = NodeProperties.table_cell(properties) # # Nodes are processed first "as is" and then based on the XML # generated wrapped by paragraphs. trans_props = transferred_properties @children = ASTBuilder.html_to_ast(env, node.children, trans_props) @children = wrap_with_paragraphs(env, @children) end def to_docx super('w:tc') end def accept(visitor) super @children.accept(visitor) end def inspect "" end private # Wraps nodes in Paragraph AST nodes if needed to produced a valid # document def wrap_with_paragraphs(env, nodes) # convert all nodes to live xml, and use first node to determine # if that AST node should be wrapped in a paragraph nodes_xml = nodes.map { |n| Nokogiri::XML.fragment(n.to_docx) } # para = nil new_nodes = [] nodes_xml.each_with_index do |n, i| next unless n.children.first # add all nodes that need wrapped to a paragraph sequentially. # New paragraphs are created when something that doesn't need # wrapped is encountered to retain proper content ordering. first_node_name = n.children.first.node_name if wrapped_by_paragraph.include? first_node_name if para.nil? para = new_paragraph(env) new_nodes << para end para.runs << nodes[i] else new_nodes << nodes[i] para = nil end end # Ensure the table cell has an empty paragraph if nothing else new_nodes << new_paragraph(env) if new_nodes.empty? # filter nils and return Collection.new(new_nodes.compact) end # Returns a list of child tags that need to be wrapped in a paragraph def wrapped_by_paragraph Paragraph::CHILD_TAGS - self.class::CHILD_TAGS end # Creates a new Paragraph AST node, with no children def new_paragraph(env) para = Nokogiri::HTML.fragment('

').first_element_child ASTBuilder.html_to_ast(env, [para], transferred_properties).first end def children_to_docx @children.to_docx end end # Create a run of text in the document, runs cannot be nested within # each other class Run < Node PROPERTIES = %w[b i caps color dstrike emboss imprint highlight outline rStyle shadow shd smallCaps strike sz u vanish vertAlign rFonts].freeze def initialize(_env, node, properties) super properties = self.class.process_properties(properties) @properties = NodeProperties.run(properties) @string = node.to_s # using `text` doesn't reconvert HTML entities end def to_docx super('w:r') end def inspect "" end private def children_to_docx content = @string.tr("\u00A0", ' ') "#{content}" end end # Creates a blank line in the word document class Newline < Run def initialize(*) @properties = nil @attributes = {} end def inspect "" end private def children_to_docx "" end end # Creates a clickable URL in the word document, this only supports external # urls only class Hyperlink < Node def initialize(env, node, properties) super # properties are passed directly to runs because hyperlink nodes # don't have a corresponding property tag like runs or paragraphs. @runs = ASTBuilder.html_to_ast(env, node.children, properties) @runs = Collection.new(@runs) @target = node.attributes['href'].value # rel_attr = { Type: 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink', Target: @target, TargetMode: 'External' } rid = env.document.add_relationship(rel_attr) @attributes = { 'r:id' => rid } end def to_docx super('w:hyperlink') end def inspect "" end def accept(visitor) super @runs.accept(visitor) end private def children_to_docx @runs.to_docx end end end end