require "sablon/html/ast"
require "sablon/html/visitor"
module Sablon
class HTMLConverter
class ASTBuilder
Layer = Struct.new(:items, :ilvl)
def initialize(nodes)
@layers = [Layer.new(nodes, false)]
@root = Root.new([])
end
def to_ast
@root
end
def new_layer(ilvl: false)
@layers.push Layer.new([], ilvl)
end
def next
current_layer.items.shift
end
def push(node)
@layers.last.items.push node
end
def push_all(nodes)
nodes.each(&method(:push))
end
def done?
!current_layer.items.any?
end
def nested?
ilvl > 0
end
def ilvl
@layers.select { |layer| layer.ilvl }.size - 1
end
def emit(node)
@root.nodes << node
end
private
def current_layer
if @layers.any?
last_layer = @layers.last
if last_layer.items.any?
last_layer
else
@layers.pop
current_layer
end
else
Layer.new([], false)
end
end
end
def process(input)
processed_ast(input).to_docx
end
def processed_ast(input)
ast = build_ast(input)
ast.accept LastNewlineRemoverVisitor.new
ast
end
def build_ast(input)
doc = Nokogiri::HTML.fragment(input)
@builder = ASTBuilder.new(doc.children)
while !@builder.done?
ast_next_paragraph
end
@builder.to_ast
end
private
def ast_next_paragraph
node = @builder.next
if node.name == 'div'
@builder.new_layer
@builder.emit Paragraph.new('Normal', ast_text(node.children))
elsif node.name == 'p'
@builder.new_layer
@builder.emit Paragraph.new('Paragraph', ast_text(node.children))
elsif node.name =~ /h(\d+)/
@builder.new_layer
@builder.emit Paragraph.new("Heading#{$1}", ast_text(node.children))
elsif node.name == 'ul'
@builder.new_layer ilvl: true
unless @builder.nested?
@definition = Sablon::Numbering.instance.register('ListBullet')
end
@builder.push_all(node.children)
elsif node.name == 'ol'
@builder.new_layer ilvl: true
unless @builder.nested?
@definition = Sablon::Numbering.instance.register('ListNumber')
end
@builder.push_all(node.children)
elsif node.name == 'li'
@builder.new_layer
@builder.emit ListParagraph.new(@definition.style, ast_text(node.children), @definition.numid, @builder.ilvl)
elsif node.text?
# SKIP?
else
raise ArgumentError, "Don't know how to handle node: #{node.inspect}"
end
end
def ast_text(nodes, format: TextFormat.default)
runs = nodes.flat_map do |node|
if node.text?
Text.new(node.text, format)
elsif node.name == 'br'
Newline.new
elsif node.name == 'strong' || node.name == 'b'
ast_text(node.children, format: format.with_bold).nodes
elsif node.name == 'em' || node.name == 'i'
ast_text(node.children, format: format.with_italic).nodes
elsif node.name == 'u'
ast_text(node.children, format: format.with_underline).nodes
elsif ['ul', 'ol', 'p', 'div'].include?(node.name)
@builder.push(node)
nil
else
raise ArgumentError, "Don't know how to handle node: #{node.inspect}"
end
end
Collection.new(runs.compact)
end
end
end