require "nokogiri" class HtmlToTextile class Document < ::Nokogiri::XML::SAX::Document def valid_tags ['a','p','br','u','b','i','strong','em','ul','ol','li','sup','sub', 'ins', 'del', 'h1', 'h2', 'h3', 'table', 'tr', 'td', 'th'] end def tree @tree ||= [] end def stack @stack ||= [] end def start_element name, attrs = [] if valid_tags.include? name attrs_hash = {} while key = attrs.shift do value = attrs.shift attrs_hash[key] = value end stack << { 'name' => name, 'content' => [], 'attrs' => attrs_hash } end end def end_element name if valid_tags.include? name content = stack.pop if stack.empty? or stack.last['content'].nil? tree << content else stack.last['content'] << content end end end def characters string if stack.last.nil? or stack.last['content'].nil? stack << "#{string}" else stack.last['content'] << "#{string}" end end def to_textile ##ActiveRecord::Base.logger.debug "### tree START" ##ActiveRecord::Base.logger.debug @tree.inspect ##ActiveRecord::Base.logger.debug "### tree END" #ActiveRecord::Base.logger.debug "### HTML START" to_html #ActiveRecord::Base.logger.debug "### HTML END" textile = '' tree.each do |content| textile += _to_textile_tag content end #ActiveRecord::Base.logger.debug "### TEXTILE START" #ActiveRecord::Base.logger.debug textile #ActiveRecord::Base.logger.debug "### TEXTILE END" return textile end def to_html @html = '' tree.each do |content| _to_html_tag content end return @html end private def _to_html_tag content, indent = 0 if content.class == String #ActiveRecord::Base.logger.debug "#{(' ' * indent)}#{content.to_s.gsub("\n", "#{' ' * indent}")}" @html += "#{content}" elsif content.class == Hash if ['br','img'].include? content['name'] #ActiveRecord::Base.logger.debug "#{(' ' * indent)}<#{content['name']} />" @html += "<#{content['name']} />" else #ActiveRecord::Base.logger.debug "#{(' ' * indent)}<#{content['name']}>" @html += "<#{content['name']}>" _to_html_tag(content['content'], indent + 2) #ActiveRecord::Base.logger.debug "#{(' ' * indent)}" @html += "" end elsif content.class == Array content.each do |element| _to_html_tag element, indent end end end def _to_textile_tag content, indent = 0 textile = '' if content.class == String ##ActiveRecord::Base.logger.debug "Content: #{content}" textile += "#{content}" elsif content.class == Hash if ['u','b','strong','i','em','del','ins', 'sub', 'sup'].include? content['name'] literals = { 'u' => '+', 'b' => '*', 'strong' => '*', 'i' => '_', 'em' => '_', 'del' => '-', 'ins' => '+', 'sub' => '~', 'sup' => '^' } inner_parsed_textile = _spaces _to_textile_tag(content['content']) if inner_parsed_textile[:content].blank? #ActiveRecord::Base.logger.debug "#{inner_parsed_textile[:leading]}#{inner_parsed_textile[:tailing]}" textile += "#{inner_parsed_textile[:leading]}#{inner_parsed_textile[:tailing]}" else #ActiveRecord::Base.logger.debug "#{inner_parsed_textile[:leading]}[#{literals[content['name']]}#{inner_parsed_textile[:content]}#{literals[content['name']]}]#{inner_parsed_textile[:tailing]}" textile += "#{inner_parsed_textile[:leading]}[#{literals[content['name']]}#{inner_parsed_textile[:content]}#{literals[content['name']]}]#{inner_parsed_textile[:tailing]}" end elsif ['br'].include? content['name'] #ActiveRecord::Base.logger.debug "\n" textile += "\n" elsif ['p','h1','h2','h3'].include? content['name'] if content['attrs']['style'] and a = content['attrs']['style'][/text-align:[\ ]?([^;]*)/,1] and not a.nil? #ActiveRecord::Base.logger.debug 'strip1' alignment = case a.strip.downcase when 'left' then '<' when 'right' then '>' when 'center' then '=' else '' end else alignment = '' end #ActiveRecord::Base.logger.debug "\n#{"#{content['name']}#{alignment}. " unless indent > 0}#{_to_textile_tag(content['content'], indent).strip}#\n\n" textile += "\n#{content['name']}#{alignment}. " textile += _to_textile_tag(content['content'], indent).strip textile += "\n\n" elsif ['a'].include? content['name'] inner_parsed_textile = _spaces _to_textile_tag(content['content']) if inner_parsed_textile[:content].blank? #ActiveRecord::Base.logger.debug "#{inner_parsed_textile[:leading]}#{inner_parsed_textile[:tailing]}" textile += "#{inner_parsed_textile[:leading]}#{inner_parsed_textile[:tailing]}" else #ActiveRecord::Base.logger.debug "#{inner_parsed_textile[:leading]}[\"#{inner_parsed_textile[:content]}\":#{content['attrs']['href'].blank? ? '#' : content['attrs']['href']}]#{inner_parsed_textile[:tailing]}" textile += "#{inner_parsed_textile[:leading]}[\"#{inner_parsed_textile[:content]}\":#{content['attrs']['href'].blank? ? '#' : content['attrs']['href']}]#{inner_parsed_textile[:tailing]}" end elsif ['ul','ol'].include? content['name'] literals = { 'ul' => '*', 'ol' => '#' } content['content'].each do |element| if element.class == Hash and element['name'] == 'li' #ActiveRecord::Base.logger.debug "\n #{literals[content['name']]} #{_to_textile_tag element, indent +2}" textile += "\n #{literals[content['name']]} " textile += _to_textile_tag element, indent +2 end end textile += "\n" elsif ['table'].include? content['name'] textile += "\n" content['content'].each do |row| if row.class == Hash and row['name'] == 'tr' row['content'].each do |element| if element.class == Hash and ['td', 'th'].include? element['name'] #ActiveRecord::Base.logger.debug "|#{element['name'] == 'th' ? '_.' : ''} #{_to_textile_tag(element, indent +2).strip.gsub(/\n{2,}/, "\n")} " textile += "|#{element['name'] == 'th' ? '_.' : ''} #{_to_textile_tag(element, indent +2).strip.gsub(/\n{2,}/, "\n")} " end end textile += "|\n" end end else #ActiveRecord::Base.logger.debug _to_textile_tag content['content'], indent textile += _to_textile_tag content['content'], indent end elsif content.class == Array content.each do |element| #ActiveRecord::Base.logger.debug _to_textile_tag element, indent textile += _to_textile_tag element, indent end end return textile end def _spaces string parsed = {:content => string} string.gsub(/^(\s*)?(\S*)(\s*)?$/) {|s| parsed = {:leading => $1, :content => $2, :tailing => $3} } parsed end end def initialize(string) @tree = [] @document = Document.new parser = ::Nokogiri::HTML::SAX::Parser.new(@document) parser.parse(string) end def to_textile @document.to_textile end def to_html @document.to_html end end