Sha256: 39be3254082d7adbfcf8845cfa5cd2fb5fd269099f101a4aa3434e3d94b742ca
Contents?: true
Size: 1.21 KB
Versions: 6
Compression:
Stored size: 1.21 KB
Contents
require 'nokogiri' class HtmlToTextConverter def initialize(html_string) @html_string = html_string end def parse return '' unless @html_string html = pre_parse @fragment = Nokogiri::HTML.fragment(html) add_newline_after_block_elements parse_whitespace parse_lists @fragment.content end private def pre_parse html = @html_string.gsub(/>\s+</, '><') # remove whitespace between tags html.gsub(' ', ' ') # replace nbsp with regular space end def add_newline_after_block_elements @fragment.css('br, p, div, ul, ol').each { |node| node.after("\n") } end def parse_whitespace @fragment.xpath('.//text()').each do |node| if node.content =~ /\S/ # has non-whitespace characters node.content = node.content.squeeze(' ') # consolidate whitespace node.content = node.content.lstrip else # remove nodes that are only whitespace node.remove unless node.content.include?("\n") end end end def parse_lists # Make lists pretty: # * list item 1 # * list item 2 # * list item 3 @fragment.css('li').each do |node| node.content = " * #{node.content}\n" end end end
Version data entries
6 entries across 6 versions & 1 rubygems