Sha256: 39be3254082d7adbfcf8845cfa5cd2fb5fd269099f101a4aa3434e3d94b742ca

Contents?: true

Size: 1.21 KB

Versions: 6

Compression:

Stored size: 1.21 KB

Contents

require 'nokogiri'

class HtmlToTextConverter

  def initialize(html_string)
    @html_string = html_string
  end

  def parse
    return '' unless @html_string

    html = pre_parse

    @fragment = Nokogiri::HTML.fragment(html)

    add_newline_after_block_elements
    parse_whitespace
    parse_lists

    @fragment.content
  end

  private

  def pre_parse
    html = @html_string.gsub(/>\s+</, '><') # remove whitespace between tags
    html.gsub('&nbsp;', ' ')                # replace nbsp with regular space
  end

  def add_newline_after_block_elements
    @fragment.css('br, p, div, ul, ol').each { |node| node.after("\n") }
  end

  def parse_whitespace
    @fragment.xpath('.//text()').each do |node|
      if node.content =~ /\S/ # has non-whitespace characters
        node.content = node.content.squeeze(' ') # consolidate whitespace
        node.content = node.content.lstrip
      else
        # remove nodes that are only whitespace
        node.remove unless node.content.include?("\n")
      end
    end
  end

  def parse_lists
    # Make lists pretty:
    #   * list item 1
    #   * list item 2
    #   * list item 3
    @fragment.css('li').each do |node|
      node.content = "  * #{node.content}\n"
    end
  end

end

Version data entries

6 entries across 6 versions & 1 rubygems

Version Path
rallycat-0.4.1 lib/rallycat/html_to_text_converter.rb
rallycat-0.4.0 lib/rallycat/html_to_text_converter.rb
rallycat-0.3.1 lib/rallycat/html_to_text_converter.rb
rallycat-0.3.0 lib/rallycat/html_to_text_converter.rb
rallycat-0.2.0 lib/rallycat/html_to_text_converter.rb
rallycat-0.1.0 lib/rallycat/html_to_text_converter.rb