Sha256: c0c10b318be18234f8c97cdcb1310c34c00230e68f70689769c888b38d3e0179

Contents?: true

Size: 1.98 KB

Versions: 13

Compression:

Stored size: 1.98 KB

Contents

module Wovnrb
  module Helpers
    module NokogumboHelper
      def parse_html(html_string, encoding = 'UTF-8')
        if /<html/i.match?(html_string.strip[0..999])
          d = Nokogiri::HTML5(html_string)
          d.encoding = encoding
          d
        else
          parse_fragment(html_string, encoding)
        end
      end

      # https://www.rubydoc.info/gems/nokogumbo/Nokogiri/HTML5#fragment-class_method
      #
      # Nokogumbo does not properly support parsing fragment and the current
      # implementation of Nokogiri::HTML5.fragment does not handle encoding
      # (second line of code below).
      def parse_fragment(html_string, encoding = 'UTF-8')
        doc = Nokogiri::HTML5.parse(html_string)
        doc.encoding = encoding
        fragment = Nokogiri::HTML::DocumentFragment.new(doc)

        if doc.children.length != 1 or doc.children.first.name != 'html'
          # no HTML?  Return document as is
          fragment = doc
        else
          # examine children of HTML element
          children = doc.children.first.children

          # head is always first.  If present, take children but otherwise
          # ignore the head element
          if children.length > 0 and doc.children.first.name = 'head'
            fragment << children.shift.children
          end

          # body may be next, or last.  If found, take children but otherwise
          # ignore the body element.  Also take any remaining elements, taking
          # care to preserve order.
          if children.length > 0 and doc.children.first.name = 'body'
            fragment << children.shift.children
            fragment << children
          elsif children.length > 0 and doc.children.last.name = 'body'
            body = children.pop
            fragment << children
            fragment << body.children
          else
            fragment << children
          end
        end

        # return result
        fragment
      end

      module_function :parse_html, :parse_fragment
    end
  end
end

Version data entries

13 entries across 13 versions & 1 rubygems

Version Path
wovnrb-3.11.1 lib/wovnrb/helpers/nokogumbo_helper.rb
wovnrb-3.11.0 lib/wovnrb/helpers/nokogumbo_helper.rb
wovnrb-3.10.3 lib/wovnrb/helpers/nokogumbo_helper.rb
wovnrb-3.10.2 lib/wovnrb/helpers/nokogumbo_helper.rb
wovnrb-3.10.1 lib/wovnrb/helpers/nokogumbo_helper.rb
wovnrb-3.10.0 lib/wovnrb/helpers/nokogumbo_helper.rb
wovnrb-3.9.0 lib/wovnrb/helpers/nokogumbo_helper.rb
wovnrb-3.8.0 lib/wovnrb/helpers/nokogumbo_helper.rb
wovnrb-3.7.2 lib/wovnrb/helpers/nokogumbo_helper.rb
wovnrb-3.7.1 lib/wovnrb/helpers/nokogumbo_helper.rb
wovnrb-3.6.0 lib/wovnrb/helpers/nokogumbo_helper.rb
wovnrb-3.5.0 lib/wovnrb/helpers/nokogumbo_helper.rb
wovnrb-3.4.1 lib/wovnrb/helpers/nokogumbo_helper.rb