Sha256: ac967b724407cc1d6f85c26b57c1baf80a4445e10de665b04dff20d60d453b9c

Contents?: true

Size: 894 Bytes

Versions: 38

Compression:

Stored size: 894 Bytes

Contents

require "nokogiri"
require 'yaml'

module Solrizer::HTML::Extractor
  
  #
  # This method strips html tags out and returns content to be indexed in solr
  #
  # @param [Datastream] ds object that responds to .content with HTML content
  # @param [Hash] solr_doc hash of values to be inserted into solr as a solr document
  def html_to_solr( ds, solr_doc=Hash.new )
    
    text = CGI.unescapeHTML(ds.content)
    doc = Nokogiri::HTML(text)
    
    # html to story_display
    stories = doc.xpath('//story')
        
    stories.each do |story|
      solr_doc.merge!({:story_display => story.children.to_xml})
    end
    
    #strip out text and put in story_t
    text_nodes = doc.xpath("//text()")
    text = String.new
    
     text_nodes.each do |text_node|
       text << text_node.content
     end
    
     solr_doc.merge!({:story_t => text})
     
     return solr_doc
  end
  
end

Version data entries

38 entries across 38 versions & 1 rubygems

Version Path
solrizer-3.4.1 lib/solrizer/html/extractor.rb
solrizer-3.4.0 lib/solrizer/html/extractor.rb
solrizer-3.3.0 lib/solrizer/html/extractor.rb
solrizer-3.2.0 lib/solrizer/html/extractor.rb
solrizer-2.2.0 lib/solrizer/html/extractor.rb
solrizer-3.1.1 lib/solrizer/html/extractor.rb
solrizer-3.1.0 lib/solrizer/html/extractor.rb
solrizer-3.0.0 lib/solrizer/html/extractor.rb
solrizer-3.0.0.rc2 lib/solrizer/html/extractor.rb
solrizer-3.0.0.rc1 lib/solrizer/html/extractor.rb
solrizer-3.0.0.pre8 lib/solrizer/html/extractor.rb
solrizer-3.0.0.pre7 lib/solrizer/html/extractor.rb
solrizer-3.0.0.pre6 lib/solrizer/html/extractor.rb
solrizer-3.0.0.pre5 lib/solrizer/html/extractor.rb
solrizer-3.0.0.pre4 lib/solrizer/html/extractor.rb
solrizer-3.0.0.pre3 lib/solrizer/html/extractor.rb
solrizer-3.0.0.pre2 lib/solrizer/html/extractor.rb
solrizer-3.0.0.pre1 lib/solrizer/html/extractor.rb
solrizer-2.1.0 lib/solrizer/html/extractor.rb
solrizer-2.1.0.rc1 lib/solrizer/html/extractor.rb