lib/solrizer/extractor.rb in solrizer-0.2.0 vs lib/solrizer/extractor.rb in solrizer-0.3.0
- old
+ new
@@ -2,88 +2,47 @@
require 'rexml/document'
require "nokogiri"
require 'yaml'
module Solrizer
-class Extractor
-
- def extract_tags(text)
- doc = REXML::Document.new( text )
- extract_tag(doc, 'archivist_tags').merge(extract_tag(doc, 'donor_tags'))
- end
-
- def extract_tag(doc, type)
- tags = doc.elements["/fields/#{type}"]
- return {} unless tags
- {type => tags.text.split(/,/).map {|t| t.strip}}
- end
+# Provides utilities for extracting solr fields from a variety of objects and/or creating solr documents from a given object
+# Note: These utilities are optional. You can implement .to_solr directly on your classes if you want to bypass using Extractors.
+#
+# Each of the Solrizer implementations provides its own Extractor module that extends the behaviors of Solrizer::Extractor
+# with methods specific to that implementation (ie. extract_tag, extract_rels_ext, xml_to_solr, html_to_solr)
+#
+class Extractor
-
+ # Populates a solr doc with values from a hash.
+ # Accepts two forms of hashes:
+ # => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]}
+ # or
+ # => {:facets => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]} }
#
- # Extracts content-model and hydra-type from RELS-EXT datastream
- #
- def extract_rels_ext( text, solr_doc=Solr::Document.new )
- # TODO: only read in this file once
-
- if defined?(RAILS_ROOT)
- config_path = File.join(RAILS_ROOT, "config")
- else
- config_path = File.join(File.dirname(__FILE__), "..", "..", "config")
- end
- map = YAML.load(File.open(File.join(config_path, "hydra_types.yml")))
-
- doc = Nokogiri::XML(text)
- doc.xpath( '//foo:hasModel', 'foo' => 'info:fedora/fedora-system:def/model#' ).each do |element|
- cmodel = element.attributes['resource'].to_s
- solr_doc << Solr::Field.new( :cmodel_t => cmodel )
-
- if map.has_key?(cmodel)
- solr_doc << Solr::Field.new( :hydra_type_t => map[cmodel] )
+ # Note that values for individual fields can be a single string or an array of strings.
+ def extract_hash( input_hash, solr_doc=Solr::Document.new )
+ facets = input_hash.has_key?(:facets) ? input_hash[:facets] : input_hash
+ facets.each_pair do |facet_name, value|
+ case value.class.to_s
+ when "String"
+ solr_doc << Solr::Field.new( :"#{facet_name}_facet" => "#{value}" )
+ when "Array"
+ value.each { |v| solr_doc << Solr::Field.new( :"#{facet_name}_facet" => "#{v}" ) }
end
end
-
- return solr_doc
- end
-
- #
- # This method extracts solr fields from simple xml
- #
- def xml_to_solr( text, solr_doc=Solr::Document.new )
- doc = REXML::Document.new( text )
- doc.root.elements.each do |element|
- solr_doc << Solr::Field.new( :"#{element.name}_t" => "#{element.text}" )
- end
-
- return solr_doc
- end
-
- #
- # This method strips html tags out and returns content to be indexed in solr
- #
- def html_content_to_solr( ds, solr_doc=Solr::Document.new )
- text = CGI.unescapeHTML(ds.content)
- doc = Nokogiri::HTML(text)
-
- # html to story_display
- stories = doc.xpath('//story')
-
- stories.each do |story|
- solr_doc << Solr::Field.new(:story_display => story.children.to_xml)
+ if input_hash.has_key?(:symbols)
+ input_hash[:symbols].each do |symbol_name, value|
+ case value.class.to_s
+ when "String"
+ solr_doc << Solr::Field.new( :"#{symbol_name}_s" => "#{value}" )
+ when "Array"
+ value.each { |v| solr_doc << Solr::Field.new( :"#{symbol_name}_s" => "#{v}" ) }
+ end
+ end
end
-
- #strip out text and put in story_t
- text_nodes = doc.xpath("//text()")
- text = String.new
-
- text_nodes.each do |text_node|
- text << text_node.content
- end
-
- solr_doc << Solr::Field.new(:story_t => text)
-
- return solr_doc
+ return solr_doc
end
end
end