extractor.rb in solrizer-0.3.0

- old
+ new
@@ -2,88 +2,47 @@
 require 'rexml/document'
 require "nokogiri"
 require 'yaml'
 
 module Solrizer
-class Extractor
   
-  
-  def extract_tags(text)
-    doc = REXML::Document.new( text )
-    extract_tag(doc, 'archivist_tags').merge(extract_tag(doc, 'donor_tags'))
-  end
-  
-  def extract_tag(doc, type)
-    tags = doc.elements["/fields/#{type}"]
-    return {} unless tags
-    {type => tags.text.split(/,/).map {|t| t.strip}}
-  end
+# Provides utilities for extracting solr fields from a variety of objects and/or creating solr documents from a given object
+# Note: These utilities are optional.  You can implement .to_solr directly on your classes if you want to bypass using Extractors.
+#
+# Each of the Solrizer implementations provides its own Extractor module that extends the behaviors of Solrizer::Extractor
+# with methods specific to that implementation (ie. extract_tag, extract_rels_ext, xml_to_solr, html_to_solr)
+#
+class Extractor
 
-  
+  # Populates a solr doc with values from a hash.  
+  # Accepts two forms of hashes:
+  # => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]}
+  # or
+  # => {:facets => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]} }
   #
-  # Extracts content-model and hydra-type from RELS-EXT datastream
-  #
-  def extract_rels_ext( text, solr_doc=Solr::Document.new )
-    # TODO: only read in this file once
-    
-    if defined?(RAILS_ROOT)
-      config_path = File.join(RAILS_ROOT, "config")
-    else
-      config_path = File.join(File.dirname(__FILE__), "..", "..", "config")
-    end    
-    map = YAML.load(File.open(File.join(config_path, "hydra_types.yml")))
-    
-    doc = Nokogiri::XML(text)
-    doc.xpath( '//foo:hasModel', 'foo' => 'info:fedora/fedora-system:def/model#' ).each do |element|
-      cmodel = element.attributes['resource'].to_s
-      solr_doc << Solr::Field.new( :cmodel_t => cmodel )
-      
-      if map.has_key?(cmodel)
-        solr_doc << Solr::Field.new( :hydra_type_t => map[cmodel] )
+  # Note that values for individual fields can be a single string or an array of strings.
+  def extract_hash( input_hash, solr_doc=Solr::Document.new )    
+    facets = input_hash.has_key?(:facets) ? input_hash[:facets] : input_hash
+    facets.each_pair do |facet_name, value|
+      case value.class.to_s
+      when "String"
+        solr_doc << Solr::Field.new( :"#{facet_name}_facet" => "#{value}" )
+      when "Array"
+        value.each { |v| solr_doc << Solr::Field.new( :"#{facet_name}_facet" => "#{v}" ) } 
       end
     end
-
-    return solr_doc
-  end
-
-  #
-  # This method extracts solr fields from simple xml
-  #
-  def xml_to_solr( text, solr_doc=Solr::Document.new )
-    doc = REXML::Document.new( text )
-    doc.root.elements.each do |element|
-      solr_doc << Solr::Field.new( :"#{element.name}_t" => "#{element.text}" )
-    end
-
-    return solr_doc
-  end
-  
-  #
-  # This method strips html tags out and returns content to be indexed in solr
-  #
-  def html_content_to_solr( ds, solr_doc=Solr::Document.new )
     
-    text = CGI.unescapeHTML(ds.content)
-    doc = Nokogiri::HTML(text)
-    
-    # html to story_display
-    stories = doc.xpath('//story')
-        
-    stories.each do |story|
-      solr_doc << Solr::Field.new(:story_display => story.children.to_xml)
+    if input_hash.has_key?(:symbols) 
+      input_hash[:symbols].each do |symbol_name, value|
+        case value.class.to_s
+        when "String"
+          solr_doc << Solr::Field.new( :"#{symbol_name}_s" => "#{value}" )
+	      when "Array"
+          value.each { |v| solr_doc << Solr::Field.new( :"#{symbol_name}_s" => "#{v}" ) } 
+        end
+      end
     end
-    
-    #strip out text and put in story_t
-    text_nodes = doc.xpath("//text()")
-    text = String.new
-    
-     text_nodes.each do |text_node|
-       text << text_node.content
-     end
-    
-     solr_doc << Solr::Field.new(:story_t => text)
-     
-     return solr_doc
+    return solr_doc
   end
   
 end
 end