lib/solrizer/extractor.rb in solrizer-0.2.0 vs lib/solrizer/extractor.rb in solrizer-0.3.0

- old
+ new

@@ -2,88 +2,47 @@ require 'rexml/document' require "nokogiri" require 'yaml' module Solrizer -class Extractor - - def extract_tags(text) - doc = REXML::Document.new( text ) - extract_tag(doc, 'archivist_tags').merge(extract_tag(doc, 'donor_tags')) - end - - def extract_tag(doc, type) - tags = doc.elements["/fields/#{type}"] - return {} unless tags - {type => tags.text.split(/,/).map {|t| t.strip}} - end +# Provides utilities for extracting solr fields from a variety of objects and/or creating solr documents from a given object +# Note: These utilities are optional. You can implement .to_solr directly on your classes if you want to bypass using Extractors. +# +# Each of the Solrizer implementations provides its own Extractor module that extends the behaviors of Solrizer::Extractor +# with methods specific to that implementation (ie. extract_tag, extract_rels_ext, xml_to_solr, html_to_solr) +# +class Extractor - + # Populates a solr doc with values from a hash. + # Accepts two forms of hashes: + # => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]} + # or + # => {:facets => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]} } # - # Extracts content-model and hydra-type from RELS-EXT datastream - # - def extract_rels_ext( text, solr_doc=Solr::Document.new ) - # TODO: only read in this file once - - if defined?(RAILS_ROOT) - config_path = File.join(RAILS_ROOT, "config") - else - config_path = File.join(File.dirname(__FILE__), "..", "..", "config") - end - map = YAML.load(File.open(File.join(config_path, "hydra_types.yml"))) - - doc = Nokogiri::XML(text) - doc.xpath( '//foo:hasModel', 'foo' => 'info:fedora/fedora-system:def/model#' ).each do |element| - cmodel = element.attributes['resource'].to_s - solr_doc << Solr::Field.new( :cmodel_t => cmodel ) - - if map.has_key?(cmodel) - solr_doc << Solr::Field.new( :hydra_type_t => map[cmodel] ) + # Note that values for individual fields can be a single string or an array of strings. + def extract_hash( input_hash, solr_doc=Solr::Document.new ) + facets = input_hash.has_key?(:facets) ? input_hash[:facets] : input_hash + facets.each_pair do |facet_name, value| + case value.class.to_s + when "String" + solr_doc << Solr::Field.new( :"#{facet_name}_facet" => "#{value}" ) + when "Array" + value.each { |v| solr_doc << Solr::Field.new( :"#{facet_name}_facet" => "#{v}" ) } end end - - return solr_doc - end - - # - # This method extracts solr fields from simple xml - # - def xml_to_solr( text, solr_doc=Solr::Document.new ) - doc = REXML::Document.new( text ) - doc.root.elements.each do |element| - solr_doc << Solr::Field.new( :"#{element.name}_t" => "#{element.text}" ) - end - - return solr_doc - end - - # - # This method strips html tags out and returns content to be indexed in solr - # - def html_content_to_solr( ds, solr_doc=Solr::Document.new ) - text = CGI.unescapeHTML(ds.content) - doc = Nokogiri::HTML(text) - - # html to story_display - stories = doc.xpath('//story') - - stories.each do |story| - solr_doc << Solr::Field.new(:story_display => story.children.to_xml) + if input_hash.has_key?(:symbols) + input_hash[:symbols].each do |symbol_name, value| + case value.class.to_s + when "String" + solr_doc << Solr::Field.new( :"#{symbol_name}_s" => "#{value}" ) + when "Array" + value.each { |v| solr_doc << Solr::Field.new( :"#{symbol_name}_s" => "#{v}" ) } + end + end end - - #strip out text and put in story_t - text_nodes = doc.xpath("//text()") - text = String.new - - text_nodes.each do |text_node| - text << text_node.content - end - - solr_doc << Solr::Field.new(:story_t => text) - - return solr_doc + return solr_doc end end end