require 'solr' require 'rexml/document' require "nokogiri" require 'yaml' module Solrizer class Extractor def extract_tags(text) doc = REXML::Document.new( text ) extract_tag(doc, 'archivist_tags').merge(extract_tag(doc, 'donor_tags')) end def extract_tag(doc, type) tags = doc.elements["/fields/#{type}"] return {} unless tags {type => tags.text.split(/,/).map {|t| t.strip}} end # # Extracts content-model and hydra-type from RELS-EXT datastream # def extract_rels_ext( text, solr_doc=Solr::Document.new ) # TODO: only read in this file once if defined?(RAILS_ROOT) config_path = File.join(RAILS_ROOT, "config") else config_path = File.join(File.dirname(__FILE__), "..", "..", "config") end map = YAML.load(File.open(File.join(config_path, "hydra_types.yml"))) doc = Nokogiri::XML(text) doc.xpath( '//foo:hasModel', 'foo' => 'info:fedora/fedora-system:def/model#' ).each do |element| cmodel = element.attributes['resource'].to_s solr_doc << Solr::Field.new( :cmodel_t => cmodel ) if map.has_key?(cmodel) solr_doc << Solr::Field.new( :hydra_type_t => map[cmodel] ) end end return solr_doc end # # This method extracts solr fields from simple xml # def xml_to_solr( text, solr_doc=Solr::Document.new ) doc = REXML::Document.new( text ) doc.root.elements.each do |element| solr_doc << Solr::Field.new( :"#{element.name}_t" => "#{element.text}" ) end return solr_doc end # # This method strips html tags out and returns content to be indexed in solr # def html_content_to_solr( ds, solr_doc=Solr::Document.new ) text = CGI.unescapeHTML(ds.content) doc = Nokogiri::HTML(text) # html to story_display stories = doc.xpath('//story') stories.each do |story| solr_doc << Solr::Field.new(:story_display => story.children.to_xml) end #strip out text and put in story_t text_nodes = doc.xpath("//text()") text = String.new text_nodes.each do |text_node| text << text_node.content end solr_doc << Solr::Field.new(:story_t => text) return solr_doc end end end