require 'net/http' require 'net/https' require 'uri' require 'cgi' require 'rexml/document' include REXML class DorService def DorService.get_https_connection(url) https = Net::HTTP.new(url.host, url.port) if(url.scheme == 'https') https.use_ssl = true https.cert = OpenSSL::X509::Certificate.new( File.read(CERT_FILE) ) https.key = OpenSSL::PKey::RSA.new( File.read(KEY_FILE), KEY_PASS ) https.verify_mode = OpenSSL::SSL::VERIFY_NONE end https end # This should check to see if an object with the given PID already # exists in the repository def DorService.create_object(form_data) begin url = DOR_URI + '/objects' body = DorService.encodeParams(form_data) content_type = 'application/x-www-form-urlencoded' res = LyberCore::Connection.post(url, body, :content_type => content_type) res =~ /\/objects\/(.*)/ druid = $1 return druid rescue Exception => e LyberCore::Log.error("Unable to create object #{e.backtrace}") raise e end end #objects/dr:123/resources #parms: model, id #will create object of type dor:GoogleScannedPage def DorService.create_child_object(parent_druid, child_id) begin #See if page exists before creating new fedora object # raise "Object exists with id: " + child_id if(DorService.get_druid_by_id(child_id)) form_data = {'model' => 'dor:googleScannedPage', 'id' => child_id} url = DOR_URI + '/objects/' + parent_druid + '/resources' body = DorService.encodeParams(form_data) content_type = 'application/x-www-form-urlencoded' res = LyberCore::Connection.post(url, body, :content_type => content_type) res=~ /\/resources\/(.*)/ druid = $1 LyberCore::Log.info("Child googleScannedPage object created for parent #{parent_druid}") LyberCore::Log.debug("child_id = #{child_id}") LyberCore::Log.debug("new druid = #{druid}") return druid rescue Exception => e LyberCore::Log.error("Unable to create object") raise e, "Unable to create object " end end # Takes a hash of arrays and builds a x-www-form-urlencoded string for POSTing form parameters # # == Parameters # - form_data - a hash of arrays that contains the form data, ie. {'param1' => ['val1', 'val2'], 'param2' => ['val3']} def DorService.encodeParams(form_data) body = "" form_data.each_pair do |param, array| array.each do |value| encoded = CGI.escape value body += '&' unless (body == "") body += param + '=' + encoded end end body end # Depricated. Use Dor::WorkflowService#create_workflow in lyber_core gem # def DorService.create_workflow(workflow, druid) # begin # url = URI.parse(DOR_URI + '/objects/' + druid + '/workflows/' + workflow.workflow_id) # req = Net::HTTP::Put.new(url.path) # #req.basic_auth 'fedoraUser', 'pass' # req.body = workflow.workflow_process_xml # req.content_type = 'application/xml' # res = DorService.get_https_connection(url).start {|http| http.request(req) } # # WorkflowService.create_workflow() # # case res # when Net::HTTPSuccess # puts workflow.workflow_id + " created for " + druid # else # $stderr.print res.body # raise res.error! # end # rescue Exception => e # $stderr.print "Unable to create workflow " + e # raise # end # end # See if an object exists with this dor_id (not druid, but sub-identifier) # Caller will have to handle any exception thrown def DorService.get_druid_by_id(dor_id) url_string = "#{DOR_URI}/query_by_id?id=#{dor_id}" LyberCore::Log.debug("Fetching druid for dor_id #{dor_id} at url #{url_string}") url = URI.parse(url_string) req = Net::HTTP::Get.new(url.request_uri) res = DorService.get_https_connection(url).start {|http| http.request(req) } case res when Net::HTTPSuccess res.body =~ /druid="([^"\r\n]*)"/ return $1 when Net::HTTPClientError LyberCore::Log.debug("Barcode does not yet exist in DOR: #{dor_id}") return nil when Net::HTTPServerError LyberCore::Log.error("Encountered HTTPServerError error when requesting #{url}: #{res.inspect}") raise "Encountered 500 error when requesting #{url}: #{res.inspect}" else LyberCore::Log.error("Encountered unknown error when requesting #{url}: #{res.inspect}") raise "Encountered unknown error when requesting #{url}: #{res.inspect}" end end ############################################# Start of Datastream methods # Until ActiveFedora supports client-side certificate configuration, we are stuck with our own methods to access datastreams #/objects/{pid}/datastreams/{dsID} ? [controlGroup] [dsLocation] [altIDs] [dsLabel] [versionable] [dsState] [formatURI] [checksumType] [checksum] [logMessage] def DorService.add_datastream(druid, ds_id, ds_label, xml, content_type='application/xml', versionable = false ) DorService.add_datastream_managed(druid, ds_id, ds_label, xml, content_type, versionable) end def DorService.add_datastream_external_url(druid, ds_id, ds_label, ext_url, content_type, versionable = false) parms = '?controlGroup=E' parms += '&dsLabel=' + CGI.escape(ds_label) parms += '&versionable=false' unless(versionable) parms += '&dsLocation=' + ext_url DorService.set_datastream(druid, ds_id, parms, :post, {:type => content_type}) end def DorService.update_datastream(druid, ds_id, xml, content_type='application/xml', versionable = false) parms = '?controlGroup=M' parms += '&versionable=false' unless(versionable) DorService.set_datastream(druid, ds_id, parms, :put, {:type => content_type, :xml => xml}) end def DorService.add_datastream_managed(druid, ds_id, ds_label, xml, content_type='application/xml', versionable = false ) parms = '?controlGroup=M' parms += '&dsLabel=' + CGI.escape(ds_label) parms += '&versionable=false' unless(versionable) DorService.set_datastream(druid, ds_id, parms, :post, {:type => content_type, :xml => xml}) end # Retrieve the content of a datastream of a DOR object # e.g. FEDORA_URI + /objects/ + druid + /datastreams/dor/content gets "dor" datastream content def DorService.get_datastream(druid, ds_id) begin LyberCore::Log.debug("Connecting to #{FEDORA_URI}...") url_string = "#{FEDORA_URI}/objects/#{druid}/datastreams/#{ds_id}/content" url = URI.parse(url_string) LyberCore::Log.debug("Connecting to #{url_string}...") req = Net::HTTP::Get.new(url.request_uri) LyberCore::Log.debug("request object: #{req.inspect}") res = DorService.get_https_connection(url).start {|http| http.request(req) } case res when Net::HTTPSuccess return res.body when Net::HTTPClientError LyberCore::Log.debug("Datastream not found at url #{url_string}") return nil when Net::HTTPServerError LyberCore::Log.error("Attempted to reach #{url_string} but failed") raise "Encountered 500 error when requesting #{url_string}: #{res.inspect}" else LyberCore::Log.error("Encountered unknown error when requesting #{url}: #{res.inspect}") raise "Encountered unknown error when requesting #{url}: #{res.inspect}" end rescue Exception => e raise e end end # Depricated - use Dor::WorkflowService#get_workflow_xml def DorService.get_workflow_xml(druid, workflow) raise Exception.new("This method is deprecated. Please use Dor::WorkflowService#get_workflow_xml") end # Retrieve the metadata of a datastream of a DOR object # e.g. FEDORA_URI + /objects/ + druid + /datastreams/dor gets "dor" datastream metadata def DorService.get_datastream_md(druid, ds_id) begin LyberCore::Log.debug("Connecting to #{FEDORA_URI}...") url = URI.parse(FEDORA_URI + '/objects/' + druid + '/datastreams/' + ds_id) LyberCore::Log.debug("Connecting to #{url}...") req = Net::HTTP::Get.new(url.request_uri) req.basic_auth FEDORA_USER, FEDORA_PASS LyberCore::Log.debug("request object: #{req.inspect}") res = DorService.get_https_connection(url).start {|http| http.request(req) } case res when Net::HTTPSuccess return res.body else LyberCore::Log.error("Attempted to reach #{url} but failed") LyberCore::Log.error("Datastream #{dsid} not found for #{druid}") end rescue Exception => e raise e, "Couldn't get datastream from #{url}" end end # Add a new datastream, but only if it does not yet exist def DorService.add_datastream_unless_exists(druid, ds_id, ds_label, xml) # make sure xml is not empty unless xml raise "No data supplied for datastream " + ds + "of " + druid end # check to make sure datastream does not yet exist unless DorService.get_datastream(druid, ds_id) DorService.add_datastream(druid, ds_id, ds_label, xml) end end ############################################# End of Datastream methods # Deprecated. Use Dor::WorkflowService#update_workflow_status #PUT "objects/pid:123/workflows/GoogleScannedWF/convert" #" #TODO increment attempts # def DorService.updateWorkflowStatus(repository, druid, workflow, process, status, elapsed = 0, lifecycle = nil) # begin # url = URI.parse(WORKFLOW_URI + '/' + repository + '/objects/' + druid + '/workflows/' + workflow + '/' + process) # req = Net::HTTP::Put.new(url.path) # process_xml = '' # req.body = process_xml # req.content_type = 'application/xml' # res = DorService.get_https_connection(url).start {|http| http.request(req) } # case res # when Net::HTTPSuccess # puts "#{workflow} process updated for " + druid # else # $stderr.print res.body # raise res.error! # end # rescue Exception => e # $stderr.print "Unable to update workflow " + e # raise # end # # end # Returns string containing object list XML from a workflow DOR query # # @param [String] repository name of the repository you are querying. Right now, dor and sdr are supported # @param [String] workflow name of the workflow being queried, eg googleScannedBookWF # @param [String, Array] completed if only querying for one completed step, pass in a String. # If querying for two completed steps, pass in an Array of the two completed steps # @param [String] waiting the name of the waiting step # @raise [LyberCore::Exceptions::EmptyQueue] When the query is successful, but no objects are found in that queue # @raise [Exception] For other problems like connection failures # @return [String] XML containing all the objects that match the specific query. It looks like: # # # # def DorService.get_objects_for_workstep(repository, workflow, completed, waiting) LyberCore::Log.debug("DorService.get_objects_for_workstep(#{repository}, #{workflow}, #{completed}, #{waiting})") begin if repository.nil? or workflow.nil? or completed.nil? or waiting.nil? LyberCore::Log.fatal("Can't execute DorService.get_objects_for_workstep: missing info") end unless defined?(WORKFLOW_URI) and WORKFLOW_URI != nil LyberCore::Log.fatal("WORKFLOW_URI is not set. ROBOT_ROOT = #{ROBOT_ROOT}") raise "WORKFLOW_URI is not set" end uri_string = "#{WORKFLOW_URI}/workflow_queue?repository=#{repository}&workflow=#{workflow}&waiting=#{waiting}" if(completed.class == Array) raise "The workflow service can only handle queries with no more than 2 completed steps" if completed.size > 2 completed.each {|step| uri_string << "&completed=#{step}"} else uri_string << "&completed=#{completed}" end LyberCore::Log.info("Attempting to connect to #{uri_string}") url = URI.parse(uri_string) req = Net::HTTP::Get.new(url.request_uri) res = DorService.get_https_connection(url).start {|http| http.request(req) } case res when Net::HTTPSuccess begin doc = Nokogiri::XML(res.body) count = doc.root.at_xpath("//objects/@count").content.to_i rescue Exception => e msg = "Could not parse response from Workflow Service" LyberCore::Log.error(msg + "\n#{res.body}") raise e, msg end if(count == 0) raise LyberCore::Exceptions::EmptyQueue.new, "empty queue" else return res.body end else LyberCore::Log.fatal("Workflow queue not found for #{workflow} : #{waiting}") LyberCore::Log.debug("I am attempting to connect to WORKFLOW_URI #{WORKFLOW_URI}") LyberCore::Log.debug("repository: #{repository}") LyberCore::Log.debug("workflow: #{workflow}") LyberCore::Log.debug("completed: #{completed}") LyberCore::Log.debug("waiting: #{waiting}") LyberCore::Log.debug(res.inspect) raise "Could not connect to url #{uri_string}" end end end def DorService.log_and_raise_workflow_connection_problem(repository, workflow, completed, waiting, response) end # Transforms the XML from getObjectsForWorkStep into a list of druids # TODO figure out how to return a partial list # This method is here for backward compatibility, but it has # been superceded by DlssService.get_druids_from_object_list(objectListXml) def DorService.get_druids_from_object_list(objectListXml) DlssService.get_all_druids_from_object_list(objectListXml) end # Retrieves the identityMetadata datastream for a DOR object, # extracts the otherId values, and returns them in a hash def DorService.get_object_identifiers(druid) begin identifiers = {} identityMetadata = get_datastream(druid, 'identityMetadata') raise "Unable to get identityMetadata datastream for #{druid}" if identityMetadata.nil? dorXml = Document.new(identityMetadata) dorXml.elements.each("identityMetadata/otherId") do |element| identifiers[element.attributes["name"]] = case element.text when nil then nil else element.text.strip end end return identifiers rescue Exception => e raise e, "Couldn't get object identifiers for #{druid}" end end def DorService.transfer_object(objectid, sourceDir, destinationDir) rsync='rsync -a -e ssh ' rsync_cmd = rsync + "'" + sourceDir + objectid + "' " + destinationDir LyberCore::Log.debug(rsync_cmd + "\n") system(rsync_cmd) return File.exists?(File.join(destinationDir, objectid)) end def DorService.verify_checksums(directory, checksumFile) dirSave = Dir.pwd Dir.chdir(directory) checksumCmd = 'md5sum -c ' + checksumFile + ' | grep -v OK | wc -l' badcount = `#{checksumCmd}`.to_i Dir.chdir(dirSave) return (badcount==0) end # Given a process and an error message, constuct an xml fragment that can be # posted to the workflow service to record the error generated for a given druid def DorService.construct_error_update_request(process, error_msg, error_txt) clean_error_msg = error_msg.gsub(/\s+/," ").gsub(/[`'#<>]/,'').gsub(/"/,"'") clean_error_txt = error_txt.gsub(/\s+/," ").gsub(/[`'#<>]/,'').gsub(/"/,"'") unless error_txt.nil? body = ' e msg = "Unable to update workflow service at url #{url_string}" LyberCore::Log.error(msg) raise e, msg end end # This method sends a GET request to jenson and returns MARC XML def DorService.query_symphony(flexkey) begin symphony_url = 'http://zaph.stanford.edu' path_info = '/cgi-bin/holding.pl?' parm_list = URI.escape('search=location&flexkey=' + flexkey) url_string = symphony_url + path_info + parm_list url = URI.parse(url_string) LyberCore::Log.debug("Attempting to query symphony: #{url_string}") res = Net::HTTP.start(url.host, url.port) {|http| http.get( path_info + parm_list ) } case res when Net::HTTPSuccess LyberCore::Log.debug("Successfully queried symphony for #{flexkey}") return res.body else LyberCore::Log.error("Encountered an error from symphony: #{res.body}") raise res.error! end rescue Exception => e raise e, "Encountered an error from symphony" end end #query_symphony private # druid, ds, url, content_type, method, parms def DorService.set_datastream(druid, ds_id, parms, method, content = {}) begin url = URI.parse(FEDORA_URI + '/objects/' + druid + '/datastreams/' + ds_id + parms) case method when :post req = Net::HTTP::Post.new(url.request_uri) when :put req = Net::HTTP::Put.new(url.request_uri) end req.basic_auth FEDORA_USER, FEDORA_PASS req.body = content[:xml] if(content[:xml]) req.content_type = content[:type] res = DorService.get_https_connection(url).start {|http| http.request(req) } case res when Net::HTTPSuccess return true when Net::HTTPServerError LyberCore::Log.error("Attempted to set datastream #{url} but failed") raise "Encountered 500 error setting datastream #{url}: #{res.inspect}" else LyberCore::Log.error("Encountered unknown error when setting datastream #{url}: #{res.inspect}") raise "Encountered unknown error when setting datastream #{url}: #{res.inspect}" end rescue Exception => e raise end end def DorService.get_object_metadata(druid) dor = DorService.get_datastream(druid, 'identityMetadata') mods = DorService.get_datastream(druid, 'descMetadata') googlemets = DorService.get_datastream(druid, 'googlemets') contentMetadata = DorService.get_datastream(druid, 'contentMetadata') adminMetadata = DorService.get_datastream(druid, 'adminMetadata') xml = "\n" + dor + mods + googlemets + contentMetadata + adminMetadata + "\n" return xml end end # Given an array of strings, construct valid xml in which each # member of the array becomes a element def DorService.construct_xml_for_tag_array(tag_array) xml = "" tag_array.each do |tag| tag = tag.gsub(/\s+/," ").gsub(/[<>!]/,'') xml << "#{tag}" end xml << "" end def DorService.add_identity_tags(druid, tags) begin url = URI.parse(DOR_URI + '/objects/' + druid + '/datastreams/identityMetadata/tags' ) req = Net::HTTP::Put.new(url.path) req.body = DorService.construct_xml_for_tag_array(tags) req.content_type = 'application/xml' res = DorService.get_https_connection(url).start {|http| http.request(req) } case res when Net::HTTPSuccess return true when Net::HTTPServerError LyberCore::Log.error("Attempted to add identity tags #{url} but failed") raise "Encountered 500 error when adding identity tags #{url}: #{res.inspect}" else LyberCore::Log.error("Encountered unknown error when adding identity tags #{url}: #{res.inspect}") raise "Encountered unknown error when adding identity tags #{url}: #{res.inspect}" end rescue Exception => e raise e end end #DorService.updateWorkflowStatus('dr:rf624mb644', 'GoogleScannedWF', 'descriptive-metadata', 'completed') ####Testing #line = 'id="catkey:1990757"||id="barcode:36105045033136"||model="GoogleScannedBook"||label="The poacher"' #form_data = {} #DorService.parse_line_return_hashlist(line, form_data) #form_data.each_pair{|k,v| puts "key: #{k} value: #{v}"} # #puts DorService.encodeParams(form_data) #DorService.create_object('id="catkey:454545454545454"||id="barcode:434343434343434343434343434"||model="GoogleScannedBook"||label="Ruby multiple Id parms 3"')