# Copyright © 2012 The Pennsylvania State University # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. require 'datastreams/fits_datastream' require 'datastreams/generic_file_rdf_datastream' require 'datastreams/properties_datastream' require 'datastreams/paranoid_rights_datastream' require 'datastreams/file_content_datastream' class GenericFile < ActiveFedora::Base include ActiveModel::Validations::HelperMethods include ActiveFedora::Validations include Hydra::ModelMixins::CommonMetadata include Hydra::ModelMixins::RightsMetadata include Sufia::ModelMethods include Sufia::Noid @@FIELD_LABEL_MAP = {"based_near"=>"Location", 'description'=>"Abstract or Summary", 'tag'=>"Keyword", 'date_created'=>"Date Created", 'related_url'=>"Related URL"} has_metadata :name => "characterization", :type => FitsDatastream has_metadata :name => "descMetadata", :type => GenericFileRdfDatastream has_metadata :name => "properties", :type => PropertiesDatastream has_metadata :name => "rightsMetadata", :type => ParanoidRightsDatastream has_file_datastream :name => "content", :type => FileContentDatastream has_file_datastream :name => "thumbnail", :type => FileContentDatastream belongs_to :batch, :property => :is_part_of delegate_to :properties, [:relative_path, :depositor], :unique => true delegate_to :descMetadata, [:date_uploaded, :date_modified], :unique => true delegate_to :descMetadata, [:related_url, :based_near, :part_of, :creator, :contributor, :title, :tag, :description, :rights, :publisher, :date_created, :subject, :format, :resource_type, :identifier, :language] delegate :mime_type, :to => :characterization, :unique => true delegate_to :characterization, [:format_label, :file_size, :last_modified, :filename, :original_checksum, :rights_basis, :copyright_basis, :copyright_note, :well_formed, :valid, :status_message, :file_title, :file_author, :page_count, :file_language, :word_count, :character_count, :paragraph_count, :line_count, :table_count, :graphics_count, :byte_order, :compression, :width, :height, :color_space, :profile_name, :profile_version, :orientation, :color_map, :image_producer, :capture_device, :scanning_software, :exif_version, :gps_timestamp, :latitude, :longitude, :character_set, :markup_basis, :markup_language, :duration, :bit_depth, :sample_rate, :channels, :data_format, :offset] around_save :characterize_if_changed, :retry_warming validate :paranoid_permissions NO_RUNS = 999 #make sure the terms of service is present and set to 1 before saving # note GenericFile.create will no longer save a GenericFile as the terms_of_service will not be set terms_of_service = nil validates_acceptance_of :terms_of_service, :allow_nil => false # set the terms of service on create so an empty generic file can be created #before_validation(:on => :create) do # logger.info "!!!! Before create !!!!" # self.terms_of_service = '1' #end def self.get_label(key) label = @@FIELD_LABEL_MAP[key] puts "label = #{label}" label = key.gsub('_',' ').titleize if label.blank? return label end def persistent_url "#{Sufia::Engine.config.persistent_hostpath}#{noid}" end def paranoid_permissions # let the rightsMetadata ds make this determination # - the object instance is passed in for easier access to the props ds rightsMetadata.validate(self) end ## Updates those permissions that are provided to it. Does not replace any permissions unless they are provided def permissions=(params) perm_hash = permission_hash params[:new_user_name].each { |name, access| perm_hash['person'][name] = access } if params[:new_user_name].present? params[:new_group_name].each { |name, access| perm_hash['group'][name] = access } if params[:new_group_name].present? params[:user].each { |name, access| perm_hash['person'][name] = access} if params[:user] params[:group].each { |name, access| perm_hash['group'][name] = access} if params[:group] rightsMetadata.update_permissions(perm_hash) end def retry_warming save_tries = 0 conflict_tries = 0 begin yield rescue RSolr::Error::Http => error save_tries += 1 logger.warn "Retry Solr caught RSOLR error on #{self.pid}: #{error.inspect}" # fail for good if the tries is greater than 3 rescue_action_without_handler(error) if save_tries >=3 sleep 0.01 retry rescue ActiveResource::ResourceConflict => error conflict_tries += 1 logger.warn "Retry caught Active Resource Conflict #{self.pid}: #{error.inspect}" rescue_action_without_handler(error) if conflict_tries >=10 sleep 0.01 retry rescue =>error if (error.to_s.downcase.include? "conflict") conflict_tries += 1 logger.warn "Retry caught Active Resource Conflict #{self.pid}: #{error.inspect}" rescue_action_without_handler(error) if conflict_tries >=10 sleep 0.01 retry else rescue_action_without_handler(error) end end end def characterize_if_changed content_changed = self.content.changed? yield #logger.debug "DOING CHARACTERIZE ON #{self.pid}" begin Resque.enqueue(CharacterizeJob, self.pid) if content_changed rescue Redis::CannotConnectError logger.error "Redis is down!" end end ## Extract the metadata from the content datastream and record it in the characterization datastream def characterize self.characterization.content = self.content.extract_metadata self.append_metadata self.filename = self.label self.terms_of_service = '1' save unless self.new_object? end def related_files relateds = begin self.batch.generic_files rescue NoMethodError batch_id = self.object_relations["isPartOf"].first || self.object_relations[:is_part_of].first return [] if batch_id.nil? self.class.find(:is_part_of_s => batch_id) end relateds.reject { |gf| gf.pid == self.pid } end # Create thumbnail requires that the characterization has already been run (so mime_type, width and height is available) # and that the object is already has a pid set def create_thumbnail return if self.content.content.nil? if ["application/pdf"].include? self.mime_type create_pdf_thumbnail elsif ["image/png","image/jpeg", "image/gif"].include? self.mime_type create_image_thumbnail # TODO: if we can figure out how to do video (ffmpeg?) #elsif ["video/mpeg", "video/mp4"].include? self.mime_type end end # redefine find so that it sets the terms of service def self.find(args, opts={}) gf = super # use the field type to see if the return will be one item or multiple if args.is_a? String gf.terms_of_service = '1' else gf.each {|f| f.terms_of_service = '1'} end return gf end def create_pdf_thumbnail retryCnt = 0 stat = false; for retryCnt in 1..3 begin pdf = Magick::ImageList.new pdf.from_blob(content.content) first = pdf.to_a[0] first.format = "PNG" thumb = first.scale(338, 493) self.thumbnail.content = thumb.to_blob { self.format = "PNG" } #logger.debug "Has the content changed before saving? #{self.content.changed?}" self.terms_of_service = '1' stat = self.save break rescue => e logger.warn "Rescued an error #{e.inspect} retry count = #{retryCnt}" sleep 1 end end return stat end def create_image_thumbnail img = Magick::ImageList.new img.from_blob(content.content) # horizontal img height = self.height.first.to_i width = self.width.first.to_i scale = height / width if width > height if width > 150 and height > 105 thumb = img.scale(150, height/scale) else thumb = img.scale(width, height) end # vertical img else if width > 150 and height > 200 thumb = img.scale(150*scale, 200) else thumb = img.scale(width, height) end end self.thumbnail.content = thumb.to_blob self.terms_of_service = '1' #logger.debug "Has the content before saving? #{self.content.changed?}" self.save end def append_metadata terms = self.characterization_terms Sufia::Engine.config.fits_to_desc_mapping.each_pair do |k, v| if terms.has_key?(k) # coerce to array to remove a conditional terms[k] = [terms[k]] unless terms[k].is_a? Array terms[k].each do |term_value| proxy_term = self.send(v) if proxy_term.kind_of?(Array) proxy_term << term_value unless proxy_term.include?(term_value) else # these are single-valued terms which cannot be appended to self.send("#{v}=", term_value) end end end end end def set_visibility(params) # only set explicit permissions if params[:visibility] == "open" self.datastreams["rightsMetadata"].permissions({:group=>"public"}, "read") elsif params[:visibility] == "psu" self.datastreams["rightsMetadata"].permissions({:group=>"registered"}, "read") self.datastreams["rightsMetadata"].permissions({:group=>"public"}, "none") elsif params[:visibility] == "restricted" self.datastreams["rightsMetadata"].permissions({:group=>"registered"}, "none") self.datastreams["rightsMetadata"].permissions({:group=>"public"}, "none") #params[:generic_file][:permissions][:group][:public] = "none" #params[:generic_file][:permissions][:group][:registered] = "none" end end def to_solr(solr_doc={}, opts={}) super(solr_doc, opts) solr_doc["label_t"] = self.label solr_doc["noid_s"] = noid solr_doc["file_format_t"] = file_format solr_doc["file_format_facet"] = solr_doc["file_format_t"] # remap dates as a valid xml date not to_s solr_doc['generic_file__date_uploaded_dt'] = Time.parse(date_uploaded).utc.to_s.sub(' ','T').sub(' UTC','Z') rescue Time.new(date_uploaded).utc.to_s.sub(' ','T').sub(' UTC','Z') unless date_uploaded.blank? solr_doc['generic_file__date_modified_dt'] = Time.parse(date_modified).utc.to_s.sub(' ','T').sub(' UTC','Z') rescue Time.new(date_modified).utc.to_s.sub(' ','T').sub(' UTC','Z') unless date_modified.blank? return solr_doc end def file_format return nil if self.mime_type.blank? and self.format_label.blank? return self.mime_type.split('/')[1]+ " ("+self.format_label.join(", ")+")" unless self.mime_type.blank? or self.format_label.blank? return self.mime_type.split('/')[1] unless self.mime_type.blank? return self.format_label end # Redefine this for more intuitive keys in Redis def to_param noid end def label=(new_label) @inner_object.label = new_label if self.title.empty? self.title = new_label end end def to_jq_upload return { "name" => self.title, "size" => self.file_size, "url" => "/files/#{noid}", "thumbnail_url" => self.pid, "delete_url" => "deleteme", # generic_file_path(:id => id), "delete_type" => "DELETE" } end def get_terms terms = [] self.descMetadata.class.config[:predicate_mapping].each do |uri, mappings| new_terms = mappings.keys.map(&:to_s).select do |term| term.start_with? "generic_file__" and !['type', 'behaviors'].include? term.split('__').last end terms.concat(new_terms) end terms end def get_values terms = get_terms values = {} terms.each do |t| next if t.empty? key = t.to_s.split("generic_file__").last next if ['part_of', 'date_modified', 'date_uploaded', 'format'].include?(key) values[key] = self.send(key) if self.respond_to?(key) end return values end def characterization_terms h = {} self.characterization.class.terminology.terms.each_pair do |k, v| next unless v.respond_to? :proxied_term term = v.proxied_term begin value = self.send(term.name) h[term.name] = value unless value.empty? rescue NoMethodError next end end h end # MIME: 'application/x-endnote-refer' def export_as_endnote end_note_format = { '%T' => [:title, lambda { |x| x.first }], '%Q' => [:title, lambda { |x| x.drop(1) }], '%A' => [:creator], '%C' => [:publication_place], '%D' => [:date_created], '%8' => [:date_uploaded], '%E' => [:contributor], '%I' => [:publisher], '%J' => [:series_title], '%@' => [:isbn], '%U' => [:related_url], '%7' => [:edition_statement], '%R' => [:persistent_url], '%X' => [:description], '%G' => [:language], '%[' => [:date_modified], '%9' => [:resource_type], '%~' => Application.config.application_name, '%W' => 'Penn State University' } text = [] text << "%0 GenericFile" end_note_format.each do |endnote_key, mapping| if mapping.is_a? String values = [mapping] else values = self.send(mapping[0]) if self.respond_to? mapping[0] values = mapping[1].call(values) if mapping.length == 2 values = [values] unless values.is_a? Array end next if values.empty? or values.first.nil? spaced_values = values.join("; ") text << "#{endnote_key} #{spaced_values}" end return text.join("\n") end # MIME type: 'application/x-openurl-ctx-kev' def export_as_openurl_ctx_kev export_text = [] export_text << "url_ver=Z39.88-2004&ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Adc&rfr_id=info%3Asid%2Fblacklight.rubyforge.org%3Agenerator" field_map = { :title => 'title', :creator => 'creator', :subject => 'subject', :description => 'description', :publisher => 'publisher', :contributor => 'contributor', :date_created => 'date', :resource_type => 'format', :identifier => 'identifier', :language => 'language', :tag => 'relation', :based_near => 'coverage', :rights => 'rights' } field_map.each do |element, kev| values = self.send(element) next if values.empty? or values.first.nil? values.each do |value| export_text << "rft.#{kev}=#{CGI::escape(value)}" end end export_text.join('&') unless export_text.blank? end def export_as_apa_citation text = '' authors_list = [] authors_list_final = [] #setup formatted author list authors = get_author_list authors.each do |author| next if author.blank? authors_list.push(abbreviate_name(author)) end authors_list.each do |author| if author == authors_list.first #first authors_list_final.push(author.strip) elsif author == authors_list.last #last authors_list_final.push(", & " + author.strip) else #all others authors_list_final.push(", " + author.strip) end end text << authors_list_final.join unless text.blank? if text[-1,1] != "." text << ". " else text << " " end end # Get Pub Date text << "(" + setup_pub_date + "). " unless setup_pub_date.nil? # setup title info title_info = setup_title_info text << "" + title_info + " " unless title_info.nil? # Publisher info text << setup_pub_info unless setup_pub_info.nil? unless text.blank? if text[-1,1] != "." text += "." end end text.html_safe end def export_as_mla_citation text = '' authors_final = [] #setup formatted author list authors = get_author_list if authors.length < 4 authors.each do |author| if author == authors.first #first authors_final.push(author) elsif author == authors.last #last authors_final.push(", and " + name_reverse(author) + ".") else #all others authors_final.push(", " + name_reverse(author)) end end text << authors_final.join unless text.blank? if text[-1,1] != "." text << ". " else text << " " end end else text << authors.first + ", et al. " end # setup title title_info = setup_title_info text << "" + mla_citation_title(title_info) + " " unless title.blank? # Publication text << setup_pub_info + ", " unless setup_pub_info.nil? # Get Pub Date text << setup_pub_date unless setup_pub_date.nil? if text[-1,1] != "." text << "." unless text.blank? end text.html_safe end def export_as_chicago_citation author_text = "" authors = get_all_authors unless authors.blank? if authors.length > 10 authors.each_with_index do |author, index| if index < 7 if index == 0 author_text << "#{author}" if author.ends_with?(",") author_text << " " else author_text << ", " end else author_text << "#{name_reverse(author)}, " end end end author_text << " et al." elsif authors.length > 1 authors.each_with_index do |author,index| if index == 0 author_text << "#{author}" if author.ends_with?(",") author_text << " " else author_text << ", " end elsif index + 1 == authors.length author_text << "and #{name_reverse(author)}." else author_text << "#{name_reverse(author)}, " end end else author_text << authors.first end end title_info = "" title_info << citation_title(clean_end_punctuation(CGI::escapeHTML(title.first)).strip) unless title.blank? pub_info = "" place = self.based_near.first publisher = self.publisher.first unless place.blank? place = CGI::escapeHTML(place) pub_info << place pub_info << ": " unless publisher.blank? end unless publisher.blank? publisher = CGI::escapeHTML(publisher) pub_info << publisher pub_info << ", " unless setup_pub_date.nil? end unless setup_pub_date.nil? pub_info << setup_pub_date end citation = "" citation << "#{author_text} " unless author_text.blank? citation << "#{title_info}. " unless title_info.blank? citation << "#{pub_info}." unless pub_info.blank? citation.html_safe end def logs(dsid) ChecksumAuditLog.where(:dsid=>dsid, :pid=>self.pid).order('created_at desc, id desc') end def audit! audit(true) end def audit_stat! audit_stat(true) end def audit_stat(force = false) logs = audit(force) audit_results = logs.collect { |result| result["pass"] } # check how many non runs we had non_runs =audit_results.reduce(0) { |sum, value| (value == NO_RUNS) ? sum = sum+1 : sum } if (non_runs == 0) result =audit_results.reduce(true) { |sum, value| sum && value } return result elsif (non_runs < audit_results.length) result =audit_results.reduce(true) { |sum, value| (value == NO_RUNS) ? sum : sum && value } return 'Some audits have not been run, but the ones run were '+ ((result)? 'passing' : 'failing') + '.' else return 'Audits have not yet been run on this file.' end end def audit(force = false) logs = [] self.per_version do |ver| logs << GenericFile.audit(ver, force) end logs end def per_version(&block) self.datastreams.each do |dsid, ds| ds.versions.each do |ver| block.call(ver) end end end def self.audit!(version) GenericFile.audit(version, true) end def self.audit(version, force = false) #logger.debug "***AUDIT*** log for #{version.inspect}" latest_audit = self.find(version.pid).logs(version.dsid).first unless force return latest_audit unless GenericFile.needs_audit?(version, latest_audit) end begin Resque.enqueue(AuditJob, version.pid, version.dsid, version.versionID) rescue Redis::CannotConnectError logger.error "Redis is down!" end # run the find just incase the job has finished already latest_audit = self.find(version.pid).logs(version.dsid).first latest_audit = ChecksumAuditLog.new(:pass=>NO_RUNS, :pid=>version.pid, :dsid=>version.dsid, :version=>version.versionID) unless latest_audit return latest_audit end def self.needs_audit?(version, latest_audit) if latest_audit and latest_audit.updated_at #logger.debug "***AUDIT*** last audit = #{latest_audit.updated_at.to_date}" days_since_last_audit = (DateTime.now - latest_audit.updated_at.to_date).to_i #logger.debug "***AUDIT*** days since last audit: #{days_since_last_audit}" if days_since_last_audit < Rails.application.config.max_days_between_audits #logger.debug "***AUDIT*** No audit needed for #{version.pid} #{version.versionID} (#{latest_audit.updated_at})" return false end else logger.warn "***AUDIT*** problem with audit log! Latest Audit is not nil, but updated_at is not set #{latest_audit}" unless latest_audit.nil? end #logger.info "***AUDIT*** Audit needed for #{version.pid} #{version.versionID}" return true end def self.audit_everything(force = false) GenericFile.find(:all, :rows => GenericFile.count).each do |gf| gf.per_version do |ver| GenericFile.audit(ver, force) end end end def self.audit_everything! GenericFile.audit_everything(true) end def self.run_audit(version) if version.dsChecksumValid #logger.info "***AUDIT*** Audit passed for #{version.pid} #{version.versionID}" passing = 1 ChecksumAuditLog.prune_history(version) else logger.warn "***AUDIT*** Audit failed for #{version.pid} #{version.versionID}" passing = 0 end check = ChecksumAuditLog.create!(:pass=>passing, :pid=>version.pid, :dsid=>version.dsid, :version=>version.versionID) return check end # Is this file in the middle of being processed by a batch? def processing? return false if self.batch.blank? return false if !self.batch.methods.include? :status return (!self.batch.status.empty?) && (self.batch.status.count == 1) && (self.batch.status[0] == "processing") end private def permission_hash old_perms = self.permissions user_perms = {} old_perms.select{|r| r[:type] == 'user'}.each do |r| user_perms[r[:name]] = r[:access] end user_perms group_perms = {} old_perms.select{|r| r[:type] == 'group'}.each do |r| group_perms[r[:name]] = r[:access] end {'person'=>user_perms, 'group'=>group_perms} end def setup_pub_date first_date = self.date_created.first unless first_date.blank? first_date = CGI::escapeHTML(first_date) date_value = first_date.gsub(/[^0-9|n\.d\.]/, "")[0,4] return nil if date_value.nil? end clean_end_punctuation(date_value) if date_value end def setup_pub_info text = '' place = self.based_near.first publisher = self.publisher.first unless place.blank? place = CGI::escapeHTML(place) text << place text << ": " unless publisher.blank? end unless publisher.blank? publisher = CGI::escapeHTML(publisher) text << publisher end return nil if text.strip.blank? clean_end_punctuation(text.strip) end def mla_citation_title(text) no_upcase = ["a","an","and","but","by","for","it","of","the","to","with"] new_text = [] word_parts = text.split(" ") word_parts.each do |w| if !no_upcase.include? w new_text.push(w.capitalize) else new_text.push(w) end end new_text.join(" ") end def citation_title(title_text) prepositions = ["a","about","across","an","and","before","but","by","for","it","of","the","to","with","without"] new_text = [] title_text.split(" ").each_with_index do |word,index| if (index == 0 and word != word.upcase) or (word.length > 1 and word != word.upcase and !prepositions.include?(word)) # the split("-") will handle the capitalization of hyphenated words new_text << word.split("-").map!{|w| w.capitalize }.join("-") else new_text << word end end new_text.join(" ") end def setup_title_info text = '' title = self.title.first unless title.blank? title = CGI::escapeHTML(title) title_info = clean_end_punctuation(title.strip) text << title_info end return nil if text.strip.blank? clean_end_punctuation(text.strip) + "." end def clean_end_punctuation(text) if [".",",",":",";","/"].include? text[-1,1] return text[0,text.length-1] end text end def get_author_list self.creator.map { |author| clean_end_punctuation(CGI::escapeHTML(author)) }.uniq end def get_all_authors authors = self.creator return authors.empty? ? nil : authors.map { |author| CGI::escapeHTML(author) } end def abbreviate_name(name) abbreviated_name = '' name = name.join('') if name.is_a? Array # make sure we handle "Cher" correctly return name if !name.include?(' ') and !name.include?(',') surnames_first = name.include?(',') delimiter = surnames_first ? ', ' : ' ' name_segments = name.split(delimiter) given_names = surnames_first ? name_segments.last.split(' ') : name_segments.first.split(' ') surnames = surnames_first ? name_segments.first.split(' ') : name_segments.last.split(' ') abbreviated_name << surnames.join(' ') abbreviated_name << ', ' abbreviated_name << given_names.map { |n| "#{n[0]}." }.join if given_names.is_a? Array abbreviated_name << "#{given_names[0]}." if given_names.is_a? String abbreviated_name end def name_reverse(name) name = clean_end_punctuation(name) return name unless name =~ /,/ temp_name = name.split(", ") return temp_name.last + " " + temp_name.first end end