# https://github.com/AustinBlues/RSS-Speed-Reader/blob/master/lib/rss_speed_reader.rb # https://github.com/tenderlove/nokogiri/pull/524 require 'mini_exiftool' module Daengine::TeamsiteMetadataParser @@translation = { 'TeamSite/Metadata/rttTitle' => 'title', "TeamSite/Metadata/enterprise_last_updated_date"=> 'changed_at', "TeamSite/Metadata/enterprise_audience_id"=> 'audiences', "TeamSite/Metadata/enterprise_sami_desc"=> 'sami_code', "TeamSite/Metadata/enterprise_last_publication_date"=> 'published_at', "TeamSite/Metadata/enterprise_unpublish_date"=> 'unpublished_at', "TeamSite/Metadata/enterprise_expiration_date"=> 'expires_at', "TeamSite/Metadata/enterprise_guid" => 'guid', "TeamSite/Metadata/shortSynopsis" => 'summary', "TeamSite/Metadata/business_owner" => 'business_owner', "TeamSite/Metadata/enterprise_product_id" => 'product_ids', "TeamSite/Metadata/enterprise_content_organization_id" => 'content_organization_ids', "TeamSite/Metadata/enterprise_program_id" => 'program_ids', "TeamSite/Metadata/omnitureSiteSection_codes" => 'omniture_codes' } @@path_tuples = { "path" => 'path', "TeamSite/Metadata/enterprise_last_content_update_date" => 'doc_changed_at', "TeamSite/Metadata/enterprise_content_type_id" => 'content_type' } @@validations = { "TeamSite/Metadata/display_on_website" => lambda { |val| /Y|1|true/ =~ val }, "TeamSite/Metadata/enterprise_expiration_date" => lambda {|val| !val.blank? }, # "TeamSite/Metadata/enterprise_unpublish_date" => lambda {|val| val.blank? }, #"path" => lambda {|val| !(/\/manifest\// =~ val) } } @@logger = nil def self.logger=(some_logger) @@logger = some_logger self end def self.log(args) @@logger.error(args) unless @@logger.blank? end def self.parse_tuple_file(file, last_read = nil) time do asset = nil assets = {} docpath = {} valid = true while (line = file.gets) case line when /<\/?data-tuple>/ if (asset.blank?) asset = DigitalAsset.new elsif (valid) assets[asset.guid] ||= asset.attributes # first tuple metadata wins assets[asset.guid]['documents_attributes'] ||= [] assets[asset.guid]['documents_attributes'] << docpath # assets[asset.guid]['_id'] = asset.guid asset = nil; docpath = {}; valid = true; else asset = nil; docpath = {}; valid = true; end when /([^<]+)<\/tuple-field>/ if (valid) if @@validations[$1] valid = @@validations[$1].call($2) end if @@path_tuples[$1] docpath[@@path_tuples[$1]] = $2 # if this is one of our keys, 'send' to the method elsif (@@translation[$1]) val = asset.send("#{@@translation[$1]}").respond_to?(:[]) ? $2.split(',') : $2 method_name = "#{@@translation[$1]}=" val = val.gsub(/(\w{3} \d{2} \d{4} \d{2}:\d{2}:\d{2}):\d{3}(AM|PM)/,'\1 \2') if val.respond_to?(:gsub) # deal with strange teamsite date formats asset.send(method_name, val) end end end end # loop thru each doc in the collection, either replace or delete it error_files = [] update_count = 0; delete_count = 0; added_count = 0; assets.keys.each do |key| da = nil begin if (!assets[key]['unpublished_at'].nil?) DigitalAsset.where(guid: key).try :delete_all delete_count += 1 elsif (assets[key]['expires_at'].nil? || (assets[key]['expires_at'] < 1.minute.from_now)) DigitalAsset.where(guid: key).try :delete_all delete_count += 1 else da = DigitalAsset.find_or_initialize_by(guid: key) asset_docs = trim_package(assets[key]['documents_attributes'], last_read, da) if (!asset_docs.empty?) assets[key]['documents_attributes'] = asset_docs assets[key]['orderable'] = da.orderable unless da.new? creating = da.new? da.documents = [] begin da.update_attributes!(assets[key]) rescue Exception => e error_files << "#{e} ---- #{da.try(:guid)}, #{da.try(:errors).try(:full_messages)}" end creating ? added_count += 1 : update_count += 1 else DigitalAsset.where(guid: key).try :delete_all delete_count += 1 end end rescue Exception => e Daengine.log(e.message, "error") Daengine.log(e.backtrace.join('\n'), "error") end end log_txt = "TeamsiteMetadataParser: Failed to save/update following DigitalAssets in database:\n" error_files.each do |asset_file| log_txt << "> #{asset_file}\n" end Daengine.log(log_txt, "warn") unless error_files.empty? Daengine.log("checking purge condition, stale count is #{DigitalAsset.stale.count}, bulk_processed is #{DigitalAsset.bulk_processed?}", 'warn') DigitalAsset.purge! # if the purge criteria is met, purge anything not updated Daengine.log("TeamsiteMetadataParser: #{added_count} records added, #{update_count} updated, #{delete_count} removed", "info") end end def self.trim_package(asset_docs, last_read = nil, existing_da = nil) docs = [] path = Daengine.config[:digital_assets_file_directory] if Dir.exist?(path) # dont throw away assets if we can't locate the dir asset_docs.each do |doc| #exclude manifest_file unless doc['path'].match('\/manifest\/') file = File.join(path, doc['path']) if(!File.exist?(file)) # try with just the filename #Daengine.log "Cant find file #{file}, trying with just the path and filename", "info" file = File.join(path, doc['path'].split('/').last) end #exclude digital_asset_files that are not in *Teamsite Staging* if File.exist?(file) docs << doc if(File.mtime(file) > last_read) begin exifdata = ::MiniExiftool.new(file) # spaces in filename pages = exifdata.pagecount pages = exifdata.pages if pages.blank? pages = exifdata.slides if pages.blank? if(pages.blank?) Daengine.log "Unable to get pages metadata for #{file}, exifdata.pagecount = #{exifdata.pagecount}, exifdata.pages = #{exifdata.pages}", "info" end doc['pages'] = pages doc['size'] = exifdata.filesize doc['mime_type'] = exifdata.mimetype #doc['keywords'] = exifdata.keywords.gsub(';', ',').gsub(':', ',').split(",") unless exifdata.keywords.nil? if exifdata.keywords.is_a? Enumerable doc['keywords'] = exifdata.keywords else doc['keywords'] = exifdata.keywords.gsub(';', ',').gsub(':', ',').split(",") unless exifdata.keywords.nil? end if exifdata.description.is_a? Enumerable doc['subject'] = exifdata.description.join(" ") unless exifdata.description.nil? else doc['subject'] = exifdata.description.gsub(':', '') unless exifdata.description.nil? end doc['author'] = exifdata.author Daengine.log "Exif data for #{file} was pages:#{doc['pages']}, size:#{doc['size']}", "info" rescue Exception => e Daengine.log "Error reading metadata from #{file} #{e.message}", "error" end elsif (existing_da && !existing_da.new?) # copy over the existing asset metadata existing_doc = existing_da.documents.where(path: doc['path'])[0] if (existing_doc) doc['pages'] = existing_doc.pages doc['size'] = existing_doc.size doc['mime_type'] = existing_doc.mime_type doc['keywords'] = existing_doc.keywords doc['subject'] = existing_doc.subject doc['author'] = existing_doc.author else Daengine.log "Unable to find document with path #{file} on existing asset #{existing_da.guid} title: #{existing_da.title}", "info" end Daengine.log "Saving existing pages #{existing_doc.pages} and size #{existing_doc.size} on #{file}", "info" end else # the file was missing on disk, show a warning! Daengine.log("TeamsiteMetadataParser: Unable to locate file #{file} on disk! Removing from metadata", "warn") end end end else Daengine.log("******************", "error") Daengine.log("ERROR!!! Unable to locate physical teamsite files at #{path}", "error") Daengine.log("******************", "error") end docs end def self.time start = Time.now yield self.log "elapsed time was #{Time.now - start}" end end