require 'nokogiri' require 'mini_exiftool' require 'daengine/http_client' module Daengine module TeamsiteMetadataProcessor def self.process_tuple_file(file, last_read = nil) verify_pre_conditions time do assets = parse_file(file) assets = select_1_asset_per_id(assets, last_read) assets = add_file_attributes(assets, last_read) unless Daengine.config[:disable_file_check] assets = add_fund_codes(assets) summary = call_service(assets) Daengine.log("***** Summary = #{summary.inspect}", 'info') end rescue => ex Daengine.log("Error processing XML file - #{ex.inspect}", 'error') Daengine.log(ex.backtrace.join, 'error') end def self.verify_pre_conditions path = Daengine.config[:digital_assets_file_directory] raise "Unable to locate digital assets at #{path}" unless File.directory?(path) || Daengine.config[:disable_file_check] service_uri = Daengine.config[:digital_asset_service_url] raise 'No digital asset service URL set' if service_uri.blank? end def self.parse_file(file) puts "----- Parsing the file -----" document = Document.new Nokogiri::XML::SAX::Parser.new(document).parse(file) Daengine.log("Nokogiri Parser complete...", "info") document.assets end def self.select_1_asset_per_id(assets, last_read) comparison_date = last_read.nil? ? 10.years.ago : last_read - 5.minutes results = {} assets.each do |key, values| puts "----- select_1 #{key} -----" list = values.find_all { |v| v.effective? } asset = most_recent_non_expired(list) asset = (asset.changed_since(comparison_date) ? asset : nil) unless asset.nil? asset.finra_path = finra_path(list) unless asset.nil? results[key] = asset unless asset.nil? end results end def self.add_file_attributes(assets, last_read) results = {} assets.each do |key, asset| begin puts "----- add_file #{key} -----" file_name = asset_file_name(asset) asset.mark_for_deletion unless File.file?(file_name) set_asset_file_attributes(file_name, asset, last_read) unless asset.delete? results[key] = asset rescue => ex Daengine.log("***** Error processing asset with file name = #{asset.path} - #{ex.inspect}", 'error') end end results end def self.add_fund_codes(assets) results = {} assets.each do |key, asset| puts "----- add_fund_codes #{key} -----" fund_codes = [] asset.product_ids.each do |product_id| fund_code = DigitalAssetLookupService.fund_code_from_id(product_id) fund_codes << fund_code.strip.rjust(5, '0') unless fund_code.blank? end asset.fund_codes = fund_codes unless fund_codes.empty? results[key] = asset end results end def self.call_service(assets) results = {:errors => 0, :updated => 0, :deleted => 0} assets.each_value do |asset| begin puts "----- call_service #{asset.digital_asset_id} -----" if asset.delete? path = "#{service_uri}/#{asset.digital_asset_id}" options = {:method => :delete, :headers => header} operation = :deleted else path = "#{service_uri}" options = {:method => :post, :query => asset.as_hash, :headers => header} operation = :updated end response = Daengine::HTTP::Client.call(path, options) results[operation] += 1 if response.success? results[:errors] += 1 unless response.success? rescue => ex Daengine.log("***** Error calling service for #{asset.inspect} - #{ex.inspect}", 'error') results[:errors] += 1 end end results end def self.time start = Time.now yield Daengine.log("***** Elapsed time was #{Time.now - start}", 'info') end def self.most_recent_non_expired(list) list.inject do |previous, current| prev_published_at = previous.default_blank_time(:published_at) current_published_at = current.default_blank_time(:published_at) !current.expired? && !current.manifest_file? && !current.finra? && current_published_at >= prev_published_at ? current : previous end end def self.finra_path(list) finra = list.find { |value| value.finra? } finra.try(:path) end def self.asset_file_name(asset) name = File.join(file_directory, asset.path) name = File.join(file_directory, asset.file_name) unless File.exists?(name) name end def self.set_asset_file_attributes(file_name, asset, last_read) if File.mtime(file_name) > last_read update_asset_file_attributes(file_name, asset) end end def self.update_asset_file_attributes(file_name, asset) exifdata = ::MiniExiftool.new(file_name) pages = exifdata.pagecount pages = exifdata.pages if pages.blank? pages = exifdata.slides if pages.blank? asset.pages = pages asset.size = exifdata.filesize asset.mime_type = exifdata.mimetype asset.author = exifdata.author if exifdata.keywords.is_a? Enumerable asset.keywords = exifdata.keywords else asset.keywords = exifdata.keywords.gsub(';', ',').gsub(':', ',').split(',') unless exifdata.keywords.nil? end if exifdata.description.is_a? Enumerable asset.subject = exifdata.description.join(' ') unless exifdata.description.nil? else asset.subject = exifdata.description.gsub(':', '') unless exifdata.description.nil? end Daengine.log("--- exif data for #{file_name} pages=#{asset.pages}, size = #{asset.size}", "info") end def self.header {'Accept' => 'application/json', 'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3', 'User-Agent' => 'Ruby'} end def self.service_uri Daengine.config[:digital_asset_service_url] end def self.file_directory Daengine.config[:digital_assets_file_directory] end end class Document < Nokogiri::XML::SAX::Document DATA_TUPLE_ELEMENT = 'data-tuple' TUPLE_FIELD_ELEMENT = 'tuple-field' NAME_ATTRIBUTE = 'name' # Teamsite Date example: Dec 29 2008 12:00:00:000AM TIME_FORMAT = '%b %d %Y %k:%M:%S:%L%p' TRANSLATION = { 'TeamSite/Metadata/rttTitle' => 'title', 'TeamSite/Metadata/enterprise_last_updated_date' => 'changed_at', 'TeamSite/Metadata/enterprise_audience_id' => 'audiences', 'TeamSite/Metadata/enterprise_sami_desc' => 'sami_code', 'TeamSite/Metadata/enterprise_last_publication_date' => 'published_at', 'TeamSite/Metadata/enterprise_unpublish_date' => 'unpublished_at', 'TeamSite/Metadata/enterprise_expiration_date' => 'expires_at', #'TeamSite/Metadata/enterprise_guid' => 'guid', 'TeamSite/Metadata/enterprise_guid' => 'digital_asset_id', 'TeamSite/Metadata/shortSynopsis' => 'summary', 'TeamSite/Metadata/business_owner' => 'business_owner', 'TeamSite/Metadata/enterprise_product_id' => 'product_ids', 'TeamSite/Metadata/enterprise_content_organization_id' => 'content_organization_ids', 'TeamSite/Metadata/enterprise_program_id' => 'program_ids', 'TeamSite/Metadata/omnitureSiteSection_codes' => 'omniture_codes', 'TeamSite/Metadata/display_on_website' => 'display_on_website', 'path' => 'path', 'TeamSite/Metadata/enterprise_last_content_update_date' => 'doc_changed_at', 'TeamSite/Metadata/enterprise_content_type_id' => 'content_type' } def initialize @asset = nil @field_name = nil @text = '' @assets = {} end def assets @assets end def start_element(name, attrs = []) case name when DATA_TUPLE_ELEMENT @asset = ::ServiceDigitalAsset.new @field_name = nil when TUPLE_FIELD_ELEMENT @field_name = translate_field_name(name_attr_value(attrs)) end @text = '' end def characters(string) @text << string end def end_element(name) case name when DATA_TUPLE_ELEMENT @assets[@asset.digital_asset_id] = [] if @assets[@asset.digital_asset_id].nil? @assets[@asset.digital_asset_id] << @asset @asset = nil when TUPLE_FIELD_ELEMENT unless @field_name.nil? value = convert_value(@field_name, @text) setter_name = "#{@field_name}=".to_sym @asset.send(setter_name, value) if @asset.respond_to?(setter_name) end end @field_name, @text = nil, '' end def error(string) Daengine.log("***** Parse error - #{string} *****", 'error') end def warning(string) Daengine.log("***** Parse warning - #{string} *****", 'warn') end def convert_value(field_name, string) begin return convert_to_boolean(string) if boolean_field?(field_name) return convert_to_datetime(string) if datetime_field?(field_name) return convert_to_array(string) if array_field?(field_name) rescue => ex Daengine.log("***** convert-value('#{field_name}', '#{string}') - #{ex.inspect}", 'error') end string end def boolean_field?(field_name) ::ServiceDigitalAsset.boolean_field?(field_name) end def datetime_field?(field_name) ::ServiceDigitalAsset.time_field?(field_name) end def array_field?(field_name) ::ServiceDigitalAsset.array_field?(field_name) end def convert_to_boolean(string) !!(string =~ /Y|1|true/) end def convert_to_datetime(string) Time.strptime(string, TIME_FORMAT) unless string.blank? end def convert_to_array(string) string.try(:split, ',') end def name_attr_value(attrs) unless attrs.nil? hash = Hash[*attrs.flatten] hash[NAME_ATTRIBUTE] end end def translate_field_name(name) TRANSLATION[name] end end end