module Maltese class Sitemap attr_reader :sitemap_bucket, :sitemap_url, :from_date, :until_date def initialize(attributes={}) @sitemap_bucket = attributes[:sitemap_bucket]|| "" @sitemap_url = attributes[:sitemap_url] || "" @from_date = attributes[:from_date].presence || ( - @until_date = attributes[:until_date].presence || end # load ENV variables from container environment if json file exists # see env_json_file = "/etc/container_environment.json" if File.size?(env_json_file).to_i > 2 env_vars = JSON.parse( env_vars.each { |k, v| ENV[k] = v } end def search_path "#{sitemap_url}/api?" end def sitemaps_host "http://#{sitemap_bucket}" end def sitemaps_path 'sitemaps/' end def timeout 120 end def job_batch_size 50000 end def sitemap @sitemap ||= default_host: sitemap_url, adapter: s3_adapter, sitemaps_host: sitemaps_host, sitemaps_path: sitemaps_path, finalize: false) end def s3_adapter 'AWS', aws_access_key_id: ENV['AWS_ACCESS_KEY_ID'], aws_secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'], fog_directory: sitemap_bucket, fog_region: ENV['AWS_REGION']) end def fog_storage 'AWS', aws_access_key_id: ENV['AWS_ACCESS_KEY_ID'], aws_secret_access_key: ENV['AWS_SECRET_ACCESS_KEY']) end def queue_jobs(options={}) total = get_total(options) if total > 0 puts process_data(options.merge(total: total)) else puts "No works found for date range #{from_date} - #{until_date}." end # return number of works queued total end def get_total(options={}) query_url = get_query_url(options.merge(rows: 0)) result = Maremma.get(query_url, options) result.body.fetch("data", {}).fetch("response", {}).fetch("numFound", 0) end def get_query_url(options={}) options[:offset] = options[:offset].to_i || 0 options[:rows] = options[:rows].presence || job_batch_size updated = "updated:[#{from_date}T00:00:00Z TO #{until_date}T23:59:59Z]" fq = "#{updated} AND has_metadata:true AND is_active:true" params = { q: "*:*", fq: fq, start: options[:offset], rows: options[:rows], fl: "doi,updated", sort: "updated asc", wt: "json" } search_path + URI.encode_www_form(params) end def process_data(options = {}) options[:start_time] = # walk through paginated results total_pages = (options[:total].to_f / job_batch_size).ceil (0...total_pages).each do |page| options[:offset] = page * job_batch_size data = get_data(options.merge(timeout: timeout)) parse_data(data) end push_data(options) end def get_data(options={}) query_url = get_query_url(options) Maremma.get(query_url, options) end def parse_data(result, options={}) return result.body.fetch("errors") if result.body.fetch("errors", nil).present? items = result.body.fetch("data", {}).fetch('response', {}).fetch('docs', nil) Array(items).each do |item| loc = "/works/" + item.fetch("doi") sitemap.add loc, changefreq: "monthly", lastmod: item.fetch("updated") end sitemap.sitemap.link_count end def push_data(options={}) # sync time with AWS S3 before uploading fog_storage.sync_clock sitemap.finalize! options[:start_time] ||= sitemap.sitemap_index.stats_summary(:time_taken => - options[:start_time]) sitemap.sitemap.link_count end end end