require 'nokogiri'
module AnswersEngine
  module Scraper
    # @abstract
    class Executor
      # Max allowed page size when query outputs (see #find_outputs).
      MAX_FIND_OUTPUTS_PER_PAGE = 500

      attr_accessor :filename, :gid, :job_id

      include AnswersEngine::Plugin::ContextExposer

      def exec_parser(save=false)
        raise "should be implemented in subclass"
      end

      def init_page()
        if job_id
          puts "getting Job Page"
          init_job_page
        else
          puts "getting Global Page"
          init_global_page()
        end

      end

      def init_job_page()
        client = Client::JobPage.new()
        job_page = client.find(job_id, gid)
        unless job_page.code == 200
          raise "Job #{job_id} or GID #{gid} not found. Aborting execution!"
        else
          job_page
        end

      end

      def parsing_update(options={})
        client = Client::JobPage.new()
        job_id = options.fetch(:job_id)
        gid = options.fetch(:gid)

        client.parsing_update(job_id, gid, options)
      end

      def seeding_update(options={})
        client = Client::Job.new()
        job_id = options.fetch(:job_id)

        client.seeding_update(job_id, options)
      end

      def init_global_page()
        client = Client::GlobalPage.new()
        client.find(gid)
      end

      def get_content(gid)
        client = Client::GlobalPage.new()
        content_json = client.find_content(gid)

        if content_json['available']
          signed_url = content_json['signed_url']
          Client::BackblazeContent.new.get_gunzipped_content(signed_url)
        else
          nil
        end
      end

      def get_failed_content(gid)
        client = Client::GlobalPage.new()
        content_json = client.find_failed_content(gid)

        if content_json['available']
          signed_url = content_json['signed_url']
          Client::BackblazeContent.new.get_gunzipped_content(signed_url)
        else
          nil
        end
      end

      # Get current job id from scraper or default when scraper_name is null.
      #
      # @param [String|nil] scraper_name Scraper name.
      # @param [Integer|nil] default (nil) Default job id when no scraper name.
      #
      # @raise [Exception] When scraper name is not null, and scraper doesn't
      #   exists or it has no current job.
      def get_job_id scraper_name, default = nil
        return default if scraper_name.nil?
        job = Client::ScraperJob.new().find(scraper_name)
        raise JSON.pretty_generate(job) if job['id'].nil?
        job['id']
      end

      # Find outputs by collection and query with pagination.
      #
      # @param [String] collection ('default') Collection name.
      # @param [Hash] query ({}) Filters to query.
      # @param [Integer] page (1) Page number.
      # @param [Integer] per_page (100) Page size.
      # @param [Hash] opts ({}) Configuration options.
      # @option opts [String|nil] :scraper_name (nil) Scraper name to query
      #   from.
      # @option opts [Integer|nil] :job_id (nil) Job's id to query from.
      #
      # @raise [ArgumentError] +collection+ is not String.
      # @raise [ArgumentError] +query+ is not a Hash.
      # @raise [ArgumentError] +page+ is not an Integer greater than 0.
      # @raise [ArgumentError] +per_page+ is not an Integer between 1 and 500.
      #
      # @return [Array]
      #
      # @example
      #   find_outputs
      # @example
      #   find_outputs 'my_collection'
      # @example
      #   find_outputs 'my_collection', {}
      # @example
      #   find_outputs 'my_collection', {}, 1
      # @example
      #   find_outputs 'my_collection', {}, 1, 100
      # @example Find from another scraper by name
      #   find_outputs 'my_collection', {}, 1, 100, scraper_name: 'my_scraper'
      # @example Find from another scraper by job_id
      #   find_outputs 'my_collection', {}, 1, 100, job_id: 123
      #
      # @note *opts `:job_id` option is prioritize over `:scraper_name` when
      #   both exists. If none add provided or nil values, then current job
      #   will be used to query instead, this is the defaul behavior.
      def find_outputs(collection='default', query={}, page=1, per_page=100, opts = {})
        # Validate parameters out from nil for easier user usage.
        raise ArgumentError.new("collection needs to be a String") unless collection.is_a?(String)
        raise ArgumentError.new("query needs to be a Hash, instead of: #{query}") unless query.is_a?(Hash)
        unless page.is_a?(Integer) && page > 0
          raise ArgumentError.new("page needs to be an Integer greater than 0")
        end
        unless per_page.is_a?(Integer) && per_page > 0 && per_page <= MAX_FIND_OUTPUTS_PER_PAGE
          raise ArgumentError.new("per_page needs to be an Integer between 1 and #{MAX_FIND_OUTPUTS_PER_PAGE}")
        end

        options = {
          query: query,
          page: page,
          per_page: per_page}

        # Get job_id
        query_job_id = opts[:job_id] || get_job_id(opts[:scraper_name], self.job_id)

        client = Client::JobOutput.new(options)
        response = client.all(query_job_id, collection)

        if response.code != 200
          raise "response_code: #{response.code}|#{response.parsed_response}"
        end
        (response.body != 'null') ? response.parsed_response : []
      end

      # Find one output by collection and query with pagination.
      #
      # @param [String] collection ('default') Collection name.
      # @param [Hash] query ({}) Filters to query.
      # @param [Hash] opts ({}) Configuration options.
      # @option opts [String|nil] :scraper_name (nil) Scraper name to query
      #   from.
      # @option opts [Integer|nil] :job_id (nil) Job's id to query from.
      #
      # @raise [ArgumentError] +collection+ is not String.
      # @raise [ArgumentError] +query+ is not a Hash.
      #
      # @return [Hash|nil] `Hash` when found, and `nil` when no output is found.
      #
      # @example
      #   find_output
      # @example
      #   find_output 'my_collection'
      # @example
      #   find_output 'my_collection', {}
      # @example Find from another scraper by name
      #   find_output 'my_collection', {}, scraper_name: 'my_scraper'
      # @example Find from another scraper by job_id
      #   find_output 'my_collection', {}, job_id: 123
      #
      # @note *opts `:job_id` option is prioritize over `:scraper_name` when
      #   both exists. If none add provided or nil values, then current job
      #   will be used to query instead, this is the defaul behavior.
      def find_output(collection='default', query={}, opts = {})
        result = find_outputs(collection, query, 1, 1, opts)
        result.respond_to?(:first) ? result.first : nil
      end

      # Remove dups by prioritizing the latest dup.
      #
      # @param [Array] list List of hashes to dedup.
      # @param [Hash] key_defaults Key and default value pair hash to use on
      #   uniq validation.
      #
      # @return [Integer] Removed duplicated items count.
      def remove_old_dups!(list, key_defaults)
        raw_count = list.count
        keys = key_defaults.keys
        force_uniq = 0
        list.reverse!.uniq! do |item|
          # Extract stringify keys as hash
          key_hash = Hash[item.map{|k,v|keys.include?(k.to_s) ? [k.to_s,v] : nil}.select{|i|!i.nil?}]

          # Apply defaults for uniq validation
          key_defaults.each{|k,v| key_hash[k] = v if key_hash[k].nil?}

          # Don't dedup nil key defaults
          skip_dedup = !keys.find{|k| key_hash[k].nil?}.nil?
          skip_dedup ? (force_uniq += 1) : key_hash
        end
        list.reverse!
        dup_count = raw_count - list.count
        dup_count
      end

      # Remove page dups by prioritizing the latest dup.
      #
      # @param [Array] list List of pages to dedup.
      #
      # @return [Integer] Removed duplicated items count.
      #
      # @note It will not dedup for now as it is hard to build gid.
      #   TODO: Build gid so we can dedup
      def remove_old_page_dups!(list)
        key_defaults = {
          'gid' => nil
        }
        remove_old_dups! list, key_defaults
      end

      # Remove dups by prioritizing the latest dup.
      #
      # @param [Array] list List of outputs to dedup.
      #
      # @return [Integer] Removed duplicated items count.
      def remove_old_output_dups!(list)
        key_defaults = {
          '_id' => nil,
          '_collection' => 'default'
        }
        remove_old_dups! list, key_defaults
      end

      def save_pages_and_outputs(pages = [], outputs = [], status)
        total_pages = pages.count
        total_outputs = outputs.count
        records_per_slice = 100
        until pages.empty? && outputs.empty?
          pages_slice = pages.shift(records_per_slice)
          pages_dup_count = remove_old_page_dups! pages_slice
          outputs_slice = outputs.shift(records_per_slice)
          outputs_dup_count = remove_old_output_dups! outputs_slice

          log_msgs = []
          unless pages_slice.empty?
            page_dups_ignored = pages_dup_count > 0 ? " (#{pages_dup_count} dups ignored)" : ''
            log_msgs << "#{pages_slice.count} out of #{total_pages} Pages#{page_dups_ignored}"
            unless save
              puts '----------------------------------------'
              puts "Would have saved #{log_msgs.last}#{page_dups_ignored}"
              puts JSON.pretty_generate pages_slice
            end
          end

          unless outputs_slice.empty?
            output_dups_ignored = outputs_dup_count > 0 ? " (#{outputs_dup_count} dups ignored)" : ''
            log_msgs << "#{outputs_slice.count} out of #{total_outputs} Outputs#{output_dups_ignored}"
            unless save
              puts '----------------------------------------'
              puts "Would have saved #{log_msgs.last}#{output_dups_ignored}"
              puts JSON.pretty_generate outputs_slice
            end
          end

          next unless save
          log_msg = "Saving #{log_msgs.join(' and ')}."
          puts "#{log_msg}"

          # saving to server
          response = update_to_server(
            job_id: job_id,
            gid: gid,
            pages: pages_slice,
            outputs: outputs_slice,
            status: status)

          if response.code == 200
            log_msg = "Saved."
            puts "#{log_msg}"
          else
            puts "Error: Unable to save Pages and/or Outputs to server: #{response.body}"
            raise "Unable to save Pages and/or Outputs to server: #{response.body}"
          end
        end
      end

      def update_to_server(opts = {})
        raise "Implemented in Subclass"
      end

      def clean_backtrace(backtrace)
        i = backtrace.index{|x| x =~ /gems\/answersengine/i}
        if i.to_i < 1
          return []
        else
          return backtrace[0..(i-1)]
        end
      end

      def save_type
        raise NotImplementedError.new('Need to implement "save_type" method.')
      end

      # Saves pages from an array and clear it.
      #
      # @param [Array] pages ([]) Page array to save. Warning: all elements will
      #   be removed from the array.
      #
      # @note IMPORTANT: +pages+ array's elements will be removed.
      def save_pages(pages=[])
        save_pages_and_outputs(pages, [], save_type)
      end

      # Saves outputs from an array and clear it.
      #
      # @param [Array] outputs ([]) Output array to save. Warning: all elements
      #   will be removed from the array.
      #
      # @note IMPORTANT: +outputs+ array's elements will be removed.
      def save_outputs(outputs=[])
        save_pages_and_outputs([], outputs, save_type)
      end

      # Eval a filename with a custom binding
      #
      # @param [String] file_path File path to read.
      # @param [Binding] context Context binding to evaluate with.
      #
      # @note Using this method will allow scripts to contain `return` to
      #   exit the script sooner along some improved security.
      def eval_with_context file_path, context
        eval(File.read(file_path), context, file_path)
      end
    end
  end
end