lib/answersengine/scraper/executor.rb in answersengine-0.2.33 vs lib/answersengine/scraper/executor.rb in answersengine-0.3.0

- old
+ new

@@ -189,33 +189,92 @@ def find_output(collection='default', query={}, opts = {}) result = find_outputs(collection, query, 1, 1, opts) result.respond_to?(:first) ? result.first : nil end + # Remove dups by prioritizing the latest dup. + # + # @param [Array] list List of hashes to dedup. + # @param [Hash] key_defaults Key and default value pair hash to use on + # uniq validation. + # + # @return [Integer] Removed duplicated items count. + def remove_old_dups!(list, key_defaults) + raw_count = list.count + keys = key_defaults.keys + force_uniq = 0 + list.reverse!.uniq! do |item| + # Extract stringify keys as hash + key_hash = Hash[item.map{|k,v|keys.include?(k.to_s) ? [k.to_s,v] : nil}.select{|i|!i.nil?}] + + # Apply defaults for uniq validation + key_defaults.each{|k,v| key_hash[k] = v if key_hash[k].nil?} + + # Don't dedup nil key defaults + skip_dedup = !keys.find{|k| key_hash[k].nil?}.nil? + skip_dedup ? (force_uniq += 1) : key_hash + end + list.reverse! + dup_count = raw_count - list.count + dup_count + end + + # Remove page dups by prioritizing the latest dup. + # + # @param [Array] list List of pages to dedup. + # + # @return [Integer] Removed duplicated items count. + # + # @note It will not dedup for now as it is hard to build gid. + # TODO: Build gid so we can dedup + def remove_old_page_dups!(list) + key_defaults = { + 'gid' => nil + } + remove_old_dups! list, key_defaults + end + + # Remove dups by prioritizing the latest dup. + # + # @param [Array] list List of outputs to dedup. + # + # @return [Integer] Removed duplicated items count. + def remove_old_output_dups!(list) + key_defaults = { + '_id' => nil, + '_collection' => 'default' + } + remove_old_dups! list, key_defaults + end + def save_pages_and_outputs(pages = [], outputs = [], status) total_pages = pages.count total_outputs = outputs.count records_per_slice = 100 until pages.empty? && outputs.empty? pages_slice = pages.shift(records_per_slice) + pages_dup_count = remove_old_page_dups! pages_slice outputs_slice = outputs.shift(records_per_slice) + outputs_dup_count = remove_old_output_dups! outputs_slice log_msgs = [] unless pages_slice.empty? - log_msgs << "#{pages_slice.count} out of #{total_pages} Pages" + page_dups_ignored = pages_dup_count > 0 ? " (#{pages_dup_count} dups ignored)" : '' + log_msgs << "#{pages_slice.count} out of #{total_pages} Pages#{page_dups_ignored}" unless save puts '----------------------------------------' - puts "Would have saved #{log_msgs.last}" + puts "Would have saved #{log_msgs.last}#{page_dups_ignored}" puts JSON.pretty_generate pages_slice end end unless outputs_slice.empty? - log_msgs << "#{outputs_slice.count} out of #{total_outputs} Outputs" + output_dups_ignored = outputs_dup_count > 0 ? " (#{outputs_dup_count} dups ignored)" : '' + log_msgs << "#{outputs_slice.count} out of #{total_outputs} Outputs#{output_dups_ignored}" unless save puts '----------------------------------------' - puts "Would have saved #{log_msgs.last}" + puts "Would have saved #{log_msgs.last}#{output_dups_ignored}" puts JSON.pretty_generate outputs_slice end end next unless save @@ -277,10 +336,10 @@ save_pages_and_outputs([], outputs, save_type) end # Eval a filename with a custom binding # - # @param [String] filename File path to read. + # @param [String] file_path File path to read. # @param [Binding] context Context binding to evaluate with. # # @note Using this method will allow scripts to contain `return` to # exit the script sooner along some improved security. def eval_with_context file_path, context