#!/usr/bin/env ruby require 'RMagick' require 'progressbar' require 'base64' class DupeMagick attr_accessor :source_vector, :target_vector, :source_cube, :target_cube, :distance def compare_images(source, target, params) source_image = process_image(source, params) target_image = process_image(target, params) @source_vector = create_vector_from_image(source_image) @source_cube = create_cube_from_vector(@source_vector) @target_vector = create_vector_from_image(target_image) @target_cube = create_cube_from_vector(@target_vector) @distance = calculate_euclidian_distance(@source_cube, @target_cube) puts "Images are " + euclidian_plain_language(@distance) + ", score: " + @distance.to_i.to_s @distance.to_i end def find_duplicates(source_file, target_path) source_image = process_image(source_file, {:geometry => '8x8!'}) @source_vector = create_vector_from_image(source_image) @source_cube = create_cube_from_vector(@source_vector) files = files_in_directory(target_path) files.each do |target| target_image = process_image(target, {:geometry => '8x8!'}) @target_vector = create_vector_from_image(target_image) @target_cube = create_cube_from_vector(@target_vector) @distance = calculate_euclidian_distance(@source_cube, @target_cube) puts "Images are " + euclidian_plain_language(@distance) + ", score: " + @distance.to_i.to_s end end def serialize(obj) # for storing image data as a blob in mysql etc Base64.encode64(Marshal.dump(obj)) end def deserialize(obj) # for retrieving image data from a blob in mysql etc Base64.decode64(Marshal.load(obj)) end private def read_image(file) Magick::Image.read(file).first end def files_in_directory(path) Dir[path] end def do_make_progress_bar (title, total) ProgressBar.new(title, total) end def process_image(image, params) # read image image = read_image(image) # normalize image image = image.normalize # strip off 10% border image = image.excerpt((image.columns * 10/100).to_i, (image.rows * 10/100).to_i, (image.columns * 90/100).to_i, (image.rows * 90/100).to_i) # optionally quantize image to 32 colours image = image.quantize(32, Magick::RGBColorspace) if params[:quantize] # optionally blur image image = image.gaussian_blur(0.0, params[:blur_radius]) if params[:blur] # change geometry of image image = image.change_geometry(params[:geometry]) { |cols, rows, img| img.resize!(cols, rows)} image end def create_vector_from_image(image) image.export_pixels(0, 0, image.columns, image.rows, "RGB") end def create_cube_from_vector(vector) # create cube 8x8x8 for each channel cube = Hash.new cube[:r] = Hash.new 0 cube[:g] = Hash.new 0 cube[:b] = Hash.new 0 array_mod(vector, 3, 0).each { |r| cube[:r][which_bin(r)] += 1 } array_mod(vector, 3, 1).each { |g| cube[:g][which_bin(g)] += 1 } array_mod(vector, 3, 2).each { |b| cube[:b][which_bin(b)] += 1 } cube end def which_bin(channel) case channel when 0..8191 then 1 when 8192..16383 then 2 when 16384..24575 then 3 when 24576..32767 then 4 when 32768..40959 then 5 when 40960..49151 then 6 when 49152..57343 then 7 when 57344..65535 then 8 end end def array_mod(arr, mod, offset = 0) arr.shift(offset) out_arr = [] arr.each_with_index do |val, idx| out_arr << val if idx % mod == 0 end out_arr end def calculate_euclidian_distance(source_cube, target_cube) sum = Hash.new 0 1.upto(8) do |r| 1.upto(8) do |g| 1.upto(8) do |b| sum[:r] += (target_cube[:r][r] - source_cube[:r][r])**2 sum[:g] += (target_cube[:g][g] - source_cube[:g][g])**2 sum[:b] += (target_cube[:b][b] - source_cube[:b][b])**2 end end end Math.sqrt(sum[:r] + sum[:g] + sum[:b]) end def euclidian_plain_language(distance) case distance when 0 then "identical" when 1..50 then "similar" when 51..150 then "possibly similar" else "different" end end end