require 'phash'
module Phash
class TxtHashPoint < FFI::Struct
layout :hash, :uint64,
:index, :off_t
end
class TxtMatch < FFI::Struct
layout :index_a, :off_t,
:index_b, :off_t,
:length, :uint32
end
# textual hash for file
#
# param filename - char* name of file
# param nbpoints - int length of array of return value (out)
# return TxtHashPoint* array of hash points with respective index into file.
#
# TxtHashPoint* ph_texthash(const char *filename, int *nbpoints);
#
attach_function :ph_texthash, [:string, :pointer], :pointer, :blocking => true
# compare 2 text hashes
#
# param hash1 -TxtHashPoint
# param N1 - int length of hash1
# param hash2 - TxtHashPoint
# param N2 - int length of hash2
# param nbmatches - int number of matches found (out)
# return TxtMatch* - list of all matches
#
# TxtMatch* ph_compare_text_hashes(TxtHashPoint *hash1, int N1, TxtHashPoint *hash2, int N2, int *nbmatches);
#
attach_function :ph_compare_text_hashes, [:pointer, :int, :pointer, :int, :pointer], :pointer, :blocking => true
attach_function :free, [:pointer], :void
class << self
# Get text file hash using ph_texthash
def text_hash(path)
hash_data_length_p = FFI::MemoryPointer.new :int
if hash_data = ph_texthash(path.to_s, hash_data_length_p)
hash_data_length = hash_data_length_p.get_int(0)
hash_data_length_p.free
TextHash.new(hash_data, hash_data_length)
end
end
# Get distance between two text hashes using text_distance
def text_similarity(hash_a, hash_b)
hash_a.is_a?(TextHash) or raise ArgumentError.new('hash_a is not a TextHash')
hash_b.is_a?(TextHash) or raise ArgumentError.new('hash_b is not a TextHash')
matches_length_p = FFI::MemoryPointer.new :int
if data = ph_compare_text_hashes(hash_a.data, hash_a.length, hash_b.data, hash_b.length, matches_length_p)
matches_length = matches_length_p.get_int(0)
matches_length_p.free
matches = matches_length.times.map{ |i| TxtMatch.new(data + i * TxtMatch.size) }
matched_a = Array.new(hash_a.length)
matched_b = Array.new(hash_b.length)
matches.each do |match|
index_a = match[:index_a]
index_b = match[:index_b]
match[:length].times do |i|
matched_a[index_a + i] = true
matched_b[index_b + i] = true
end
end
coverage_a = matched_a.compact.length / hash_a.length.to_f
coverage_b = matched_b.compact.length / hash_b.length.to_f
similarity = (coverage_a + coverage_b) * 0.5
free(data)
similarity
end
end
end
# Class to store text hash and compare to other
class TextHash < HashData
end
# Class to store text file hash and compare to other
class Text < FileHash
end
end