lib/flay.rb in flay-1.0.0 vs lib/flay.rb in flay-1.1.0
- old
+ new
@@ -1,42 +1,91 @@
#!/usr/bin/env ruby -w
-$: << "../../sexp_processor/dev/lib" # TODO: remove
$: << "../../ruby_parser/dev/lib"
require 'rubygems'
require 'sexp_processor'
require 'ruby_parser'
require 'pp' # TODO: remove
+$m ||= 16
+$v ||= false
+$f ||= false
+
+if $v then
+ $: << "../../ruby2ruby/dev/lib"
+ require 'ruby2ruby'
+ require 'tempfile'
+end
+
class Flay
- VERSION = '1.0.0'
+ VERSION = '1.1.0'
+ attr_accessor :mass_threshold
attr_reader :hashes
def initialize(mass = 16)
@hashes = Hash.new { |h,k| h[k] = [] }
@mass_threshold = mass
end
def process(*files)
files.each do |file|
- warn "Processing #{file}..."
+ warn "Processing #{file}"
- t = Time.now
pt = RubyParser.new.process(File.read(file), file)
-
next unless pt # empty files... hahaha, suck.
- t = Time.now
- pt.deep_each do |node|
- next unless node.any? { |sub| Sexp === sub }
- next if node.mass < @mass_threshold
+ process_sexp pt
+ end
- self.hashes[node.fuzzy_hash] << node
+ process_fuzzy_similarities if $f
+ end
+
+ def process_sexp pt
+ pt.deep_each do |node|
+ next unless node.any? { |sub| Sexp === sub }
+ next if node.mass < self.mass_threshold
+
+ self.hashes[node.fuzzy_hash] << node
+ end
+ end
+
+ def process_fuzzy_similarities
+ all_hashes, detected = {}, {}
+
+ self.hashes.values.each do |nodes|
+ nodes.each do |node|
+ next if node.mass > 4 * self.mass_threshold
+ # TODO: try out with fuzzy_hash
+ # all_hashes[node] = node.grep(Sexp).map { |s| [s.hash] * s.mass }.flatten
+ all_hashes[node] = node.grep(Sexp).map { |s| [s.hash] }.flatten
end
end
+
+ # warn "looking for copy/paste/edit code across #{all_hashes.size} nodes"
+
+ all_hashes = all_hashes.to_a
+ all_hashes.each_with_index do |(s1, h1), i|
+ similar = [s1]
+ all_hashes[i+1..-1].each do |(s2, h2)|
+ next if detected[h2]
+ intersection = h1.intersection h2
+ max = [h1.size, h2.size].max
+ if intersection.size >= max * 0.60 then
+ similarity = s1.similarity(s2)
+ if similarity > 0.60 then
+ similar << s2
+ detected[h2] = true
+ else
+ p [similarity, s1, s2]
+ end
+ end
+ end
+
+ self.hashes[similar.first.hash].push(*similar) if similar.size > 1
+ end
end
def prune
# prune trees that aren't duped at all, or are too small
self.hashes.delete_if { |_,nodes| nodes.size == 1 }
@@ -53,34 +102,92 @@
# nuke subtrees so we show the biggest matching tree possible
self.hashes.delete_if { |h,_| all_hashes[h] }
end
+ def n_way_diff *data
+ data.each_with_index do |s, i|
+ c = (?A + i).chr
+ s.group = c
+ end
+
+ max = data.map { |s| s.scan(/^.*/).size }.max
+
+ data.map! { |s| # FIX: this is tarded, but I'm out of brain
+ c = s.group
+ s = s.scan(/^.*/)
+ s.push(*([""] * (max - s.size))) # pad
+ s.each do |o|
+ o.group = c
+ end
+ s
+ }
+
+ groups = data[0].zip(*data[1..-1])
+ groups.map! { |lines|
+ collapsed = lines.uniq
+ if collapsed.size == 1 then
+ " #{lines.first}"
+ else
+ # TODO: make r2r have a canonical mode (doesn't make 1-liners)
+ lines.reject { |l| l.empty? }.map { |l| "#{l.group}: #{l}" }
+ end
+ }
+ groups.flatten.join("\n")
+ end
+
def report prune = nil
self.prune
- self.hashes.sort_by { |_,nodes|
- -(nodes.first.mass * nodes.size)
- }.each do |_,nodes|
+ identical = {}
+ masses = {}
+
+ self.hashes.each do |hash,nodes|
+ identical[hash] = nodes[1..-1].all? { |n| n == nodes.first }
+ masses[hash] = nodes.first.mass * nodes.size
+ masses[hash] *= (nodes.size) if identical[hash]
+ end
+
+ count = 0
+ masses.sort_by { |h,m| [-m, hashes[h].first.file] }.each do |hash,mass|
+ nodes = hashes[hash]
next unless nodes.first.first == prune if prune
puts
+ same = identical[hash]
node = nodes.first
- puts "Matches found in %p (mass = %d)" %
- [node.first, nodes.size * node.mass]
+ n = nodes.size
+ match, bonus = if same then
+ ["IDENTICAL", "*#{n}"]
+ else
+ ["Similar", ""]
+ end
- nodes.each do |node|
- puts " #{node.file}:#{node.line}"
+ count += 1
+ puts "%d) %s code found in %p (mass%s = %d)" %
+ [count, match, node.first, bonus, mass]
+
+ nodes.each_with_index do |node, i|
+ if $v then
+ c = (?A + i).chr
+ puts " #{c}: #{node.file}:#{node.line}"
+ else
+ puts " #{node.file}:#{node.line}"
+ end
end
+
+ if $v then
+ puts
+ r2r = Ruby2Ruby.new
+ puts n_way_diff(*nodes.map { |s| r2r.process(s.deep_clone) })
+ end
end
end
end
-class Symbol
- def hash
- @hash ||= self.to_s.hash
- end
+class String
+ attr_accessor :group
end
class Sexp
def mass
@mass ||= self.structure.flatten.size
@@ -106,10 +213,11 @@
s += (l_lits & r_lits).size
r += (r_lits - l_lits).size
# TODO: I think this is wrong, since it isn't positional. What to do?
l_sexp.zip(r_sexp).each do |l_sub, r_sub|
+ next unless l_sub && r_sub # HACK
l2, s2, r2 = l_sub.compare_to r_sub
l += l2
s += s2
r += r2
end
@@ -141,18 +249,35 @@
next unless Sexp === sexp
yield sexp
end
end
+end
- alias :old_inspect :inspect
- def inspect
- old_inspect.sub(/\)\Z/, ":h_#{self.fuzzy_hash})")
+class Array
+ def intersection other
+ intersection, start = [], 0
+ other_size = other.length
+ self.each_with_index do |m, i|
+ (start...other_size).each do |j|
+ n = other.at j
+ if m == n then
+ intersection << m
+ start = j + 1
+ break
+ end
+ end
+ end
+ intersection
end
- alias :shut_up! :pretty_print
- def pretty_print(q) # shows the hash TODO: remove
- q.group(1, 'S(', ')') do
- q.seplist(self + [":h_#{self.fuzzy_hash}"]) {|v| q.pp v }
+ def triangle # TODO: use?
+ max = self.size
+ (0...max).each do |i|
+ o1 = at(i)
+ (i+1...max).each do |j|
+ o2 = at(j)
+ yield o1, o2
+ end
end
end
end