lib/flay.rb in flay-1.0.0 vs lib/flay.rb in flay-1.1.0

- old
+ new

@@ -1,42 +1,91 @@ #!/usr/bin/env ruby -w -$: << "../../sexp_processor/dev/lib" # TODO: remove $: << "../../ruby_parser/dev/lib" require 'rubygems' require 'sexp_processor' require 'ruby_parser' require 'pp' # TODO: remove +$m ||= 16 +$v ||= false +$f ||= false + +if $v then + $: << "../../ruby2ruby/dev/lib" + require 'ruby2ruby' + require 'tempfile' +end + class Flay - VERSION = '1.0.0' + VERSION = '1.1.0' + attr_accessor :mass_threshold attr_reader :hashes def initialize(mass = 16) @hashes = Hash.new { |h,k| h[k] = [] } @mass_threshold = mass end def process(*files) files.each do |file| - warn "Processing #{file}..." + warn "Processing #{file}" - t = Time.now pt = RubyParser.new.process(File.read(file), file) - next unless pt # empty files... hahaha, suck. - t = Time.now - pt.deep_each do |node| - next unless node.any? { |sub| Sexp === sub } - next if node.mass < @mass_threshold + process_sexp pt + end - self.hashes[node.fuzzy_hash] << node + process_fuzzy_similarities if $f + end + + def process_sexp pt + pt.deep_each do |node| + next unless node.any? { |sub| Sexp === sub } + next if node.mass < self.mass_threshold + + self.hashes[node.fuzzy_hash] << node + end + end + + def process_fuzzy_similarities + all_hashes, detected = {}, {} + + self.hashes.values.each do |nodes| + nodes.each do |node| + next if node.mass > 4 * self.mass_threshold + # TODO: try out with fuzzy_hash + # all_hashes[node] = node.grep(Sexp).map { |s| [s.hash] * s.mass }.flatten + all_hashes[node] = node.grep(Sexp).map { |s| [s.hash] }.flatten end end + + # warn "looking for copy/paste/edit code across #{all_hashes.size} nodes" + + all_hashes = all_hashes.to_a + all_hashes.each_with_index do |(s1, h1), i| + similar = [s1] + all_hashes[i+1..-1].each do |(s2, h2)| + next if detected[h2] + intersection = h1.intersection h2 + max = [h1.size, h2.size].max + if intersection.size >= max * 0.60 then + similarity = s1.similarity(s2) + if similarity > 0.60 then + similar << s2 + detected[h2] = true + else + p [similarity, s1, s2] + end + end + end + + self.hashes[similar.first.hash].push(*similar) if similar.size > 1 + end end def prune # prune trees that aren't duped at all, or are too small self.hashes.delete_if { |_,nodes| nodes.size == 1 } @@ -53,34 +102,92 @@ # nuke subtrees so we show the biggest matching tree possible self.hashes.delete_if { |h,_| all_hashes[h] } end + def n_way_diff *data + data.each_with_index do |s, i| + c = (?A + i).chr + s.group = c + end + + max = data.map { |s| s.scan(/^.*/).size }.max + + data.map! { |s| # FIX: this is tarded, but I'm out of brain + c = s.group + s = s.scan(/^.*/) + s.push(*([""] * (max - s.size))) # pad + s.each do |o| + o.group = c + end + s + } + + groups = data[0].zip(*data[1..-1]) + groups.map! { |lines| + collapsed = lines.uniq + if collapsed.size == 1 then + " #{lines.first}" + else + # TODO: make r2r have a canonical mode (doesn't make 1-liners) + lines.reject { |l| l.empty? }.map { |l| "#{l.group}: #{l}" } + end + } + groups.flatten.join("\n") + end + def report prune = nil self.prune - self.hashes.sort_by { |_,nodes| - -(nodes.first.mass * nodes.size) - }.each do |_,nodes| + identical = {} + masses = {} + + self.hashes.each do |hash,nodes| + identical[hash] = nodes[1..-1].all? { |n| n == nodes.first } + masses[hash] = nodes.first.mass * nodes.size + masses[hash] *= (nodes.size) if identical[hash] + end + + count = 0 + masses.sort_by { |h,m| [-m, hashes[h].first.file] }.each do |hash,mass| + nodes = hashes[hash] next unless nodes.first.first == prune if prune puts + same = identical[hash] node = nodes.first - puts "Matches found in %p (mass = %d)" % - [node.first, nodes.size * node.mass] + n = nodes.size + match, bonus = if same then + ["IDENTICAL", "*#{n}"] + else + ["Similar", ""] + end - nodes.each do |node| - puts " #{node.file}:#{node.line}" + count += 1 + puts "%d) %s code found in %p (mass%s = %d)" % + [count, match, node.first, bonus, mass] + + nodes.each_with_index do |node, i| + if $v then + c = (?A + i).chr + puts " #{c}: #{node.file}:#{node.line}" + else + puts " #{node.file}:#{node.line}" + end end + + if $v then + puts + r2r = Ruby2Ruby.new + puts n_way_diff(*nodes.map { |s| r2r.process(s.deep_clone) }) + end end end end -class Symbol - def hash - @hash ||= self.to_s.hash - end +class String + attr_accessor :group end class Sexp def mass @mass ||= self.structure.flatten.size @@ -106,10 +213,11 @@ s += (l_lits & r_lits).size r += (r_lits - l_lits).size # TODO: I think this is wrong, since it isn't positional. What to do? l_sexp.zip(r_sexp).each do |l_sub, r_sub| + next unless l_sub && r_sub # HACK l2, s2, r2 = l_sub.compare_to r_sub l += l2 s += s2 r += r2 end @@ -141,18 +249,35 @@ next unless Sexp === sexp yield sexp end end +end - alias :old_inspect :inspect - def inspect - old_inspect.sub(/\)\Z/, ":h_#{self.fuzzy_hash})") +class Array + def intersection other + intersection, start = [], 0 + other_size = other.length + self.each_with_index do |m, i| + (start...other_size).each do |j| + n = other.at j + if m == n then + intersection << m + start = j + 1 + break + end + end + end + intersection end - alias :shut_up! :pretty_print - def pretty_print(q) # shows the hash TODO: remove - q.group(1, 'S(', ')') do - q.seplist(self + [":h_#{self.fuzzy_hash}"]) {|v| q.pp v } + def triangle # TODO: use? + max = self.size + (0...max).each do |i| + o1 = at(i) + (i+1...max).each do |j| + o2 = at(j) + yield o1, o2 + end end end end