Sha256: 388dbca9315a522c65b7302bd071037533035c38dde7738e4c121418e3ec7720

Contents?: true

Size: 1.22 KB

Versions: 7

Compression:

Stored size: 1.22 KB

Contents

class DiffTokenizer

  def initialize(text)
    lines = text.split("\n")

    add_lines = select_lines_starting_with(lines, '+')
    substract_lines = select_lines_starting_with(lines, '-')

    added_tokens = extract_tokens(add_lines)
    substracted_tokens = extract_tokens(substract_lines)
    
    cleaned_added_tokens = added_tokens - substracted_tokens.uniq
    cleaned_substracted_tokens = substracted_tokens - added_tokens.uniq

    @top = {}
    @top[:adds] = count_tokens(cleaned_added_tokens)
    @top[:subs] = count_tokens(cleaned_substracted_tokens)
  end
  
  def top(count, type)
    @top[type][0, count].map { |token_and_count| token_and_count.first }
  end
  
  def top_adds(count = 5)
    top(count, :adds)
  end
  
  def top_subs(count = 5)
    top(count, :subs)
  end
  
  private
    def select_lines_starting_with(lines, char)
      escaped_char = Regexp.escape(char)
      lines.select { |line| line =~ Regexp.new("^#{escaped_char}[^#{escaped_char}]") }
    end
    
    def extract_tokens(lines)
      lines.map { |line| line.scan(/[\w_]{2,}/) }.flatten
    end
    
    def count_tokens(tokens)
      counts = Hash.new(0)
      tokens.each { |token| counts[token] += 1 }
      counts.sort_by { |a, b| b }.reverse
    end
end

Version data entries

7 entries across 7 versions & 1 rubygems

Version Path
hangover-0.0.7 lib/hangover/diff_tokenizer.rb
hangover-0.0.6 lib/hangover/diff_tokenizer.rb
hangover-0.0.5 lib/hangover/diff_tokenizer.rb
hangover-0.0.4 lib/hangover/diff_tokenizer.rb
hangover-0.0.3 lib/hangover/diff_tokenizer.rb
hangover-0.0.2 lib/hangover/diff_tokenizer.rb
hangover-0.0.1 lib/hangover/diff_tokenizer.rb