Sha256: ae0de00e9289e7c64f1f2e06816df25e21ee55d6c98839d1b47618fde940ce42

Contents?: true

Size: 1.96 KB

Versions: 4

Compression:

Stored size: 1.96 KB

Contents

class Licensee
  module Matchers
    class Dice
      def initialize(file)
        @file = file
      end

      # Return the first potential license that is more similar
      # than the confidence threshold
      def match
        return @match if defined? @match
        matches = potential_licenses.map do |license|
          if (sim = similarity(license)) >= Licensee.confidence_threshold
            [license, sim]
          end
        end
        matches.compact!
        @match = if matches.empty?
          nil
        else
          matches.max_by { |l, sim| sim }.first
        end
      end

      # Sort all licenses, in decending order, by difference in
      # length to the file
      # Difference in lengths cannot exceed the file's length *
      # the confidence threshold / 100
      def potential_licenses
        @potential_licenses ||= begin
          licenses = Licensee.licenses(:hidden => true)
          licenses = licenses.select do |license|
            license.wordset && length_delta(license) <= max_delta
          end
          licenses.sort_by { |l| length_delta(l) }
        end
      end

      # Calculate the difference between the file length and a given
      # license's length
      def length_delta(license)
        (@file.wordset.size - license.wordset.size).abs
      end

      # Maximum possible difference between file length and license length
      # for a license to be a potential license to be matched
      def max_delta
        @max_delta ||= (@file.wordset.size * (Licensee.confidence_threshold/100.0))
      end

      # Confidence that the matched license is a match
      def confidence
        @confidence ||= match ? similarity(match) : 0
      end

      private
      # Calculate percent changed between file and potential license
      def similarity(license)
        overlap = (@file.wordset & license.wordset).size
        total = @file.wordset.size + license.wordset.size
        100.0 * (overlap * 2.0 / total)
      end
    end
  end
end

Version data entries

4 entries across 4 versions & 1 rubygems

Version Path
licensee-6.1.1 lib/licensee/matchers/dice_matcher.rb
licensee-6.1.0 lib/licensee/matchers/dice_matcher.rb
licensee-6.0.0 lib/licensee/matchers/dice_matcher.rb
licensee-6.0.0b1 lib/licensee/matchers/dice_matcher.rb