Sha256: 6f38a17a97df153f5953d8790a00f2c2b40c48246d7e4b44439c6e82f698ffbd

Contents?: true

Size: 1.98 KB

Versions: 7

Compression:

Stored size: 1.98 KB

Contents

module Licensee
  module Matchers
    class Dice
      def initialize(file)
        @file = file
      end

      # Return the first potential license that is more similar
      # than the confidence threshold
      def match
        return @match if defined? @match
        matches = potential_licenses.map do |license|
          if (sim = similarity(license)) >= Licensee.confidence_threshold
            [license, sim]
          end
        end
        matches.compact!
        @match = if matches.empty?
          nil
        else
          matches.max_by { |_l, sim| sim }.first
        end
      end

      # Sort all licenses, in decending order, by difference in
      # length to the file
      # Difference in lengths cannot exceed the file's length *
      # the confidence threshold / 100
      def potential_licenses
        @potential_licenses ||= begin
          licenses = Licensee.licenses(hidden: true)
          licenses = licenses.select do |license|
            license.wordset && length_delta(license) <= max_delta
          end
          licenses.sort_by { |l| length_delta(l) }
        end
      end

      # Calculate the difference between the file length and a given
      # license's length
      def length_delta(license)
        (@file.wordset.size - license.wordset.size).abs
      end

      # Maximum possible difference between file length and license length
      # for a license to be a potential license to be matched
      def max_delta
        @max_delta ||= (
          @file.wordset.size * (Licensee.confidence_threshold / 100.0)
        )
      end

      # Confidence that the matched license is a match
      def confidence
        @confidence ||= match ? similarity(match) : 0
      end

      private

      # Calculate percent changed between file and potential license
      def similarity(license)
        overlap = (@file.wordset & license.wordset).size
        total = @file.wordset.size + license.wordset.size
        100.0 * (overlap * 2.0 / total)
      end
    end
  end
end

Version data entries

7 entries across 7 versions & 1 rubygems

Version Path
licensee-8.3.1 lib/licensee/matchers/dice_matcher.rb
licensee-8.3.0 lib/licensee/matchers/dice_matcher.rb
licensee-8.2.0 lib/licensee/matchers/dice_matcher.rb
licensee-8.1.0 lib/licensee/matchers/dice_matcher.rb
licensee-8.0.0 lib/licensee/matchers/dice_matcher.rb
licensee-7.0.1 lib/licensee/matchers/dice_matcher.rb
licensee-7.0.0 lib/licensee/matchers/dice_matcher.rb