RubygemsResearch

Sha256: d7e5a710c94e9157ccaec28a35d7f73283f6d5802d0cde3d5e889a6a09b0884b

Contents?: true

Size: 1.94 KB

Versions: 25

Compression:

Stored size: 1.94 KB

class Licensee
  class LevenshteinMatcher < Matcher

    # Return the first potential license that is more similar than the confidence threshold
    def match
      @match ||= potential_licenses.find do |license|
        similarity(license) >= Licensee::CONFIDENCE_THRESHOLD
      end
    end

    # Sort all licenses, in decending order, by difference in length to the file
    # Difference in lengths cannot exceed the file's length * the confidence threshold / 100
    def potential_licenses
      @potential_licenses ||= begin
        Licensee.licenses.select { |license| length_delta(license) <= max_delta }.sort_by { |l| length_delta(l) }
      end
    end

    # Calculate the difference between the file length and a given license's length
    def length_delta(license)
      (file_length - license.body_normalized.length).abs
    end

    # Maximum possible difference between file length and license length
    # for a license to be a potential license to be matched
    def max_delta
      @max_delta ||= (file_length * (Licensee::CONFIDENCE_THRESHOLD.to_f / 100.to_f ))
    end

    # Confidence that the matched license is a match
    def confidence
      @confidence ||= match ? similarity(match) : 0
    end

    private

    # Length of the file, normalized to strip whitespace
    def file_length
      @file_length ||= file.content_normalized.length.to_f
    end

    # Calculate percent changed between file and potential license
    def similarity(license)
      100 * (file_length - distance(license)) / file_length
    end

    # Calculate the levenshtein distance between file and license
    # Note: We used content/body normalized because white space and capitalization
    # isn't legally significant in this context. Fewer characters lets levenshtein
    # work faster. As long as they both undergo the same transformation, should match.
    def distance(license)
      Levenshtein.distance(license.body_normalized, file.content_normalized).to_f
    end
  end
end

Version data entries

25 entries across 25 versions & 1 rubygems

Version	Path
licensee-4.7.4	lib/licensee/matchers/levenshtein_matcher.rb
licensee-4.7.3	lib/licensee/matchers/levenshtein_matcher.rb
licensee-4.7.2	lib/licensee/matchers/levenshtein_matcher.rb
licensee-4.7.1	lib/licensee/matchers/levenshtein_matcher.rb
licensee-4.7.0	lib/licensee/matchers/levenshtein_matcher.rb
licensee-4.6.0	lib/licensee/matchers/levenshtein_matcher.rb
licensee-4.5.0	lib/licensee/matchers/levenshtein_matcher.rb
licensee-4.4.2	lib/licensee/matchers/levenshtein_matcher.rb
licensee-4.4.1	lib/licensee/matchers/levenshtein_matcher.rb
licensee-4.4.0	lib/licensee/matchers/levenshtein_matcher.rb
licensee-4.3.3	lib/licensee/matchers/levenshtein_matcher.rb
licensee-4.3.2	lib/licensee/matchers/levenshtein_matcher.rb
licensee-4.3.1	lib/licensee/matchers/levenshtein_matcher.rb
licensee-4.3.0	lib/licensee/matchers/levenshtein_matcher.rb
licensee-4.2.4	lib/licensee/matchers/levenshtein_matcher.rb
licensee-4.2.3	lib/licensee/matchers/levenshtein_matcher.rb
licensee-4.2.2	lib/licensee/matchers/levenshtein_matcher.rb
licensee-4.2.1	lib/licensee/matchers/levenshtein_matcher.rb
licensee-4.2.0	lib/licensee/matchers/levenshtein_matcher.rb
licensee-4.1.2	lib/licensee/matchers/levenshtein_matcher.rb