lib/licensee/matchers/levenshtein_matcher.rb in licensee-5.0.0b4 vs lib/licensee/matchers/levenshtein_matcher.rb in licensee-5.0.0b5
- old
+ new
@@ -2,19 +2,30 @@
class LevenshteinMatcher < Matcher
# Return the first potential license that is more similar than the confidence threshold
def match
@match ||= potential_licenses.find do |license|
+
+ # If we know the license text contains the license name or nickname,
+ # bail early unless the file we're checking contains it.
+ # Guards against OSL & AFL confusion. See https://github.com/benbalter/licensee/issues/50
+ next if license.body_includes_name? && !includes_license_name?(license)
+ next if license.body_includes_nickname? && !includes_license_nickname?(license)
+
similarity(license) >= Licensee.confidence_threshold
end
end
# Sort all licenses, in decending order, by difference in length to the file
# Difference in lengths cannot exceed the file's length * the confidence threshold / 100
def potential_licenses
@potential_licenses ||= begin
- Licensee.licenses(:hidden => true).select { |license| length_delta(license) <= max_delta }.sort_by { |l| length_delta(l) }
+ licenses = Licensee.licenses(:hidden => true)
+ licenses = licenses.select do |license|
+ license.body_normalized && length_delta(license) <= max_delta
+ end
+ licenses.sort_by { |l| length_delta(l) }
end
end
# Calculate the difference between the file length and a given license's length
def length_delta(license)
@@ -48,8 +59,16 @@
# Note: We used content/body normalized because white space and capitalization
# isn't legally significant in this context. Fewer characters lets levenshtein
# work faster. As long as they both undergo the same transformation, should match.
def distance(license)
Levenshtein.distance(license.body_normalized, file.content_normalized).to_f
+ end
+
+ def includes_license_name?(license)
+ file.content_normalized.include?(license.name_without_version.downcase)
+ end
+
+ def includes_license_nickname?(license)
+ license.nickname && file.content_normalized.include?(license.nickname.downcase)
end
end
end