Sha256: fbe8a2a4923e80eb1c266be00df6a74cc01c22ba709ce66c63d9b31d7df3c8eb

Contents?: true

Size: 1.56 KB

Versions: 4

Compression:

Stored size: 1.56 KB

Contents

require 'set'
require 'digest'

module Licensee
  module ContentHelper
    DIGEST = Digest::SHA1

    # A set of each word in the license, without duplicates
    def wordset
      @wordset ||= if content_normalized
        content_normalized.scan(/[\w']+/).to_set
      end
    end

    # Number of characteres in the normalized content
    def length
      return 0 unless content_normalized
      content_normalized.length
    end

    # Number of characters that could be added/removed to still be
    # considered a potential match
    def max_delta
      (length * Licensee.inverse_confidence_threshold).to_i
    end

    # Given another license or project file, calculates the difference in length
    def length_delta(other)
      (length - other.length).abs
    end

    # Given another license or project file, calculates the similarity
    # as a percentage of words in common
    def similarity(other)
      overlap = (wordset & other.wordset).size
      total = wordset.size + other.wordset.size
      100.0 * (overlap * 2.0 / total)
    end

    # SHA1 of the normalized content
    def hash
      @hash ||= DIGEST.hexdigest content_normalized
    end

    # Content with copyright header and linebreaks removed
    def content_normalized
      return unless content
      @content_normalized ||= begin
        content_normalized = content.downcase.strip
        content_normalized.gsub!(/^#{Matchers::Copyright::REGEX}$/i, '')
        content_normalized.gsub!(/[=-]{4,}/, '') # Strip HRs from MPL
        content_normalized.tr("\n", ' ').squeeze(' ').strip
      end
    end
  end
end

Version data entries

4 entries across 4 versions & 1 rubygems

Version Path
licensee-8.7.0 lib/licensee/content_helper.rb
licensee-8.6.1 lib/licensee/content_helper.rb
licensee-8.6.0 lib/licensee/content_helper.rb
licensee-8.5.0 lib/licensee/content_helper.rb