Sha256: 3795c9f8b379bdd916f410682840b1fcc9ba192e84cdb376c2462d3e8f366c2b

Contents?: true

Size: 1.22 KB

Versions: 4

Compression:

Stored size: 1.22 KB

Contents

# frozen_string_literal: true

module Spandx
  class Content
    attr_reader :raw, :threshold

    def initialize(raw, threshold: 89.0)
      @threshold = threshold
      @raw = raw
    end

    def tokens
      @tokens ||= tokenize(canonicalize(raw)).to_set
    end

    def similar?(other, algorithm: :dice_coefficient)
      case algorithm
      when :dice_coefficient
        similarity_score(other) > threshold
      when :levenshtein
        similarity_score(other) < threshold
      end
    end

    def similarity_score(other, algorithm: :dice_coefficient)
      case algorithm
      when :dice_coefficient
        dice_coefficient(other)
      when :levenshtein
        Text::Levenshtein.distance(raw, other.raw, 100)
      end
    end

    private

    def canonicalize(content)
      content&.downcase
    end

    def tokenize(content)
      content.to_s.scan(/[a-zA-Z]+/)
    end

    def blank?(content)
      content.nil? || content.chomp.strip.empty?
    end

    # https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Dice%27s_coefficient#Ruby
    def dice_coefficient(other)
      overlap = (tokens & other.tokens).size
      total = tokens.size + other.tokens.size
      100.0 * (overlap * 2.0 / total)
    end
  end
end

Version data entries

4 entries across 4 versions & 1 rubygems

Version Path
spandx-0.4.1 lib/spandx/content.rb
spandx-0.4.0 lib/spandx/content.rb
spandx-0.3.0 lib/spandx/content.rb
spandx-0.2.0 lib/spandx/content.rb