Sha256: c4f04e5324be20da943eab983c550bcecefbd46f6ca258f16ecc8b9ac9ad02f8
Contents?: true
Size: 1.41 KB
Versions: 1
Compression:
Stored size: 1.41 KB
Contents
# frozen_string_literal: true module Spandx class Content attr_reader :raw def initialize(raw) @raw = raw end def tokens @tokens ||= tokenize(canonicalize(raw)).to_set end def similar?(other, algorithm: :dice_coefficient) case algorithm when :dice_coefficient similarity_score(other, algorithm: algorithm) > 89.0 when :levenshtein similarity_score(other, algorithm: algorithm) < 3 when :jaro_winkler similarity_score(other, algorithm: algorithm) > 89.0 end end def similarity_score(other, algorithm: :dice_coefficient) case algorithm when :dice_coefficient dice_coefficient(other) when :levenshtein require 'text' Text::Levenshtein.distance(raw, other.raw, 100) when :jaro_winkler require 'jaro_winkler' JaroWinkler.distance(raw, other.raw) * 100.0 end end private def canonicalize(content) content&.downcase end def tokenize(content) content.to_s.scan(/[a-zA-Z]+/) end def blank?(content) content.nil? || content.chomp.strip.empty? end # https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Dice%27s_coefficient#Ruby def dice_coefficient(other) overlap = (tokens & other.tokens).size total = tokens.size + other.tokens.size 100.0 * (overlap * 2.0 / total) end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
spandx-0.5.0 | lib/spandx/content.rb |