lib/spandx/content.rb in spandx-0.1.7 vs lib/spandx/content.rb in spandx-0.2.0
- old
+ new
@@ -1,27 +1,38 @@
# frozen_string_literal: true
module Spandx
class Content
- attr_reader :tokens, :threshold
+ attr_reader :raw, :threshold
- def initialize(content, threshold: 89.0)
+ def initialize(raw, threshold: 89.0)
@threshold = threshold
- @tokens = tokenize(canonicalize(content)).to_set
+ @raw = raw
end
- def similar?(other)
- similarity_score(other) > threshold
+ def tokens
+ @tokens ||= tokenize(canonicalize(raw)).to_set
end
- # https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Dice%27s_coefficient#Ruby
- def similarity_score(other)
- overlap = (tokens & other.tokens).size
- total = tokens.size + other.tokens.size
- 100.0 * (overlap * 2.0 / total)
+ def similar?(other, algorithm: :dice_coefficient)
+ case algorithm
+ when :dice_coefficient
+ similarity_score(other) > threshold
+ when :levenshtein
+ similarity_score(other) < threshold
+ end
end
+ def similarity_score(other, algorithm: :dice_coefficient)
+ case algorithm
+ when :dice_coefficient
+ dice_coefficient(other)
+ when :levenshtein
+ Text::Levenshtein.distance(raw, other.raw, 100)
+ end
+ end
+
private
def canonicalize(content)
content&.downcase
end
@@ -30,8 +41,15 @@
content.to_s.scan(/[a-zA-Z]+/)
end
def blank?(content)
content.nil? || content.chomp.strip.empty?
+ end
+
+ # https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Dice%27s_coefficient#Ruby
+ def dice_coefficient(other)
+ overlap = (tokens & other.tokens).size
+ total = tokens.size + other.tokens.size
+ 100.0 * (overlap * 2.0 / total)
end
end
end