Sha256: f0b2913878114f1979b84136b27b046c8f3d92bcdb635488671ba52eeb55af2f

Contents?: true

Size: 1.54 KB

Versions: 8

Compression:

Stored size: 1.54 KB

Contents

# encoding: utf-8
# Original author: Wilker LĂșcio <wilkerlucio@gmail.com>

require "set"

module Text

  # Ruby implementation of the string similarity described by Simon White
  # at: http://www.catalysoft.com/articles/StrikeAMatch.html
  #
  #                        2 * |pairs(s1) INTERSECT pairs(s2)|
  #   similarity(s1, s2) = -----------------------------------
  #                            |pairs(s1)| + |pairs(s2)|
  #
  # e.g.
  #                                             2 * |{FR, NC}|
  #   similarity(FRANCE, FRENCH) = ---------------------------------------
  #                                |{FR,RA,AN,NC,CE}| + |{FR,RE,EN,NC,CH}|
  #
  #                              = (2 * 2) / (5 + 5)
  #
  #                              = 0.4
  #
  #   WhiteSimilarity.new.similarity("FRANCE", "FRENCH")
  #
  class WhiteSimilarity

    def self.similarity(str1, str2)
      new.similarity(str1, str2)
    end

    def initialize
      @word_letter_pairs = {}
    end

    def similarity(str1, str2)
      pairs1 = word_letter_pairs(str1)
      pairs2 = word_letter_pairs(str2).dup

      union = pairs1.length + pairs2.length

      intersection = 0
      pairs1.each do |pair1|
        if index = pairs2.index(pair1)
          intersection += 1
          pairs2.delete_at(index)
        end
      end

      (2.0 * intersection) / union
    end

  private
    def word_letter_pairs(str)
      @word_letter_pairs[str] ||=
        str.upcase.split(/\s+/).map{ |word|
          (0 ... (word.length - 1)).map { |i| word[i, 2] }
        }.flatten.freeze
    end
  end
end

Version data entries

8 entries across 8 versions & 1 rubygems

Version Path
text-1.3.1 lib/text/white_similarity.rb
text-1.3.0 lib/text/white_similarity.rb
text-1.2.3 lib/text/white_similarity.rb
text-1.2.2 lib/text/white_similarity.rb
text-1.0.5 lib/text/white_similarity.rb
text-1.2.1 lib/text/white_similarity.rb
text-1.2.0 lib/text/white_similarity.rb
text-1.0.4 lib/text/white_similarity.rb