Sha256: edbeb6073d35d4f43c287b7ed6044725c9297c7464fde2a6a31cefb151295e56

Contents?: true

Size: 1.19 KB

Versions: 1

Compression:

Stored size: 1.19 KB

Contents

module Company
  module Mapping

#CompanyMapper given a corpus of documents (that contains company names) can map a new document with an existing one
#if one exists
    class CompanyMapper

      def initialize(corpus)
        @corpus = corpus
        @tfidf = TFIDF.new(@corpus)
        @tfidf.calculate
      end

      #maps a given company to a company exists to the given corpus. If the maximum name similarity found exceeds the given
      # threshold then the company's id is returned as a match
      def map(company_doc, threshold)
        if (company_doc.is_a? String)
          company = new TextDocument.new
          company.contents = company_doc
          company.id = "new_comp"
        end

        @tfidf.calculate_tfidf_weights_of_new_document(company)

        maxSim = 0.0
        mapped_company = ""
        @corpus.each do |d|
          similarity = @tfidf.similarity(d.id, company.id)
          next unless maxSim < similarity
          maxSim = similarity
          mapped_company = d.id
          break if maxSim == 1
        end

        return unless maxSim > threshold
        mapped_company.to_s.sub(/\_.*/, "").to_i
      end
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
company-mapping-0.2.2 lib/company/mapping/company_mapper.rb