module Eco module API module Organization # Class to find out duplicates in the People Manager # # @attr_writer attribute [String, Proc, nil] the target attribute to be read. class PeopleSimilarity < Eco::API::Organization::People include Eco::Data::FuzzyMatch attr_accessor :attribute # @!group Config # @return [String, Proc, nil] the target attribute to be read. def attribute=(attr) @attribute = attr end def attribute @attribute ||= :name end # Returns the target value to analyse # @param person [Ecoportal::API::V1::Person] def item_value(person) return attr.call(item) if attribute.is_a?(Proc) attr = attribute.to_sym return item.send(attr) if item.respond_to?(attr) end # Define the order or relevant of per user matches # @param values[Array] the algorithms' results it should be ordered by # * Possible values: `:dice`, `:levenshtein`, `:jaro_winkler`, `:ngrams`, `:words_ngrams`, `:chars_position` def order=(values) @order = values end def order @order ||= [:words_ngrams, :dice] end # Define the order or relevant of per user matches # @param value [Float] the threshold that all of the algorithms should comply with def threshold=(value) @threshold = value end def threshold @threshold ||= 0.15 end # Generates a new object with same config but different base `data`. # @return [Eco::API::Organization::PeopleSimilarity] def newFrom(data) super(data).tap do |simil| simil.threshold = threshold simil.order = order simil.attribute = attribute end end # @!endgroup # @!group Searchers # It gathers those that have the same `email` # @return [Hash] where `keys` are `email`s and `values` an `Array` def repeated_emails init_caches @by_email.select do |email, people| people.count > 1 end end # It returns all people with no name # @return [Eco::API::Organization::PeopleSimilarity] def unnamed select do |person| person.name.to_s.strip.length < 2 end.yield_self do |results| newFrom(results) end end # It returns all people with no name # @return [Eco::API::Organization::PeopleSimilarity] def named reject do |person| person.name.to_s.strip.length < 2 end.yield_self do |results| newFrom(results) end end # It returns all the entries with `attribute` empty # @return [Eco::API::Organization::PeopleSimilarity] def blank_attribute select do |person| item_value(person).to_s.strip.length < 2 end.yield_self do |results| newFrom(results) end end # It returns all the entries with `attribute` **n0t** empty # @return [Eco::API::Organization::PeopleSimilarity] def attribute_present reject do |person| item_value(person).to_s.strip.length < 2 end.yield_self do |results| newFrom(results) end end # @!endgroup # @!group Analisys starters # Analyses People bases on `options` # @param needle_read [Proc, Symbol] when the value to read from `needle` object is different to the `:read` (`attribute`). # This allows to for example, facet `needle.name` (needle_read) against `haystack_item.details[alt_id]` (read). # @param keep_empty [Boolean] to indicate if it should get rid of people with no results (based on threshold) # @return [Hash] where the _keys_ are the people `id`s and the _values_ the `Eco::Data::FuzzyMatch::Results` def analyse(needle_read: nil, keep_empty: false, **options) options = { read: self.attribute }.merge(options) total = count; i = 1 each_with_object({}) do |person, results| needle_str = needle_read ? item_string(person, needle_read) : nil results[person.id] = find_all_with_score(person, needle_str: needle_str, **options) print_progress("Analysed", total, i) i += 1 end.yield_self do |analysed| analysed = clean_empty(analysed) unless keep_empty #puts "... #{analysed.count} results after cleaning empty" analysed end end # @!endgroup # @!group Results Treatment # Gets a new instance object of this class, with only people in results # @param analysed [Hash] where the _keys_ are the people `id`s and _values_ the `Eco::Data::FuzzyMatch::Results` # @return [Eco::API::Organization::PeopleSimilarity] def newSimilarity(analysed) newFrom(people_in_results(analysed)) end def people_in_results(analysed) analysed.each_with_object([]) do |(id, results), people| related = results.each_with_object([self[id]]) do |result, related| related << result.match end related.each {|person| people << person unless people.include?(person)} end end # Removes from results those that do not have similar entries def clean_empty(analysed) analysed.select do |id, results| !results.empty? end end # Helper to do some treatment fo the results # @param analysed [Hash] where the _keys_ are the people `id`s and _values_ the `Eco::Data::FuzzyMatch::Results` # @return [Hash] where the _keys_ are the people `id`s and _values_ the `Eco::Data::FuzzyMatch::Results` def with_analysed(analysed, keep_empty: false) analysed.each_with_object({}) do |(id, results), reanalysed| reanalysed[id] = yield(self[id], results) end.yield_self do |reanalysed| reanalysed = clean_empty(reanalysed) unless keep_empty reanalysed end.tap {|out| "with_analysed... returns #{out.count} records"} end # Launches a reanalyis on `analysed` based on `options` # @param analysed [Hash] where the _keys_ are the people `id`s and the _values_ the `Eco::Data::FuzzyMatch::Results` def rearrange(analysed, **options) with_analysed(analysed) do |person, results| results.relevant_results(**options) end end # Reanalyses by using a block to treat the needle and item values def reanalyse(analysed, msg: "Reanalysing", **options, &block) options = { read: self.attribute }.merge(options) total = analysed.count; i = 1 with_analysed(analysed) do |person, results| print_progress(msg, total, i) i += 1 recalculate_results(results, &block) end end # Renalyses by ignoring matching words between the `needle` and those found in `results` def ignore_matching_words(analysed, **options) prompt = "Reanalysing by ignoring matching words" reanalyse(analysed, msg: prompt, **options) do |needle_str, item_str, needle, item| self.class.remove_matching_words(needle_str, item_str) end end # Renalyses by ignoring matching words between the `needle` and those found in `results` def ignore_matching_words_old(analysed, **options) options = { read: self.attribute }.merge(options) total = analysed.count; i = 1 with_analysed(analysed) do |person, results| print_progress("Reanalysing by ignoring matching words", total, i) i += 1 ignore_same_words_score(results, **options) end end # @!endgroup # @!group Reporting Helpers # @return [String] well structured text def report(analysed, format: :txt) case when format == :txt analysed.each_with_object("") do |(id, results), out| msg = results.results.map {|r| r.print}.join("\n ") out << "#{self[id].identify}:\n " + msg + "\n" end end end # @note # 1. Unless `:analysed` is provided, it launches an analysis cutting with Jaro Winker min 0.5 # 2. It then re-sorts and cuts based on `options` # @return [Hash] where the _keys_ are the people `id`s and the _values_ the `Eco::Data::FuzzyMatch::Results` def print_analysis(**options) analysed = options[:analysed] || results_with_false_positives.analyse(**options) analysed.each_with_object({}) do |(id, results), out| puts report(analysed) end end # @!endgroup protected def on_change remove_instance_variable(@fuzzy_match) super end private def print_progress(msg, total, num) return unless total > 10 puts "" unless num > 1 @print_msg_len ||= 0 percent = (100 * num.to_f / total).round(1) msg = " #{msg}: #{percent}% (#{num} of #{total})\r" @print_msg_len = msg.length unless @print_msg_len > msg.length print msg $stdout.flush if percent > 99.9 sleep(0.2) print "#{" " * @print_msg_len}\r" $stdout.flush end end end end end end