lib/eco/api/organization/people_similarity.rb in eco-helpers-2.0.19 vs lib/eco/api/organization/people_similarity.rb in eco-helpers-2.0.21
- old
+ new
@@ -11,13 +11,25 @@
attr_accessor :attribute
# @!group Config
# @return [String, Proc, nil] the target attribute to be read.
def attribute=(attr)
- @attribute ||= "name"
+ @attribute = attr
end
+ def attribute
+ @attribute ||= :name
+ end
+
+ # Returns the target value to analyse
+ # @param person [Ecoportal::API::V1::Person]
+ def item_value(person)
+ return attr.call(item) if attribute.is_a?(Proc)
+ attr = attribute.to_sym
+ return item.send(attr) if item.respond_to?(attr)
+ end
+
# Define the order or relevant of per user matches
# @param values[Array<Symbol>] the algorithms' results it should be ordered by
# * Possible values: `:dice`, `:levenshtein`, `:jaro_winkler`, `:ngrams`, `:words_ngrams`, `:chars_position`
def order=(values)
@order = values
@@ -35,10 +47,20 @@
def threshold
@threshold ||= 0.15
end
+ # Generates a new object with same config but different base `data`.
+ # @return [Eco::API::Organization::PeopleSimilarity]
+ def newFrom(data)
+ super(data).tap do |simil|
+ simil.threshold = threshold
+ simil.order = order
+ simil.attribute = attribute
+ end
+ end
+
# @!endgroup
# @!group Searchers
# It gathers those that have the same `email`
@@ -48,40 +70,162 @@
@by_email.select do |email, people|
people.count > 1
end
end
+ # It returns all people with no name
+ # @return [Eco::API::Organization::PeopleSimilarity]
+ def unnamed
+ select do |person|
+ person.name.to_s.strip.length < 2
+ end.yield_self do |results|
+ newFrom(results)
+ end
+ end
+
+ # It returns all people with no name
+ # @return [Eco::API::Organization::PeopleSimilarity]
+ def named
+ reject do |person|
+ person.name.to_s.strip.length < 2
+ end.yield_self do |results|
+ newFrom(results)
+ end
+ end
+
+ # It returns all the entries with `attribute` empty
+ # @return [Eco::API::Organization::PeopleSimilarity]
+ def blank_attribute
+ select do |person|
+ item_value(person).to_s.strip.length < 2
+ end.yield_self do |results|
+ newFrom(results)
+ end
+ end
+
+ # It returns all the entries with `attribute` **n0t** empty
+ # @return [Eco::API::Organization::PeopleSimilarity]
+ def attribute_present
+ reject do |person|
+ item_value(person).to_s.strip.length < 2
+ end.yield_self do |results|
+ newFrom(results)
+ end
+ end
+
# @!endgroup
- # @!group Analysers
+ # @!group Analisys starters
# Analyses People bases on `options`
+ # @param needle_read [Proc, Symbol] when the value to read from `needle` object is different to the `:read` (`attribute`).
+ # This allows to for example, facet `needle.name` (needle_read) against `haystack_item.details[alt_id]` (read).
+ # @param keep_empty [Boolean] to indicate if it should get rid of people with no results (based on threshold)
# @return [Hash] where the _keys_ are the people `id`s and the _values_ the `Eco::Data::FuzzyMatch::Results`
- def analyse(**options)
+ def analyse(needle_read: nil, keep_empty: false, **options)
options = { read: self.attribute }.merge(options)
+ total = count; i = 1
each_with_object({}) do |person, results|
- results[person.id] = find_all_with_score(person, **options)
+ needle_str = needle_read ? item_string(person, needle_read) : nil
+ results[person.id] = find_all_with_score(person, needle_str: needle_str, **options)
+ print_progress("Analysed", total, i)
+ i += 1
+ end.yield_self do |analysed|
+ analysed = clean_empty(analysed) unless keep_empty
+ #puts "... #{analysed.count} results after cleaning empty"
+ analysed
end
end
+ # @!endgroup
+
+ # @!group Results Treatment
+
+ # Gets a new instance object of this class, with only people in results
+ # @param analysed [Hash] where the _keys_ are the people `id`s and _values_ the `Eco::Data::FuzzyMatch::Results`
+ # @return [Eco::API::Organization::PeopleSimilarity]
+ def newSimilarity(analysed)
+ newFrom(people_in_results(analysed))
+ end
+
+ def people_in_results(analysed)
+ analysed.each_with_object([]) do |(id, results), people|
+ related = results.each_with_object([self[id]]) do |result, related|
+ related << result.match
+ end
+ related.each {|person| people << person unless people.include?(person)}
+ end
+ end
+
+ # Removes from results those that do not have similar entries
+ def clean_empty(analysed)
+ analysed.select do |id, results|
+ !results.empty?
+ end
+ end
+
+ # Helper to do some treatment fo the results
+ # @param analysed [Hash] where the _keys_ are the people `id`s and _values_ the `Eco::Data::FuzzyMatch::Results`
+ # @return [Hash] where the _keys_ are the people `id`s and _values_ the `Eco::Data::FuzzyMatch::Results`
+ def with_analysed(analysed, keep_empty: false)
+ analysed.each_with_object({}) do |(id, results), reanalysed|
+ reanalysed[id] = yield(self[id], results)
+ end.yield_self do |reanalysed|
+ reanalysed = clean_empty(reanalysed) unless keep_empty
+ reanalysed
+ end.tap {|out| "with_analysed... returns #{out.count} records"}
+ end
+
# Launches a reanalyis on `analysed` based on `options`
# @param analysed [Hash] where the _keys_ are the people `id`s and the _values_ the `Eco::Data::FuzzyMatch::Results`
- def re_analyse(analysed, **options)
- analysed.each_with_object({}) do |(id, results), out|
- out[id] = results.relevant_results(**options)
+ def rearrange(analysed, **options)
+ with_analysed(analysed) do |person, results|
+ results.relevant_results(**options)
end
end
- # @!group Helpers
+ # Reanalyses by using a block to treat the needle and item values
+ def reanalyse(analysed, msg: "Reanalysing", **options, &block)
+ options = { read: self.attribute }.merge(options)
+ total = analysed.count; i = 1
+ with_analysed(analysed) do |person, results|
+ print_progress(msg, total, i)
+ i += 1
+ recalculate_results(results, &block)
+ end
+ end
+ # Renalyses by ignoring matching words between the `needle` and those found in `results`
+ def ignore_matching_words(analysed, **options)
+ prompt = "Reanalysing by ignoring matching words"
+ reanalyse(analysed, msg: prompt, **options) do |needle_str, item_str, needle, item|
+ self.class.remove_matching_words(needle_str, item_str)
+ end
+ end
+
+ # Renalyses by ignoring matching words between the `needle` and those found in `results`
+ def ignore_matching_words_old(analysed, **options)
+ options = { read: self.attribute }.merge(options)
+ total = analysed.count; i = 1
+ with_analysed(analysed) do |person, results|
+ print_progress("Reanalysing by ignoring matching words", total, i)
+ i += 1
+ ignore_same_words_score(results, **options)
+ end
+ end
+
+ # @!endgroup
+
+ # @!group Reporting Helpers
+
# @return [String] well structured text
- def analysis(analysed, format: :txt)
+ def report(analysed, format: :txt)
case
when format == :txt
analysed.each_with_object("") do |(id, results), out|
msg = results.results.map {|r| r.print}.join("\n ")
- "'#{self[id].identify}':\n " + msg
+ out << "#{self[id].identify}:\n " + msg + "\n"
end
end
end
# @note
@@ -89,11 +233,11 @@
# 2. It then re-sorts and cuts based on `options`
# @return [Hash] where the _keys_ are the people `id`s and the _values_ the `Eco::Data::FuzzyMatch::Results`
def print_analysis(**options)
analysed = options[:analysed] || results_with_false_positives.analyse(**options)
analysed.each_with_object({}) do |(id, results), out|
- puts analysis(analysed)
+ puts report(analysed)
end
end
# @!endgroup
protected
@@ -102,9 +246,25 @@
remove_instance_variable(@fuzzy_match)
super
end
private
+
+ def print_progress(msg, total, num)
+ return unless total > 10
+ puts "" unless num > 1
+ @print_msg_len ||= 0
+ percent = (100 * num.to_f / total).round(1)
+ msg = " #{msg}: #{percent}% (#{num} of #{total})\r"
+ @print_msg_len = msg.length unless @print_msg_len > msg.length
+ print msg
+ $stdout.flush
+ if percent > 99.9
+ sleep(0.2)
+ print "#{" " * @print_msg_len}\r"
+ $stdout.flush
+ end
+ end
end
end
end