lib/eco/api/organization/people_similarity.rb in eco-helpers-2.0.19 vs lib/eco/api/organization/people_similarity.rb in eco-helpers-2.0.21

- old
+ new

@@ -11,13 +11,25 @@ attr_accessor :attribute # @!group Config # @return [String, Proc, nil] the target attribute to be read. def attribute=(attr) - @attribute ||= "name" + @attribute = attr end + def attribute + @attribute ||= :name + end + + # Returns the target value to analyse + # @param person [Ecoportal::API::V1::Person] + def item_value(person) + return attr.call(item) if attribute.is_a?(Proc) + attr = attribute.to_sym + return item.send(attr) if item.respond_to?(attr) + end + # Define the order or relevant of per user matches # @param values[Array<Symbol>] the algorithms' results it should be ordered by # * Possible values: `:dice`, `:levenshtein`, `:jaro_winkler`, `:ngrams`, `:words_ngrams`, `:chars_position` def order=(values) @order = values @@ -35,10 +47,20 @@ def threshold @threshold ||= 0.15 end + # Generates a new object with same config but different base `data`. + # @return [Eco::API::Organization::PeopleSimilarity] + def newFrom(data) + super(data).tap do |simil| + simil.threshold = threshold + simil.order = order + simil.attribute = attribute + end + end + # @!endgroup # @!group Searchers # It gathers those that have the same `email` @@ -48,40 +70,162 @@ @by_email.select do |email, people| people.count > 1 end end + # It returns all people with no name + # @return [Eco::API::Organization::PeopleSimilarity] + def unnamed + select do |person| + person.name.to_s.strip.length < 2 + end.yield_self do |results| + newFrom(results) + end + end + + # It returns all people with no name + # @return [Eco::API::Organization::PeopleSimilarity] + def named + reject do |person| + person.name.to_s.strip.length < 2 + end.yield_self do |results| + newFrom(results) + end + end + + # It returns all the entries with `attribute` empty + # @return [Eco::API::Organization::PeopleSimilarity] + def blank_attribute + select do |person| + item_value(person).to_s.strip.length < 2 + end.yield_self do |results| + newFrom(results) + end + end + + # It returns all the entries with `attribute` **n0t** empty + # @return [Eco::API::Organization::PeopleSimilarity] + def attribute_present + reject do |person| + item_value(person).to_s.strip.length < 2 + end.yield_self do |results| + newFrom(results) + end + end + # @!endgroup - # @!group Analysers + # @!group Analisys starters # Analyses People bases on `options` + # @param needle_read [Proc, Symbol] when the value to read from `needle` object is different to the `:read` (`attribute`). + # This allows to for example, facet `needle.name` (needle_read) against `haystack_item.details[alt_id]` (read). + # @param keep_empty [Boolean] to indicate if it should get rid of people with no results (based on threshold) # @return [Hash] where the _keys_ are the people `id`s and the _values_ the `Eco::Data::FuzzyMatch::Results` - def analyse(**options) + def analyse(needle_read: nil, keep_empty: false, **options) options = { read: self.attribute }.merge(options) + total = count; i = 1 each_with_object({}) do |person, results| - results[person.id] = find_all_with_score(person, **options) + needle_str = needle_read ? item_string(person, needle_read) : nil + results[person.id] = find_all_with_score(person, needle_str: needle_str, **options) + print_progress("Analysed", total, i) + i += 1 + end.yield_self do |analysed| + analysed = clean_empty(analysed) unless keep_empty + #puts "... #{analysed.count} results after cleaning empty" + analysed end end + # @!endgroup + + # @!group Results Treatment + + # Gets a new instance object of this class, with only people in results + # @param analysed [Hash] where the _keys_ are the people `id`s and _values_ the `Eco::Data::FuzzyMatch::Results` + # @return [Eco::API::Organization::PeopleSimilarity] + def newSimilarity(analysed) + newFrom(people_in_results(analysed)) + end + + def people_in_results(analysed) + analysed.each_with_object([]) do |(id, results), people| + related = results.each_with_object([self[id]]) do |result, related| + related << result.match + end + related.each {|person| people << person unless people.include?(person)} + end + end + + # Removes from results those that do not have similar entries + def clean_empty(analysed) + analysed.select do |id, results| + !results.empty? + end + end + + # Helper to do some treatment fo the results + # @param analysed [Hash] where the _keys_ are the people `id`s and _values_ the `Eco::Data::FuzzyMatch::Results` + # @return [Hash] where the _keys_ are the people `id`s and _values_ the `Eco::Data::FuzzyMatch::Results` + def with_analysed(analysed, keep_empty: false) + analysed.each_with_object({}) do |(id, results), reanalysed| + reanalysed[id] = yield(self[id], results) + end.yield_self do |reanalysed| + reanalysed = clean_empty(reanalysed) unless keep_empty + reanalysed + end.tap {|out| "with_analysed... returns #{out.count} records"} + end + # Launches a reanalyis on `analysed` based on `options` # @param analysed [Hash] where the _keys_ are the people `id`s and the _values_ the `Eco::Data::FuzzyMatch::Results` - def re_analyse(analysed, **options) - analysed.each_with_object({}) do |(id, results), out| - out[id] = results.relevant_results(**options) + def rearrange(analysed, **options) + with_analysed(analysed) do |person, results| + results.relevant_results(**options) end end - # @!group Helpers + # Reanalyses by using a block to treat the needle and item values + def reanalyse(analysed, msg: "Reanalysing", **options, &block) + options = { read: self.attribute }.merge(options) + total = analysed.count; i = 1 + with_analysed(analysed) do |person, results| + print_progress(msg, total, i) + i += 1 + recalculate_results(results, &block) + end + end + # Renalyses by ignoring matching words between the `needle` and those found in `results` + def ignore_matching_words(analysed, **options) + prompt = "Reanalysing by ignoring matching words" + reanalyse(analysed, msg: prompt, **options) do |needle_str, item_str, needle, item| + self.class.remove_matching_words(needle_str, item_str) + end + end + + # Renalyses by ignoring matching words between the `needle` and those found in `results` + def ignore_matching_words_old(analysed, **options) + options = { read: self.attribute }.merge(options) + total = analysed.count; i = 1 + with_analysed(analysed) do |person, results| + print_progress("Reanalysing by ignoring matching words", total, i) + i += 1 + ignore_same_words_score(results, **options) + end + end + + # @!endgroup + + # @!group Reporting Helpers + # @return [String] well structured text - def analysis(analysed, format: :txt) + def report(analysed, format: :txt) case when format == :txt analysed.each_with_object("") do |(id, results), out| msg = results.results.map {|r| r.print}.join("\n ") - "'#{self[id].identify}':\n " + msg + out << "#{self[id].identify}:\n " + msg + "\n" end end end # @note @@ -89,11 +233,11 @@ # 2. It then re-sorts and cuts based on `options` # @return [Hash] where the _keys_ are the people `id`s and the _values_ the `Eco::Data::FuzzyMatch::Results` def print_analysis(**options) analysed = options[:analysed] || results_with_false_positives.analyse(**options) analysed.each_with_object({}) do |(id, results), out| - puts analysis(analysed) + puts report(analysed) end end # @!endgroup protected @@ -102,9 +246,25 @@ remove_instance_variable(@fuzzy_match) super end private + + def print_progress(msg, total, num) + return unless total > 10 + puts "" unless num > 1 + @print_msg_len ||= 0 + percent = (100 * num.to_f / total).round(1) + msg = " #{msg}: #{percent}% (#{num} of #{total})\r" + @print_msg_len = msg.length unless @print_msg_len > msg.length + print msg + $stdout.flush + if percent > 99.9 + sleep(0.2) + print "#{" " * @print_msg_len}\r" + $stdout.flush + end + end end end end