lib/disco/recommender.rb in disco-0.2.3 vs lib/disco/recommender.rb in disco-0.2.4

- old
+ new

@@ -1,34 +1,34 @@ module Disco class Recommender - attr_reader :global_mean, :item_factors, :user_factors + attr_reader :global_mean def initialize(factors: 8, epochs: 20, verbose: nil) @factors = factors @epochs = epochs @verbose = verbose + @user_map = {} + @item_map = {} end def fit(train_set, validation_set: nil) train_set = to_dataset(train_set) validation_set = to_dataset(validation_set) if validation_set - @implicit = !train_set.any? { |v| v[:rating] } + check_training_set(train_set) + @implicit = !train_set.any? { |v| v[:rating] } unless @implicit - ratings = train_set.map { |o| o[:rating] } - check_ratings(ratings) - @min_rating = ratings.min - @max_rating = ratings.max + check_ratings(train_set) + @min_rating, @max_rating = train_set.minmax_by { |o| o[:rating] }.map { |o| o[:rating] } if validation_set - check_ratings(validation_set.map { |o| o[:rating] }) + check_ratings(validation_set) end end - check_training_set(train_set) - create_maps(train_set) + update_maps(train_set) @rated = Hash.new { |hash, key| hash[key] = {} } input = [] value_key = @implicit ? :value : :rating train_set.each do |v| @@ -141,26 +141,55 @@ @user_index = create_index(@user_factors) end def similar_items(item_id, count: 5) check_fit - similar(item_id, @item_map, @item_factors, item_norms, count, @item_index) + similar(item_id, @item_map, @item_factors, @item_index ? nil : item_norms, count, @item_index) end alias_method :item_recs, :similar_items def similar_users(user_id, count: 5) check_fit - similar(user_id, @user_map, @user_factors, user_norms, count, @user_index) + similar(user_id, @user_map, @user_factors, @user_index ? nil : user_norms, count, @user_index) end + def user_ids + @user_map.keys + end + + def item_ids + @item_map.keys + end + + def user_factors(user_id = nil) + if user_id + u = @user_map[user_id] + @user_factors[u, true] if u + else + @user_factors + end + end + + def item_factors(item_id = nil) + if item_id + i = @item_map[item_id] + @item_factors[i, true] if i + else + @item_factors + end + end + private def create_index(factors) require "ngt" + # could speed up search with normalized cosine + # https://github.com/yahoojapan/NGT/issues/36 index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine") - index.batch_insert(factors) + ids = index.batch_insert(factors) + raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0] index end def user_norms @user_norms ||= norms(@user_factors) @@ -189,45 +218,46 @@ # convert cosine distance to cosine similarity score: 1 - v[:distance] } end else - predictions = factors.dot(factors[i, true]) / norms + # cosine similarity without norms[i] + # otherwise, denominator would be (norms[i] * norms) + predictions = factors.inner(factors[i, true]) / norms predictions = map.keys.zip(predictions).map do |item_id, pred| {item_id: item_id, score: pred} end - max_score = predictions.delete_at(i)[:score] + predictions.delete_at(i) predictions.sort_by! { |pred| -pred[:score] } # already sorted by id predictions = predictions.first(count) if count - # divide by max score to get cosine similarity + # divide by norms[i] to get cosine similarity # only need to do for returned records - predictions.each { |pred| pred[:score] /= max_score } + predictions.each { |pred| pred[:score] /= norms[i] } predictions end else [] end end - def create_maps(train_set) - user_ids = train_set.map { |v| v[:user_id] }.uniq.sort - item_ids = train_set.map { |v| v[:item_id] }.uniq.sort + def update_maps(train_set) + raise ArgumentError, "Missing user_id" if train_set.any? { |v| v[:user_id].nil? } + raise ArgumentError, "Missing item_id" if train_set.any? { |v| v[:item_id].nil? } - raise ArgumentError, "Missing user_id" if user_ids.any?(&:nil?) - raise ArgumentError, "Missing item_id" if item_ids.any?(&:nil?) - - @user_map = user_ids.zip(user_ids.size.times).to_h - @item_map = item_ids.zip(item_ids.size.times).to_h + train_set.each do |v| + @user_map[v[:user_id]] ||= @user_map.size + @item_map[v[:item_id]] ||= @item_map.size + end end def check_ratings(ratings) - unless ratings.all? { |r| !r.nil? } + unless ratings.all? { |r| !r[:rating].nil? } raise ArgumentError, "Missing ratings" end - unless ratings.all? { |r| r.is_a?(Numeric) } + unless ratings.all? { |r| r[:rating].is_a?(Numeric) } raise ArgumentError, "Ratings must be numeric" end end def check_training_set(train_set)