lib/disco/recommender.rb in disco-0.2.3 vs lib/disco/recommender.rb in disco-0.2.4
- old
+ new
@@ -1,34 +1,34 @@
module Disco
class Recommender
- attr_reader :global_mean, :item_factors, :user_factors
+ attr_reader :global_mean
def initialize(factors: 8, epochs: 20, verbose: nil)
@factors = factors
@epochs = epochs
@verbose = verbose
+ @user_map = {}
+ @item_map = {}
end
def fit(train_set, validation_set: nil)
train_set = to_dataset(train_set)
validation_set = to_dataset(validation_set) if validation_set
- @implicit = !train_set.any? { |v| v[:rating] }
+ check_training_set(train_set)
+ @implicit = !train_set.any? { |v| v[:rating] }
unless @implicit
- ratings = train_set.map { |o| o[:rating] }
- check_ratings(ratings)
- @min_rating = ratings.min
- @max_rating = ratings.max
+ check_ratings(train_set)
+ @min_rating, @max_rating = train_set.minmax_by { |o| o[:rating] }.map { |o| o[:rating] }
if validation_set
- check_ratings(validation_set.map { |o| o[:rating] })
+ check_ratings(validation_set)
end
end
- check_training_set(train_set)
- create_maps(train_set)
+ update_maps(train_set)
@rated = Hash.new { |hash, key| hash[key] = {} }
input = []
value_key = @implicit ? :value : :rating
train_set.each do |v|
@@ -141,26 +141,55 @@
@user_index = create_index(@user_factors)
end
def similar_items(item_id, count: 5)
check_fit
- similar(item_id, @item_map, @item_factors, item_norms, count, @item_index)
+ similar(item_id, @item_map, @item_factors, @item_index ? nil : item_norms, count, @item_index)
end
alias_method :item_recs, :similar_items
def similar_users(user_id, count: 5)
check_fit
- similar(user_id, @user_map, @user_factors, user_norms, count, @user_index)
+ similar(user_id, @user_map, @user_factors, @user_index ? nil : user_norms, count, @user_index)
end
+ def user_ids
+ @user_map.keys
+ end
+
+ def item_ids
+ @item_map.keys
+ end
+
+ def user_factors(user_id = nil)
+ if user_id
+ u = @user_map[user_id]
+ @user_factors[u, true] if u
+ else
+ @user_factors
+ end
+ end
+
+ def item_factors(item_id = nil)
+ if item_id
+ i = @item_map[item_id]
+ @item_factors[i, true] if i
+ else
+ @item_factors
+ end
+ end
+
private
def create_index(factors)
require "ngt"
+ # could speed up search with normalized cosine
+ # https://github.com/yahoojapan/NGT/issues/36
index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine")
- index.batch_insert(factors)
+ ids = index.batch_insert(factors)
+ raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0]
index
end
def user_norms
@user_norms ||= norms(@user_factors)
@@ -189,45 +218,46 @@
# convert cosine distance to cosine similarity
score: 1 - v[:distance]
}
end
else
- predictions = factors.dot(factors[i, true]) / norms
+ # cosine similarity without norms[i]
+ # otherwise, denominator would be (norms[i] * norms)
+ predictions = factors.inner(factors[i, true]) / norms
predictions =
map.keys.zip(predictions).map do |item_id, pred|
{item_id: item_id, score: pred}
end
- max_score = predictions.delete_at(i)[:score]
+ predictions.delete_at(i)
predictions.sort_by! { |pred| -pred[:score] } # already sorted by id
predictions = predictions.first(count) if count
- # divide by max score to get cosine similarity
+ # divide by norms[i] to get cosine similarity
# only need to do for returned records
- predictions.each { |pred| pred[:score] /= max_score }
+ predictions.each { |pred| pred[:score] /= norms[i] }
predictions
end
else
[]
end
end
- def create_maps(train_set)
- user_ids = train_set.map { |v| v[:user_id] }.uniq.sort
- item_ids = train_set.map { |v| v[:item_id] }.uniq.sort
+ def update_maps(train_set)
+ raise ArgumentError, "Missing user_id" if train_set.any? { |v| v[:user_id].nil? }
+ raise ArgumentError, "Missing item_id" if train_set.any? { |v| v[:item_id].nil? }
- raise ArgumentError, "Missing user_id" if user_ids.any?(&:nil?)
- raise ArgumentError, "Missing item_id" if item_ids.any?(&:nil?)
-
- @user_map = user_ids.zip(user_ids.size.times).to_h
- @item_map = item_ids.zip(item_ids.size.times).to_h
+ train_set.each do |v|
+ @user_map[v[:user_id]] ||= @user_map.size
+ @item_map[v[:item_id]] ||= @item_map.size
+ end
end
def check_ratings(ratings)
- unless ratings.all? { |r| !r.nil? }
+ unless ratings.all? { |r| !r[:rating].nil? }
raise ArgumentError, "Missing ratings"
end
- unless ratings.all? { |r| r.is_a?(Numeric) }
+ unless ratings.all? { |r| r[:rating].is_a?(Numeric) }
raise ArgumentError, "Ratings must be numeric"
end
end
def check_training_set(train_set)