lib/disco/recommender.rb in disco-0.2.5 vs lib/disco/recommender.rb in disco-0.2.6
- old
+ new
@@ -15,35 +15,48 @@
train_set = to_dataset(train_set)
validation_set = to_dataset(validation_set) if validation_set
check_training_set(train_set)
+ # TODO option to set in initializer to avoid pass
+ # could also just check first few values
+ # but may be confusing if they are all missing and later ones aren't
@implicit = !train_set.any? { |v| v[:rating] }
+
+ # TODO improve performance
+ # (catch exception instead of checking ahead of time)
unless @implicit
check_ratings(train_set)
- @min_rating, @max_rating = train_set.minmax_by { |o| o[:rating] }.map { |o| o[:rating] }
if validation_set
check_ratings(validation_set)
end
end
- update_maps(train_set)
-
@rated = Hash.new { |hash, key| hash[key] = {} }
input = []
value_key = @implicit ? :value : :rating
train_set.each do |v|
- u = @user_map[v[:user_id]]
- i = @item_map[v[:item_id]]
+ # update maps and build matrix in single pass
+ u = (@user_map[v[:user_id]] ||= @user_map.size)
+ i = (@item_map[v[:item_id]] ||= @item_map.size)
@rated[u][i] = true
# explicit will always have a value due to check_ratings
input << [u, i, v[value_key] || 1]
end
@rated.default = nil
+ # much more efficient than checking every value in another pass
+ raise ArgumentError, "Missing user_id" if @user_map.key?(nil)
+ raise ArgumentError, "Missing item_id" if @item_map.key?(nil)
+
+ # TODO improve performance
+ unless @implicit
+ @min_rating, @max_rating = train_set.minmax_by { |o| o[:rating] }.map { |o| o[:rating] }
+ end
+
if @top_items
@item_count = [0] * @item_map.size
@item_sum = [0.0] * @item_map.size
train_set.each do |v|
i = @item_map[v[:item_id]]
@@ -76,10 +89,13 @@
@global_mean = model.bias
@user_factors = model.p_factors(format: :numo)
@item_factors = model.q_factors(format: :numo)
+ @normalized_user_factors = nil
+ @normalized_item_factors = nil
+
@user_recs_index = nil
@similar_users_index = nil
@similar_items_index = nil
end
@@ -147,17 +163,17 @@
end
end
def similar_items(item_id, count: 5)
check_fit
- similar(item_id, @item_map, item_norms, count, @similar_items_index)
+ similar(item_id, @item_map, normalized_item_factors, count, @similar_items_index)
end
alias_method :item_recs, :similar_items
def similar_users(user_id, count: 5)
check_fit
- similar(user_id, @user_map, user_norms, count, @similar_users_index)
+ similar(user_id, @user_map, normalized_user_factors, count, @similar_users_index)
end
def top_items(count: 5)
check_fit
raise "top_items not computed" unless @top_items
@@ -210,19 +226,23 @@
@user_recs_index = create_index(item_factors, library: "faiss")
end
def optimize_similar_items(library: nil)
check_fit
- @similar_items_index = create_index(item_norms, library: library)
+ @similar_items_index = create_index(normalized_item_factors, library: library)
end
alias_method :optimize_item_recs, :optimize_similar_items
def optimize_similar_users(library: nil)
check_fit
- @similar_users_index = create_index(user_norms, library: library)
+ @similar_users_index = create_index(normalized_user_factors, library: library)
end
+ def inspect
+ to_s # for now
+ end
+
private
# factors should already be normalized for similar users/items
def create_index(factors, library:)
# TODO make Faiss the default in 0.3.0
@@ -249,30 +269,30 @@
# could speed up search with normalized cosine
# https://github.com/yahoojapan/NGT/issues/36
index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine")
- # NGT normalizes so could call create_index with factors instead of norms
+ # NGT normalizes so could call create_index without normalized factors
# but keep code simple for now
ids = index.batch_insert(factors)
raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0]
index
else
raise ArgumentError, "Invalid library: #{library}"
end
end
- def user_norms
- @user_norms ||= norms(@user_factors)
+ def normalized_user_factors
+ @normalized_user_factors ||= normalize(@user_factors)
end
- def item_norms
- @item_norms ||= norms(@item_factors)
+ def normalized_item_factors
+ @normalized_item_factors ||= normalize(@item_factors)
end
- def norms(factors)
+ def normalize(factors)
norms = Numo::SFloat::Math.sqrt((factors * factors).sum(axis: 1))
norms[norms.eq(0)] = 1e-10 # no zeros
factors / norms.expand_dims(1)
end
@@ -301,34 +321,30 @@
keys = map.keys
# TODO use user_id for similar_users in 0.3.0
key = :item_id
- (1...ids.size).map do |i|
- {key => keys[ids[i]], score: predictions[i]}
+ result = []
+ # items can have the same score
+ # so original item may not be at index 0
+ ids.each_with_index do |id, j|
+ next if id == i
+
+ result << {key => keys[id], score: predictions[j]}
end
+ result
else
[]
end
end
- def update_maps(train_set)
- raise ArgumentError, "Missing user_id" if train_set.any? { |v| v[:user_id].nil? }
- raise ArgumentError, "Missing item_id" if train_set.any? { |v| v[:item_id].nil? }
-
- train_set.each do |v|
- @user_map[v[:user_id]] ||= @user_map.size
- @item_map[v[:item_id]] ||= @item_map.size
- end
- end
-
def check_ratings(ratings)
unless ratings.all? { |r| !r[:rating].nil? }
- raise ArgumentError, "Missing ratings"
+ raise ArgumentError, "Missing rating"
end
unless ratings.all? { |r| r[:rating].is_a?(Numeric) }
- raise ArgumentError, "Ratings must be numeric"
+ raise ArgumentError, "Rating must be numeric"
end
end
def check_training_set(train_set)
raise ArgumentError, "No training data" if train_set.empty?
@@ -363,11 +379,14 @@
user_map: @user_map,
item_map: @item_map,
rated: @rated,
global_mean: @global_mean,
user_factors: @user_factors,
- item_factors: @item_factors
+ item_factors: @item_factors,
+ factors: @factors,
+ epochs: @epochs,
+ verbose: @verbose
}
unless @implicit
obj[:min_rating] = @min_rating
obj[:max_rating] = @max_rating
@@ -387,9 +406,12 @@
@item_map = obj[:item_map]
@rated = obj[:rated]
@global_mean = obj[:global_mean]
@user_factors = obj[:user_factors]
@item_factors = obj[:item_factors]
+ @factors = obj[:factors]
+ @epochs = obj[:epochs]
+ @verbose = obj[:verbose]
unless @implicit
@min_rating = obj[:min_rating]
@max_rating = obj[:max_rating]
end