lib/disco/recommender.rb in disco-0.2.4 vs lib/disco/recommender.rb in disco-0.2.5
- old
+ new
@@ -1,15 +1,16 @@
module Disco
class Recommender
attr_reader :global_mean
- def initialize(factors: 8, epochs: 20, verbose: nil)
+ def initialize(factors: 8, epochs: 20, verbose: nil, top_items: false)
@factors = factors
@epochs = epochs
@verbose = verbose
@user_map = {}
@item_map = {}
+ @top_items = top_items
end
def fit(train_set, validation_set: nil)
train_set = to_dataset(train_set)
validation_set = to_dataset(validation_set) if validation_set
@@ -39,10 +40,20 @@
# explicit will always have a value due to check_ratings
input << [u, i, v[value_key] || 1]
end
@rated.default = nil
+ if @top_items
+ @item_count = [0] * @item_map.size
+ @item_sum = [0.0] * @item_map.size
+ train_set.each do |v|
+ i = @item_map[v[:item_id]]
+ @item_count[i] += 1
+ @item_sum[i] += (v[value_key] || 1)
+ end
+ end
+
eval_set = nil
if validation_set
eval_set = []
validation_set.each do |v|
u = @user_map[v[:user_id]]
@@ -65,12 +76,13 @@
@global_mean = model.bias
@user_factors = model.p_factors(format: :numo)
@item_factors = model.q_factors(format: :numo)
- @user_index = nil
- @item_index = nil
+ @user_recs_index = nil
+ @similar_users_index = nil
+ @similar_items_index = nil
end
# generates a prediction even if a user has already rated the item
def predict(data)
data = to_dataset(data)
@@ -93,65 +105,80 @@
def user_recs(user_id, count: 5, item_ids: nil)
check_fit
u = @user_map[user_id]
if u
- predictions = @item_factors.inner(@user_factors[u, true])
+ rated = item_ids ? {} : @rated[u]
- predictions =
- @item_map.keys.zip(predictions).map do |item_id, pred|
- {item_id: item_id, score: pred}
- end
-
if item_ids
- idx = item_ids.map { |i| @item_map[i] }.compact
- predictions = predictions.values_at(*idx)
+ ids = Numo::NArray.cast(item_ids.map { |i| @item_map[i] }.compact)
+ return [] if ids.size == 0
+
+ predictions = @item_factors[ids, true].inner(@user_factors[u, true])
+ indexes = predictions.sort_index.reverse
+ indexes = indexes[0...[count + rated.size, indexes.size].min] if count
+ predictions = predictions[indexes]
+ ids = ids[indexes]
+ elsif @user_recs_index && count
+ predictions, ids = @user_recs_index.search(@user_factors[u, true].expand_dims(0), count + rated.size).map { |v| v[0, true] }
else
- @rated[u].keys.sort_by { |v| -v }.each do |i|
- predictions.delete_at(i)
- end
+ predictions = @item_factors.inner(@user_factors[u, true])
+ # TODO make sure reverse isn't hurting performance
+ indexes = predictions.sort_index.reverse
+ indexes = indexes[0...[count + rated.size, indexes.size].min] if count
+ predictions = predictions[indexes]
+ ids = indexes
end
- predictions.sort_by! { |pred| -pred[:score] } # already sorted by id
- predictions = predictions.first(count) if count && !item_ids
+ predictions.inplace.clip(@min_rating, @max_rating) if @min_rating
- # clamp *after* sorting
- # also, only needed for returned predictions
- if @min_rating
- predictions.each do |pred|
- pred[:score] = pred[:score].clamp(@min_rating, @max_rating)
- end
- end
+ keys = @item_map.keys
+ result = []
+ ids.each_with_index do |item_id, i|
+ next if rated[item_id]
- predictions
+ result << {item_id: keys[item_id], score: predictions[i]}
+ break if result.size == count
+ end
+ result
+ elsif @top_items
+ top_items(count: count)
else
- # no items if user is unknown
- # TODO maybe most popular items
[]
end
end
- def optimize_similar_items
+ def similar_items(item_id, count: 5)
check_fit
- @item_index = create_index(@item_factors)
+ similar(item_id, @item_map, item_norms, count, @similar_items_index)
end
- alias_method :optimize_item_recs, :optimize_similar_items
+ alias_method :item_recs, :similar_items
- def optimize_similar_users
+ def similar_users(user_id, count: 5)
check_fit
- @user_index = create_index(@user_factors)
+ similar(user_id, @user_map, user_norms, count, @similar_users_index)
end
- def similar_items(item_id, count: 5)
+ def top_items(count: 5)
check_fit
- similar(item_id, @item_map, @item_factors, @item_index ? nil : item_norms, count, @item_index)
- end
- alias_method :item_recs, :similar_items
+ raise "top_items not computed" unless @top_items
- def similar_users(user_id, count: 5)
- check_fit
- similar(user_id, @user_map, @user_factors, @user_index ? nil : user_norms, count, @user_index)
+ if @implicit
+ scores = @item_count
+ else
+ require "wilson_score"
+
+ range = @min_rating..@max_rating
+ scores = @item_sum.zip(@item_count).map { |s, c| WilsonScore.rating_lower_bound(s / c, c, range) }
+ end
+
+ scores = scores.map.with_index.sort_by { |s, _| -s }
+ scores = scores.first(count) if count
+ item_ids = item_ids()
+ scores.map do |s, i|
+ {item_id: item_ids[i], score: s}
+ end
end
def user_ids
@user_map.keys
end
@@ -176,21 +203,65 @@
else
@item_factors
end
end
+ def optimize_user_recs
+ check_fit
+ @user_recs_index = create_index(item_factors, library: "faiss")
+ end
+
+ def optimize_similar_items(library: nil)
+ check_fit
+ @similar_items_index = create_index(item_norms, library: library)
+ end
+ alias_method :optimize_item_recs, :optimize_similar_items
+
+ def optimize_similar_users(library: nil)
+ check_fit
+ @similar_users_index = create_index(user_norms, library: library)
+ end
+
private
- def create_index(factors)
- require "ngt"
+ # factors should already be normalized for similar users/items
+ def create_index(factors, library:)
+ # TODO make Faiss the default in 0.3.0
+ library ||= defined?(Faiss) && !defined?(Ngt) ? "faiss" : "ngt"
- # could speed up search with normalized cosine
- # https://github.com/yahoojapan/NGT/issues/36
- index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine")
- ids = index.batch_insert(factors)
- raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0]
- index
+ case library
+ when "faiss"
+ require "faiss"
+
+ # inner product is cosine similarity with normalized vectors
+ # https://github.com/facebookresearch/faiss/issues/95
+ #
+ # TODO use non-exact index
+ # https://github.com/facebookresearch/faiss/wiki/Faiss-indexes
+ index = Faiss::IndexFlatIP.new(factors.shape[1])
+
+ # ids are from 0...total
+ # https://github.com/facebookresearch/faiss/blob/96b740abedffc8f67389f29c2a180913941534c6/faiss/Index.h#L89
+ index.add(factors)
+
+ index
+ when "ngt"
+ require "ngt"
+
+ # could speed up search with normalized cosine
+ # https://github.com/yahoojapan/NGT/issues/36
+ index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine")
+
+ # NGT normalizes so could call create_index with factors instead of norms
+ # but keep code simple for now
+ ids = index.batch_insert(factors)
+ raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0]
+
+ index
+ else
+ raise ArgumentError, "Invalid library: #{library}"
+ end
end
def user_norms
@user_norms ||= norms(@user_factors)
end
@@ -200,44 +271,42 @@
end
def norms(factors)
norms = Numo::SFloat::Math.sqrt((factors * factors).sum(axis: 1))
norms[norms.eq(0)] = 1e-10 # no zeros
- norms
+ factors / norms.expand_dims(1)
end
- def similar(id, map, factors, norms, count, index)
+ def similar(id, map, norm_factors, count, index)
i = map[id]
- if i
+
+ if i && norm_factors.shape[0] > 1
if index && count
- keys = map.keys
- result = index.search(factors[i, true], size: count + 1)[1..-1]
- result.map do |v|
- {
- # ids from batch_insert start at 1 instead of 0
- item_id: keys[v[:id] - 1],
- # convert cosine distance to cosine similarity
- score: 1 - v[:distance]
- }
+ if defined?(Faiss) && index.is_a?(Faiss::Index)
+ predictions, ids = index.search(norm_factors[i, true].expand_dims(0), count + 1).map { |v| v.to_a[0] }
+ else
+ result = index.search(norm_factors[i, true], size: count + 1)
+ # ids from batch_insert start at 1 instead of 0
+ ids = result.map { |v| v[:id] - 1 }
+ # convert cosine distance to cosine similarity
+ predictions = result.map { |v| 1 - v[:distance] }
end
else
- # cosine similarity without norms[i]
- # otherwise, denominator would be (norms[i] * norms)
- predictions = factors.inner(factors[i, true]) / norms
+ predictions = norm_factors.inner(norm_factors[i, true])
+ indexes = predictions.sort_index.reverse
+ indexes = indexes[0...[count + 1, indexes.size].min] if count
+ predictions = predictions[indexes]
+ ids = indexes
+ end
- predictions =
- map.keys.zip(predictions).map do |item_id, pred|
- {item_id: item_id, score: pred}
- end
+ keys = map.keys
- predictions.delete_at(i)
- predictions.sort_by! { |pred| -pred[:score] } # already sorted by id
- predictions = predictions.first(count) if count
- # divide by norms[i] to get cosine similarity
- # only need to do for returned records
- predictions.each { |pred| pred[:score] /= norms[i] }
- predictions
+ # TODO use user_id for similar_users in 0.3.0
+ key = :item_id
+
+ (1...ids.size).map do |i|
+ {key => keys[ids[i]], score: predictions[i]}
end
else
[]
end
end
@@ -302,10 +371,15 @@
unless @implicit
obj[:min_rating] = @min_rating
obj[:max_rating] = @max_rating
end
+ if @top_items
+ obj[:item_count] = @item_count
+ obj[:item_sum] = @item_sum
+ end
+
obj
end
def marshal_load(obj)
@implicit = obj[:implicit]
@@ -317,9 +391,15 @@
@item_factors = obj[:item_factors]
unless @implicit
@min_rating = obj[:min_rating]
@max_rating = obj[:max_rating]
+ end
+
+ @top_items = obj.key?(:item_count)
+ if @top_items
+ @item_count = obj[:item_count]
+ @item_sum = obj[:item_sum]
end
end
end
end