lib/disco/recommender.rb in disco-0.2.4 vs lib/disco/recommender.rb in disco-0.2.5

- old
+ new

@@ -1,15 +1,16 @@ module Disco class Recommender attr_reader :global_mean - def initialize(factors: 8, epochs: 20, verbose: nil) + def initialize(factors: 8, epochs: 20, verbose: nil, top_items: false) @factors = factors @epochs = epochs @verbose = verbose @user_map = {} @item_map = {} + @top_items = top_items end def fit(train_set, validation_set: nil) train_set = to_dataset(train_set) validation_set = to_dataset(validation_set) if validation_set @@ -39,10 +40,20 @@ # explicit will always have a value due to check_ratings input << [u, i, v[value_key] || 1] end @rated.default = nil + if @top_items + @item_count = [0] * @item_map.size + @item_sum = [0.0] * @item_map.size + train_set.each do |v| + i = @item_map[v[:item_id]] + @item_count[i] += 1 + @item_sum[i] += (v[value_key] || 1) + end + end + eval_set = nil if validation_set eval_set = [] validation_set.each do |v| u = @user_map[v[:user_id]] @@ -65,12 +76,13 @@ @global_mean = model.bias @user_factors = model.p_factors(format: :numo) @item_factors = model.q_factors(format: :numo) - @user_index = nil - @item_index = nil + @user_recs_index = nil + @similar_users_index = nil + @similar_items_index = nil end # generates a prediction even if a user has already rated the item def predict(data) data = to_dataset(data) @@ -93,65 +105,80 @@ def user_recs(user_id, count: 5, item_ids: nil) check_fit u = @user_map[user_id] if u - predictions = @item_factors.inner(@user_factors[u, true]) + rated = item_ids ? {} : @rated[u] - predictions = - @item_map.keys.zip(predictions).map do |item_id, pred| - {item_id: item_id, score: pred} - end - if item_ids - idx = item_ids.map { |i| @item_map[i] }.compact - predictions = predictions.values_at(*idx) + ids = Numo::NArray.cast(item_ids.map { |i| @item_map[i] }.compact) + return [] if ids.size == 0 + + predictions = @item_factors[ids, true].inner(@user_factors[u, true]) + indexes = predictions.sort_index.reverse + indexes = indexes[0...[count + rated.size, indexes.size].min] if count + predictions = predictions[indexes] + ids = ids[indexes] + elsif @user_recs_index && count + predictions, ids = @user_recs_index.search(@user_factors[u, true].expand_dims(0), count + rated.size).map { |v| v[0, true] } else - @rated[u].keys.sort_by { |v| -v }.each do |i| - predictions.delete_at(i) - end + predictions = @item_factors.inner(@user_factors[u, true]) + # TODO make sure reverse isn't hurting performance + indexes = predictions.sort_index.reverse + indexes = indexes[0...[count + rated.size, indexes.size].min] if count + predictions = predictions[indexes] + ids = indexes end - predictions.sort_by! { |pred| -pred[:score] } # already sorted by id - predictions = predictions.first(count) if count && !item_ids + predictions.inplace.clip(@min_rating, @max_rating) if @min_rating - # clamp *after* sorting - # also, only needed for returned predictions - if @min_rating - predictions.each do |pred| - pred[:score] = pred[:score].clamp(@min_rating, @max_rating) - end - end + keys = @item_map.keys + result = [] + ids.each_with_index do |item_id, i| + next if rated[item_id] - predictions + result << {item_id: keys[item_id], score: predictions[i]} + break if result.size == count + end + result + elsif @top_items + top_items(count: count) else - # no items if user is unknown - # TODO maybe most popular items [] end end - def optimize_similar_items + def similar_items(item_id, count: 5) check_fit - @item_index = create_index(@item_factors) + similar(item_id, @item_map, item_norms, count, @similar_items_index) end - alias_method :optimize_item_recs, :optimize_similar_items + alias_method :item_recs, :similar_items - def optimize_similar_users + def similar_users(user_id, count: 5) check_fit - @user_index = create_index(@user_factors) + similar(user_id, @user_map, user_norms, count, @similar_users_index) end - def similar_items(item_id, count: 5) + def top_items(count: 5) check_fit - similar(item_id, @item_map, @item_factors, @item_index ? nil : item_norms, count, @item_index) - end - alias_method :item_recs, :similar_items + raise "top_items not computed" unless @top_items - def similar_users(user_id, count: 5) - check_fit - similar(user_id, @user_map, @user_factors, @user_index ? nil : user_norms, count, @user_index) + if @implicit + scores = @item_count + else + require "wilson_score" + + range = @min_rating..@max_rating + scores = @item_sum.zip(@item_count).map { |s, c| WilsonScore.rating_lower_bound(s / c, c, range) } + end + + scores = scores.map.with_index.sort_by { |s, _| -s } + scores = scores.first(count) if count + item_ids = item_ids() + scores.map do |s, i| + {item_id: item_ids[i], score: s} + end end def user_ids @user_map.keys end @@ -176,21 +203,65 @@ else @item_factors end end + def optimize_user_recs + check_fit + @user_recs_index = create_index(item_factors, library: "faiss") + end + + def optimize_similar_items(library: nil) + check_fit + @similar_items_index = create_index(item_norms, library: library) + end + alias_method :optimize_item_recs, :optimize_similar_items + + def optimize_similar_users(library: nil) + check_fit + @similar_users_index = create_index(user_norms, library: library) + end + private - def create_index(factors) - require "ngt" + # factors should already be normalized for similar users/items + def create_index(factors, library:) + # TODO make Faiss the default in 0.3.0 + library ||= defined?(Faiss) && !defined?(Ngt) ? "faiss" : "ngt" - # could speed up search with normalized cosine - # https://github.com/yahoojapan/NGT/issues/36 - index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine") - ids = index.batch_insert(factors) - raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0] - index + case library + when "faiss" + require "faiss" + + # inner product is cosine similarity with normalized vectors + # https://github.com/facebookresearch/faiss/issues/95 + # + # TODO use non-exact index + # https://github.com/facebookresearch/faiss/wiki/Faiss-indexes + index = Faiss::IndexFlatIP.new(factors.shape[1]) + + # ids are from 0...total + # https://github.com/facebookresearch/faiss/blob/96b740abedffc8f67389f29c2a180913941534c6/faiss/Index.h#L89 + index.add(factors) + + index + when "ngt" + require "ngt" + + # could speed up search with normalized cosine + # https://github.com/yahoojapan/NGT/issues/36 + index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine") + + # NGT normalizes so could call create_index with factors instead of norms + # but keep code simple for now + ids = index.batch_insert(factors) + raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0] + + index + else + raise ArgumentError, "Invalid library: #{library}" + end end def user_norms @user_norms ||= norms(@user_factors) end @@ -200,44 +271,42 @@ end def norms(factors) norms = Numo::SFloat::Math.sqrt((factors * factors).sum(axis: 1)) norms[norms.eq(0)] = 1e-10 # no zeros - norms + factors / norms.expand_dims(1) end - def similar(id, map, factors, norms, count, index) + def similar(id, map, norm_factors, count, index) i = map[id] - if i + + if i && norm_factors.shape[0] > 1 if index && count - keys = map.keys - result = index.search(factors[i, true], size: count + 1)[1..-1] - result.map do |v| - { - # ids from batch_insert start at 1 instead of 0 - item_id: keys[v[:id] - 1], - # convert cosine distance to cosine similarity - score: 1 - v[:distance] - } + if defined?(Faiss) && index.is_a?(Faiss::Index) + predictions, ids = index.search(norm_factors[i, true].expand_dims(0), count + 1).map { |v| v.to_a[0] } + else + result = index.search(norm_factors[i, true], size: count + 1) + # ids from batch_insert start at 1 instead of 0 + ids = result.map { |v| v[:id] - 1 } + # convert cosine distance to cosine similarity + predictions = result.map { |v| 1 - v[:distance] } end else - # cosine similarity without norms[i] - # otherwise, denominator would be (norms[i] * norms) - predictions = factors.inner(factors[i, true]) / norms + predictions = norm_factors.inner(norm_factors[i, true]) + indexes = predictions.sort_index.reverse + indexes = indexes[0...[count + 1, indexes.size].min] if count + predictions = predictions[indexes] + ids = indexes + end - predictions = - map.keys.zip(predictions).map do |item_id, pred| - {item_id: item_id, score: pred} - end + keys = map.keys - predictions.delete_at(i) - predictions.sort_by! { |pred| -pred[:score] } # already sorted by id - predictions = predictions.first(count) if count - # divide by norms[i] to get cosine similarity - # only need to do for returned records - predictions.each { |pred| pred[:score] /= norms[i] } - predictions + # TODO use user_id for similar_users in 0.3.0 + key = :item_id + + (1...ids.size).map do |i| + {key => keys[ids[i]], score: predictions[i]} end else [] end end @@ -302,10 +371,15 @@ unless @implicit obj[:min_rating] = @min_rating obj[:max_rating] = @max_rating end + if @top_items + obj[:item_count] = @item_count + obj[:item_sum] = @item_sum + end + obj end def marshal_load(obj) @implicit = obj[:implicit] @@ -317,9 +391,15 @@ @item_factors = obj[:item_factors] unless @implicit @min_rating = obj[:min_rating] @max_rating = obj[:max_rating] + end + + @top_items = obj.key?(:item_count) + if @top_items + @item_count = obj[:item_count] + @item_sum = obj[:item_sum] end end end end