lib/disco/recommender.rb in disco-0.2.5 vs lib/disco/recommender.rb in disco-0.2.6

- old
+ new

@@ -15,35 +15,48 @@ train_set = to_dataset(train_set) validation_set = to_dataset(validation_set) if validation_set check_training_set(train_set) + # TODO option to set in initializer to avoid pass + # could also just check first few values + # but may be confusing if they are all missing and later ones aren't @implicit = !train_set.any? { |v| v[:rating] } + + # TODO improve performance + # (catch exception instead of checking ahead of time) unless @implicit check_ratings(train_set) - @min_rating, @max_rating = train_set.minmax_by { |o| o[:rating] }.map { |o| o[:rating] } if validation_set check_ratings(validation_set) end end - update_maps(train_set) - @rated = Hash.new { |hash, key| hash[key] = {} } input = [] value_key = @implicit ? :value : :rating train_set.each do |v| - u = @user_map[v[:user_id]] - i = @item_map[v[:item_id]] + # update maps and build matrix in single pass + u = (@user_map[v[:user_id]] ||= @user_map.size) + i = (@item_map[v[:item_id]] ||= @item_map.size) @rated[u][i] = true # explicit will always have a value due to check_ratings input << [u, i, v[value_key] || 1] end @rated.default = nil + # much more efficient than checking every value in another pass + raise ArgumentError, "Missing user_id" if @user_map.key?(nil) + raise ArgumentError, "Missing item_id" if @item_map.key?(nil) + + # TODO improve performance + unless @implicit + @min_rating, @max_rating = train_set.minmax_by { |o| o[:rating] }.map { |o| o[:rating] } + end + if @top_items @item_count = [0] * @item_map.size @item_sum = [0.0] * @item_map.size train_set.each do |v| i = @item_map[v[:item_id]] @@ -76,10 +89,13 @@ @global_mean = model.bias @user_factors = model.p_factors(format: :numo) @item_factors = model.q_factors(format: :numo) + @normalized_user_factors = nil + @normalized_item_factors = nil + @user_recs_index = nil @similar_users_index = nil @similar_items_index = nil end @@ -147,17 +163,17 @@ end end def similar_items(item_id, count: 5) check_fit - similar(item_id, @item_map, item_norms, count, @similar_items_index) + similar(item_id, @item_map, normalized_item_factors, count, @similar_items_index) end alias_method :item_recs, :similar_items def similar_users(user_id, count: 5) check_fit - similar(user_id, @user_map, user_norms, count, @similar_users_index) + similar(user_id, @user_map, normalized_user_factors, count, @similar_users_index) end def top_items(count: 5) check_fit raise "top_items not computed" unless @top_items @@ -210,19 +226,23 @@ @user_recs_index = create_index(item_factors, library: "faiss") end def optimize_similar_items(library: nil) check_fit - @similar_items_index = create_index(item_norms, library: library) + @similar_items_index = create_index(normalized_item_factors, library: library) end alias_method :optimize_item_recs, :optimize_similar_items def optimize_similar_users(library: nil) check_fit - @similar_users_index = create_index(user_norms, library: library) + @similar_users_index = create_index(normalized_user_factors, library: library) end + def inspect + to_s # for now + end + private # factors should already be normalized for similar users/items def create_index(factors, library:) # TODO make Faiss the default in 0.3.0 @@ -249,30 +269,30 @@ # could speed up search with normalized cosine # https://github.com/yahoojapan/NGT/issues/36 index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine") - # NGT normalizes so could call create_index with factors instead of norms + # NGT normalizes so could call create_index without normalized factors # but keep code simple for now ids = index.batch_insert(factors) raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0] index else raise ArgumentError, "Invalid library: #{library}" end end - def user_norms - @user_norms ||= norms(@user_factors) + def normalized_user_factors + @normalized_user_factors ||= normalize(@user_factors) end - def item_norms - @item_norms ||= norms(@item_factors) + def normalized_item_factors + @normalized_item_factors ||= normalize(@item_factors) end - def norms(factors) + def normalize(factors) norms = Numo::SFloat::Math.sqrt((factors * factors).sum(axis: 1)) norms[norms.eq(0)] = 1e-10 # no zeros factors / norms.expand_dims(1) end @@ -301,34 +321,30 @@ keys = map.keys # TODO use user_id for similar_users in 0.3.0 key = :item_id - (1...ids.size).map do |i| - {key => keys[ids[i]], score: predictions[i]} + result = [] + # items can have the same score + # so original item may not be at index 0 + ids.each_with_index do |id, j| + next if id == i + + result << {key => keys[id], score: predictions[j]} end + result else [] end end - def update_maps(train_set) - raise ArgumentError, "Missing user_id" if train_set.any? { |v| v[:user_id].nil? } - raise ArgumentError, "Missing item_id" if train_set.any? { |v| v[:item_id].nil? } - - train_set.each do |v| - @user_map[v[:user_id]] ||= @user_map.size - @item_map[v[:item_id]] ||= @item_map.size - end - end - def check_ratings(ratings) unless ratings.all? { |r| !r[:rating].nil? } - raise ArgumentError, "Missing ratings" + raise ArgumentError, "Missing rating" end unless ratings.all? { |r| r[:rating].is_a?(Numeric) } - raise ArgumentError, "Ratings must be numeric" + raise ArgumentError, "Rating must be numeric" end end def check_training_set(train_set) raise ArgumentError, "No training data" if train_set.empty? @@ -363,11 +379,14 @@ user_map: @user_map, item_map: @item_map, rated: @rated, global_mean: @global_mean, user_factors: @user_factors, - item_factors: @item_factors + item_factors: @item_factors, + factors: @factors, + epochs: @epochs, + verbose: @verbose } unless @implicit obj[:min_rating] = @min_rating obj[:max_rating] = @max_rating @@ -387,9 +406,12 @@ @item_map = obj[:item_map] @rated = obj[:rated] @global_mean = obj[:global_mean] @user_factors = obj[:user_factors] @item_factors = obj[:item_factors] + @factors = obj[:factors] + @epochs = obj[:epochs] + @verbose = obj[:verbose] unless @implicit @min_rating = obj[:min_rating] @max_rating = obj[:max_rating] end