recommender.rb in disco-0.2.5

- old
+ new

@@ -1,15 +1,16 @@
 module Disco
   class Recommender
     attr_reader :global_mean
 
-    def initialize(factors: 8, epochs: 20, verbose: nil)
+    def initialize(factors: 8, epochs: 20, verbose: nil, top_items: false)
       @factors = factors
       @epochs = epochs
       @verbose = verbose
       @user_map = {}
       @item_map = {}
+      @top_items = top_items
     end
 
     def fit(train_set, validation_set: nil)
       train_set = to_dataset(train_set)
       validation_set = to_dataset(validation_set) if validation_set
@@ -39,10 +40,20 @@
         # explicit will always have a value due to check_ratings
         input << [u, i, v[value_key] || 1]
       end
       @rated.default = nil
 
+      if @top_items
+        @item_count = [0] * @item_map.size
+        @item_sum = [0.0] * @item_map.size
+        train_set.each do |v|
+          i = @item_map[v[:item_id]]
+          @item_count[i] += 1
+          @item_sum[i] += (v[value_key] || 1)
+        end
+      end
+
       eval_set = nil
       if validation_set
         eval_set = []
         validation_set.each do |v|
           u = @user_map[v[:user_id]]
@@ -65,12 +76,13 @@
       @global_mean = model.bias
 
       @user_factors = model.p_factors(format: :numo)
       @item_factors = model.q_factors(format: :numo)
 
-      @user_index = nil
-      @item_index = nil
+      @user_recs_index = nil
+      @similar_users_index = nil
+      @similar_items_index = nil
     end
 
     # generates a prediction even if a user has already rated the item
     def predict(data)
       data = to_dataset(data)
@@ -93,65 +105,80 @@
     def user_recs(user_id, count: 5, item_ids: nil)
       check_fit
       u = @user_map[user_id]
 
       if u
-        predictions = @item_factors.inner(@user_factors[u, true])
+        rated = item_ids ? {} : @rated[u]
 
-        predictions =
-          @item_map.keys.zip(predictions).map do |item_id, pred|
-            {item_id: item_id, score: pred}
-          end
-
         if item_ids
-          idx = item_ids.map { |i| @item_map[i] }.compact
-          predictions = predictions.values_at(*idx)
+          ids = Numo::NArray.cast(item_ids.map { |i| @item_map[i] }.compact)
+          return [] if ids.size == 0
+
+          predictions = @item_factors[ids, true].inner(@user_factors[u, true])
+          indexes = predictions.sort_index.reverse
+          indexes = indexes[0...[count + rated.size, indexes.size].min] if count
+          predictions = predictions[indexes]
+          ids = ids[indexes]
+        elsif @user_recs_index && count
+          predictions, ids = @user_recs_index.search(@user_factors[u, true].expand_dims(0), count + rated.size).map { |v| v[0, true] }
         else
-          @rated[u].keys.sort_by { |v| -v }.each do |i|
-            predictions.delete_at(i)
-          end
+          predictions = @item_factors.inner(@user_factors[u, true])
+          # TODO make sure reverse isn't hurting performance
+          indexes = predictions.sort_index.reverse
+          indexes = indexes[0...[count + rated.size, indexes.size].min] if count
+          predictions = predictions[indexes]
+          ids = indexes
         end
 
-        predictions.sort_by! { |pred| -pred[:score] } # already sorted by id
-        predictions = predictions.first(count) if count && !item_ids
+        predictions.inplace.clip(@min_rating, @max_rating) if @min_rating
 
-        # clamp *after* sorting
-        # also, only needed for returned predictions
-        if @min_rating
-          predictions.each do |pred|
-            pred[:score] = pred[:score].clamp(@min_rating, @max_rating)
-          end
-        end
+        keys = @item_map.keys
+        result = []
+        ids.each_with_index do |item_id, i|
+          next if rated[item_id]
 
-        predictions
+          result << {item_id: keys[item_id], score: predictions[i]}
+          break if result.size == count
+        end
+        result
+      elsif @top_items
+        top_items(count: count)
       else
-        # no items if user is unknown
-        # TODO maybe most popular items
         []
       end
     end
 
-    def optimize_similar_items
+    def similar_items(item_id, count: 5)
       check_fit
-      @item_index = create_index(@item_factors)
+      similar(item_id, @item_map, item_norms, count, @similar_items_index)
     end
-    alias_method :optimize_item_recs, :optimize_similar_items
+    alias_method :item_recs, :similar_items
 
-    def optimize_similar_users
+    def similar_users(user_id, count: 5)
       check_fit
-      @user_index = create_index(@user_factors)
+      similar(user_id, @user_map, user_norms, count, @similar_users_index)
     end
 
-    def similar_items(item_id, count: 5)
+    def top_items(count: 5)
       check_fit
-      similar(item_id, @item_map, @item_factors, @item_index ? nil : item_norms, count, @item_index)
-    end
-    alias_method :item_recs, :similar_items
+      raise "top_items not computed" unless @top_items
 
-    def similar_users(user_id, count: 5)
-      check_fit
-      similar(user_id, @user_map, @user_factors, @user_index ? nil : user_norms, count, @user_index)
+      if @implicit
+        scores = @item_count
+      else
+        require "wilson_score"
+
+        range = @min_rating..@max_rating
+        scores = @item_sum.zip(@item_count).map { |s, c| WilsonScore.rating_lower_bound(s / c, c, range) }
+      end
+
+      scores = scores.map.with_index.sort_by { |s, _| -s }
+      scores = scores.first(count) if count
+      item_ids = item_ids()
+      scores.map do |s, i|
+        {item_id: item_ids[i], score: s}
+      end
     end
 
     def user_ids
       @user_map.keys
     end
@@ -176,21 +203,65 @@
       else
         @item_factors
       end
     end
 
+    def optimize_user_recs
+      check_fit
+      @user_recs_index = create_index(item_factors, library: "faiss")
+    end
+
+    def optimize_similar_items(library: nil)
+      check_fit
+      @similar_items_index = create_index(item_norms, library: library)
+    end
+    alias_method :optimize_item_recs, :optimize_similar_items
+
+    def optimize_similar_users(library: nil)
+      check_fit
+      @similar_users_index = create_index(user_norms, library: library)
+    end
+
     private
 
-    def create_index(factors)
-      require "ngt"
+    # factors should already be normalized for similar users/items
+    def create_index(factors, library:)
+      # TODO make Faiss the default in 0.3.0
+      library ||= defined?(Faiss) && !defined?(Ngt) ? "faiss" : "ngt"
 
-      # could speed up search with normalized cosine
-      # https://github.com/yahoojapan/NGT/issues/36
-      index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine")
-      ids = index.batch_insert(factors)
-      raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0]
-      index
+      case library
+      when "faiss"
+        require "faiss"
+
+        # inner product is cosine similarity with normalized vectors
+        # https://github.com/facebookresearch/faiss/issues/95
+        #
+        # TODO use non-exact index
+        # https://github.com/facebookresearch/faiss/wiki/Faiss-indexes
+        index = Faiss::IndexFlatIP.new(factors.shape[1])
+
+        # ids are from 0...total
+        # https://github.com/facebookresearch/faiss/blob/96b740abedffc8f67389f29c2a180913941534c6/faiss/Index.h#L89
+        index.add(factors)
+
+        index
+      when "ngt"
+        require "ngt"
+
+        # could speed up search with normalized cosine
+        # https://github.com/yahoojapan/NGT/issues/36
+        index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine")
+
+        # NGT normalizes so could call create_index with factors instead of norms
+        # but keep code simple for now
+        ids = index.batch_insert(factors)
+        raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0]
+
+        index
+      else
+        raise ArgumentError, "Invalid library: #{library}"
+      end
     end
 
     def user_norms
       @user_norms ||= norms(@user_factors)
     end
@@ -200,44 +271,42 @@
     end
 
     def norms(factors)
       norms = Numo::SFloat::Math.sqrt((factors * factors).sum(axis: 1))
       norms[norms.eq(0)] = 1e-10 # no zeros
-      norms
+      factors / norms.expand_dims(1)
     end
 
-    def similar(id, map, factors, norms, count, index)
+    def similar(id, map, norm_factors, count, index)
       i = map[id]
-      if i
+
+      if i && norm_factors.shape[0] > 1
         if index && count
-          keys = map.keys
-          result = index.search(factors[i, true], size: count + 1)[1..-1]
-          result.map do |v|
-            {
-              # ids from batch_insert start at 1 instead of 0
-              item_id: keys[v[:id] - 1],
-              # convert cosine distance to cosine similarity
-              score: 1 - v[:distance]
-            }
+          if defined?(Faiss) && index.is_a?(Faiss::Index)
+            predictions, ids = index.search(norm_factors[i, true].expand_dims(0), count + 1).map { |v| v.to_a[0] }
+          else
+            result = index.search(norm_factors[i, true], size: count + 1)
+            # ids from batch_insert start at 1 instead of 0
+            ids = result.map { |v| v[:id] - 1 }
+            # convert cosine distance to cosine similarity
+            predictions = result.map { |v| 1 - v[:distance] }
           end
         else
-          # cosine similarity without norms[i]
-          # otherwise, denominator would be (norms[i] * norms)
-          predictions = factors.inner(factors[i, true]) / norms
+          predictions = norm_factors.inner(norm_factors[i, true])
+          indexes = predictions.sort_index.reverse
+          indexes = indexes[0...[count + 1, indexes.size].min] if count
+          predictions = predictions[indexes]
+          ids = indexes
+        end
 
-          predictions =
-            map.keys.zip(predictions).map do |item_id, pred|
-              {item_id: item_id, score: pred}
-            end
+        keys = map.keys
 
-          predictions.delete_at(i)
-          predictions.sort_by! { |pred| -pred[:score] } # already sorted by id
-          predictions = predictions.first(count) if count
-          # divide by norms[i] to get cosine similarity
-          # only need to do for returned records
-          predictions.each { |pred| pred[:score] /= norms[i] }
-          predictions
+        # TODO use user_id for similar_users in 0.3.0
+        key = :item_id
+
+        (1...ids.size).map do |i|
+          {key => keys[ids[i]], score: predictions[i]}
         end
       else
         []
       end
     end
@@ -302,10 +371,15 @@
       unless @implicit
         obj[:min_rating] = @min_rating
         obj[:max_rating] = @max_rating
       end
 
+      if @top_items
+        obj[:item_count] = @item_count
+        obj[:item_sum] = @item_sum
+      end
+
       obj
     end
 
     def marshal_load(obj)
       @implicit = obj[:implicit]
@@ -317,9 +391,15 @@
       @item_factors = obj[:item_factors]
 
       unless @implicit
         @min_rating = obj[:min_rating]
         @max_rating = obj[:max_rating]
+      end
+
+      @top_items = obj.key?(:item_count)
+      if @top_items
+        @item_count = obj[:item_count]
+        @item_sum = obj[:item_sum]
       end
     end
   end
 end