lib/disco/recommender.rb in disco-0.1.1 vs lib/disco/recommender.rb in disco-0.1.2
- old
+ new
@@ -68,13 +68,15 @@
model = Libmf::Model.new(loss: loss, factors: @factors, iterations: @epochs, quiet: !verbose)
model.fit(input, eval_set: eval_set)
@global_mean = model.bias
- # TODO read from LIBMF directly to Numo for performance
- @user_factors = Numo::DFloat.cast(model.p_factors)
- @item_factors = Numo::DFloat.cast(model.q_factors)
+ @user_factors = model.p_factors(format: :numo)
+ @item_factors = model.q_factors(format: :numo)
+
+ @user_index = nil
+ @item_index = nil
end
def user_recs(user_id, count: 5, item_ids: nil)
u = @user_map[user_id]
@@ -104,21 +106,38 @@
# TODO maybe most popular items
[]
end
end
+ def optimize_similar_items
+ @item_index = create_index(@item_factors)
+ end
+ alias_method :optimize_item_recs, :optimize_similar_items
+
+ def optimize_similar_users
+ @user_index = create_index(@user_factors)
+ end
+
def similar_items(item_id, count: 5)
- similar(item_id, @item_map, @item_factors, item_norms, count)
+ similar(item_id, @item_map, @item_factors, item_norms, count, @item_index)
end
alias_method :item_recs, :similar_items
def similar_users(user_id, count: 5)
- similar(user_id, @user_map, @user_factors, user_norms, count)
+ similar(user_id, @user_map, @user_factors, user_norms, count, @user_index)
end
private
+ def create_index(factors)
+ require "ngt"
+
+ index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine")
+ index.batch_insert(factors)
+ index
+ end
+
def user_norms
@user_norms ||= norms(@user_factors)
end
def item_norms
@@ -129,23 +148,40 @@
norms = Numo::DFloat::Math.sqrt((factors * factors).sum(axis: 1))
norms[norms.eq(0)] = 1e-10 # no zeros
norms
end
- def similar(id, map, factors, norms, count)
+ def similar(id, map, factors, norms, count, index)
i = map[id]
if i
- predictions = factors.dot(factors[i, true]) / norms
-
- predictions =
- map.keys.zip(predictions).map do |item_id, pred|
- {item_id: item_id, score: pred}
+ if index && count
+ keys = map.keys
+ result = index.search(factors[i, true], size: count + 1)[1..-1]
+ result.map do |v|
+ {
+ # ids from batch_insert start at 1 instead of 0
+ item_id: keys[v[:id] - 1],
+ # convert cosine distance to cosine similarity
+ score: 1 - v[:distance]
+ }
end
+ else
+ predictions = factors.dot(factors[i, true]) / norms
- predictions.delete_at(i)
- predictions.sort_by! { |pred| -pred[:score] } # already sorted by id
- predictions = predictions.first(count) if count
- predictions
+ predictions =
+ map.keys.zip(predictions).map do |item_id, pred|
+ {item_id: item_id, score: pred}
+ end
+
+ max_score = predictions.delete_at(i)[:score]
+ predictions.sort_by! { |pred| -pred[:score] } # already sorted by id
+ predictions = predictions.first(count) if count
+ # divide by max score to get cosine similarity
+ # only need to do for returned records
+ # could alternatively do cosine distance = 1 - cosine similarity
+ # predictions.each { |pred| pred[:score] /= max_score }
+ predictions
+ end
else
[]
end
end