lib/disco/recommender.rb in disco-0.1.1 vs lib/disco/recommender.rb in disco-0.1.2

- old
+ new

@@ -68,13 +68,15 @@ model = Libmf::Model.new(loss: loss, factors: @factors, iterations: @epochs, quiet: !verbose) model.fit(input, eval_set: eval_set) @global_mean = model.bias - # TODO read from LIBMF directly to Numo for performance - @user_factors = Numo::DFloat.cast(model.p_factors) - @item_factors = Numo::DFloat.cast(model.q_factors) + @user_factors = model.p_factors(format: :numo) + @item_factors = model.q_factors(format: :numo) + + @user_index = nil + @item_index = nil end def user_recs(user_id, count: 5, item_ids: nil) u = @user_map[user_id] @@ -104,21 +106,38 @@ # TODO maybe most popular items [] end end + def optimize_similar_items + @item_index = create_index(@item_factors) + end + alias_method :optimize_item_recs, :optimize_similar_items + + def optimize_similar_users + @user_index = create_index(@user_factors) + end + def similar_items(item_id, count: 5) - similar(item_id, @item_map, @item_factors, item_norms, count) + similar(item_id, @item_map, @item_factors, item_norms, count, @item_index) end alias_method :item_recs, :similar_items def similar_users(user_id, count: 5) - similar(user_id, @user_map, @user_factors, user_norms, count) + similar(user_id, @user_map, @user_factors, user_norms, count, @user_index) end private + def create_index(factors) + require "ngt" + + index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine") + index.batch_insert(factors) + index + end + def user_norms @user_norms ||= norms(@user_factors) end def item_norms @@ -129,23 +148,40 @@ norms = Numo::DFloat::Math.sqrt((factors * factors).sum(axis: 1)) norms[norms.eq(0)] = 1e-10 # no zeros norms end - def similar(id, map, factors, norms, count) + def similar(id, map, factors, norms, count, index) i = map[id] if i - predictions = factors.dot(factors[i, true]) / norms - - predictions = - map.keys.zip(predictions).map do |item_id, pred| - {item_id: item_id, score: pred} + if index && count + keys = map.keys + result = index.search(factors[i, true], size: count + 1)[1..-1] + result.map do |v| + { + # ids from batch_insert start at 1 instead of 0 + item_id: keys[v[:id] - 1], + # convert cosine distance to cosine similarity + score: 1 - v[:distance] + } end + else + predictions = factors.dot(factors[i, true]) / norms - predictions.delete_at(i) - predictions.sort_by! { |pred| -pred[:score] } # already sorted by id - predictions = predictions.first(count) if count - predictions + predictions = + map.keys.zip(predictions).map do |item_id, pred| + {item_id: item_id, score: pred} + end + + max_score = predictions.delete_at(i)[:score] + predictions.sort_by! { |pred| -pred[:score] } # already sorted by id + predictions = predictions.first(count) if count + # divide by max score to get cosine similarity + # only need to do for returned records + # could alternatively do cosine distance = 1 - cosine similarity + # predictions.each { |pred| pred[:score] /= max_score } + predictions + end else [] end end