/** * Copyright (c) 2016-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include "loss.h" #include "utils.h" #include namespace fasttext { constexpr int64_t SIGMOID_TABLE_SIZE = 512; constexpr int64_t MAX_SIGMOID = 8; constexpr int64_t LOG_TABLE_SIZE = 512; bool comparePairs( const std::pair& l, const std::pair& r) { return l.first > r.first; } real std_log(real x) { return std::log(x + 1e-5); } Loss::Loss(std::shared_ptr& wo) : wo_(wo) { t_sigmoid_.reserve(SIGMOID_TABLE_SIZE + 1); for (int i = 0; i < SIGMOID_TABLE_SIZE + 1; i++) { real x = real(i * 2 * MAX_SIGMOID) / SIGMOID_TABLE_SIZE - MAX_SIGMOID; t_sigmoid_.push_back(1.0 / (1.0 + std::exp(-x))); } t_log_.reserve(LOG_TABLE_SIZE + 1); for (int i = 0; i < LOG_TABLE_SIZE + 1; i++) { real x = (real(i) + 1e-5) / LOG_TABLE_SIZE; t_log_.push_back(std::log(x)); } } real Loss::log(real x) const { if (x > 1.0) { return 0.0; } int64_t i = int64_t(x * LOG_TABLE_SIZE); return t_log_[i]; } real Loss::sigmoid(real x) const { if (x < -MAX_SIGMOID) { return 0.0; } else if (x > MAX_SIGMOID) { return 1.0; } else { int64_t i = int64_t((x + MAX_SIGMOID) * SIGMOID_TABLE_SIZE / MAX_SIGMOID / 2); return t_sigmoid_[i]; } } void Loss::predict( int32_t k, real threshold, Predictions& heap, Model::State& state) const { computeOutput(state); findKBest(k, threshold, heap, state.output); std::sort_heap(heap.begin(), heap.end(), comparePairs); } void Loss::findKBest( int32_t k, real threshold, Predictions& heap, const Vector& output) const { for (int32_t i = 0; i < output.size(); i++) { if (output[i] < threshold) { continue; } if (heap.size() == k && std_log(output[i]) < heap.front().first) { continue; } heap.push_back(std::make_pair(std_log(output[i]), i)); std::push_heap(heap.begin(), heap.end(), comparePairs); if (heap.size() > k) { std::pop_heap(heap.begin(), heap.end(), comparePairs); heap.pop_back(); } } } BinaryLogisticLoss::BinaryLogisticLoss(std::shared_ptr& wo) : Loss(wo) {} real BinaryLogisticLoss::binaryLogistic( int32_t target, Model::State& state, bool labelIsPositive, real lr, bool backprop) const { real score = sigmoid(wo_->dotRow(state.hidden, target)); if (backprop) { real alpha = lr * (real(labelIsPositive) - score); state.grad.addRow(*wo_, target, alpha); wo_->addVectorToRow(state.hidden, target, alpha); } if (labelIsPositive) { return -log(score); } else { return -log(1.0 - score); } } void BinaryLogisticLoss::computeOutput(Model::State& state) const { Vector& output = state.output; output.mul(*wo_, state.hidden); int32_t osz = output.size(); for (int32_t i = 0; i < osz; i++) { output[i] = sigmoid(output[i]); } } OneVsAllLoss::OneVsAllLoss(std::shared_ptr& wo) : BinaryLogisticLoss(wo) {} real OneVsAllLoss::forward( const std::vector& targets, int32_t /* we take all targets here */, Model::State& state, real lr, bool backprop) { real loss = 0.0; int32_t osz = state.output.size(); for (int32_t i = 0; i < osz; i++) { bool isMatch = utils::contains(targets, i); loss += binaryLogistic(i, state, isMatch, lr, backprop); } return loss; } NegativeSamplingLoss::NegativeSamplingLoss( std::shared_ptr& wo, int neg, const std::vector& targetCounts) : BinaryLogisticLoss(wo), neg_(neg), negatives_(), uniform_() { real z = 0.0; for (size_t i = 0; i < targetCounts.size(); i++) { z += pow(targetCounts[i], 0.5); } for (size_t i = 0; i < targetCounts.size(); i++) { real c = pow(targetCounts[i], 0.5); for (size_t j = 0; j < c * NegativeSamplingLoss::NEGATIVE_TABLE_SIZE / z; j++) { negatives_.push_back(i); } } uniform_ = std::uniform_int_distribution(0, negatives_.size() - 1); } real NegativeSamplingLoss::forward( const std::vector& targets, int32_t targetIndex, Model::State& state, real lr, bool backprop) { assert(targetIndex >= 0); assert(targetIndex < targets.size()); int32_t target = targets[targetIndex]; real loss = binaryLogistic(target, state, true, lr, backprop); for (int32_t n = 0; n < neg_; n++) { auto negativeTarget = getNegative(target, state.rng); loss += binaryLogistic(negativeTarget, state, false, lr, backprop); } return loss; } int32_t NegativeSamplingLoss::getNegative( int32_t target, std::minstd_rand& rng) { int32_t negative; do { negative = negatives_[uniform_(rng)]; } while (target == negative); return negative; } HierarchicalSoftmaxLoss::HierarchicalSoftmaxLoss( std::shared_ptr& wo, const std::vector& targetCounts) : BinaryLogisticLoss(wo), paths_(), codes_(), tree_(), osz_(targetCounts.size()) { buildTree(targetCounts); } void HierarchicalSoftmaxLoss::buildTree(const std::vector& counts) { tree_.resize(2 * osz_ - 1); for (int32_t i = 0; i < 2 * osz_ - 1; i++) { tree_[i].parent = -1; tree_[i].left = -1; tree_[i].right = -1; tree_[i].count = 1e15; tree_[i].binary = false; } for (int32_t i = 0; i < osz_; i++) { tree_[i].count = counts[i]; } int32_t leaf = osz_ - 1; int32_t node = osz_; for (int32_t i = osz_; i < 2 * osz_ - 1; i++) { int32_t mini[2] = {0}; for (int32_t j = 0; j < 2; j++) { if (leaf >= 0 && tree_[leaf].count < tree_[node].count) { mini[j] = leaf--; } else { mini[j] = node++; } } tree_[i].left = mini[0]; tree_[i].right = mini[1]; tree_[i].count = tree_[mini[0]].count + tree_[mini[1]].count; tree_[mini[0]].parent = i; tree_[mini[1]].parent = i; tree_[mini[1]].binary = true; } for (int32_t i = 0; i < osz_; i++) { std::vector path; std::vector code; int32_t j = i; while (tree_[j].parent != -1) { path.push_back(tree_[j].parent - osz_); code.push_back(tree_[j].binary); j = tree_[j].parent; } paths_.push_back(path); codes_.push_back(code); } } real HierarchicalSoftmaxLoss::forward( const std::vector& targets, int32_t targetIndex, Model::State& state, real lr, bool backprop) { real loss = 0.0; int32_t target = targets[targetIndex]; const std::vector& binaryCode = codes_[target]; const std::vector& pathToRoot = paths_[target]; for (int32_t i = 0; i < pathToRoot.size(); i++) { loss += binaryLogistic(pathToRoot[i], state, binaryCode[i], lr, backprop); } return loss; } void HierarchicalSoftmaxLoss::predict( int32_t k, real threshold, Predictions& heap, Model::State& state) const { dfs(k, threshold, 2 * osz_ - 2, 0.0, heap, state.hidden); std::sort_heap(heap.begin(), heap.end(), comparePairs); } void HierarchicalSoftmaxLoss::dfs( int32_t k, real threshold, int32_t node, real score, Predictions& heap, const Vector& hidden) const { if (score < std_log(threshold)) { return; } if (heap.size() == k && score < heap.front().first) { return; } if (tree_[node].left == -1 && tree_[node].right == -1) { heap.push_back(std::make_pair(score, node)); std::push_heap(heap.begin(), heap.end(), comparePairs); if (heap.size() > k) { std::pop_heap(heap.begin(), heap.end(), comparePairs); heap.pop_back(); } return; } real f = wo_->dotRow(hidden, node - osz_); f = 1. / (1 + std::exp(-f)); dfs(k, threshold, tree_[node].left, score + std_log(1.0 - f), heap, hidden); dfs(k, threshold, tree_[node].right, score + std_log(f), heap, hidden); } SoftmaxLoss::SoftmaxLoss(std::shared_ptr& wo) : Loss(wo) {} void SoftmaxLoss::computeOutput(Model::State& state) const { Vector& output = state.output; output.mul(*wo_, state.hidden); real max = output[0], z = 0.0; int32_t osz = output.size(); for (int32_t i = 0; i < osz; i++) { max = std::max(output[i], max); } for (int32_t i = 0; i < osz; i++) { output[i] = exp(output[i] - max); z += output[i]; } for (int32_t i = 0; i < osz; i++) { output[i] /= z; } } real SoftmaxLoss::forward( const std::vector& targets, int32_t targetIndex, Model::State& state, real lr, bool backprop) { computeOutput(state); assert(targetIndex >= 0); assert(targetIndex < targets.size()); int32_t target = targets[targetIndex]; if (backprop) { int32_t osz = wo_->size(0); for (int32_t i = 0; i < osz; i++) { real label = (i == target) ? 1.0 : 0.0; real alpha = lr * (label - state.output[i]); state.grad.addRow(*wo_, i, alpha); wo_->addVectorToRow(state.hidden, i, alpha); } } return -log(state.output[target]); }; } // namespace fasttext