// Copyright (C) 2011  Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.
#ifndef DLIB_STRUCTURAL_SVM_PRObLEM_THREADED_Hh_
#define DLIB_STRUCTURAL_SVM_PRObLEM_THREADED_Hh_

#include "structural_svm_problem_threaded_abstract.h"
#include "../algs.h"
#include <vector>
#include "structural_svm_problem.h"
#include "../matrix.h"
#include "sparse_vector.h"
#include <iostream>
#include "../threads.h"
#include "../misc_api.h"
#include "../statistics.h"

namespace dlib
{

// ----------------------------------------------------------------------------------------

    template <
        typename matrix_type_,
        typename feature_vector_type_ = matrix_type_
        >
    class structural_svm_problem_threaded : public structural_svm_problem<matrix_type_,feature_vector_type_> 
    {
    public:

        typedef matrix_type_ matrix_type;
        typedef typename matrix_type::type scalar_type;
        typedef feature_vector_type_ feature_vector_type;

        explicit structural_svm_problem_threaded (
            unsigned long num_threads
        ) :
            tp(num_threads),
            num_iterations_executed(0)
        {}

        unsigned long get_num_threads (
        ) const { return tp.num_threads_in_pool(); }

    private:

        struct binder
        {
            binder (
                const structural_svm_problem_threaded& self_,
                const matrix_type& w_,
                matrix_type& subgradient_,
                scalar_type& total_loss_,
                bool buffer_subgradients_locally_
            ) : self(self_), w(w_), subgradient(subgradient_), total_loss(total_loss_),
                buffer_subgradients_locally(buffer_subgradients_locally_){}

            void call_oracle (
                long begin,
                long end
            ) 
            {
                // If we are only going to call the separation oracle once then don't run
                // the slightly more complex for loop version of this code.  Or if we just
                // don't want to run the complex buffering one.  The code later on decides
                // if we should do the buffering based on how long it takes to execute.  We
                // do this because, when the subgradient is really high dimensional it can
                // take a lot of time to add them together.  So we might want to avoid
                // doing that.
                if (end-begin <= 1 || !buffer_subgradients_locally)
                {
                    scalar_type loss;
                    feature_vector_type ftemp;
                    for (long i = begin; i < end; ++i)
                    {
                        self.separation_oracle_cached(i, w, loss, ftemp);

                        auto_mutex lock(self.accum_mutex);
                        total_loss += loss;
                        add_to(subgradient, ftemp);
                    }
                }
                else
                {
                    scalar_type loss = 0;
                    matrix_type faccum(subgradient.size(),1);
                    faccum = 0;

                    feature_vector_type ftemp;

                    for (long i = begin; i < end; ++i)
                    {
                        scalar_type loss_temp;
                        self.separation_oracle_cached(i, w, loss_temp, ftemp);
                        loss += loss_temp;
                        add_to(faccum, ftemp);
                    }

                    auto_mutex lock(self.accum_mutex);
                    total_loss += loss;
                    add_to(subgradient, faccum);
                }
            }

            const structural_svm_problem_threaded& self;
            const matrix_type& w;
            matrix_type& subgradient;
            scalar_type& total_loss;
            bool buffer_subgradients_locally;
        };


        virtual void call_separation_oracle_on_all_samples (
            const matrix_type& w,
            matrix_type& subgradient,
            scalar_type& total_loss
        ) const
        {
            ++num_iterations_executed;

            const uint64 start_time = ts.get_timestamp();

            bool buffer_subgradients_locally = with_buffer_time.mean() < without_buffer_time.mean();

            // every 50 iterations we should try to flip the buffering scheme to see if
            // doing it the other way might be better.  
            if ((num_iterations_executed%50) == 0)
            {
                buffer_subgradients_locally = !buffer_subgradients_locally;
            }

            binder b(*this, w, subgradient, total_loss, buffer_subgradients_locally);
            parallel_for_blocked(tp, 0, this->get_num_samples(), b, &binder::call_oracle);

            const uint64 stop_time = ts.get_timestamp();

            if (buffer_subgradients_locally)
                with_buffer_time.add(stop_time-start_time);
            else
                without_buffer_time.add(stop_time-start_time);

        }

        mutable thread_pool tp;
        mutable mutex accum_mutex;
        mutable timestamper ts;
        mutable running_stats<double> with_buffer_time;
        mutable running_stats<double> without_buffer_time;
        mutable unsigned long num_iterations_executed;
    };

// ----------------------------------------------------------------------------------------

}

#endif // DLIB_STRUCTURAL_SVM_PRObLEM_THREADED_Hh_