require "svm" #File.dirname(__FILE__) + "/../../../libsvm-ruby-swig/lib/svm"

require "bloomfilter" #  igrigorik-bloomfilter (github)

module Basset
  # =Overview
  # A class for SVM document classification.  Follows the same basic interface
  # as NaiveBayes; add labeled training documents to the classifier, then 
  # use it to classify unlabeled documents.  Do test your accuracy before 
  # using the classifier in production, there are a lot of knobs to tweak.
  # When testing, it is usually best to use a separate set of documents, i.e.,
  # not the training set.
  # =Learning Resources
  # SVM can be tricky to understand at first, try the following references:
  # http://en.wikipedia.org/wiki/Support_vector_machine
  # http://www.igvita.com/2008/01/07/support-vector-machines-svm-in-ruby/
  # http://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf
  # =Implementation
  # This class wraps libsvm-ruby-swig, which is itself a swig based wrapper for
  # libsvm.
  # libsvm-ruby-swig: http://github.com/tomz/libsvm-ruby-swig
  # libsvm: http://www.csie.ntu.edu.tw/~cjlin/libsvm
  # verbose version:
  # Chih-Chung Chang and Chih-Jen Lin, LIBSVM : a library for support vector machines, 2001. Software available at http://www.csie.ntu.edu.tw/~cjlin/libsvm
  # 
  # There is also the libsvm-ruby implementation.  It was originally available from
  # http://debian.cilibrar.com/debian/pool/main/libs/libsvm-ruby/libsvm-ruby_2.8.4.orig.tar.gz
  # but was not available from there when I last checked.  The Ubuntu package
  # was still available as of this writing.
  class Svm
    #include YamlSerialization
    attr_reader :class_labels, :feature_dictionary
        
    def initialize
      @total_classes = 0
      @feature_dictionary = []
      @class_labels = {}
      @documents_for_class = Hash.new {|docs_hash,key| docs_hash[key] = []}
      @svm_parameter = default_svm_parameter
    end
    
    # Adds a new document to the training set.
    def add_document(classification, feature_vectors)
      update_class_labels_with_new(classification) if new_class?(classification)
      @feature_dictionary += feature_vectors.map { |fv| fv.name }
      @feature_dictionary.uniq!
      @documents_for_class[classification] << feature_vectors.map { |fv| fv.name }
      reset_memoized_vars!
    end
    
    # Gives the vector representation of the training documents of class
    # _classification_
    def vectorized_docs(classification)
      # hardwired to binary representation
      @documents_for_class[classification].map do |features| 
        vectorize_doc(features)
        #@feature_dictionary.map { |dict_feature| features.include?(dict_feature) ? 1 : 0}
      end
    end
    
    # Returns the vectorized representation of the training data, suitable for 
    # use in the constructor for the libsvm Problem class.
    def labels_and_document_vectors
      # {labels => [features1-label, features2-label, ...], :features => [features1, features2, ...]}
      labels_features = {:labels => [], :features => []}
      @class_labels.each do |classification, label|
        vectorized_docs(classification).each do |document_vector|
          labels_features[:labels] << label
          labels_features[:features] << document_vector
        end
      end
      labels_features
    end
    
    def classify(feature_vectors)
      class_of_label(model.predict(vectorize_doc(feature_vectors.map { |fv| fv.name })))
    end
    
    def classes
      @class_labels.keys
    end
        
    # Exposes the libsvm-ruby-swig Parameter object.  If given 
    # a block, the parameter object is yielded, otherwise,
    # it's returned.
    # 
    # For example, to set parameters to their default values:
    #   
    #   basset_svm_obj.parameters do |param|
    #     param.C = 100           
    #     param.svm_type = NU_SVC
    #     param.degree = 1
    #     param.coef0 = 0
    #     param.eps= 0.001
    #     param.kernel_type = RBF
    #   end
    # 
    # To access one value:
    #   basset_svm_obj.parameters.svm_type
    #   => NU_SVC
    def parameters
      if block_given?
        yield @svm_parameter
      else
        @svm_parameter
      end
    end
    
    private
    
    def vectorize_doc(features)
      vectorized_doc = Array.new(@feature_dictionary.size, 0)
      features.each do |feature|
        if index = feature_dictionary_hash[feature]
          vectorized_doc[index] = 1
        end
      end
      vectorized_doc
    end
            
    def feature_dictionary_hash
      unless @memoized_feature_dictionary_hash
        m = 15 * @feature_dictionary.count  # bloom filter size (bytes)
        @memoized_feature_dictionary_hash = BloomFilter.new(m,3,23)
        
        @feature_dictionary.each_index do |i|
          @memoized_feature_dictionary_hash[@feature_dictionary[i]] = i
        end
      end
      @memoized_feature_dictionary_hash
    end
    
    def reset_memoized_vars!
      @memoized_model, @memoized_problem, @memoized_feature_dictionary_hash = nil, nil, nil
      @memoized_inverted_class_labels = nil
    end
    
    def model
      @memoized_model ||= Model.new(problem, @svm_parameter)
    end
    
    def problem
      unless @memoized_problem 
        labels_features = labels_and_document_vectors
        @memoized_problem = Problem.new(labels_features[:labels], labels_features[:features])
      end
      @memoized_problem
    end
    
    def new_class?(classification)
      !@class_labels.keys.include?(classification)
    end
    
    def default_svm_parameter
      param = ::Parameter.new
      param.C = 100
      param.svm_type = NU_SVC
      param.degree = 1
      param.coef0 = 0
      param.eps= 0.001
      param.nu = 0.5          #?! this blows up on my dataset...
      param.kernel_type = RBF
      param
    end
    
    def update_class_labels_with_new(classification)
      #@class_labels.each_value { |vector| vector << 0 }
      @class_labels[classification] = @total_classes  #Array.new(@total_classes, 0) << 1
      @total_classes += 1
    end
    
    def class_of_label(label)
      unless @memoized_inverted_class_labels
        @memoized_inverted_class_labels = @class_labels.invert
      end
      @memoized_inverted_class_labels[label.to_i]
    end
    
  end
end