lib/svm_helper/selectors/simple.rb in svm_helper-0.1.0 vs lib/svm_helper/selectors/simple.rb in svm_helper-0.1.1

- old
+ new

@@ -24,11 +24,12 @@ attr_accessor :global_dictionary - def initialize args={} + def initialize classification, args={} + @classification = classification @global_dictionary = args.fetch(:global_dictionary) {[]} @language = args.fetch(:language){'en'} @parallel = args.fetch(:parallel){false} end @@ -41,30 +42,30 @@ # @param data_set [Array<PreprocessedData>] list of preprocessed data # @param classification [Symbol] in `:industry`, `:function`, `:career_level` # @param dictionary_size [Integer] Size of a dictionary to create if non exists # # @return [Array<FeatureVector>] list of feature vectors and labels - def generate_vectors data_set, classification=:function, dictionary_size=DEFAULT_DICTIONARY_SIZE + def generate_vectors data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE words_per_data = extract_words data_set generate_global_dictionary words_per_data, dictionary_size make_vectors(words_per_data) do |words,index| word_set = words.uniq - make_vector word_set, data_set[index], classification + make_vector word_set, data_set[index] end end # # generates a feature vector with its label # @param data [PreprocessedData] # @param classification [Symbol] in `:industry`, `:function`, `:career_level` # @param dictionary [Array] dictionary to use for this selection # # @return [FeatureVector] - def generate_vector data, classification=:function, dictionary=global_dictionary + def generate_vector data, dictionary=global_dictionary word_set = Set.new extract_words_from_data(data) - make_vector word_set, data, classification, dictionary + make_vector word_set, data, dictionary end # # loads a txt file with stop words # @param location String folder with stopword lists @@ -107,39 +108,33 @@ # @return [Array<String>] list of words def extract_words_from_data data (data.data.flat_map(&:split) - stopwords).delete_if { |e| e.size <= 3 } end - def reset + def reset classification @global_dictionary = [] + @classification = classification end private # # creates a feature vector for the given words, classification and dictionary # also adds the label # @param words [Array<String>] list of words # @param data [PreprocessedData] - # @param classification [Symbol] in `:industry`, `:function`, `:career_level` # @param dictionary # # @return [FeatureVector] - def make_vector words, data, classification, dictionary=global_dictionary + def make_vector words, data, dictionary=global_dictionary FeatureVector.new( word_data: dictionary.map{|dic_word| - words.include?(dic_word) ? 1 : 0 - }, - classification_arrays: { - function: classification_array(data.ids, :function), - industry: classification_array(data.ids, :industry), - career_level: classification_array(data.ids, :career_level) }, - labels: { - function: data.labels[:function] ? 1 : 0, - industry: data.labels[:industry] ? 1 : 0, - career_level: data.labels[:career_level] ? 1 : 0 } - ).tap{|e| e.send("#{classification}!")} + words.include?(dic_word) ? 1 : 0 + }, + classification: classification_array(data.id), + label: data.label ? 1 : 0 + ) end def make_vectors data, &block if @parallel && RUBY_PLATFORM == 'java' Parallel.map_with_index(data, in_threads: THREAD_COUNT ){|e,i| yield e,i } @@ -153,11 +148,10 @@ # # creates the classification specific part of the feature vector # @param ids [Hash] hash with classification ids # # @return [Array<Integer>] list of size=count(classifcation_ids) with only one not zero item - def classification_array(ids, classification) - id = ids[classification] - Array.new(CLASSIFICATIONS_SIZE[classification]){|n| n==(id-1) ? 1 : 0} + def classification_array(id) + Array.new(CLASSIFICATIONS_SIZE[@classification]){|n| n==(id-1) ? 1 : 0} end end end \ No newline at end of file