lib/svm_helper/selectors/simple.rb in svm_helper-0.1.1 vs lib/svm_helper/selectors/simple.rb in svm_helper-0.2.1

- old
+ new

@@ -3,35 +3,24 @@ # Selector which uses a simple dictionary to generate feature vectors # # @author Andreas Eger # class Simple - THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i - # stopword file - #TODO use File.expand_path - STOPWORD_LOCATION = File.join(File.dirname(__FILE__),'..','stopwords') + include ::ParallelHelper # default dictionary size DEFAULT_DICTIONARY_SIZE = 800 - CLASSIFICATIONS_SIZE= if defined?(Pjpp) == 'constant' - { function: Pjpp::Function.count, - industry: Pjpp::Industry.count, - career_level: Pjpp::CareerLevel.count } - else - { function: 19, # 1..19 - industry: 632, # 1..14370 but not all ids used - career_level: 8 } # 1..8 - end - - - attr_accessor :global_dictionary - + attr_reader :classification_encoding, + :gram_size, + :word_selection def initialize classification, args={} @classification = classification @global_dictionary = args.fetch(:global_dictionary) {[]} - @language = args.fetch(:language){'en'} + @classification_encoding = args.fetch(:classification_encoding){:bitmap} + @word_selection = args.fetch(:word_selection){ :single } + @gram_size = args.fetch(:gram_size) { 1 } @parallel = args.fetch(:parallel){false} end def label "simple" @@ -46,11 +35,11 @@ # @return [Array<FeatureVector>] list of feature vectors and labels def generate_vectors data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE words_per_data = extract_words data_set generate_global_dictionary words_per_data, dictionary_size - make_vectors(words_per_data) do |words,index| + p_map_with_index(words_per_data) do |words,index| word_set = words.uniq make_vector word_set, data_set[index] end end @@ -65,19 +54,10 @@ word_set = Set.new extract_words_from_data(data) make_vector word_set, data, dictionary end # - # loads a txt file with stop words - # @param location String folder with stopword lists - # - # @return [Array<String>] Array of stopwords - def stopwords(location=STOPWORD_LOCATION) - @stopwords ||= IO.read(File.join(location,@language)).split - end - - # # generates a list of words used as dictionary # @param all_words (see #extract_words) # @param size dictionary size # # @return [Array<String>] list of words @@ -88,10 +68,14 @@ .sort_by{|e| e.size} .map{|e| [e[0],e.size]} @global_dictionary = words.last(size).map(&:first).reverse end + def build_dictionary data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE + words_per_data = extract_words data_set + generate_global_dictionary words_per_data, dictionary_size + end # # extracts the words of all provided data entries # @param data_set [Array<PreprocessedData>] list of preprocessed data # # @return [Array<Array<String>>] list of words per data entry @@ -105,13 +89,52 @@ # fetches all words from one data entry, removes stopwords and very short words # @param data [PreprocessedData] preprocessed data entry # # @return [Array<String>] list of words def extract_words_from_data data - (data.data.flat_map(&:split) - stopwords).delete_if { |e| e.size <= 3 } + words = (data.data.flat_map(&:split) - stopwords) + .delete_if { |e| e.size <= 2 } + if gram_size > 1 + words = words.each_cons(@gram_size).map{|e| e.join " " } + end + words end + # + # fetches all words and two word phrases from one data entry, removes stopwords and very short words + # @param data [PreprocessedData] preprocessed data entry + # @param keep_label + # + # @return [OpenStruct<Array<String>,Boolean>] list of words + def extract_words_from_data data, keep_label=false + # assume the first token is the title an preserve it + title, *words = data.data.flatten + features = case word_selection + when :grams + words.each_cons(@gram_size).map{|e| e.join " " } + when :grams1_2 + words + words.each_cons(2).map{|e| e.join " " } + when :grams1_2_3 + words + + words.each_cons(2).map{|e| e.join " " } + + words.each_cons(3).map{|e| e.join " " } + when :grams1_2_3_4 + words + + words.each_cons(2).map{|e| e.join " " } + + words.each_cons(3).map{|e| e.join " " } + + words.each_cons(4).map{|e| e.join " " } + else + words + end + features.unshift(title) + return features unless keep_label + OpenStruct.new( + features: features, + label: data.label + ) + end + def reset classification @global_dictionary = [] @classification = classification end @@ -133,25 +156,42 @@ classification: classification_array(data.id), label: data.label ? 1 : 0 ) end - def make_vectors data, &block - if @parallel && RUBY_PLATFORM == 'java' - Parallel.map_with_index(data, in_threads: THREAD_COUNT ){|e,i| yield e,i } - elsif @parallel - Parallel.map_with_index(data, in_processes: THREAD_COUNT ){|e,i| yield e,i } - else - data.map.with_index {|e,i| yield e,i } - end - end + BITMAP_ARRAY_SIZES= if defined?(Pjpp) == 'constant' + { function: Pjpp::Function.count, + industry: Pjpp::Industry.count, + career_level: Pjpp::CareerLevel.count } + else + { function: 19, # 1..19 + industry: 632, # 1..14370 but not all ids used + career_level: 8 } # 1..8 + end + BINARY_ARRAY_SIZES = { + function: 8, # max id 255, currently 19 + industry: 16, # max id 65535, currently 14370 + career_level: 4 } # max id 15, currently 8 # # creates the classification specific part of the feature vector # @param ids [Hash] hash with classification ids # # @return [Array<Integer>] list of size=count(classifcation_ids) with only one not zero item def classification_array(id) - Array.new(CLASSIFICATIONS_SIZE[@classification]){|n| n==(id-1) ? 1 : 0} + case @classification_encoding + when :binary + number_to_binary_array(id, BINARY_ARRAY_SIZES[@classification]) + else # :bitmap + Array.new(BITMAP_ARRAY_SIZES[@classification]){|n| n==(id-1) ? 1 : 0} + end end + + def number_to_binary_array(number, size=8) + a=[] + (size-1).downto(0) do |i| + a<<number[i] + end + a + end end -end \ No newline at end of file +end