lib/svm_helper/selectors/simple.rb in svm_helper-0.1.1 vs lib/svm_helper/selectors/simple.rb in svm_helper-0.2.1
- old
+ new
@@ -3,35 +3,24 @@
# Selector which uses a simple dictionary to generate feature vectors
#
# @author Andreas Eger
#
class Simple
- THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i
- # stopword file
- #TODO use File.expand_path
- STOPWORD_LOCATION = File.join(File.dirname(__FILE__),'..','stopwords')
+ include ::ParallelHelper
# default dictionary size
DEFAULT_DICTIONARY_SIZE = 800
- CLASSIFICATIONS_SIZE= if defined?(Pjpp) == 'constant'
- { function: Pjpp::Function.count,
- industry: Pjpp::Industry.count,
- career_level: Pjpp::CareerLevel.count }
- else
- { function: 19, # 1..19
- industry: 632, # 1..14370 but not all ids used
- career_level: 8 } # 1..8
- end
-
-
-
attr_accessor :global_dictionary
-
+ attr_reader :classification_encoding,
+ :gram_size,
+ :word_selection
def initialize classification, args={}
@classification = classification
@global_dictionary = args.fetch(:global_dictionary) {[]}
- @language = args.fetch(:language){'en'}
+ @classification_encoding = args.fetch(:classification_encoding){:bitmap}
+ @word_selection = args.fetch(:word_selection){ :single }
+ @gram_size = args.fetch(:gram_size) { 1 }
@parallel = args.fetch(:parallel){false}
end
def label
"simple"
@@ -46,11 +35,11 @@
# @return [Array<FeatureVector>] list of feature vectors and labels
def generate_vectors data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
words_per_data = extract_words data_set
generate_global_dictionary words_per_data, dictionary_size
- make_vectors(words_per_data) do |words,index|
+ p_map_with_index(words_per_data) do |words,index|
word_set = words.uniq
make_vector word_set, data_set[index]
end
end
@@ -65,19 +54,10 @@
word_set = Set.new extract_words_from_data(data)
make_vector word_set, data, dictionary
end
#
- # loads a txt file with stop words
- # @param location String folder with stopword lists
- #
- # @return [Array<String>] Array of stopwords
- def stopwords(location=STOPWORD_LOCATION)
- @stopwords ||= IO.read(File.join(location,@language)).split
- end
-
- #
# generates a list of words used as dictionary
# @param all_words (see #extract_words)
# @param size dictionary size
#
# @return [Array<String>] list of words
@@ -88,10 +68,14 @@
.sort_by{|e| e.size}
.map{|e| [e[0],e.size]}
@global_dictionary = words.last(size).map(&:first).reverse
end
+ def build_dictionary data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
+ words_per_data = extract_words data_set
+ generate_global_dictionary words_per_data, dictionary_size
+ end
#
# extracts the words of all provided data entries
# @param data_set [Array<PreprocessedData>] list of preprocessed data
#
# @return [Array<Array<String>>] list of words per data entry
@@ -105,13 +89,52 @@
# fetches all words from one data entry, removes stopwords and very short words
# @param data [PreprocessedData] preprocessed data entry
#
# @return [Array<String>] list of words
def extract_words_from_data data
- (data.data.flat_map(&:split) - stopwords).delete_if { |e| e.size <= 3 }
+ words = (data.data.flat_map(&:split) - stopwords)
+ .delete_if { |e| e.size <= 2 }
+ if gram_size > 1
+ words = words.each_cons(@gram_size).map{|e| e.join " " }
+ end
+ words
end
+ #
+ # fetches all words and two word phrases from one data entry, removes stopwords and very short words
+ # @param data [PreprocessedData] preprocessed data entry
+ # @param keep_label
+ #
+ # @return [OpenStruct<Array<String>,Boolean>] list of words
+ def extract_words_from_data data, keep_label=false
+ # assume the first token is the title an preserve it
+ title, *words = data.data.flatten
+ features = case word_selection
+ when :grams
+ words.each_cons(@gram_size).map{|e| e.join " " }
+ when :grams1_2
+ words + words.each_cons(2).map{|e| e.join " " }
+ when :grams1_2_3
+ words +
+ words.each_cons(2).map{|e| e.join " " } +
+ words.each_cons(3).map{|e| e.join " " }
+ when :grams1_2_3_4
+ words +
+ words.each_cons(2).map{|e| e.join " " } +
+ words.each_cons(3).map{|e| e.join " " } +
+ words.each_cons(4).map{|e| e.join " " }
+ else
+ words
+ end
+ features.unshift(title)
+ return features unless keep_label
+ OpenStruct.new(
+ features: features,
+ label: data.label
+ )
+ end
+
def reset classification
@global_dictionary = []
@classification = classification
end
@@ -133,25 +156,42 @@
classification: classification_array(data.id),
label: data.label ? 1 : 0
)
end
- def make_vectors data, &block
- if @parallel && RUBY_PLATFORM == 'java'
- Parallel.map_with_index(data, in_threads: THREAD_COUNT ){|e,i| yield e,i }
- elsif @parallel
- Parallel.map_with_index(data, in_processes: THREAD_COUNT ){|e,i| yield e,i }
- else
- data.map.with_index {|e,i| yield e,i }
- end
- end
+ BITMAP_ARRAY_SIZES= if defined?(Pjpp) == 'constant'
+ { function: Pjpp::Function.count,
+ industry: Pjpp::Industry.count,
+ career_level: Pjpp::CareerLevel.count }
+ else
+ { function: 19, # 1..19
+ industry: 632, # 1..14370 but not all ids used
+ career_level: 8 } # 1..8
+ end
+ BINARY_ARRAY_SIZES = {
+ function: 8, # max id 255, currently 19
+ industry: 16, # max id 65535, currently 14370
+ career_level: 4 } # max id 15, currently 8
#
# creates the classification specific part of the feature vector
# @param ids [Hash] hash with classification ids
#
# @return [Array<Integer>] list of size=count(classifcation_ids) with only one not zero item
def classification_array(id)
- Array.new(CLASSIFICATIONS_SIZE[@classification]){|n| n==(id-1) ? 1 : 0}
+ case @classification_encoding
+ when :binary
+ number_to_binary_array(id, BINARY_ARRAY_SIZES[@classification])
+ else # :bitmap
+ Array.new(BITMAP_ARRAY_SIZES[@classification]){|n| n==(id-1) ? 1 : 0}
+ end
end
+
+ def number_to_binary_array(number, size=8)
+ a=[]
+ (size-1).downto(0) do |i|
+ a<<number[i]
+ end
+ a
+ end
end
-end
\ No newline at end of file
+end