lib/svm_helper/preprocessors/simple.rb in svm_helper-0.1.1 vs lib/svm_helper/preprocessors/simple.rb in svm_helper-0.2.1
- old
+ new
@@ -4,11 +4,11 @@
# Preprocessor which just cleans to text
#
# @author Andreas Eger
#
class Simple
- THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i
+ include ::ParallelHelper
# filters most gender stuff
GENDER_FILTER = %r{(\(*(m|w)(\/|\|)(w|m)\)*)|(/-*in)|\(in\)}
# filters most wierd symbols
SYMBOL_FILTER = %r{/|-|–|:|\+|!|,|\.|\*|\?|/|·|\"|„|•||\||(\S*(&|;)\S*)}
# urls and email filter
@@ -23,12 +23,20 @@
# filters all kind of XMl/HTML tags
XML_TAG_FILTER = /<(.*?)>/
# filter for used job tokens
CODE_TOKEN_FILTER = /\[[^\]]*\]|\([^\)]*\)|\{[^\}]*\}|\S*\d+\w+/
+ # stopword file
+ #TODO use File.expand_path
+ STOPWORD_LOCATION = File.join(File.dirname(__FILE__),'..','stopwords')
+ attr_accessor :language
+
+
def initialize args={}
+ @language = args.fetch(:language){'en'}
@parallel = args.fetch(:parallel){false}
+ @stopwords ||= IO.read(File.join(STOPWORD_LOCATION,@language)).split
end
def label
"simple"
end
@@ -46,16 +54,24 @@
# @param classification [Symbol] in `:industry`, `:function`, `:career_level`
#
# @return [Array<PreprocessedData>] list of processed job data - or singe job data
def process jobs
if jobs.is_a? Array
- process_jobs jobs
+ p_map(jobs) {|job| process_job job }
else
process_job jobs
end
end
+ #
+ # loads a txt file with stop words
+ # @param location String folder with stopword lists
+ #
+ # @return [Array<String>] Array of stopwords
+ def strip_stopwords(text)
+ (text.split - @stopwords).delete_if { |e| e.size <= 2 }
+ end
#
# converts string into a cleaner version
# @param title [String] job title
#
@@ -73,32 +89,25 @@
# converts string into a cleaner version
# @param desc [String] job description
#
# @return [String] clean and lowercase version of input
def clean_description desc
- desc.gsub(XML_TAG_FILTER,' ')
- .gsub(EMAIL_FILTER,'')
- .gsub(URL_FILTER,'')
- .gsub(GENDER_FILTER,'')
- .gsub(NEW_LINES,'')
- .gsub(SYMBOL_FILTER,' ')
- .gsub(WHITESPACE,' ')
- .gsub(WORDS_IN_BRACKETS, '\1')
- .gsub(CODE_TOKEN_FILTER,'')
- .downcase
- .strip
+ strip_stopwords(
+ desc.gsub(XML_TAG_FILTER,' ')
+ .gsub(EMAIL_FILTER,'')
+ .gsub(URL_FILTER,'')
+ .gsub(GENDER_FILTER,'')
+ .gsub(NEW_LINES,'')
+ .gsub(SYMBOL_FILTER,' ')
+ .gsub(WHITESPACE,' ')
+ .gsub(WORDS_IN_BRACKETS, '\1')
+ .gsub(CODE_TOKEN_FILTER,'')
+ .downcase
+ .strip
+ )
end
private
- def process_jobs jobs
- if @parallel && RUBY_PLATFORM == 'java'
- Parallel.map(jobs, in_threads: THREAD_COUNT ) {|job| process_job job }
- elsif @parallel
- Parallel.map(jobs, in_processes: THREAD_COUNT ) {|job| process_job job }
- else
- jobs.map {|job| process_job job }
- end
- end
def process_job job
PreprocessedData.new(
data: [clean_title(job[:title]), clean_description(job[:description])],
id: job[:id],