lib/svm_helper/preprocessors/simple.rb in svm_helper-0.1.1 vs lib/svm_helper/preprocessors/simple.rb in svm_helper-0.2.1

- old
+ new

@@ -4,11 +4,11 @@ # Preprocessor which just cleans to text # # @author Andreas Eger # class Simple - THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i + include ::ParallelHelper # filters most gender stuff GENDER_FILTER = %r{(\(*(m|w)(\/|\|)(w|m)\)*)|(/-*in)|\(in\)} # filters most wierd symbols SYMBOL_FILTER = %r{/|-|–|:|\+|!|,|\.|\*|\?|/|·|\"|„|•||\||(\S*(&|;)\S*)} # urls and email filter @@ -23,12 +23,20 @@ # filters all kind of XMl/HTML tags XML_TAG_FILTER = /<(.*?)>/ # filter for used job tokens CODE_TOKEN_FILTER = /\[[^\]]*\]|\([^\)]*\)|\{[^\}]*\}|\S*\d+\w+/ + # stopword file + #TODO use File.expand_path + STOPWORD_LOCATION = File.join(File.dirname(__FILE__),'..','stopwords') + attr_accessor :language + + def initialize args={} + @language = args.fetch(:language){'en'} @parallel = args.fetch(:parallel){false} + @stopwords ||= IO.read(File.join(STOPWORD_LOCATION,@language)).split end def label "simple" end @@ -46,16 +54,24 @@ # @param classification [Symbol] in `:industry`, `:function`, `:career_level` # # @return [Array<PreprocessedData>] list of processed job data - or singe job data def process jobs if jobs.is_a? Array - process_jobs jobs + p_map(jobs) {|job| process_job job } else process_job jobs end end + # + # loads a txt file with stop words + # @param location String folder with stopword lists + # + # @return [Array<String>] Array of stopwords + def strip_stopwords(text) + (text.split - @stopwords).delete_if { |e| e.size <= 2 } + end # # converts string into a cleaner version # @param title [String] job title # @@ -73,32 +89,25 @@ # converts string into a cleaner version # @param desc [String] job description # # @return [String] clean and lowercase version of input def clean_description desc - desc.gsub(XML_TAG_FILTER,' ') - .gsub(EMAIL_FILTER,'') - .gsub(URL_FILTER,'') - .gsub(GENDER_FILTER,'') - .gsub(NEW_LINES,'') - .gsub(SYMBOL_FILTER,' ') - .gsub(WHITESPACE,' ') - .gsub(WORDS_IN_BRACKETS, '\1') - .gsub(CODE_TOKEN_FILTER,'') - .downcase - .strip + strip_stopwords( + desc.gsub(XML_TAG_FILTER,' ') + .gsub(EMAIL_FILTER,'') + .gsub(URL_FILTER,'') + .gsub(GENDER_FILTER,'') + .gsub(NEW_LINES,'') + .gsub(SYMBOL_FILTER,' ') + .gsub(WHITESPACE,' ') + .gsub(WORDS_IN_BRACKETS, '\1') + .gsub(CODE_TOKEN_FILTER,'') + .downcase + .strip + ) end private - def process_jobs jobs - if @parallel && RUBY_PLATFORM == 'java' - Parallel.map(jobs, in_threads: THREAD_COUNT ) {|job| process_job job } - elsif @parallel - Parallel.map(jobs, in_processes: THREAD_COUNT ) {|job| process_job job } - else - jobs.map {|job| process_job job } - end - end def process_job job PreprocessedData.new( data: [clean_title(job[:title]), clean_description(job[:description])], id: job[:id],