simple.rb in svm_helper-0.2.1

- old
+ new

@@ -4,11 +4,11 @@
   # Preprocessor which just cleans to text
   #
   # @author Andreas Eger
   #
   class Simple
-    THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i
+    include ::ParallelHelper
     # filters most gender stuff
     GENDER_FILTER = %r{(\(*(m|w)(\/|\|)(w|m)\)*)|(/-*in)|\(in\)}
     # filters most wierd symbols
     SYMBOL_FILTER = %r{/|-|–|:|\+|!|,|\.|\*|\?|/|·|\"|„|•||\||(\S*(&|;)\S*)}
     # urls and email filter
@@ -23,12 +23,20 @@
     # filters all kind of XMl/HTML tags
     XML_TAG_FILTER = /<(.*?)>/
     # filter for used job tokens
     CODE_TOKEN_FILTER = /\[[^\]]*\]|\([^\)]*\)|\{[^\}]*\}|\S*\d+\w+/
 
+    # stopword file
+    #TODO use File.expand_path
+    STOPWORD_LOCATION = File.join(File.dirname(__FILE__),'..','stopwords')
+    attr_accessor :language
+
+
     def initialize args={}
+      @language = args.fetch(:language){'en'}
       @parallel = args.fetch(:parallel){false}
+      @stopwords ||= IO.read(File.join(STOPWORD_LOCATION,@language)).split
     end
 
     def label
       "simple"
     end
@@ -46,16 +54,24 @@
     #   @param  classification [Symbol] in `:industry`, `:function`, `:career_level`
     #
     # @return [Array<PreprocessedData>] list of processed job data - or singe job data
     def process jobs
       if jobs.is_a? Array
-        process_jobs jobs
+        p_map(jobs) {|job| process_job job }
       else
         process_job jobs
       end
     end
 
+    #
+    # loads a txt file with stop words
+    # @param  location String folder with stopword lists
+    #
+    # @return [Array<String>] Array of stopwords
+    def strip_stopwords(text)
+      (text.split - @stopwords).delete_if { |e| e.size <= 2 }
+    end
 
     #
     # converts string into a cleaner version
     # @param  title [String] job title
     #
@@ -73,32 +89,25 @@
     # converts string into a cleaner version
     # @param  desc [String] job description
     #
     # @return [String] clean and lowercase version of input
     def clean_description desc
-      desc.gsub(XML_TAG_FILTER,' ')
-          .gsub(EMAIL_FILTER,'')
-          .gsub(URL_FILTER,'')
-          .gsub(GENDER_FILTER,'')
-          .gsub(NEW_LINES,'')
-          .gsub(SYMBOL_FILTER,' ')
-          .gsub(WHITESPACE,' ')
-          .gsub(WORDS_IN_BRACKETS, '\1')
-          .gsub(CODE_TOKEN_FILTER,'')
-          .downcase
-          .strip
+      strip_stopwords(
+        desc.gsub(XML_TAG_FILTER,' ')
+            .gsub(EMAIL_FILTER,'')
+            .gsub(URL_FILTER,'')
+            .gsub(GENDER_FILTER,'')
+            .gsub(NEW_LINES,'')
+            .gsub(SYMBOL_FILTER,' ')
+            .gsub(WHITESPACE,' ')
+            .gsub(WORDS_IN_BRACKETS, '\1')
+            .gsub(CODE_TOKEN_FILTER,'')
+            .downcase
+            .strip
+        )
     end
 
     private
-    def process_jobs jobs
-      if @parallel && RUBY_PLATFORM == 'java'
-        Parallel.map(jobs, in_threads: THREAD_COUNT ) {|job| process_job job }
-      elsif @parallel
-        Parallel.map(jobs, in_processes: THREAD_COUNT ) {|job| process_job job }
-      else
-        jobs.map {|job| process_job job }
-      end
-    end
 
     def process_job job
       PreprocessedData.new(
         data: [clean_title(job[:title]), clean_description(job[:description])],
         id: job[:id],