processor.rb in pragmatic_tokenizer-0.3.0

- old
+ new

@@ -13,13 +13,12 @@
       shift_ellipse(text)
       shift_special_quotes(text)
       shift_colon(text)
       shift_bracket(text)
       shift_semicolon(text)
-      shift_underscore(text)
-      shift_asterisk(text)
-      shift_at_symbol(text)
+      shift_caret(text)
+      shift_vertical_bar(text)
       convert_dbl_quotes(text)
       convert_sgl_quotes(text)
       shift_beginning_hyphen(text)
       shift_ending_hyphen(text)
       tokens = separate_full_stop(text.squeeze(' ')
@@ -33,12 +32,14 @@
     private
 
     def convert_dbl_quotes(text)
       # Convert left double quotes to special character
       text.gsub!(/"(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
+      text.gsub!(/“(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['“'] + ' ') || text
       # Convert remaining quotes to special character
       text.gsub!(/"/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
+      text.gsub!(/”/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['”'] + ' ') || text
     end
 
     def convert_sgl_quotes(text)
       if defined? @language::SingleQuotes
         @language::SingleQuotes.new.handle_single_quotes(text)
@@ -49,10 +50,14 @@
 
     def shift_multiple_dash(text)
       text.gsub!(/--+/o, ' - ') || text
     end
 
+    def shift_vertical_bar(text)
+      text.gsub!(/\|/, ' | ') || text
+    end
+
     def shift_comma(text)
       # Shift commas off everything but numbers
       text.gsub!(/,(?!\d)/o, ' , ') || text
     end
 
@@ -81,38 +86,30 @@
 
     def shift_bracket(text)
       text.gsub!(/([\(\[\{\}\]\)])/o) { ' ' + $1 + ' ' } || text
     end
 
-    def shift_underscore(text)
-      text.gsub!(/(?<=\s)\_+/, ' \1') || text
-      text.gsub!(/\_+(?=\s)/, ' \1') || text
-      text.gsub!(/(?<=\A)\_+/, '\1 ') || text
-      text.gsub!(/\_+(?=\z)/, ' \1') || text
-    end
-
-    def shift_asterisk(text)
-      text.gsub!(/\*+/, ' \1 ') || text
-    end
-
-    def shift_at_symbol(text)
-      text.gsub!(/(\A|\s)\@/, '\1 ') || text
-    end
-
     def shift_colon(text)
+      puts "Text: #{text}"
       return text unless text.include?(':') &&
-        !(/\A\d+/ == text.partition(':').last[0]) &&
-        !(/\A\d+/ == text.partition(':').first[-1])
+        text.partition(':').last[0] !~ /\A\d+/ &&
+        text.partition(':').first[-1] !~ /\A\d+/
+      puts "YOYOYO"
       # Ignore web addresses
       text.gsub!(/(?<=[http|https]):(?=\/\/)/, PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[":"]) || text
       text.gsub!(/:/o, ' :') || text
+      text.gsub!(/(?<=\s):(?=\#)/, ': ') || text
     end
 
     def shift_semicolon(text)
       text.gsub!(/([;])/o) { ' ' + $1 + ' ' } || text
     end
 
+    def shift_caret(text)
+      text.gsub!(/\^/, ' ^ ') || text
+    end
+
     def shift_ellipse(text)
       text.gsub!(/(\.\.\.+)/o) { ' ' + $1 + ' ' } || text
       text.gsub!(/(\.\.+)/o) { ' ' + $1 + ' ' } || text
       text.gsub!(/(…+)/o) { ' ' + $1 + ' ' } || text
     end
@@ -165,10 +162,10 @@
       end
       cleaned_tokens
     end
 
     def convert_sym_to_punct(token)
-      symbol = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚]/.match(token)
+      symbol = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘]/.match(token)
       if symbol.nil?
         return token
       else
         return token.gsub!(symbol[0], PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP.key(symbol[0]))
       end
\ No newline at end of file