lib/pragmatic_tokenizer/processor.rb in pragmatic_tokenizer-0.2.4 vs lib/pragmatic_tokenizer/processor.rb in pragmatic_tokenizer-0.3.0

- old
+ new

@@ -13,13 +13,12 @@ shift_ellipse(text) shift_special_quotes(text) shift_colon(text) shift_bracket(text) shift_semicolon(text) - shift_underscore(text) - shift_asterisk(text) - shift_at_symbol(text) + shift_caret(text) + shift_vertical_bar(text) convert_dbl_quotes(text) convert_sgl_quotes(text) shift_beginning_hyphen(text) shift_ending_hyphen(text) tokens = separate_full_stop(text.squeeze(' ') @@ -33,12 +32,14 @@ private def convert_dbl_quotes(text) # Convert left double quotes to special character text.gsub!(/"(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text + text.gsub!(/“(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['“'] + ' ') || text # Convert remaining quotes to special character text.gsub!(/"/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text + text.gsub!(/”/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['”'] + ' ') || text end def convert_sgl_quotes(text) if defined? @language::SingleQuotes @language::SingleQuotes.new.handle_single_quotes(text) @@ -49,10 +50,14 @@ def shift_multiple_dash(text) text.gsub!(/--+/o, ' - ') || text end + def shift_vertical_bar(text) + text.gsub!(/\|/, ' | ') || text + end + def shift_comma(text) # Shift commas off everything but numbers text.gsub!(/,(?!\d)/o, ' , ') || text end @@ -81,38 +86,30 @@ def shift_bracket(text) text.gsub!(/([\(\[\{\}\]\)])/o) { ' ' + $1 + ' ' } || text end - def shift_underscore(text) - text.gsub!(/(?<=\s)\_+/, ' \1') || text - text.gsub!(/\_+(?=\s)/, ' \1') || text - text.gsub!(/(?<=\A)\_+/, '\1 ') || text - text.gsub!(/\_+(?=\z)/, ' \1') || text - end - - def shift_asterisk(text) - text.gsub!(/\*+/, ' \1 ') || text - end - - def shift_at_symbol(text) - text.gsub!(/(\A|\s)\@/, '\1 ') || text - end - def shift_colon(text) + puts "Text: #{text}" return text unless text.include?(':') && - !(/\A\d+/ == text.partition(':').last[0]) && - !(/\A\d+/ == text.partition(':').first[-1]) + text.partition(':').last[0] !~ /\A\d+/ && + text.partition(':').first[-1] !~ /\A\d+/ + puts "YOYOYO" # Ignore web addresses text.gsub!(/(?<=[http|https]):(?=\/\/)/, PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[":"]) || text text.gsub!(/:/o, ' :') || text + text.gsub!(/(?<=\s):(?=\#)/, ': ') || text end def shift_semicolon(text) text.gsub!(/([;])/o) { ' ' + $1 + ' ' } || text end + def shift_caret(text) + text.gsub!(/\^/, ' ^ ') || text + end + def shift_ellipse(text) text.gsub!(/(\.\.\.+)/o) { ' ' + $1 + ' ' } || text text.gsub!(/(\.\.+)/o) { ' ' + $1 + ' ' } || text text.gsub!(/(…+)/o) { ' ' + $1 + ' ' } || text end @@ -165,10 +162,10 @@ end cleaned_tokens end def convert_sym_to_punct(token) - symbol = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚]/.match(token) + symbol = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘]/.match(token) if symbol.nil? return token else return token.gsub!(symbol[0], PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP.key(symbol[0])) end \ No newline at end of file