lib/pragmatic_tokenizer/processor.rb in pragmatic_tokenizer-0.2.4 vs lib/pragmatic_tokenizer/processor.rb in pragmatic_tokenizer-0.3.0
- old
+ new
@@ -13,13 +13,12 @@
shift_ellipse(text)
shift_special_quotes(text)
shift_colon(text)
shift_bracket(text)
shift_semicolon(text)
- shift_underscore(text)
- shift_asterisk(text)
- shift_at_symbol(text)
+ shift_caret(text)
+ shift_vertical_bar(text)
convert_dbl_quotes(text)
convert_sgl_quotes(text)
shift_beginning_hyphen(text)
shift_ending_hyphen(text)
tokens = separate_full_stop(text.squeeze(' ')
@@ -33,12 +32,14 @@
private
def convert_dbl_quotes(text)
# Convert left double quotes to special character
text.gsub!(/"(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
+ text.gsub!(/“(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['“'] + ' ') || text
# Convert remaining quotes to special character
text.gsub!(/"/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
+ text.gsub!(/”/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['”'] + ' ') || text
end
def convert_sgl_quotes(text)
if defined? @language::SingleQuotes
@language::SingleQuotes.new.handle_single_quotes(text)
@@ -49,10 +50,14 @@
def shift_multiple_dash(text)
text.gsub!(/--+/o, ' - ') || text
end
+ def shift_vertical_bar(text)
+ text.gsub!(/\|/, ' | ') || text
+ end
+
def shift_comma(text)
# Shift commas off everything but numbers
text.gsub!(/,(?!\d)/o, ' , ') || text
end
@@ -81,38 +86,30 @@
def shift_bracket(text)
text.gsub!(/([\(\[\{\}\]\)])/o) { ' ' + $1 + ' ' } || text
end
- def shift_underscore(text)
- text.gsub!(/(?<=\s)\_+/, ' \1') || text
- text.gsub!(/\_+(?=\s)/, ' \1') || text
- text.gsub!(/(?<=\A)\_+/, '\1 ') || text
- text.gsub!(/\_+(?=\z)/, ' \1') || text
- end
-
- def shift_asterisk(text)
- text.gsub!(/\*+/, ' \1 ') || text
- end
-
- def shift_at_symbol(text)
- text.gsub!(/(\A|\s)\@/, '\1 ') || text
- end
-
def shift_colon(text)
+ puts "Text: #{text}"
return text unless text.include?(':') &&
- !(/\A\d+/ == text.partition(':').last[0]) &&
- !(/\A\d+/ == text.partition(':').first[-1])
+ text.partition(':').last[0] !~ /\A\d+/ &&
+ text.partition(':').first[-1] !~ /\A\d+/
+ puts "YOYOYO"
# Ignore web addresses
text.gsub!(/(?<=[http|https]):(?=\/\/)/, PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[":"]) || text
text.gsub!(/:/o, ' :') || text
+ text.gsub!(/(?<=\s):(?=\#)/, ': ') || text
end
def shift_semicolon(text)
text.gsub!(/([;])/o) { ' ' + $1 + ' ' } || text
end
+ def shift_caret(text)
+ text.gsub!(/\^/, ' ^ ') || text
+ end
+
def shift_ellipse(text)
text.gsub!(/(\.\.\.+)/o) { ' ' + $1 + ' ' } || text
text.gsub!(/(\.\.+)/o) { ' ' + $1 + ' ' } || text
text.gsub!(/(…+)/o) { ' ' + $1 + ' ' } || text
end
@@ -165,10 +162,10 @@
end
cleaned_tokens
end
def convert_sym_to_punct(token)
- symbol = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚]/.match(token)
+ symbol = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘]/.match(token)
if symbol.nil?
return token
else
return token.gsub!(symbol[0], PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP.key(symbol[0]))
end
\ No newline at end of file