lib/lita/handlers/markov/engine.rb in lita-markov-1.0.2 vs lib/lita/handlers/markov/engine.rb in lita-markov-1.1.0
- old
+ new
@@ -144,38 +144,45 @@
chain << sentence.last
chain
end
+ STRING_SEPARATOR = /([.!?])|\s+/
+
def separate_string string
# Including the punctuation in group so they'll be included in the
# split results
string
- .split(/([.!?])|\s+/)
- .map { |w| w.strip }
+ .split(STRING_SEPARATOR)
+ .map { |w| w.strip!; w }
.select { |w| !w.empty? }
end
- PUNCTUATION = ['.', '!', '?']
+ PUNCTUATION = [',', '.', '!', '?']
# Don't allow anything besides letters, digits, whitespace, and puncutation
ILLEGAL_CHARACTERS = /[^\w\d\s:;,.!?#@]/
+ HYPERLINKS = /http[^\s]+/
SIMPLE_CODE_BLOCK = /`[^`]+`/
EXTENDED_CODE_BLOCK = /```.+```/m
+ REPEATED_PUNCTUATION = /([.!?])[.!?]+/
+ BASIC_PUNCTUATION = /([;,.!?])/
+
+
def sanitize_string string
string = string
+ .gsub(HYPERLINKS, ''.freeze) # Remove any hyperlinks
+ .gsub(SIMPLE_CODE_BLOCK, ''.freeze) # Remove code blocks and illegal characters
+ .gsub(EXTENDED_CODE_BLOCK, ''.freeze)
+ .gsub(ILLEGAL_CHARACTERS, ''.freeze)
+ .gsub(REPEATED_PUNCTUATION, '\1'.freeze) # Trim down repeated punctuation
+ .gsub(BASIC_PUNCTUATION, '\1 '.freeze) # Put whitespace after punctuation for proper separation
.strip()
- .gsub(/http[^\s]+/, '') # Remove any hyperlinks
- .gsub(SIMPLE_CODE_BLOCK, '') # Remove code blocks and illegal characters
- .gsub(EXTENDED_CODE_BLOCK, '')
- .gsub(ILLEGAL_CHARACTERS, '')
- .gsub(/([:;,.!?])/, '\1 ') # Put whitespace after punctuation for proper separation
- .strip()
ends_with_punctuation = PUNCTUATION.any? { |p| string.end_with? p }
- string = string+'.' unless ends_with_punctuation
+ string = string+'.'.freeze unless ends_with_punctuation
string
end
end
end