lib/lita/handlers/markov/engine.rb in lita-markov-1.1.0 vs lib/lita/handlers/markov/engine.rb in lita-markov-1.1.1
- old
+ new
@@ -34,12 +34,12 @@
string = sanitize_string string
words = separate_string string
return if words.length == 0
- # Capitalize the first word
- words = [words[0].capitalize] + words.slice(1..-1)
+ # Capitalize the first word and add a period at the end
+ words = [words[0].capitalize] + words.slice(1..-1) + ['.']
# Iterate over it one step at a time in sets of `@depth + 1`
words.each_cons(@depth + 1) do |words|
current_state = words[0]+' '+words[1]
next_state = words[2]
@@ -97,14 +97,10 @@
state = states.sample
state.split(' ').last
end
- def is_punctuation?(string)
- PUNCTUATION.any? { |p| string == p }
- end
-
def get_next_state(user, current_state)
states = @db[:dictionary]
.where(user: user, current_state: current_state)
.select(:next_state, :frequency)
.all
@@ -131,11 +127,11 @@
# Stop if we failed to find a next state
break if next_state.nil?
sentence << next_state
- if is_punctuation? next_state
+ if next_state == '.'
ended_with_punctuation = true
break
end
end
@@ -144,46 +140,35 @@
chain << sentence.last
chain
end
- STRING_SEPARATOR = /([.!?])|\s+/
+ STRING_SEPARATOR = /\s+/
def separate_string string
# Including the punctuation in group so they'll be included in the
# split results
string
.split(STRING_SEPARATOR)
.map { |w| w.strip!; w }
.select { |w| !w.empty? }
end
- PUNCTUATION = [',', '.', '!', '?']
-
# Don't allow anything besides letters, digits, whitespace, and puncutation
- ILLEGAL_CHARACTERS = /[^\w\d\s:;,.!?#@]/
+ NON_WORD_CHARACTERS = /[^\w\d'"“”’:+-]/
HYPERLINKS = /http[^\s]+/
SIMPLE_CODE_BLOCK = /`[^`]+`/
EXTENDED_CODE_BLOCK = /```.+```/m
+ REPEATED_WHITESPACE = /\s+/
- REPEATED_PUNCTUATION = /([.!?])[.!?]+/
- BASIC_PUNCTUATION = /([;,.!?])/
-
-
def sanitize_string string
string = string
.gsub(HYPERLINKS, ''.freeze) # Remove any hyperlinks
.gsub(SIMPLE_CODE_BLOCK, ''.freeze) # Remove code blocks and illegal characters
.gsub(EXTENDED_CODE_BLOCK, ''.freeze)
- .gsub(ILLEGAL_CHARACTERS, ''.freeze)
- .gsub(REPEATED_PUNCTUATION, '\1'.freeze) # Trim down repeated punctuation
- .gsub(BASIC_PUNCTUATION, '\1 '.freeze) # Put whitespace after punctuation for proper separation
+ .gsub(NON_WORD_CHARACTERS, ' '.freeze) # Convert non-word characters into whitespace
+ .gsub(REPEATED_WHITESPACE, ' '.freeze) # Convert repeated whitespace into just single spaces
.strip()
-
- ends_with_punctuation = PUNCTUATION.any? { |p| string.end_with? p }
- string = string+'.'.freeze unless ends_with_punctuation
-
- string
end
end
end