lib/twss/trainer.rb in twss-0.0.3 vs lib/twss/trainer.rb in twss-0.0.4
- old
+ new
@@ -2,67 +2,110 @@
module TWSS
class Trainer
- attr_reader :engine
+ attr_reader :engine, :training_percentage
def initialize(engine, options = {})
@engine = engine
engine.clear_state!
- @training_set_size = options[:training_set_size] || 100
+ @training_percentage = options[:training_percentage] || 0.9
end
def train
path = File.join(File.dirname(__FILE__), '../../data/')
+ run_training(path)
+
+ puts "Writing to file..."
+ engine.dump_classifier_to_file
+
+ run_tests(path)
+ end
+
+ def total_documents(file)
+ t = 0
+ File.read(file).each_line do |l|
+ t += 1
+ end
+ t
+ end
+
+ def run_training(path)
+ positive_file = File.join(path, 'twss.txt')
+ negative_file = File.join(path, 'non_twss.txt')
+
puts "Clearing state..."
engine.clear_state!
puts "Training NON-TWSS strings..."
- File.read(File.join(path, 'non_twss.txt')).each_line do |l|
- engine.train(TWSS::Engine::FALSE, strip_tweet(l))
+ File.read(negative_file).each_line do |l|
+ print '.'
+ $stdout.flush
+ engine.train(TWSS::Engine::FALSE, l)
end
+ puts
puts "Training TWSS strings..."
- File.read(File.join(path, 'twss.txt')).each_line do |l|
- engine.train(TWSS::Engine::TRUE, strip_tweet(l))
+ File.read(positive_file).each_line do |l|
+ print '.'
+ $stdout.flush
+ engine.train(TWSS::Engine::TRUE, l)
end
+ puts
+ end
+
+ def run_tests(path)
+ positive_test_file = File.join(path, 'test_twss.txt')
+ negative_test_file = File.join(path, 'test_non_twss.txt')
+
+ total_positive = total_documents(positive_test_file)
+ total_negative = total_documents(negative_test_file)
+
+ false_negatives = 0
+ false_positives = 0
+ total = 0
+ correct = 0
+ test_each(positive_test_file, (total_positive * training_percentage).to_i) do |line, result|
+ if result
+ correct += 1
+ else
+ false_negatives += 1
+ end
+ total += 1
+ end
- puts "Writing to file..."
- engine.dump_classifier_to_file
-
- puts "Done."
+ test_each(negative_test_file, (total_negative * training_percentage).to_i) do |line, result|
+ if !result
+ correct += 1
+ else
+ false_positives += 1
+ end
+ total += 1
+ end
+
puts
-
- run_examples
+ puts "Test set size: #{total}"
+ puts "Overall accuracy: #{100 * correct / total.to_f}%"
+ puts "False positives: #{false_positives} (#{100 * false_positives / total_negative.to_f}%)"
+ puts "False negatives: #{false_negatives} (#{100 * false_negatives / total_positive.to_f}%)"
+ puts
end
-
- # A little cleanup of the text before we train on it.
- def strip_tweet(text)
- t = text.gsub(/[\@\#]\w+\b/i, '') # strip mentions and hashtags
- t.gsub!(/(RT|OH)\W/i, '') # strip RT's and OH's
- t.gsub!(/twss/i, '') # strip out twss itself
- t.gsub!(/http:\/\/[A-Za-z0-9\.\/]+/, '') # URLs
- t.gsub!(/[\W\d]/, ' ') # now all non word chars and numbers
- t.strip!
- t
+
+ def test_each(file, sample_size, &blk)
+ i = 0
+ File.read(file).each_line do |line|
+ return if i > sample_size
+ l = line.strip
+ unless l.empty?
+ r = TWSS(l)
+ puts l + ' => ' + r.to_s
+ blk.call(l, r)
+ i += 1
+ end
+ end
end
-
- def run_examples
- ["how big is that thing going to get?",
- "umm... that's the not the right hole",
- "did you resolve the ticket?",
- "did you fix the bug?",
- "you're going to need to go faster",
- "I'm almost there, keep going",
- "Ok, send me a pull request",
- "The president issued a decree",
- "I don't get it, this isn't working correctly",
- "finished specialties in the warehouse"].each do |s|
- puts '"' + s + '" => ' + TWSS(s).to_s
- end
- end
-
+
end
end
\ No newline at end of file