Sha256: 930811453c5700d33845cdb188f9de9c5ee6458dce8e79bc5667a84d15cc6c7f
Contents?: true
Size: 1.08 KB
Versions: 5
Compression:
Stored size: 1.08 KB
Contents
require 'despamilator/filter' module DespamilatorFilter class UnusualCharacters < Despamilator::Filter def name 'Unusual Characters' end def description 'Detects and scores each occurrence of an unusual 2 or 3 character combination' end def parse subject initialize_combos tokenize(subject.text.without_uris).each do |token| subject.register_match!({:score => 0.05, :filter => self}) if @@combos[token.to_sym] end end private def tokenize text tokens = [] text.downcase.split(/[^a-z]/).each do |word| word.chars.each_with_index do |c, i| substr = word[i,i+3] tokens << substr.to_sym if substr.length == 3 tokens << substr[0,2].to_sym if substr.length > 1 end end tokens end def initialize_combos @@combos ||= {} return @@combos unless @@combos.empty? File.open(File.join(File.dirname(__FILE__), %w{.. .. .. conf unusual_characters.txt}), 'r').each do |line| @@combos[line.strip.to_sym] = true end end end end
Version data entries
5 entries across 5 versions & 1 rubygems