Sha256: 930811453c5700d33845cdb188f9de9c5ee6458dce8e79bc5667a84d15cc6c7f

Contents?: true

Size: 1.08 KB

Versions: 5

Compression:

Stored size: 1.08 KB

Contents

require 'despamilator/filter'

module DespamilatorFilter

  class UnusualCharacters < Despamilator::Filter

    def name
      'Unusual Characters'
    end

    def description
      'Detects and scores each occurrence of an unusual 2 or 3 character combination'
    end

    def parse subject
      initialize_combos
      tokenize(subject.text.without_uris).each do |token|
        subject.register_match!({:score => 0.05, :filter => self}) if @@combos[token.to_sym]
      end
    end

    private

    def tokenize text
      tokens = []
      text.downcase.split(/[^a-z]/).each do |word|
        word.chars.each_with_index do |c, i|
          substr = word[i,i+3]
          tokens << substr.to_sym if substr.length == 3
          tokens << substr[0,2].to_sym if substr.length > 1
        end
      end
      tokens
    end

    def initialize_combos
      @@combos ||= {}
      return @@combos unless @@combos.empty?

      File.open(File.join(File.dirname(__FILE__), %w{.. .. .. conf unusual_characters.txt}), 'r').each do |line|
        @@combos[line.strip.to_sym] = true
      end
    end

  end

end

Version data entries

5 entries across 5 versions & 1 rubygems

Version Path
despamilator-2.1.4 lib/despamilator/filter/unusual_characters.rb
despamilator-2.1.3 lib/despamilator/filter/unusual_characters.rb
despamilator-2.1.2 lib/despamilator/filter/unusual_characters.rb
despamilator-2.1.1 lib/despamilator/filter/unusual_characters.rb
despamilator-2.1 lib/despamilator/filter/unusual_characters.rb