Sha256: eb0e0cbf2b5947a4e925c91a5bd9a292ec717bd005c812b45ee29a67739e78dc

Contents?: true

Size: 1.57 KB

Versions: 1

Compression:

Stored size: 1.57 KB

Contents

module Wordlist
  module Parsers
    def self.included(base)
      base.module_eval do
        # Ignore case of parsed text
        attr_accessor :ignore_case

        # Ignore the punctuation of parsed text
        attr_accessor :ignore_punctuation

        # Ignore URLs
        attr_accessor :ignore_urls

        # Ignore Phone numbers
        attr_accessor :ignore_phone_numbers

        # Ignore References
        attr_accessor :ignore_references
      end
    end

    def initialize
      @ignore_case = false
      @ignore_punctuation = true
      @ignore_urls = true
      @ignore_phone_numbers = false
      @ignore_references = false
    end

    #
    # Parses the specified _text_ and returns an Array of tokens.
    #
    def parse(text)
      text = text.to_s

      if @ignore_punctuation
        # eat tailing punctuation
        text.gsub!(/[\.\?!]*$/,'')
      end

      if @ignore_case
        # downcase the sentence
        text.downcase!
      end

      if @ignore_urls
        text.gsub!(/\s*\w+:\/\/[\w\/\+_\-,:%\d\.\-\?&=]*\s*/,' ')
      end

      if @ignore_phone_numbers
        # remove phone numbers
        text.gsub!(/\s*(\d-)?(\d{3}-)?\d{3}-\d{4}\s*/,' ')
      end

      if @ignore_references
        # remove RFC style references
        text.gsub!(/\s*[\(\{\[]\d+[\)\}\]]\s*/,' ')
      end

      if @ignore_punctuation
        # split and ignore punctuation characters
        return text.scan(/\w+[\-_\.:']\w+|\w+/)
      else
        # split and accept punctuation characters
        return text.scan(/[\w\-_,:;\.\?\!'"\\\/]+/)
      end
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
wordlist-0.1.0 lib/wordlist/parsers.rb