Sha256: 915796f5db8bd8482f39ed7f90b897358d5281d83e40d7edb50b5c6556bd73d7

Contents?: true

Size: 1.62 KB

Versions: 1

Compression:

Stored size: 1.62 KB

Contents

module Wordlist
  module Parsers
    # Ignore case of parsed text
    attr_accessor :ignore_case

    # Ignore the punctuation of parsed text
    attr_accessor :ignore_punctuation

    # Ignore URLs
    attr_accessor :ignore_urls

    # Ignore Phone numbers
    attr_accessor :ignore_phone_numbers

    # Ignore References
    attr_accessor :ignore_references

    #
    # Initializes the parsers settings.
    #
    def initialize
      @ignore_case          = false
      @ignore_punctuation   = true
      @ignore_urls          = true
      @ignore_phone_numbers = false
      @ignore_references    = false
    end

    #
    # Parses the given text.
    #
    # @param [String] text
    #   The text to parse.
    #
    # @return [Array<String>]
    #   The Array of parsed tokens.
    #
    def parse(text)
      text = text.to_s

      if @ignore_punctuation
        # eat tailing punctuation
        text.gsub!(/[\.\?!]*$/,'')
      end

      if @ignore_case
        # downcase the sentence
        text.downcase!
      end

      if @ignore_urls
        text.gsub!(/\s*\w+:\/\/[\w\/\+_\-,:%\d\.\-\?&=]*\s*/,' ')
      end

      if @ignore_phone_numbers
        # remove phone numbers
        text.gsub!(/\s*(\d-)?(\d{3}-)?\d{3}-\d{4}\s*/,' ')
      end

      if @ignore_references
        # remove RFC style references
        text.gsub!(/\s*[\(\{\[]\d+[\)\}\]]\s*/,' ')
      end

      if @ignore_punctuation
        # split and ignore punctuation characters
        return text.scan(/\w+[\-_\.:']\w+|\w+/)
      else
        # split and accept punctuation characters
        return text.scan(/[\w\-_,:;\.\?\!'"\\\/]+/)
      end
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
wordlist-0.1.1 lib/wordlist/parsers.rb