#
# Ronin Web - A Ruby library for Ronin that provides support for web
# scraping and spidering functionality.
#
# Copyright (c) 2006-2011 Hal Brodigan (postmodern.mod3 at gmail.com)
#
# This file is part of Ronin Web.
#
# Ronin is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Ronin is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Ronin. If not, see .
#
require 'ronin/web/config'
require 'set'
module Ronin
module Web
#
# Represents the set of `User-Agent` strings loaded from all
# `data/ronin/web/user_agents.yml` files.
#
# ## ronin/web/user_agents.yml
#
# The `user_agent.yml` files are essentially YAML files listing
# `User-Agent` strings grouped by category:
#
# ---
# :googlebot:
# - "Googlebot/2.1 ( http://www.googlebot.com/bot.html)"
# - "Googlebot-Image/1.0 ( http://www.googlebot.com/bot.html)"
# - "Mediapartners-Google/2.1"
# - "Google-Sitemaps/1.0"
#
# These files can be added to Ronin Repositories or to Ronin libraries,
# and will be loaded by the {UserAgents} objects.
#
# @since 0.3.0
#
class UserAgents
include Enumerable
# Relative path to the User-Agents file.
FILE = File.join('ronin','web','user_agents.yml')
#
# Creates a new User-Agent set.
#
# @api semipublic
#
def initialize
@files = Set[]
@user_agents = Hash.new { |hash,key| hash[key] = Set[] }
end
#
# The categories of `User-Agent` strings.
#
# @return [Array]
# The names of the categories.
#
# @api public
#
def categories
reload!
@user_agents.keys
end
#
# Iterates over each User-Agent in the set.
#
# @yield [ua]
# The given block will be passed each User-Agent.
#
# @yieldparam [String] ua
# A User-Agent string within the set.
#
# @return [Enumerator]
# If no block is given, an Enmerator will be returned.
#
# @api public
#
def each(&block)
return enum_for(:each) unless block_given?
@user_agents.each do |name,strings|
strings.each(&block)
end
end
#
# Selects a `User-Agent` string from the set.
#
# @param [Symbol, String, Regexp] key
# The User-Agents group name, sub-string or Regexp to search for.
#
# @return [String, nil]
# The matching `User-Agent` string.
#
# @api public
#
def [](key)
reload!
case key
when Symbol
if @user_agents.has_key?(key)
strings = @user_agents[key]
return strings.entries[rand(strings.length)]
end
when String
@user_agents.each do |name,strings|
strings.each do |string|
return string if string.include?(key)
end
end
return nil
when Regexp
@user_agents.each do |name,strings|
strings.each do |string|
return string if string =~ key
end
end
return nil
else
raise(TypeError,"key must be a Symbol, String or Regexp")
end
end
#
# Fetches a `User-Agent` string from the set.
#
# @param [Symbol, String, Regexp] key
# The User-Agents group name, sub-string or Regexp to search for.
#
# @param [String] default
# The `User-Agent` string to default to if no match is found.
#
# @return [String]
# The matching `User-Agent` string.
#
# @raise [ArgumentError]
# No matching `User-Agent` string was found, and no default value
# was given.
#
# @api public
#
def fetch(key,default=nil)
unless (string = (self[key] || default))
raise(ArgumentError,"no User-Agent strings match #{key.inspect}")
end
return string
end
protected
#
# Reloads the set of User-Agents.
#
# @api private
#
def reload!
Config.each_data_file(FILE) do |path|
next if @files.include?(path)
data = YAML.load_file(path)
unless data.kind_of?(Hash)
warn "#{path.dump} did not contain a Hash"
next
end
data.each do |name,strings|
@user_agents[name.to_sym].merge(strings)
end
end
end
end
end
end