require 'pathname' require 'json' require 'addressable/uri' module SnapSearch # This is used to detect if an incoming request to a HTTP server is coming from a robot. class Detector attr_reader :matched_routes, :ignored_routes attr_reader :check_file_extensions attr_reader :robots_json, :robots attr_reader :extensions_json, :extensions # Create a new Detector instance. # # @param [Hash, #to_h] options The options to create the detector with. # @option options [Array] :matched_routes The whitelisted routes. # @option options [Array] :ignored_routes The blacklisted routes. # @option options [String, #to_s] :robots_json The path of the JSON file containing the user agent whitelist & blacklist. # @option options [true, false] :check_file_extensions Set to `true` to ignore direct requests to files. # @option options [Rack::Request] :request The Rack request that is to be checked/detected. def initialize(options={}) raise TypeError, 'options must be a Hash or respond to #to_h or #to_hash' unless options.is_a?(Hash) || options.respond_to?(:to_h) || options.respond_to?(:to_hash) options = options.to_h rescue options.to_hash @options = { matched_routes: [], ignored_routes: [], robots_json: SnapSearch.root.join('resources', 'robots.json'), extensions_json: SnapSearch.root.join('resources', 'extensions.json'), check_file_extensions: false }.merge(options) # Reverse merge: The hash `merge` is called on is used as the default and the options argument is merged into it @matched_routes, @ignored_routes, @check_file_extensions = options.values_at(:matched_routes, :ignored_routes, :check_file_extensions) self.robots_json = @options[:robots_json] # Use the setter method which sets the @robots_json instance variable to the path, then sets @robots to the parsed JSON of the path's contents. self.extensions_json = @options[:extensions_json] # Use the setter method which sets the @extensions_json instance variable to the path, then sets @extensions to the parsed JSON of the path's contents. end # Parses the `robots.json` file by decoding the JSON and throwing an exception if the decoding went wrong. # # @param [String] value Absolute path to the JSON file containing a single Hash with the keys `ignore` and `match`. These keys contain Arrays of Strings (user agents) def robots_json=(value) @robots_json = value.to_s @robots = JSON.parse( File.read(@robots_json) ) # Ruby raises it's own generic I/O read errors & JSON parse errors @robots_json end # Parses the `extensions.json` file by decoding the JSON and throwing an exception if the decoding went wrong. # # @param [String] value Absolute path to the JSON file containing a single Hash with the keys `ignore` and `match`. These keys contain Arrays of Strings (user agents) def extensions_json=(value) @extensions_json = value.to_s @extensions = JSON.parse( File.read(@extensions_json) ) # Ruby raises it's own generic I/O read errors & JSON parse errors @extensions = {} unless @extensions.is_a?(Hash) @extensions_json end # Sets the list of robot user agents to match and ignore during detection. # Note that you must set `robots` to a Hash with the 'match' and 'ignore' keys, both containing Arrays # # @param [Hash>] value The Hash containing the list of robot user agents to match and ignore. def robots=(value) raise TypeError, 'value must be a Hash' unless value.is_a?(Hash) @robots = { 'match' => [], 'ignore' => [], }.merge(@robots) @robots.each do |key, list| raise TypeError, "The '#{key}' must be an Array or respond to #to_a" unless list.is_a?(Array) || list.respond_to?(:to_a) # Validate all keys are Arrays or can be converted to one @robots[key] = list.to_a.collect(&:to_s) # Convert all values in the Arrays to Strings end @robots end # Detects if the request came from a search engine robot. It will intercept in cascading order: # 1. on a GET request # 2. on an HTTP or HTTPS protocol # 3. not on any ignored robot user agents # 4. not on any route not matching the whitelist # 5. not on any route matching the blacklist # 6. not on any invalid file extensions if there is a file extension # 7. on requests with _escaped_fragment_ query parameter # 8. on any matched robot user agents # # @return [true, false] If the request came from a robot def detect(options={}) options = { matched_routes: @matched_routes, ignored_routes: @ignored_routes, robots_json: @robots_json, check_file_extensions: false }.merge(options) raise ArgumentError, 'options[:request] must be an instance of Rack::Request' unless options[:request].is_a?(Rack::Request) self.robots_json = options[:robots_json] if options[:robots_json] != @robots_json # If a new robots_json path is given, use the custom setter method which will set @robots to that parsed JSON file uri = Addressable::URI.parse( options[:request].url ) params = options[:request].params real_path = get_decoded_path(params, uri) document_root = options[:request]['DOCUMENT_ROOT'] # only intercept on get requests, SnapSearch robot cannot submit a POST, PUT or DELETE request return false unless options[:request].get? # only intercept on http or https protocols return false unless %W[http https].include?(uri.scheme) # detect ignored user agents, if true, then return false return false if options[:request].user_agent =~ /#{ @robots['ignore'].collect { |user_agent| Regexp.escape(user_agent) }.join(?|) }/i # if the requested route doesn't match any of the whitelisted routes, then the request is ignored # of course this only runs if there are any routes on the whitelist return false if !options[:matched_routes].nil? && !options[:matched_routes].empty? && !options[:matched_routes].all? { |route| real_path =~ route } # detect ignored routes return false if !options[:ignored_routes].nil? && options[:ignored_routes].any? { |route| real_path =~ route } # detect extensions in order to prevent direct requests to static files if options[:check_file_extensions] extensions['generic'] = [] unless extensions['generic'].is_a?(Array) extensions['ruby'] = [] unless extensions['ruby'].is_a?(Array) valid_extensions = extensions['generic'] + extensions['ruby'] valid_extensions.collect! { |value| value.to_s.downcase.strip } # Transform all extensions to Strings if they arn't already. Then downcase and strip whitespace/newlines from the beginning & end of all values. real_path_uri = Addressable::URI.parse(real_path) extension = real_path_uri.extname extension = extension[1..-1].downcase unless extension.empty? return false if !extension.empty? && !valid_extensions.include?(extension) end # detect escaped fragment (since the ignored user agents has been already been detected, SnapSearch won't continue the interception loop) return true if !uri.query_values.nil? && uri.query_values.has_key?('_escaped_fragment_') # detect matched robots, if true, then return true return true if options[:request].user_agent =~ /#{ @robots['match'].collect { |user_agent| Regexp.escape(user_agent) }.join(?|) }/i # if no match at all, return false false end # Gets the encoded URL that is passed to SnapSearch so that SnapSearch can scrape the encoded URL. # If _escaped_fragment_ query parameter is used, this is converted back to a hash fragment URL. # # @param [Hash] params The parameters of the HTTP request. # @param [Addressable::URI] uri The Addressable::URI of the Rack::Request. # @return [String] URL intended for SnapSearch def get_encoded_url(params, uri) raise TypeError, 'params must be a Hash or respond to #to_h' unless params.is_a?(Hash) || params.respond_to?(:to_h) raise TypeError, 'uri must be an instance of Addressable::URI ' unless uri.is_a?(Addressable::URI) # NOTE: Have to pass the Rack::Request instance and use the `request.params` method to retrieve the parameters because: # uri.to_s # => "http://localhost/snapsearch/path1?key1=value1&_escaped_fragment_=%2Fpath2%3Fkey2=value2" # uri.query_values # => {"key1"=>"value1", "_escaped_fragment_"=>"/path2?key2"} # request.params # => {"key1"=>"value1", "_escaped_fragment_"=>"/path2?key2=value2"} # Is seems Addressable screws up the spliting of params into a Hash, but Rack does not. if !uri.query_values.nil? && uri.query_values.has_key?('_escaped_fragment_') qs_and_hash = get_real_qs_and_hash_fragment(params, true) url = "#{uri.scheme}://#{uri.authority}#{uri.path}" # Remove the query and fragment (SCHEME + AUTHORITY + PATH)... Addressable::URI encodes the uri url.to_s + qs_and_hash['qs'] + qs_and_hash['hash'] else uri.to_s end end # Gets the decoded URL path relevant for detecting matched or ignored routes during detection. # It is also used for static file detection. # # @param [Hash] params The parameters of the HTTP request. # @param [Addressable::URI] uri The Addressable::URI of the Rack::Request. # @return [String] The decoded URL. def get_decoded_path(params, uri) raise TypeError, 'params must be a Hash or respond to #to_h or #to_hash' unless params.is_a?(Hash) || params.respond_to?(:to_h) || params.respond_to?(:to_hash) params = params.to_h rescue params.to_hash raise TypeError, 'uri must be an instance of Addressable::URI ' unless uri.is_a?(Addressable::URI) # NOTE: Have to pass the Rack::Request instance and use the `request.params` method to retrieve the parameters because: # uri.to_s # => "http://localhost/snapsearch/path1?key1=value1&_escaped_fragment_=%2Fpath2%3Fkey2=value2" # uri.query_values # => {"key1"=>"value1", "_escaped_fragment_"=>"/path2?key2"} # request.params # => {"key1"=>"value1", "_escaped_fragment_"=>"/path2?key2=value2"} # Is seems Addressable screws up the spliting of params into a Hash, but Rack does not. if !uri.query_values.nil? && uri.query_values.has_key?('_escaped_fragment_') qs_and_hash = get_real_qs_and_hash_fragment(params, false) Addressable::URI.unescape(uri.path) + qs_and_hash['qs'] + qs_and_hash['hash'] else Addressable::URI.unescape("#{ uri.path }?#{ uri.query }") end end # Gets the real query string and hash fragment by reversing the Google's _escaped_fragment_ protocol to the hash bang mode. # This is used for both getting the encoded url for scraping and the decoded path for detection and is only called when the URI has a QUERY section. # # Google will convert convert URLs like so: # Original URL: http://example.com/path1?key1=value1#!/path2?key2=value2 # Original Structure: DOMAIN - PATH - QS - HASH BANG - HASH PATH - HASH QS # Search Engine URL: http://example.com/path1?key1=value1&_escaped_fragment_=%2Fpath2%3Fkey2=value2 # Search Engine Structure: DOMAIN - PATH - QS - ESCAPED FRAGMENT # Everything after the hash bang will be stored as the _escaped_fragment_, even if they are query strings. # Therefore we have to reverse this process to get the original url which will be used for snapshotting purposes. # This means the original URL can have 2 query strings components. # The QS before the HASH BANG will be received by both the server and the client. However not all client side frameworks will process this QS. # The HASH QS will only be received by the client as anything after hash does not get sent to the server. Most client side frameworks will process this HASH QS. # See this for more information: https://developers.google.com/webmasters/ajax-crawling/docs/specification # # @param [Hash] params The params from the URI of the request. # @param [true, false] encode Whether to Addressable::URI.escape the query string or not # @return [Hash] Hash of query string and hash fragment def get_real_qs_and_hash_fragment(params, escape) raise TypeError, 'params must be a Hash or respond to #to_h or #to_hash' unless params.is_a?(Hash) || params.respond_to?(:to_h) || params.respond_to?(:to_hash) params = params.to_h rescue params.to_hash query_params = params.dup query_params.delete('_escaped_fragment_') query_params = query_params.to_a query_string = '' unless query_params.empty? query_params.collect! { |key, value| [ Addressable::URI.escape(key), Addressable::URI.escape(value) ] } if escape query_params.collect! { |key, value| "#{key}=#{value}" } query_string = "?#{ query_params.join(?&) }" end hash_fragment = params['_escaped_fragment_'] hash_fragment_string = '' hash_fragment_string = "#!#{hash_fragment}" unless hash_fragment.nil? { 'qs' => query_string, 'hash' => hash_fragment_string } end end end