# RETS4R Client # # Copyright (c) 2006 Scott Patterson # # This program is copyrighted free software by Scott Patterson. You can # redistribute it and/or modify it under the same terms of Ruby's license; # either the dual license version in 2003 (see the file RUBYS), or any later # version. # # TODO: 1.0 Support (Adding this support should be fairly easy) # TODO: 2.0 Support (Adding this support will be very difficult since it is a completely different methodology) # TODO: Case-insensitive header require 'digest/md5' require 'net/http' require 'uri' require 'cgi' require 'auth' require 'client/dataobject' require 'client/parsers/response_parser' require 'thread' require 'logger' module RETS4R class Client COMPACT_FORMAT = 'COMPACT' METHOD_GET = 'GET' METHOD_POST = 'POST' METHOD_HEAD = 'HEAD' DEFAULT_METHOD = METHOD_GET DEFAULT_RETRY = 2 #DEFAULT_USER_AGENT = 'RETS4R/0.8.2' # FIXME DEFAULT_USER_AGENT = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9b5) ' + 'Gecko/2008050509 Firefox/3.0b5' DEFAULT_RETS_VERSION = '1.7' SUPPORTED_RETS_VERSIONS = ['1.5', '1.7'] CAPABILITY_LIST = [ 'Action', 'ChangePassword', 'GetObject', 'Login', 'LoginComplete', 'Logout', 'Search', 'GetMetadata', 'Update' ] # These are the response messages as defined in the RETS 1.5e2 and 1.7d6 specifications. # Provided for convenience and are used by the HTTPError class to provide more useful # messages. RETS_HTTP_MESSAGES = { '200' => 'Operation successful.', '400' => 'The request could not be understood by the server due to malformed syntax.', '401' => 'Either the header did not contain an acceptable Authorization or the ' + 'username/password was invalid. The server response MUST include a ' + 'WWW-Authenticate header field.', '402' => 'The requested transaction requires a payment which could not be authorized.', '403' => 'The server understood the request, but is refusing to fulfill it.', '404' => 'The server has not found anything matching the Request-URI.', '405' => 'The method specified in the Request-Line is not allowed for the resource ' + 'identified by the Request-URI.', '406' => 'The resource identified by the request is only capable of generating response ' + 'entities which have content characteristics not acceptable according to the accept ' + 'headers sent in the request.', '408' => 'The client did not produce a request within the time that the server was prepared to wait.', '411' => 'The server refuses to accept the request without a defined Content-Length.', '412' => 'Transaction not permitted at this point in the session.', '413' => 'The server is refusing to process a request because the request entity is larger than ' + 'the server is willing or able to process.', '414' => 'The server is refusing to service the request because the Request-URI is longer than ' + 'the server is willing to interpret. This error usually only occurs for a GET method.', '500' => 'The server encountered an unexpected condition which prevented it from fulfilling ' + 'the request.', '501' => 'The server does not support the functionality required to fulfill the request.', '503' => 'The server is currently unable to handle the request due to a temporary overloading ' + 'or maintenance of the server.', '505' => 'The server does not support, or refuses to support, the HTTP protocol version that ' + 'was used in the request message.', } attr_accessor :mimemap, :logger attr_reader :format # Constructor # # Requires the URL to the RETS server and takes an optional output format. The output format # determines the type of data returned by the various RETS transaction methods. def initialize(url, format = COMPACT_FORMAT) @format = format @urls = { 'Login' => URI.parse(url) } @nc = 0 @headers = { 'User-Agent' => DEFAULT_USER_AGENT, 'Accept' => '*/*', 'RETS-Version' => "RETS/#{DEFAULT_RETS_VERSION}", 'RETS-Session-ID' => '0' } @request_method = DEFAULT_METHOD @semaphore = Mutex.new @response_parser = RETS4R::Client::ResponseParser.new self.mimemap = { 'image/jpeg' => 'jpg', 'image/gif' => 'gif' } if block_given? yield self end end # Assigns a block that will be called just before the request is sent. # This block must accept three parameters: # * self # * Net::HTTP instance # * Hash of headers # # The block's return value will be ignored. If you want to prevent the request # to go through, raise an exception. # # == Example # # client = RETS4R::Client.new(...) # # Make a new pre_request_block that calculates the RETS-UA-Authorization header. # client.set_pre_request_block do |rets, http, headers| # a1 = Digest::MD5.hexdigest([headers["User-Agent"], @password].join(":")) # if headers.has_key?("Cookie") then # cookie = headers["Cookie"].split(";").map(&:strip).select {|c| c =~ /rets-session-id/i} # cookie = cookie ? cookie.split("=").last : "" # else # cookie = "" # end # # parts = [a1, "", cookie, headers["RETS-Version"]] # headers["RETS-UA-Authorization"] = "Digest " + Digest::MD5.hexdigest(parts.join(":")) # end def set_pre_request_block(&block) @pre_request_block = block end # We only allow external read access to URLs because they are internally set based on the # results of various queries. def urls @urls end def set_header(name, value) if value.nil? then @headers.delete(name) else @headers[name] = value end logger.debug("Set header '#{name}' to '#{value}'") if logger end def get_header(name) @headers[name] end def set_user_agent(name) set_header('User-Agent', name) end def get_user_agent get_header('User-Agent') end def set_rets_version(version) if (SUPPORTED_RETS_VERSIONS.include? version) set_header('RETS-Version', "RETS/#{version}") else raise Unsupported.new("The client does not support RETS version '#{version}'.") end end def get_rets_version (get_header('RETS-Version') || "").gsub("RETS/", "") end def set_request_method(method) @request_method = method end def get_request_method # Basic Authentication # @request_method end # Provide more Ruby-like attribute accessors instead of get/set methods alias_method :user_agent=, :set_user_agent alias_method :user_agent, :get_user_agent alias_method :request_method=, :set_request_method alias_method :request_method, :get_request_method alias_method :rets_version=, :set_rets_version alias_method :rets_version, :get_rets_version #### RETS Transaction Methods #### # # Most of these transaction methods mirror the RETS specification methods, so if you are # unsure what they mean, you should check the RETS specification. The latest version can be # found at http://www.rets.org # Attempts to log into the server using the provided username and password. # # If called with a block, the results of the login action are yielded, # and logout is called when the block returns. In that case, #login # returns the block's value. If called without a block, returns the # result. # # As specified in the RETS specification, the Action URL is called and # the results made available in the #secondary_results accessor of the # results object. def login(username, password) #:yields: login_results @username = username @password = password # We are required to set the Accept header to this by the RETS 1.5 specification. set_header('Accept', '*/*') response = request(@urls['Login']) # Parse response to get other URLS results = @response_parser.parse_key_value(response.body) if (results.success?) CAPABILITY_LIST.each do |capability| next unless results.response[capability] uri = URI.parse(results.response[capability]) if uri.absolute? @urls[capability] = uri else base = @urls['Login'].clone base.path = results.response[capability] @urls[capability] = base end end logger.debug("Capability URL List: #{@urls.inspect}") if logger else raise LoginError.new(response.message + "(#{results.reply_code}: #{results.reply_text})") end # Perform the mandatory get request on the action URL. results.secondary_response = perform_action_url # We only yield if block_given? begin yield results ensure self.logout end else results end end # Logs out of the RETS server. def logout() # If no logout URL is provided, then we assume that logout is not necessary (not to # mention impossible without a URL). We don't throw an exception, though, but we might # want to if this becomes an issue in the future. request(@urls['Logout']) if @urls['Logout'] end # Requests Metadata from the server. An optional type and id can be specified to request # subsets of the Metadata. Please see the RETS specification for more details on this. # The format variable tells the server which format to return the Metadata in. Unless you # need the raw metadata in a specified format, you really shouldn't specify the format. # # If called with a block, yields the results and returns the value of the block, or # returns the metadata directly. def get_metadata(type = 'METADATA-SYSTEM', id = '*') xml = download_metadata(type, id) result = @response_parser.parse_metadata(xml, @format) if block_given? yield result else result end end def download_metadata(type, id) header = { 'Accept' => 'text/xml,text/plain;q=0.5' } data = { 'Type' => type, 'ID' => id, 'Format' => @format } request(@urls['GetMetadata'], data, header).body end # Performs a GetObject transaction on the server. For details on the arguments, please see # the RETS specification on GetObject requests. # # This method either returns an Array of DataObject instances, or yields each DataObject # as it is created. If a block is given, the number of objects yielded is returned. def get_object(resource, type, id, location = 0) #:yields: data_object header = { 'Accept' => mimemap.keys.join(',') } data = { 'Resource' => resource, 'Type' => type, 'ID' => id, 'Location' => location.to_s } response = request(@urls['GetObject'], data, header) results = block_given? ? 0 : [] if response['content-type'].include?('text/xml') # This probably means that there was an error. # Response parser will likely raise an exception. rr = @response_parser.parse_object_response(response.body) return rr elsif response['content-type'].include?('multipart/parallel') content_type = process_content_type(response['content-type']) # TODO: log this # puts "SPLIT ON #{content_type['boundary']}" boundary = content_type['boundary'] if boundary =~ /\s*'([^']*)\s*/ boundary = $1 end parts = response.body.split("\r\n--#{boundary}") parts.shift # Get rid of the initial boundary # TODO: log this # puts "GOT PARTS #{parts.length}" parts.each do |part| (raw_header, raw_data) = part.split("\r\n\r\n") # TODO: log this # puts raw_data.nil? next unless raw_data data_header = process_header(raw_header) data_object = DataObject.new(data_header, raw_data) if block_given? yield data_object results += 1 else results << data_object end end else info = { 'content-type' => response['content-type'], # Compatibility shim. Deprecated. 'Content-Type' => response['content-type'], 'Object-ID' => response['Object-ID'], 'Content-ID' => response['Content-ID'] } if response['Transfer-Encoding'].to_s.downcase == "chunked" || response['Content-Length'].to_i > 100 then data_object = DataObject.new(info, response.body) if block_given? yield data_object results += 1 else results << data_object end end end results end # Peforms a RETS search transaction. Again, please see the RETS specification for details # on what these parameters mean. The options parameter takes a hash of options that will # added to the search statement. def search(search_type, klass, query, options = false) header = {} # Required Data data = { 'SearchType' => search_type, 'Class' => klass, 'Query' => query, 'QueryType' => 'DMQL2', 'Format' => format, 'Count' => '0' } # Options #-- # We might want to switch this to merge!, but I've kept it like this for now because it # explicitly casts each value as a string prior to performing the search, so we find out now # if can't force a value into the string context. I suppose it doesn't really matter when # that happens, though... #++ options.each { |k,v| data[k] = v.to_s } if options response = request(@urls['Search'], data, header) results = @response_parser.parse_results(response.body, @format) if block_given? yield results else return results end end def count(search_type, klass, query) header = {} data = { 'SearchType' => search_type, 'Class' => klass, 'Query' => query, 'QueryType' => 'DMQL2', 'Format' => format, 'Count' => '2' } response = request(@urls['Search'], data, header) result = @response_parser.parse_count(response.body) return result end private # Copied from http.rb def basic_encode(account, password) 'Basic ' + ["#{account}:#{password}"].pack('m').delete("\r\n") end # XXX: This is crap. It does not properly handle quotes. def process_content_type(text) content = {} field_start = text.index(';') content['content-type'] = text[0 ... field_start].strip fields = text[field_start..-1] parts = text.split(';') parts.each do |part| (name, value) = part.gsub(/\"/, '').split('=') content[name.strip] = value ? value.strip : value end content end # Processes the HTTP header #-- # Could we switch over to using CGI for this? #++ def process_header(raw) header = {} raw.each do |line| (name, value) = line.split(':') header[name.strip] = value.strip if name && value end header end # Given a hash, it returns a URL encoded query string. def create_query_string(hash) parts = hash.map {|key,value| "#{CGI.escape(key)}=#{CGI.escape(value)}"} return parts.join('&') end # This is the primary transaction method, which the other public methods make use of. # Given a url for the transaction (endpoint) it makes a request to the RETS server. # #-- # This needs to be better documented, but for now please see the public transaction methods # for how to make use of this method. #++ def request(url, data = {}, header = {}, method = @request_method, retry_auth = DEFAULT_RETRY) response = '' @semaphore.lock http = Net::HTTP.new(url.host, url.port) http.read_timeout = 600 if logger && logger.debug? http.set_debug_output HTTPDebugLogger.new(logger) end http.start do |http| begin uri = url.path if ! data.empty? && method == METHOD_GET uri += "?#{create_query_string(data)}" end headers = @headers headers.merge(header) unless header.empty? @pre_request_block.call(self, http, headers) if @pre_request_block logger.debug(headers.inspect) if logger @semaphore.unlock post_data = data.map {|k,v| "#{CGI.escape(k.to_s)}=#{CGI.escape(v.to_s)}" }.join('&') if method == METHOD_POST response = method == METHOD_POST ? http.post(uri, post_data, headers) : http.get(uri, headers) @semaphore.lock if response.code == '401' # Authentication is required raise AuthRequired elsif response.code.to_i >= 300 # We have a non-successful response that we cannot handle @semaphore.unlock if @semaphore.locked? raise HTTPError.new(response) else cookies = [] if set_cookies = response.get_fields('set-cookie') then set_cookies.each do |cookie| cookies << cookie.split(";").first end end set_header('Cookie', cookies.join("; ")) unless cookies.empty? set_header('RETS-Session-ID', response['RETS-Session-ID']) if response['RETS-Session-ID'] end rescue AuthRequired @nc += 1 if retry_auth > 0 retry_auth -= 1 auth = Auth.authenticate(response, @username, @password, url.path, method, @headers['RETS-Request-ID'], get_user_agent, @nc) set_header('Authorization', auth) retry else @semaphore.unlock if @semaphore.locked? raise LoginError.new(response.message) end end logger.debug(response.body) if logger end @semaphore.unlock if @semaphore.locked? return response end # If an action URL is present in the URL capability list, it calls that action URL and returns the # raw result. Throws a generic RETSException if it is unable to follow the URL. def perform_action_url begin if @urls.has_key?('Action') return request(@urls['Action'], {}, {}, METHOD_GET) end rescue raise RETSException.new("Unable to follow action URL: '#{$!}'.") end end # Provides a proxy class to allow for net/http to log its debug to the logger. class HTTPDebugLogger def initialize(logger) @logger = logger end def <<(data) @logger.debug(data) end end #### Exceptions #### # This exception should be thrown when a generic client error is encountered. class ClientException < Exception end # This exception should be thrown when there is an error with the parser, which is # considered a subcomponent of the RETS client. It also includes the XML data that # that was being processed at the time of the exception. class ParserException < ClientException attr_accessor :file end # The client does not currently support a specified action. class Unsupported < ClientException end # The HTTP response returned by the server indicates that there was an error processing # the request and the client cannot continue on its own without intervention. class HTTPError < ClientException attr_accessor :http_response # Takes a HTTPResponse object def initialize(http_response) self.http_response = http_response end # Shorthand for calling HTTPResponse#code def code http_response.code end # Shorthand for calling HTTPResponse#message def message http_response.message end # Returns the RETS specification message for the HTTP response code def rets_message Client::RETS_HTTP_MESSAGES[code] end def to_s "#{code} #{message}: #{rets_message}" end end # A general RETS level exception was encountered. This would include HTTP and RETS # specification level errors as well as informative mishaps such as authentication being # required for access. class RETSException < RuntimeError end # There was a problem with logging into the RETS server. class LoginError < RETSException end # For internal client use only, it is thrown when the a RETS request is made but a password # is prompted for. class AuthRequired < RETSException end # A RETS transaction failed class RETSTransactionException < RETSException; end # Search Transaction Exceptions class UnknownQueryFieldException < RETSTransactionException; end class NoRecordsFoundException < RETSTransactionException; end class InvalidSelectException < RETSTransactionException; end class MiscellaneousSearchErrorException < RETSTransactionException; end class InvalidQuerySyntaxException < RETSTransactionException; end class UnauthorizedQueryException < RETSTransactionException; end class MaximumRecordsExceededException < RETSTransactionException; end class TimeoutException < RETSTransactionException; end class TooManyOutstandingQueriesException < RETSTransactionException; end class DTDVersionUnavailableException < RETSTransactionException; end # GetObject Exceptions class InvalidResourceException < RETSTransactionException; end class InvalidTypeException < RETSTransactionException; end class InvalidIdentifierException < RETSTransactionException; end class NoObjectFoundException < RETSTransactionException; end class UnsupportedMIMETypeException < RETSTransactionException; end class UnauthorizedRetrievalException < RETSTransactionException; end class ResourceUnavailableException < RETSTransactionException; end class ObjectUnavailableException < RETSTransactionException; end class RequestTooLargeException < RETSTransactionException; end class TimeoutException < RETSTransactionException; end class TooManyOutstandingRequestsException < RETSTransactionException; end class MiscellaneousErrorException < RETSTransactionException; end EXCEPTION_TYPES = { # Search Transaction Reply Codes 20200 => UnknownQueryFieldException, 20201 => NoRecordsFoundException, 20202 => InvalidSelectException, 20203 => MiscellaneousSearchErrorException, 20206 => InvalidQuerySyntaxException, 20207 => UnauthorizedQueryException, 20208 => MaximumRecordsExceededException, 20209 => TimeoutException, 20210 => TooManyOutstandingQueriesException, 20514 => DTDVersionUnavailableException, # GetObject Reply Codes 20400 => InvalidResourceException, 20401 => InvalidTypeException, 20402 => InvalidIdentifierException, 20403 => NoObjectFoundException, 20406 => UnsupportedMIMETypeException, 20407 => UnauthorizedRetrievalException, 20408 => ResourceUnavailableException, 20409 => ObjectUnavailableException, 20410 => RequestTooLargeException, 20411 => TimeoutException, 20412 => TooManyOutstandingRequestsException, 20413 => MiscellaneousErrorException } end end