require 'socket' require 'net/https' require 'net/http' require 'logger' require 'zlib' require 'stringio' module NewRelic module Agent # The Agent is a singleton that is instantiated when the plugin is # activated. It collects performance data from ruby applications # in realtime as the application runs, and periodically sends that # data to the NewRelic server. class Agent # Specifies the version of the agent's communication protocol with # the NewRelic hosted site. PROTOCOL_VERSION = 8 # 14105: v8 (tag 2.10.3) # (no v7) # 10379: v6 (not tagged) # 4078: v5 (tag 2.5.4) # 2292: v4 (tag 2.3.6) # 1754: v3 (tag 2.3.0) # 534: v2 (shows up in 2.1.0, our first tag) def initialize @launch_time = Time.now @metric_ids = {} @histogram = NewRelic::Histogram.new(NewRelic::Control.instance.apdex_t / 10) @stats_engine = NewRelic::Agent::StatsEngine.new @transaction_sampler = NewRelic::Agent::TransactionSampler.new @stats_engine.transaction_sampler = @transaction_sampler @error_collector = NewRelic::Agent::ErrorCollector.new @request_timeout = NewRelic::Control.instance.fetch('timeout', 2 * 60) @last_harvest_time = Time.now @obfuscator = method(:default_sql_obfuscator) end module ClassMethods # Should only be called by NewRelic::Control def instance @instance ||= self.new end end module InstanceMethods attr_reader :obfuscator attr_reader :stats_engine attr_reader :transaction_sampler attr_reader :error_collector attr_reader :record_sql attr_reader :histogram attr_reader :metric_ids attr_reader :url_rules def record_transaction(duration_seconds, options={}) is_error = options['is_error'] || options['error_message'] || options['exception'] metric = options['metric'] metric ||= options['uri'] # normalize this with url rules raise "metric or uri arguments required" unless metric metric_info = NewRelic::MetricParser::MetricParser.for_metric_named(metric) if metric_info.is_web_transaction? NewRelic::Agent::Instrumentation::MetricFrame.record_apdex(metric_info, duration_seconds, duration_seconds, is_error) histogram.process(duration_seconds) end metrics = metric_info.summary_metrics metrics << metric metrics.each do |name| stats = stats_engine.get_stats_no_scope(name) stats.record_data_point(duration_seconds) end if is_error if options['exception'] e = options['exception'] elsif options['error_message'] e = Exception.new options['error_message'] else e = Exception.new 'Unknown Error' end error_collector.notice_error e, :uri => options['uri'], :metric => metric end # busy time ? end # This method is deprecated. Use NewRelic::Agent.manual_start def manual_start(ignored=nil, also_ignored=nil) raise "This method no longer supported. Instead use the class method NewRelic::Agent.manual_start" end # This method should be called in a forked process after a fork. # It assumes the parent process initialized the agent, but does # not assume the agent started. # # The call is idempotent, but not re-entrant. # # * It clears any metrics carried over from the parent process # * Restarts the sampler thread if necessary # * Initiates a new agent run and worker loop unless that was done # in the parent process and +:force_reconnect+ is not true # # Options: # * :force_reconnect => true to force the spawned process to # establish a new connection, such as when forking a long running process. # The default is false--it will only connect to the server if the parent # had not connected. # * :keep_retrying => false if we try to initiate a new # connection, this tells me to only try it once so this method returns # quickly if there is some kind of latency with the server. def after_fork(options={}) # @connected gets false after we fail to connect or have an error # connecting. @connected has nil if we haven't finished trying to connect. # or we didn't attempt a connection because this is the master process # log.debug "Agent received after_fork notice in #$$: [#{control.agent_enabled?}; monitor=#{control.monitor_mode?}; connected: #{@connected.inspect}; thread=#{@worker_thread.inspect}]" return if !control.agent_enabled? or !control.monitor_mode? or @connected == false or @worker_thread && @worker_thread.alive? log.info "Starting the worker thread in #$$ after forking." # Clear out stats that are left over from parent process reset_stats # Don't ever check to see if this is a spawner. If we're in a forked process # I'm pretty sure we're not also forking new instances. start_worker_thread(options) @stats_engine.start_sampler_thread end # True if we have initialized and completed 'start' def started? @started end # Return nil if not yet connected, true if successfully started # and false if we failed to start. def connected? @connected end # Attempt a graceful shutdown of the agent. def shutdown return if not started? if @worker_loop @worker_loop.stop log.debug "Starting Agent shutdown" # if litespeed, then ignore all future SIGUSR1 - it's # litespeed trying to shut us down if control.dispatcher == :litespeed Signal.trap("SIGUSR1", "IGNORE") Signal.trap("SIGTERM", "IGNORE") end begin NewRelic::Agent.disable_all_tracing do graceful_disconnect end rescue => e log.error e log.error e.backtrace.join("\n") end end @started = nil end def start_transaction @stats_engine.start_transaction end def end_transaction @stats_engine.end_transaction end def set_record_sql(should_record) prev = Thread::current[:record_sql] Thread::current[:record_sql] = should_record prev.nil? || prev end def set_record_tt(should_record) prev = Thread::current[:record_tt] Thread::current[:record_tt] = should_record prev.nil? || prev end # Push flag indicating whether we should be tracing in this # thread. def push_trace_execution_flag(should_trace=false) (Thread.current[:newrelic_untraced] ||= []) << should_trace end # Pop the current trace execution status. Restore trace execution status # to what it was before we pushed the current flag. def pop_trace_execution_flag Thread.current[:newrelic_untraced].pop if Thread.current[:newrelic_untraced] end def set_sql_obfuscator(type, &block) if type == :before @obfuscator = NewRelic::ChainedCall.new(block, @obfuscator) elsif type == :after @obfuscator = NewRelic::ChainedCall.new(@obfuscator, block) elsif type == :replace @obfuscator = block else fail "unknown sql_obfuscator type #{type}" end end def log NewRelic::Agent.logger end # Start up the agent. This verifies that the agent_enabled? is # true and initializes the sampler based on the current # configuration settings. Then it will fire up the background # thread for sending data to the server if applicable. def start if started? control.log! "Agent Started Already!", :error return end return if !control.agent_enabled? @started = true @local_host = determine_host if control.dispatcher.nil? || control.dispatcher.to_s.empty? log.info "No dispatcher detected." else log.info "Dispatcher: #{control.dispatcher.to_s}" end log.info "Application: #{control.app_names.join(", ")}" unless control.app_names.empty? sampler_config = control.fetch('transaction_tracer', {}) # TODO: Should move this state into the transaction sampler instance @should_send_samples = @config_should_send_samples = sampler_config.fetch('enabled', true) @should_send_random_samples = sampler_config.fetch('random_sample', false) @explain_threshold = sampler_config.fetch('explain_threshold', 0.5).to_f @explain_enabled = sampler_config.fetch('explain_enabled', true) @record_sql = sampler_config.fetch('record_sql', :obfuscated).to_sym # use transaction_threshold: 4.0 to force the TT collection # threshold to 4 seconds # use transaction_threshold: apdex_f to use your apdex t value # multiplied by 4 # undefined transaction_threshold defaults to 2.0 apdex_f = 4 * NewRelic::Control.instance.apdex_t @slowest_transaction_threshold = sampler_config.fetch('transaction_threshold', 2.0) if @slowest_transaction_threshold =~ /apdex_f/i @slowest_transaction_threshold = apdex_f end @slowest_transaction_threshold = @slowest_transaction_threshold.to_f log.warn "Agent is configured to send raw SQL to RPM service" if @record_sql == :raw case when !control.monitor_mode? log.warn "Agent configured not to send data in this environment - edit newrelic.yml to change this" when !control.license_key log.error "No license key found. Please edit your newrelic.yml file and insert your license key." when control.license_key.length != 40 log.error "Invalid license key: #{control.license_key}" when [:passenger, :unicorn].include?(control.dispatcher) log.info "Connecting workers after forking." else # Do the connect in the foreground if we are in sync mode NewRelic::Agent.disable_all_tracing { connect(:keep_retrying => false) } if control.sync_startup # Start the event loop and initiate connection if necessary start_worker_thread # Our shutdown handler needs to run after other shutdown handlers # that may be doing things like running the app (hello sinatra). if control.send_data_on_exit if RUBY_VERSION =~ /rubinius/i list = at_exit { shutdown } # move the shutdown handler to the front of the list, to # execute last: list.unshift(list.pop) elsif !defined?(JRuby) or !defined?(Sinatra::Application) at_exit { at_exit { shutdown } } end end end log.info "New Relic RPM Agent #{NewRelic::VERSION::STRING} Initialized: pid = #$$" log.info "Agent Log found in #{NewRelic::Control.instance.log_file}" if NewRelic::Control.instance.log_file end # Clear out the metric data, errors, and transaction traces. Reset the histogram data. def reset_stats @stats_engine.reset_stats @unsent_errors = [] @traces = nil @unsent_timeslice_data = {} @last_harvest_time = Time.now @launch_time = Time.now @histogram = NewRelic::Histogram.new(NewRelic::Control.instance.apdex_t / 10) end private def collector @collector ||= control.server end # Try to launch the worker thread and connect to the server. # # See #connect for a description of connection_options. def start_worker_thread(connection_options = {}) log.debug "Creating RPM worker thread." @worker_thread = Thread.new do begin NewRelic::Agent.disable_all_tracing do # We try to connect. If this returns false that means # the server rejected us for a licensing reason and we should # just exit the thread. If it returns nil # that means it didn't try to connect because we're in the master. connect(connection_options) if @connected # disable transaction sampling if disabled by the server and we're not in dev mode if !control.developer_mode? && !@should_send_samples @transaction_sampler.disable else @transaction_sampler.enable # otherwise ensure TT's are enabled end log.info "Reporting performance data every #{@report_period} seconds." log.debug "Running worker loop" # Note if the agent attempts to report more frequently than allowed by the server # the server will start dropping data. @worker_loop = WorkerLoop.new @worker_loop.run(@report_period) do harvest_and_send_timeslice_data harvest_and_send_slowest_sample if @should_send_samples harvest_and_send_errors if error_collector.enabled end else log.debug "No connection. Worker thread finished." end end rescue NewRelic::Agent::ForceRestartException => e log.info e.message # disconnect and start over. # clear the stats engine reset_stats @metric_ids = {} @connected = nil # Wait a short time before trying to reconnect sleep 30 retry rescue NewRelic::Agent::ForceDisconnectException => e # when a disconnect is requested, stop the current thread, which # is the worker thread that gathers data and talks to the # server. log.error "RPM forced this agent to disconnect (#{e.message})" @connected = false rescue NewRelic::Agent::ServerConnectionException => e log.error "Unable to establish connection with the server. Run with log level set to debug for more information." log.debug("#{e.class.name}: #{e.message}\n#{e.backtrace.first}") @connected = false rescue Exception => e log.error "Terminating worker loop: #{e.class.name}: #{e}\n #{e.backtrace.join("\n ")}" @connected = false end # begin end # thread new @worker_thread['newrelic_label'] = 'Worker Loop' end def control NewRelic::Control.instance end # Connect to the server and validate the license. If successful, # @connected has true when finished. If not successful, you can # keep calling this. Return false if we could not establish a # connection with the server and we should not retry, such as if # there's a bad license key. # # Set keep_retrying=false to disable retrying and return asap, such as when # invoked in the foreground. Otherwise this runs until a successful # connection is made, or the server rejects us. # # * :keep_retrying => false to only try to connect once, and # return with the connection set to nil. This ensures we may try again # later (default true). # * force_reconnect => true if you want to establish a new connection # to the server before running the worker loop. This means you get a separate # agent run and RPM sees it as a separate instance (default is false). def connect(options) # Don't proceed if we already connected (@connected=true) or if we tried # to connect and were rejected with prejudice because of a license issue # (@connected=false). return if !@connected.nil? && !options[:force_reconnect] keep_retrying = options[:keep_retrying].nil? || options[:keep_retrying] # wait a few seconds for the web server to boot, necessary in development connect_retry_period = keep_retrying ? 10 : 0 connect_attempts = 0 @agent_id = nil begin sleep connect_retry_period.to_i log.debug "Connecting Process to RPM: #$0" host = invoke_remote(:get_redirect_host) @collector = control.server_from_host(host) if host environment = control['send_environment_info'] != false ? control.local_env.snapshot : [] log.debug "Connecting with validation seed/token: #{control.validate_seed}/#{control.validate_token}" if control.validate_seed connect_data = invoke_remote :connect, :pid => $$, :host => @local_host, :app_name => control.app_names, :language => 'ruby', :agent_version => NewRelic::VERSION::STRING, :environment => environment, :settings => control.settings, :validate => {:seed => control.validate_seed, :token => control.validate_token } @agent_id = connect_data['agent_run_id'] @report_period = connect_data['data_report_period'] @url_rules = connect_data['url_rules'] control.log! "Connected to NewRelic Service at #{@collector}" log.debug "Agent Run = #{@agent_id}." log.debug "Connection data = #{connect_data.inspect}" # Ask the server for permission to send transaction samples. # determined by subscription license. @should_send_samples = @config_should_send_samples && connect_data['collect_traces'] if @should_send_samples if @should_send_random_samples @transaction_sampler.random_sampling = true @transaction_sampler.sampling_rate = connect_data['sampling_rate'] log.info "Transaction sampling enabled, rate = #{@transaction_sampler.sampling_rate}" end log.debug "Transaction tracing threshold is #{@slowest_transaction_threshold} seconds." else log.debug "Transaction traces will not be sent to the RPM service." end # Ask for permission to collect error data error_collector.enabled = error_collector.config_enabled && connect_data['collect_errors'] log.debug "Errors will be sent to the RPM service." if error_collector.enabled @connected_pid = $$ @connected = true rescue NewRelic::Agent::LicenseException => e log.error e.message log.info "Visit NewRelic.com to obtain a valid license key, or to upgrade your account." @connected = false rescue Timeout::Error, StandardError => e if e.instance_of? NewRelic::Agent::ServerConnectionException log.info "Unable to establish connection with New Relic RPM Service at #{control.server}: #{e.message}" log.debug e.backtrace.join("\n") else log.error "Error establishing connection with New Relic RPM Service at #{control.server}: #{e.message}" log.debug e.backtrace.join("\n") end # retry logic if keep_retrying connect_attempts += 1 case connect_attempts when 1..2 connect_retry_period, period_msg = 60, "1 minute" when 3..5 connect_retry_period, period_msg = 60 * 2, "2 minutes" else connect_retry_period, period_msg = 5 * 60, "5 minutes" end log.info "Will re-attempt in #{period_msg}" retry else @connected = nil end end end def determine_host Socket.gethostname end def determine_home_directory control.root end def is_application_spawner? $0 =~ /ApplicationSpawner|^unicorn\S* master/ end def harvest_and_send_timeslice_data NewRelic::Agent::BusyCalculator.harvest_busy now = Time.now @unsent_timeslice_data ||= {} @unsent_timeslice_data = @stats_engine.harvest_timeslice_data(@unsent_timeslice_data, @metric_ids) begin # In this version of the protocol, we get back an assoc array of spec to id. metric_ids = invoke_remote(:metric_data, @agent_id, @last_harvest_time.to_f, now.to_f, @unsent_timeslice_data.values) rescue Timeout::Error # assume that the data was received. chances are that it was metric_ids = nil end metric_ids.each do | spec, id | @metric_ids[spec] = id end if metric_ids log.debug "#{now}: sent #{@unsent_timeslice_data.length} timeslices (#{@agent_id}) in #{Time.now - now} seconds" # if we successfully invoked this web service, then clear the unsent message cache. @unsent_timeslice_data = {} @last_harvest_time = now # handle_messages # note - exceptions are logged in invoke_remote. If an exception is encountered here, # then the metric data is downsampled for another timeslices end def harvest_and_send_slowest_sample @traces = @transaction_sampler.harvest(@traces, @slowest_transaction_threshold) unless @traces.empty? now = Time.now log.debug "Sending (#{@traces.length}) transaction traces" begin # take the traces and prepare them for sending across the # wire. This includes gathering SQL explanations, stripping # out stack traces, and normalizing SQL. note that we # explain only the sql statements whose segments' execution # times exceed our threshold (to avoid unnecessary overhead # of running explains on fast queries.) options = { :keep_backtraces => true } options[:record_sql] = @record_sql unless @record_sql == :off options[:explain_sql] = @explain_threshold if @explain_enabled traces = @traces.collect {|trace| trace.prepare_to_send(options)} invoke_remote :transaction_sample_data, @agent_id, traces rescue PostTooBigException # we tried to send too much data, drop the first trace and # try again retry if @traces.shift end log.debug "Sent slowest sample (#{@agent_id}) in #{Time.now - now} seconds" end # if we succeed sending this sample, then we don't need to keep # the slowest sample around - it has been sent already and we # can collect the next one @traces = nil # note - exceptions are logged in invoke_remote. If an # exception is encountered here, then the slowest sample of is # determined of the entire period since the last reported # sample. end def harvest_and_send_errors @unsent_errors = @error_collector.harvest_errors(@unsent_errors) if @unsent_errors && @unsent_errors.length > 0 log.debug "Sending #{@unsent_errors.length} errors" begin invoke_remote :error_data, @agent_id, @unsent_errors rescue PostTooBigException @unsent_errors.shift retry end # if the remote invocation fails, then we never clear # @unsent_errors, and therefore we can re-attempt to send on # the next heartbeat. Note the error collector maxes out at # 20 instances to prevent leakage @unsent_errors = [] end end def compress_data(object) dump = Marshal.dump(object) # this checks to make sure mongrel won't choke on big uploads check_post_size(dump) # we currently optimize for CPU here since we get roughly a 10x # reduction in message size with this, and CPU overhead is at a # premium. For extra-large posts, we use the higher compression # since otherwise it actually errors out. dump_size = dump.size # Compress if content is smaller than 64kb. There are problems # with bugs in Ruby in some versions that expose us to a risk of # segfaults if we compress aggressively. return [dump, 'identity'] if dump_size < (64*1024) # medium payloads get fast compression, to save CPU # big payloads get all the compression possible, to stay under # the 2,000,000 byte post threshold compression = dump_size < 2000000 ? Zlib::BEST_SPEED : Zlib::BEST_COMPRESSION [Zlib::Deflate.deflate(dump, compression), 'deflate'] end def check_post_size(post_string) # TODO: define this as a config option on the server side return if post_string.size < control.post_size_limit log.warn "Tried to send too much data: #{post_string.size} bytes" raise PostTooBigException end def send_request(opts) request = Net::HTTP::Post.new(opts[:uri], 'CONTENT-ENCODING' => opts[:encoding], 'HOST' => opts[:collector].name) request.content_type = "application/octet-stream" request.body = opts[:data] log.debug "Connect to #{opts[:collector]}#{opts[:uri]}" response = nil http = control.http_connection(collector) http.read_timeout = nil begin NewRelic::TimerLib.timeout(@request_timeout) do response = http.request(request) end rescue Timeout::Error log.warn "Timed out trying to post data to RPM (timeout = #{@request_timeout} seconds)" unless @request_timeout < 30 raise end if response.is_a? Net::HTTPServiceUnavailable raise NewRelic::Agent::ServerConnectionException, "Service unavailable (#{response.code}): #{response.message}" elsif response.is_a? Net::HTTPGatewayTimeOut log.debug("Timed out getting response: #{response.message}") raise Timeout::Error, response.message elsif response.is_a? Net::HTTPRequestEntityTooLarge raise PostTooBigException elsif !(response.is_a? Net::HTTPSuccess) raise NewRelic::Agent::ServerConnectionException, "Unexpected response from server (#{response.code}): #{response.message}" end response end def decompress_response(response) if response['content-encoding'] != 'gzip' log.debug "Uncompressed content returned" return response.body end log.debug "Decompressing return value" i = Zlib::GzipReader.new(StringIO.new(response.body)) i.read end def check_for_exception(response) dump = decompress_response(response) value = Marshal.load(dump) raise value if value.is_a? Exception value end def remote_method_uri(method) uri = "/agent_listener/#{PROTOCOL_VERSION}/#{control.license_key}/#{method}" uri << "?run_id=#{@agent_id}" if @agent_id uri end # send a message via post def invoke_remote(method, *args) #determines whether to zip the data or send plain post_data, encoding = compress_data(args) response = send_request({:uri => remote_method_uri(method), :encoding => encoding, :collector => collector, :data => post_data}) # raises the right exception if the remote server tells it to die return check_for_exception(response) rescue NewRelic::Agent::ForceRestartException => e log.info e.message raise rescue SystemCallError, SocketError => e # These include Errno connection errors raise NewRelic::Agent::ServerConnectionException, "Recoverable error connecting to the server: #{e}" end def graceful_disconnect if @connected begin @request_timeout = 10 log.debug "Flushing unsent metric data to server" @worker_loop.run_task if @connected_pid == $$ log.debug "Sending RPM service agent run shutdown message" invoke_remote :shutdown, @agent_id, Time.now.to_f else log.debug "This agent connected from parent process #{@connected_pid}--not sending shutdown" end log.debug "Graceful disconnect complete" rescue Timeout::Error, StandardError end else log.debug "Bypassing graceful disconnect - agent not connected" end end def default_sql_obfuscator(sql) sql = sql.dup # This is hardly readable. Use the unit tests. # remove single quoted strings: sql.gsub!(/'(.*?[^\\'])??'(?!')/, '?') # remove double quoted strings: sql.gsub!(/"(.*?[^\\"])??"(?!")/, '?') # replace all number literals sql.gsub!(/\d+/, "?") sql end end extend ClassMethods include InstanceMethods end end end