# encoding: utf-8 require "logstash-filter-useragent_jars" require "logstash/filters/base" require "logstash/namespace" require 'logstash/plugin_mixins/ecs_compatibility_support' # Parse user agent strings into structured data based on BrowserScope data # # UserAgent filter, adds information about user agent like family, operating # system, version, and device # # Logstash releases ship with the regexes.yaml database made available from # ua-parser with an Apache 2.0 license. For more details on ua-parser, see # . class LogStash::Filters::UserAgent < LogStash::Filters::Base include LogStash::PluginMixins::ECSCompatibilitySupport(:disabled, :v1) config_name "useragent" # The field containing the user agent string. If this field is an # array, only the first value will be used. config :source, :validate => :string, :required => true # The name of the field to assign user agent data into. # # If not specified user agent data will be stored in the root of the event. config :target, :validate => :string # default [user_agent] in ECS mode # `regexes.yaml` file to use # # If not specified, this will default to the `regexes.yaml` that ships # with logstash. # # You can find the latest version of this here: # config :regexes, :validate => :string # A string to prepend to all of the extracted keys config :prefix, :validate => :string, :default => '' # not supported in ECS mode # UA parsing is surprisingly expensive. This filter uses an LRU cache to take advantage of the fact that # user agents are often found adjacent to one another in log files and rarely have a random distribution. # The higher you set this the more likely an item is to be in the cache and the faster this filter will run. # However, if you set this too high you can use more memory than desired. # # Experiment with different values for this option to find the best performance for your dataset. # # This MUST be set to a value > 0. There is really no reason to not want this behavior, the overhead is minimal # and the speed gains are large. # # It is important to note that this config value is global. That is to say all instances of the user agent filter # share the same cache. The last declared cache size will 'win'. The reason for this is that there would be no benefit # to having multiple caches for different instances at different points in the pipeline, that would just increase the # number of cache misses and waste memory. config :lru_cache_size, :validate => :number, :default => 100_000 def initialize(*params) super # make @target in the format [field name] if defined, i.e. surrounded by brackets target = @target || ecs_select[disabled: '', v1: '[user_agent]'] target = "[#{@target}]" if !target.empty? && target !~ /^\[[^\[\]]+\]$/ @name_field = ecs_select[disabled: "[#{@prefix}name]", v1: '[name]'] @name_field = "#{target}#{@name_field}" @device_name_field = ecs_select[disabled: "[#{@prefix}device]", v1: '[device][name]'] @device_name_field = "#{target}#{@device_name_field}" @version_field = ecs_select[disabled: "[#{@prefix}version]", v1: '[version]'] @version_field = "#{target}#{@version_field}" @major_field = ecs_select[disabled: "#{target}[#{@prefix}major]", v1: "[@metadata][filter][user_agent][version][major]"] @minor_field = ecs_select[disabled: "#{target}[#{@prefix}minor]", v1: "[@metadata][filter][user_agent][version][minor]"] @patch_field = ecs_select[disabled: "#{target}[#{@prefix}patch]", v1: "[@metadata][filter][user_agent][version][patch]"] @os_full_name_field = ecs_select[disabled: "[#{@prefix}os_full]", v1: '[os][full]'] # did not exist in legacy prior to ECS-ification @os_full_name_field = "#{target}#{@os_full_name_field}" @os_name_field = ecs_select[disabled: "[#{@prefix}os_name]", v1: '[os][name]'] @os_name_field = "#{target}#{@os_name_field}" @legacy_os_field = ecs_select[disabled: "#{target}[#{@prefix}os]", v1: nil] # same as [os_name] in legacy mode @os_version_field = ecs_select[disabled: "[#{@prefix}os_version]", v1: '[os][version]'] @os_version_field = "#{target}#{@os_version_field}" @os_major_field = ecs_select[disabled: "#{target}[#{@prefix}os_major]", v1: "[@metadata][filter][user_agent][os][version][major]"] @os_minor_field = ecs_select[disabled: "#{target}[#{@prefix}os_minor]", v1: "[@metadata][filter][user_agent][os][version][minor]"] @os_patch_field = ecs_select[disabled: "#{target}[#{@prefix}os_patch]", v1: "[@metadata][filter][user_agent][os][version][patch]"] # NOTE: unfortunately we can not reliably provide `user_agent.original` since the patterns do not # reliably give back the matched group and they support the UA string prefixed and/or suffixed end def register if ecs_compatibility != :disabled && @prefix && !@prefix.empty? @logger.warn "Field prefix isn't supported in ECS compatibility mode, please remove `prefix => #{@prefix.inspect}`" end if @regexes.nil? @parser = org.logstash.uaparser.CachingParser.new(lru_cache_size) else @logger.debug("Using user agent regexes", :regexes => @regexes) @parser = org.logstash.uaparser.CachingParser.new(@regexes, lru_cache_size) end end def filter(event) useragent = event.get(@source) useragent = useragent.first if useragent.is_a?(Array) return if useragent.nil? || useragent.empty? begin ua_data = lookup_useragent(useragent) rescue => e @logger.error("Unknown error while parsing user agent data", :exception => e.class, :message => e.message, :backtrace => e.backtrace, :field => @source, :event => event.to_hash) return end return unless ua_data event.remove(@source) if @target == @source set_fields(event, useragent, ua_data) filter_matched(event) end private def lookup_useragent(useragent) @parser.parse(useragent) end def set_fields(event, ua_source, ua_data) # UserAgentParser strings are US-ASCII ua = ua_data.userAgent event.set(@name_field, duped_string(ua.family)) event.set(@device_name_field, duped_string(ua_data.device)) if ua_data.device event.set(@major_field, duped_string(ua.major)) if ua.major event.set(@minor_field, duped_string(ua.minor)) if ua.minor event.set(@patch_field, duped_string(ua.patch)) if ua.patch set_version(event, ua_source, ua) # UA version string e.g. "89.0.4389.90" os = ua_data.os if os # os.major, os.minor, ... are all strings event.set(@os_major_field, duped_string(os.major)) if os.major # e.g. 'Vista' or '10' event.set(@os_minor_field, duped_string(os.minor)) if os.minor event.set(@os_patch_field, duped_string(os.patch)) if os.patch os_version = build_os_version(os) event.set(@os_version_field, os_version) if os_version os_name = os.family if os_name os_name = duped_string(os_name) event.set(@os_name_field, os_name) event.set(@legacy_os_field, os_name.dup) if @legacy_os_field os_full_name = os_name.dup os_full_name << ' ' << os_version if os_version event.set(@os_full_name_field, os_full_name) end end end # reconstruct and set the User-Agent version string def set_version(event, ua_source, ua) if @version_field && ua.major # only Chrome has all 4 segments, while Firefox only uses major.minor version = duped_string(ua.major) if ua.minor version << '.' << ua.minor if ua.patch version << '.' << ua.patch if ua.patchMinor version << '.' << ua.patchMinor else adjusted_version = check_and_adjust_version(ua_source, version) version = adjusted_version if adjusted_version end end end event.set(@version_field, version) end end def check_and_adjust_version(ua_source, version) # only set OS version if it's not 'interpreted' (contained in UA string) return nil if !version || (i = ua_source.index(version)).nil? i += version.size # complete version when patchMinor is not matched but still there if ua_source[i] == '.' # we built the version with dots if patch_minor = ua_source.index(' ', i + 1) patch_minor = ua_source[i + 1...patch_minor] if patch_minor.eql? patch_minor.to_i.to_s version = "#{version}.#{patch_minor}" end end end version end # reconstructs the OS version string def build_os_version(os) # NOTE: UA regexes don't always give us the versions back # they do get "corrected" for various OSes such as: # - Windows (Windows NT 6.0 => 'Vista') # - Windows ('Windows NT 6.3' => '8','1') # - Windows ('Windows NT 10.0' => '10') # - iOS ('Darwin/15.5' => '9','3','2') return unless major = os.major if major.to_i.to_s == major version, sep = duped_string(major), '.' else version, sep = duped_string(major), ' ' end if os.minor version << sep << os.minor if os.patch version << '.' << os.patch if os.patchMinor version << '.' << os.patchMinor end end end version end def duped_string(str) # Calls in here use #dup because there's potential for later filters to modify these values # and corrupt the cache. See uap source here for details https://github.com/ua-parser/uap-ruby/tree/master/lib/user_agent_parser str.dup.force_encoding(Encoding::UTF_8) end end