# encoding: utf-8
require "logstash/filters/base"
require "logstash/namespace"
require "lru_redux"
require "tempfile"
# Parse user agent strings into structured data based on BrowserScope data
#
# UserAgent filter, adds information about user agent like family, operating
# system, version, and device
#
# Logstash releases ship with the regexes.yaml database made available from
# ua-parser with an Apache 2.0 license. For more details on ua-parser, see
# .
class LogStash::Filters::UserAgent < LogStash::Filters::Base
LOOKUP_CACHE = LruRedux::ThreadSafeCache.new(1000)
config_name "useragent"
# The field containing the user agent string. If this field is an
# array, only the first value will be used.
config :source, :validate => :string, :required => true
# The name of the field to assign user agent data into.
#
# If not specified user agent data will be stored in the root of the event.
config :target, :validate => :string
# `regexes.yaml` file to use
#
# If not specified, this will default to the `regexes.yaml` that ships
# with logstash.
#
# You can find the latest version of this here:
#
config :regexes, :validate => :string
# A string to prepend to all of the extracted keys
config :prefix, :validate => :string, :default => ''
# UA parsing is surprisingly expensive. This filter uses an LRU cache to take advantage of the fact that
# user agents are often found adjacent to one another in log files and rarely have a random distribution.
# The higher you set this the more likely an item is to be in the cache and the faster this filter will run.
# However, if you set this too high you can use more memory than desired.
#
# Experiment with different values for this option to find the best performance for your dataset.
#
# This MUST be set to a value > 0. There is really no reason to not want this behavior, the overhead is minimal
# and the speed gains are large.
#
# It is important to note that this config value is global. That is to say all instances of the user agent filter
# share the same cache. The last declared cache size will 'win'. The reason for this is that there would be no benefit
# to having multiple caches for different instances at different points in the pipeline, that would just increase the
# number of cache misses and waste memory.
config :lru_cache_size, :validate => :number, :default => 1000
public
def register
require 'user_agent_parser'
if @regexes.nil?
begin
@parser = UserAgentParser::Parser.new()
rescue Exception => e
begin
path = ::File.expand_path('../../../vendor/regexes.yaml', ::File.dirname(__FILE__))
@parser = UserAgentParser::Parser.new(:patterns_path => path)
rescue => ex
raise "Failed to cache, due to: #{ex}\n"
end
end
else
@logger.info("Using user agent regexes", :regexes => @regexes)
@parser = UserAgentParser::Parser.new(:patterns_path => @regexes)
end
LOOKUP_CACHE.max_size = @lru_cache_size
end #def register
public
def filter(event)
useragent = event[@source]
useragent = useragent.first if useragent.is_a? Array
begin
ua_data = lookup_useragent(useragent)
rescue StandardError => e
@logger.error("Uknown error while parsing user agent data", :exception => e, :field => @source, :event => event)
return
end
return unless ua_data
if @target.nil?
target = event
elsif @target == @source
target = event[@source] = {}
else
target = event[@target] ||= {}
end
write_to_target(target, ua_data)
filter_matched(event)
end # def filter
def lookup_useragent(useragent)
return unless useragent
cached = LOOKUP_CACHE[useragent]
return cached if cached
ua_data = @parser.parse(useragent)
LOOKUP_CACHE[useragent] = ua_data
ua_data
end
def write_to_target(target, ua_data)
# UserAgentParser outputs as US-ASCII.
target[@prefix + "name"] = ua_data.name.dup.force_encoding(Encoding::UTF_8)
#OSX, Andriod and maybe iOS parse correctly, ua-agent parsing for Windows does not provide this level of detail
# Calls in here use #dup because there's potential for later filters to modify these values
# and corrupt the cache. See uap source here for details https://github.com/ua-parser/uap-ruby/tree/master/lib/user_agent_parser
if (os = ua_data.os)
# The OS is a rich object
target[@prefix + "os"] = ua_data.os.to_s.dup.force_encoding(Encoding::UTF_8)
target[@prefix + "os_name"] = os.name.dup.force_encoding(Encoding::UTF_8) if os.name
# These are all strings
if (os_version = os.version)
target[@prefix + "os_major"] = os_version.major.dup.force_encoding(Encoding::UTF_8) if os_version.major
target[@prefix + "os_minor"] = os_version.minor.dup.force_encoding(Encoding::UTF_8) if os_version.minor
end
end
target[@prefix + "device"] = ua_data.device.to_s.dup.force_encoding(Encoding::UTF_8) if ua_data.device
if (ua_version = ua_data.version)
target[@prefix + "major"] = ua_version.major.dup.force_encoding(Encoding::UTF_8) if ua_version.major
target[@prefix + "minor"] = ua_version.minor.dup.force_encoding(Encoding::UTF_8) if ua_version.minor
target[@prefix + "patch"] = ua_version.patch.dup.force_encoding(Encoding::UTF_8) if ua_version.patch
target[@prefix + "build"] = ua_version.patch_minor.dup.force_encoding(Encoding::UTF_8) if ua_version.patch_minor
end
end
end # class LogStash::Filters::UserAgent