lib/honeycomb/integrations/redis.rb in honeycomb-beeline-2.4.1 vs lib/honeycomb/integrations/redis.rb in honeycomb-beeline-2.4.2

- old
+ new

@@ -185,76 +185,27 @@ def sanitize(args) args.map! { "[sanitized]" } end - def prettify(arg) - quotes = false - pretty = "".dup - arg.to_s.each_char do |c| - quotes ||= needs_quotes?(c) - pretty << escape(c) - end - quotes ? "\"#{pretty}\"" : pretty - end - # This aims to replicate the algorithms used by redis-cli. # # @see https://github.com/antirez/redis/blob/0f026af185e918a9773148f6ceaa1b084662be88/src/sds.c#L940-L1067 # The redis-cli parsing algorithm # # @see https://github.com/antirez/redis/blob/0f026af185e918a9773148f6ceaa1b084662be88/src/sds.c#L878-L907 # The redis-cli printing algorithm - def escape(char) - return escape_with_backslash(char) if escape_with_backslash?(char) - return escape_with_hex_codes(char) if escape_with_hex_codes?(char) - - char + def prettify(arg) + pretty = arg.to_s.dup + pretty.encode!("UTF-8", "binary", fallback: ->(c) { hex(c) }) + pretty.gsub!(NEEDS_BACKSLASH, BACKSLASH) + pretty.gsub!(NEEDS_HEX) { |c| hex(c) } + pretty =~ NEEDS_QUOTES ? "\"#{pretty}\"" : pretty end - # A lookup table for backslash-escaped characters. + # A regular expression matching characters that need to be hex-encoded. # - # This is used by {#escape_with_backslash?} and {#escape_with_backslash} - # to replicate the hard-coded `case` statements in redis-cli. As of this - # writing, Redis recognizes a handful of standard C escape sequences, - # like "\n" for newlines. - # - # Because {#prettify} will output double quoted strings if any escaping - # is needed, this table must additionally consider the double-quote to be - # a backslash-escaped character. For example, instead of generating - # - # '"hello"' - # - # we'll generate - # - # "\"hello\"" - # - # even though redis-cli would technically recognize the single-quoted - # version. - # - # @see https://github.com/antirez/redis/blob/0f026af185e918a9773148f6ceaa1b084662be88/src/sds.c#L888-L896 - # The redis-cli algorithm for outputting standard escape sequences - BACKSLASHES = { - "\\" => "\\\\", - '"' => '\\"', - "\n" => "\\n", - "\r" => "\\r", - "\t" => "\\t", - "\a" => "\\a", - "\b" => "\\b", - }.freeze - - def escape_with_backslash?(char) - BACKSLASHES.key?(char) - end - - def escape_with_backslash(char) - BACKSLASHES.fetch(char, char) - end - - # Do we need to hex-encode this character? - # # This replicates the C isprint() function that redis-cli uses to decide # whether to escape a character in hexadecimal notation, "\xhh". Any # non-printable character must be represented as a hex escape sequence. # # Normally, we could match this using a negated POSIX bracket expression: @@ -285,23 +236,100 @@ # That is, if the character is not printable (even in Unicode), we'll # escape it; if the character is printable but non-ASCII, we'll also # escape it. # # What's more, Ruby's Regexp#=~ method will blow up if the string does - # not have a valid encoding (e.g., in UTF-8). In this case, though, - # {#escape_with_hex_codes} can still convert the bytes that make up the - # invalid character into a hex code. So we preemptively check for - # invalidly-encoded characters before testing the above match. + # not have a valid encoding (e.g., in UTF-8). We handle this case + # separately, though, using String#encode! with a :fallback option to + # hex-encode invalid UTF-8 byte sequences with {#hex}. # # @see https://ruby-doc.org/core-2.6.5/Regexp.html # @see https://github.com/antirez/redis/blob/0f026af185e918a9773148f6ceaa1b084662be88/src/sds.c#L878-L880 # @see https://github.com/antirez/redis/blob/0f026af185e918a9773148f6ceaa1b084662be88/src/sds.c#L898-L901 # @see https://www.justinweiss.com/articles/3-steps-to-fix-encoding-problems-in-ruby/ - def escape_with_hex_codes?(char) - !char.valid_encoding? || char =~ /[^[:print:]&&[:ascii:]]/ - end + NEEDS_HEX = /[^[:print:]&&[:ascii:]]/.freeze + # A regular expression for characters that need to be backslash-escaped. + # + # Any match of this regexp will be substituted according to the + # {BACKSLASH} table. This includes standard C escape sequences (newlines, + # tabs, etc) as well as a couple special considerations: + # + # 1. Because {#prettify} will output double quoted strings if any + # escaping is needed, we must match double quotes (") so they'll be + # replaced by escaped quotes (\"). + # + # 2. Backslashes themselves get backslash-escaped, so \ becomes \\. + # However, strings with invalid UTF-8 encoding will blow up when we + # try to use String#gsub!, so {#prettify} must first use + # String#encode! to scrub out invalid characters. It does this by + # replacing invalid bytes with hex-encoded escape sequences using + # {#hex}. This will insert sequences like \xhh, which contains a + # backslash that we *don't* want to escape. + # + # Unfortunately, this regexp can't really distinguish between + # backslashes in the original input vs backslashes resulting from the + # UTF-8 fallback. We make an effort by using a negative lookahead. + # That way, only backslashes that *aren't* followed by x + hex digit + + # hex digit will be escaped. + NEEDS_BACKSLASH = /["\n\r\t\a\b]|\\(?!x\h\h)/.freeze + + # A lookup table for backslash-escaped characters. + # + # This is used by {#prettify} to replicate the hard-coded `case` + # statements in redis-cli. As of this writing, Redis recognizes a handful + # of standard C escape sequences, like "\n" for newlines. + # + # Because {#prettify} will output double quoted strings if any escaping + # is needed, this table must additionally consider the double-quote to be + # a backslash-escaped character. For example, instead of generating + # + # '"hello"' + # + # we'll generate + # + # "\"hello\"" + # + # even though redis-cli would technically recognize the single-quoted + # version. + # + # @see https://github.com/antirez/redis/blob/0f026af185e918a9773148f6ceaa1b084662be88/src/sds.c#L888-L896 + # The redis-cli algorithm for outputting standard escape sequences + BACKSLASH = { + "\\" => "\\\\", + '"' => '\\"', + "\n" => "\\n", + "\r" => "\\r", + "\t" => "\\t", + "\a" => "\\a", + "\b" => "\\b", + }.freeze + + # If the final escaped string needs quotes, it will match this regexp. + # + # The overall string returned by {#prettify} should only be quoted if at + # least one of the following holds: + # + # 1. The string contains an escape sequence, broadly demarcated by a + # backslash. This includes standard escape sequences like "\n" and + # "\t" as well as hex-encoded bytes using the "\x" escape sequence. + # Since {#prettify} uses double quotes on its output string, we must + # also force quotes if the string itself contains a literal + # double quote. This double quote behavior is handled tacitly by the + # {NEEDS_BACKSLASH} + {BACKSLASH} replacement. + # + # 2. The string contains a single quote. Since redis-cli recognizes + # single-quoted strings, we want to wrap the {#prettify} output in + # double quotes so that the literal single quote character isn't + # mistaken as the delimiter of a new string. + # + # 3. The string contains any whitespace characters. If the {#prettify} + # output weren't wrapped in quotes, whitespace would act as a + # separator between arguments to the Redis command. To group things + # together, we need to quote the string. + NEEDS_QUOTES = /[\\'\s]/.freeze + # Hex-encodes a (presumably non-printable or non-ASCII) character. # # Aside from standard backslash escape sequences, redis-cli also # recognizes "\xhh" notation, where `hh` is a hexadecimal number. # @@ -324,40 +352,11 @@ # `\x`. # # @see https://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap07.html # @see https://github.com/antirez/redis/blob/0f026af185e918a9773148f6ceaa1b084662be88/src/sds.c#L878-L880 # @see https://github.com/antirez/redis/blob/0f026af185e918a9773148f6ceaa1b084662be88/src/sds.c#L898-L901 - def escape_with_hex_codes(char) + def hex(char) char.bytes.map { |b| Kernel.format("\\x%02x", b) }.join - end - - def escape?(char) - escape_with_backslash?(char) || escape_with_hex_codes?(char) - end - - # Should this character cause {#prettify} to wrap its output in quotes? - # - # The overall string returned by {#prettify} should only be quoted if at - # least one of the following holds: - # - # 1. The string contains a character that needs to be escaped. This - # includes standard backslash escape sequences (like "\n" and "\t") as - # well as hex-encoded bytes using the "\x" escape sequence. Since - # {#prettify} uses double quotes on its output string, we must also - # force quotes if the string itself contains a literal double quote. - # This double quote behavior is handled tacitly by {BACKSLASHES}. - # - # 2. The string contains a single quote. Since redis-cli recognizes - # single-quoted strings, we want to wrap the {#prettify} output in - # double quotes so that the literal single quote character isn't - # mistaken as the delimiter of a new string. - # - # 3. The string contains any whitespace characters. If the {#prettify} - # output weren't wrapped in quotes, whitespace would act as a - # separator between arguments to the Redis command. To group things - # together, we need to quote the string. - def needs_quotes?(char) - escape?(char) || char == "'" || char =~ /\s/ end end end end