Skip to content

Commit

Permalink
Better handle different encodings (#316)
Browse files Browse the repository at this point in the history
* Handle non UTF-8 compatible values for `Error` object
* Add `Datadog::Utils.utf8_encode` method
* Use `Utils.utf8_encode` for sanitization and quantization
* Add test cases for `Utils.utf8_encode`
* Move constant to top of the file
* Use `debug` log level for encoding issues
  • Loading branch information
p-lambert authored and Emanuele Palazzetti committed Jan 17, 2018
1 parent 89ca8fc commit 79bfe4b
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 20 deletions.
7 changes: 5 additions & 2 deletions lib/ddtrace/contrib/dalli/quantize.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,13 @@ module Quantize
module_function

def format_command(operation, args)
placeholder = "#{operation} BLOB (OMITTED)"
command = [operation, *args].join(' ').strip
command = Utils.utf8_encode(command, binary: true, placeholder: placeholder)
Utils.truncate(command, MAX_CMD_LENGTH)
rescue ::Encoding::CompatibilityError
"#{operation} BLOB (OMITTED)"
rescue => e
Tracer.log.debug("Error sanitizing Dalli operation: #{e}")
placeholder
end
end
end
Expand Down
4 changes: 2 additions & 2 deletions lib/ddtrace/contrib/redis/quantize.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ module Quantize

def format_arg(arg)
str = arg.is_a?(Symbol) ? arg.to_s.upcase : arg.to_s
str = str.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
str = Utils.utf8_encode(str, binary: true, placeholder: PLACEHOLDER)
Utils.truncate(str, VALUE_MAX_LEN, TOO_LONG_MARK)
rescue StandardError => e
rescue => e
Datadog::Tracer.log.debug("non formattable Redis arg #{str}: #{e}")
PLACEHOLDER
end
Expand Down
16 changes: 3 additions & 13 deletions lib/ddtrace/error.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,9 @@ def self.build_from(value)

def initialize(type = nil, message = nil, backtrace = nil)
backtrace = Array(backtrace).join("\n")
@type = sanitize(type)
@message = sanitize(message)
@backtrace = sanitize(backtrace)
end

private

def sanitize(value)
value = value.to_s

return value if value.encoding == ::Encoding::UTF_8

value.encode(::Encoding::UTF_8)
@type = Utils.utf8_encode(type)
@message = Utils.utf8_encode(message)
@backtrace = Utils.utf8_encode(backtrace)
end

BlankError = Error.new
Expand Down
25 changes: 22 additions & 3 deletions lib/ddtrace/utils.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
module Datadog
# Utils contains low-level utilities, typically to provide pseudo-random trace IDs.
module Utils
STRING_PLACEHOLDER = ''.encode(::Encoding::UTF_8).freeze
# We use a custom random number generator because we want no interference
# with the default one. Using the default prng, we could break code that
# would rely on srand/rand sequences.
Expand All @@ -21,6 +22,10 @@ def self.was_forked?
Process.pid != @pid
end

private_class_method :reset!, :was_forked?

reset!

def self.truncate(value, size, omission = '...')
string = value.to_s

Expand All @@ -29,8 +34,22 @@ def self.truncate(value, size, omission = '...')
string.slice(0, size - omission.size) + omission
end

private_class_method :reset!, :was_forked?

reset!
def self.utf8_encode(str, options = {})
str = str.to_s

if options[:binary]
# This option is useful for "gracefully" displaying binary data that
# often contains text such as marshalled objects
str.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
elsif str.encoding == ::Encoding::UTF_8
str
else
str.encode(::Encoding::UTF_8)
end
rescue => e
Tracer.log.debug("Error encoding string in UTF-8: #{e}")

options.fetch(:placeholder, STRING_PLACEHOLDER)
end
end
end
9 changes: 9 additions & 0 deletions test/error_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -73,5 +73,14 @@ def test_nil_coercion
assert_empty(error.message)
assert_empty(error.backtrace)
end

def test_regression_non_utf8_compatible
exception = StandardError.new("\xC2".force_encoding(::Encoding::ASCII_8BIT))
error = Error.build_from(exception)

assert_equal('StandardError', @error.type)
assert_empty(error.message)
assert_empty(error.backtrace)
end
end
end
32 changes: 32 additions & 0 deletions test/utils_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,36 @@ def test_forked_process_id_collision
refute_equal(Datadog::Utils.next_id, r.read.chomp.to_i)
r.close
end

def test_utf8_encoding_happy_path
# we can't use utf-8 literals because our tests run against ruby 1.9.3
str = "pristine \U+FFE2".encode(Encoding::UTF_8)

assert_equal("pristine \U+FFE2", Datadog::Utils.utf8_encode(str))

assert_equal(::Encoding::UTF_8, Datadog::Utils.utf8_encode(str).encoding)

# we don't allocate new objects when a valid UTF-8 string is provided
assert_same(str, Datadog::Utils.utf8_encode(str))
end

def test_utf8_encoding_invalid_conversion
time_bomb = "\xC2".force_encoding(::Encoding::ASCII_8BIT)

# making sure this is indeed a problem
assert_raises(Encoding::UndefinedConversionError) do
time_bomb.encode(Encoding::UTF_8)
end

assert_equal(Datadog::Utils::STRING_PLACEHOLDER, Datadog::Utils.utf8_encode(time_bomb))

# we can also set a custom placeholder
assert_equal('?', Datadog::Utils.utf8_encode(time_bomb, placeholder: '?'))
end

def test_binary_data
byte_array = "keep what\xC2 is valid".force_encoding(::Encoding::ASCII_8BIT)

assert_equal('keep what is valid', Datadog::Utils.utf8_encode(byte_array, binary: true))
end
end

0 comments on commit 79bfe4b

Please sign in to comment.