Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(storage): respect MIME message boundary size limits #9965

Merged
merged 2 commits into from
Oct 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions google/cloud/storage/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -148,3 +148,15 @@ load(":storage_client_grpc_unit_tests.bzl", "storage_client_grpc_unit_tests")
"@com_google_googletest//:gtest_main",
],
) for test in storage_client_grpc_unit_tests]

load(":google_cloud_cpp_storage_benchmarks.bzl", "google_cloud_cpp_storage_benchmarks")

[cc_test(
name = benchmark.replace("/", "_").replace(".cc", ""),
srcs = [benchmark],
tags = ["benchmark"],
deps = [
":google_cloud_cpp_storage",
"@com_google_benchmark//:benchmark_main",
],
) for benchmark in google_cloud_cpp_storage_benchmarks]
1 change: 1 addition & 0 deletions google/cloud/storage/google_cloud_cpp_storage.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ google_cloud_cpp_storage_srcs = [
"internal/default_object_acl_requests.cc",
"internal/empty_response.cc",
"internal/error_credentials.cc",
"internal/generate_message_boundary.cc",
"internal/hash_function.cc",
"internal/hash_function_impl.cc",
"internal/hash_validator.cc",
Expand Down
23 changes: 22 additions & 1 deletion google/cloud/storage/google_cloud_cpp_storage.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ add_library(
internal/empty_response.h
internal/error_credentials.cc
internal/error_credentials.h
internal/generate_message_boundary.cc
internal/generate_message_boundary.h
internal/generic_object_request.h
internal/generic_request.h
Expand Down Expand Up @@ -529,11 +530,31 @@ if (BUILD_TESTING)
endif ()
add_test(NAME ${target} COMMAND ${target})
endforeach ()

# Export the list of unit tests so the Bazel BUILD file can pick it up.
export_list_to_bazel("storage_client_unit_tests.bzl"
"storage_client_unit_tests" YEAR "2018")

find_package(benchmark CONFIG REQUIRED)

set(google_cloud_cpp_storage_benchmarks
# cmake-format: sort
internal/generate_message_boundary_benchmark.cc)

# Export the list of benchmarks to a .bzl file so we do not need to maintain
# the list in two places.
export_list_to_bazel("google_cloud_cpp_storage_benchmarks.bzl"
"google_cloud_cpp_storage_benchmarks" YEAR "2022")

# Generate a target for each benchmark.
foreach (fname ${google_cloud_cpp_storage_benchmarks})
google_cloud_cpp_add_executable(target "storage" "${fname}")
add_test(NAME ${target} COMMAND ${target})
target_link_libraries(
${target} PRIVATE absl::memory google-cloud-cpp::storage
benchmark::benchmark_main)
google_cloud_cpp_add_common_options(${target})
endforeach ()

add_subdirectory(tests)
add_subdirectory(benchmarks)
endif ()
21 changes: 21 additions & 0 deletions google/cloud/storage/google_cloud_cpp_storage_benchmarks.bzl
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# DO NOT EDIT -- GENERATED BY CMake -- Change the CMakeLists.txt file if needed

"""Automatically generated unit tests list - DO NOT EDIT."""

google_cloud_cpp_storage_benchmarks = [
"internal/generate_message_boundary_benchmark.cc",
]
11 changes: 3 additions & 8 deletions google/cloud/storage/internal/curl_client.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1310,16 +1310,11 @@ std::string CurlClient::PickBoundary(std::string const& text_to_avoid) {
// the candidate. Eventually we will find something, though it might be
// larger than `text_to_avoid`. And we only make (approximately) one pass
// over `text_to_avoid`.
auto generate_candidate = [this](int n) {
static auto const* const kChars = new std::string(
"abcdefghijklmnopqrstuvwxyz0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ");
auto generate_candidate = [this]() {
std::unique_lock<std::mutex> lk(mu_);
return google::cloud::internal::Sample(generator_, n, *kChars);
return GenerateMessageBoundaryCandidate(generator_);
};
constexpr int kCandidateInitialSize = 16;
constexpr int kCandidateGrowthSize = 4;
return GenerateMessageBoundary(text_to_avoid, std::move(generate_candidate),
kCandidateInitialSize, kCandidateGrowthSize);
return GenerateMessageBoundary(text_to_avoid, generate_candidate);
}

StatusOr<ObjectMetadata> CurlClient::InsertObjectMediaSimple(
Expand Down
46 changes: 46 additions & 0 deletions google/cloud/storage/internal/generate_message_boundary.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "google/cloud/storage/internal/generate_message_boundary.h"
#include <algorithm>

namespace google {
namespace cloud {
namespace storage {
GOOGLE_CLOUD_CPP_INLINE_NAMESPACE_BEGIN
namespace internal {

std::string GenerateMessageBoundary(
std::string const& message,
absl::FunctionRef<std::string()> candidate_generator) {
auto candidate = candidate_generator();
while (message.find(candidate) != std::string::npos) {
candidate = candidate_generator();
}
return candidate;
}

std::string GenerateMessageBoundaryCandidate(
google::cloud::internal::DefaultPRNG& generator) {
auto candidate = std::string{
"abcdefghijklmnopqrstuvwxyz0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"};
std::shuffle(candidate.begin(), candidate.end(), generator);
return candidate;
}

} // namespace internal
GOOGLE_CLOUD_CPP_INLINE_NAMESPACE_END
} // namespace storage
} // namespace cloud
} // namespace google
46 changes: 28 additions & 18 deletions google/cloud/storage/internal/generate_message_boundary.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

#include "google/cloud/storage/version.h"
#include "google/cloud/internal/invoke_result.h"
#include "google/cloud/internal/random.h"
#include "absl/functional/function_ref.h"
#include <string>

namespace google {
Expand All @@ -34,37 +36,45 @@ namespace internal {
* characters to the message itself).
*
* The algorithm is to generate a short random string, and search for it in the
* message, if the message has that string, append some more random characters
* and keep searching.
*
* This function is a template because the string generator is typically a
* lambda that captures state variables (such as the random number generator),
* of the class that uses it.
* message, if the message has that string, generate a new string and retry.
* The strings are 64 (alphanumeric) characters long, the number of permutations
* is large enough that a suitable string will be found eventually.
*
* @param message a message body, typically the payload of a HTTP request that
* will need to be encoded as a MIME multipart message.
* @param random_string_generator a callable to generate random strings.
* @param candidate_generator a callable to generate random strings.
* Typically a lambda that captures the generator and any locks necessary
* to generate the string. This is also used in testing to verify things
* work when the initial string is found.
* @param initial_size the length for the initial string.
* @param growth_size how fast to grow the random string.
* @return a string not found in @p message.
*/
std::string GenerateMessageBoundary(
std::string const& message,
absl::FunctionRef<std::string()> candidate_generator);

/// A helper to generate message boundary candidates.
std::string GenerateMessageBoundaryCandidate(
google::cloud::internal::DefaultPRNG& generator);

/**
* @deprecated use another `GenerateMessageBoundary()` overload.
*/
template <typename RandomStringGenerator,
typename std::enable_if<google::cloud::internal::is_invocable<
RandomStringGenerator, int>::value,
int>::type = 0>
std::string GenerateMessageBoundary(
std::string const& message, RandomStringGenerator&& random_string_generator,
int initial_size, int growth_size) {
std::string candidate = random_string_generator(initial_size);
for (std::string::size_type i = message.find(candidate, 0);
i != std::string::npos; i = message.find(candidate, i)) {
candidate += random_string_generator(growth_size);
}
return candidate;
GOOGLE_CLOUD_CPP_DEPRECATED(
"deprecated, using the GenerateMessageBoundary(std::string const&, "
"absl::FunctionRef<std::string()>) overload")
std::string
GenerateMessageBoundary(std::string const& message,
RandomStringGenerator&& random_string_generator,
int /*initial_size*/, int /*growth_size*/) {
return GenerateMessageBoundary(message, [&random_string_generator]() {
return random_string_generator(64);
});
}

} // namespace internal
GOOGLE_CLOUD_CPP_INLINE_NAMESPACE_END
} // namespace storage
Expand Down
156 changes: 156 additions & 0 deletions google/cloud/storage/internal/generate_message_boundary_benchmark.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "google/cloud/storage/internal/generate_message_boundary.h"
#include "google/cloud/internal/random.h"
#include <benchmark/benchmark.h>
#include <mutex>
#include <random>
#include <string>

namespace google {
namespace cloud {
namespace storage {
GOOGLE_CLOUD_CPP_INLINE_NAMESPACE_BEGIN
namespace internal {
namespace {

// CXX=clang++ CC=clang bazel run -c opt --
// //google/cloud/storage:internal_generate_message_boundary_benchmark
// --benchmark_repetitions=100
//
// clang-format off
// Run on (96 X 2000 MHz CPU s)
// CPU Caches:
// L1 Data 32 KiB (x48)
// L1 Instruction 32 KiB (x48)
// L2 Unified 1024 KiB (x48)
// L3 Unified 39424 KiB (x2)
// ---------------------------------------------------------------------------------------------
// Benchmark Time CPU Iterations
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opinion: I usually think that the minimum is the best value to record (instead of mean or median) for such micro-benchmarks.

// ---------------------------------------------------------------------------------------------
// ...
// GenerateBoundaryFixture/GenerateBoundary_mean 20478679 ns 20475235 ns 100
// GenerateBoundaryFixture/GenerateBoundary_median 20583724 ns 20580943 ns 100
// GenerateBoundaryFixture/GenerateBoundary_stddev 335315 ns 335459 ns 100
// GenerateBoundaryFixture/GenerateBoundary_cv 1.64 % 1.64 % 100
// ...
// GenerateBoundaryFixture/GenerateBoundaryOld_mean 20809894 ns 20806317 ns 100
// GenerateBoundaryFixture/GenerateBoundaryOld_median 20520133 ns 20517279 ns 100
// GenerateBoundaryFixture/GenerateBoundaryOld_stddev 1284277 ns 1284334 ns 100
// GenerateBoundaryFixture/GenerateBoundaryOld_cv 6.17 % 6.17 % 100
// ...
// GenerateBoundaryFixture/WorstCase_mean 100747489 ns 100727911 ns 100
// GenerateBoundaryFixture/WorstCase_median 101026913 ns 101006689 ns 100
// GenerateBoundaryFixture/WorstCase_stddev 1285934 ns 1285884 ns 100
// GenerateBoundaryFixture/WorstCase_cv 1.28 % 1.28 % 100
// ...
// GenerateBoundaryFixture/BestCase_mean 9584895 ns 9583080 ns 100
// GenerateBoundaryFixture/BestCase_median 9598452 ns 9597243 ns 100
// GenerateBoundaryFixture/BestCase_stddev 90679 ns 90643 ns 100
// GenerateBoundaryFixture/BestCase_cv 0.95 % 0.95 % 100
// clang-format on

auto constexpr kMessageSize = 128 * 1024 * 1024;

class GenerateBoundaryFixture : public ::benchmark::Fixture {
public:
std::string MakeRandomString(int n) {
std::lock_guard<std::mutex> lk(mu_);
return google::cloud::internal::Sample(generator_, n,
"abcdefghijklmnopqrstuvwxyz"
"0123456789"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ");
}

std::string GenerateCandidate() {
std::lock_guard<std::mutex> lk(mu_);
return GenerateMessageBoundaryCandidate(generator_);
}

std::string const& message() const { return message_; }

private:
std::string MakeMessage() {
std::string all;
int min = (std::numeric_limits<char>::min)(); // NOLINT
int end = (std::numeric_limits<char>::max)() + 1;
for (int c = min; c != end; ++c) all.push_back(static_cast<char>(c));
return google::cloud::internal::Sample(generator_, kMessageSize, all);
}

std::mutex mu_;
google::cloud::internal::DefaultPRNG generator_ =
google::cloud::internal::DefaultPRNG(std::random_device{}());
std::string message_ = MakeMessage();
};

BENCHMARK_F(GenerateBoundaryFixture, GenerateBoundary)
(benchmark::State& state) {
auto make_string = [this]() { return GenerateCandidate(); };

for (auto _ : state) {
benchmark::DoNotOptimize(GenerateMessageBoundary(message(), make_string));
}
}

BENCHMARK_F(GenerateBoundaryFixture, GenerateBoundaryOld)
(benchmark::State& state) {
auto old = [this](std::string const& message) {
auto candidate = MakeRandomString(16);
for (std::string::size_type i = message.find(candidate, 0);
i != std::string::npos; i = message.find(candidate, i)) {
candidate += MakeRandomString(8);
}
return candidate;
};

for (auto _ : state) {
benchmark::DoNotOptimize(old(message()));
}
}

BENCHMARK_F(GenerateBoundaryFixture, WorstCase)
(benchmark::State& state) {
auto test = [](std::string const& message) {
auto count = std::count(message.begin(), message.end(), 'Z');
return std::string(count % 64, 'A');
};

for (auto _ : state) {
benchmark::DoNotOptimize(test(message()));
}
}

BENCHMARK_F(GenerateBoundaryFixture, BestCase)
(benchmark::State& state) {
auto test = [](std::string const& message) {
auto count = 0;
for (std::size_t i = 0; i < message.size(); i += 64) {
count += message[i] == 'Z' ? 1 : 0;
}
return std::string(count % 64, 'A');
};

for (auto _ : state) {
benchmark::DoNotOptimize(test(message()));
}
}

} // namespace
} // namespace internal
GOOGLE_CLOUD_CPP_INLINE_NAMESPACE_END
} // namespace storage
} // namespace cloud
} // namespace google
Loading