Skip to content

Commit

Permalink
src: use simdutf for converting externalized builtins to UTF-16
Browse files Browse the repository at this point in the history
Remove the dependency on ICU for this part, as well as the
hacky way of converting embedder main sources to UTF-8 via
V8 APIs. Allow `UnionBytes` to own the memory its pointing
to in order to simplify the code on the `BuiltinLoader` side.
  • Loading branch information
addaleax committed Jan 6, 2023
1 parent 22a2ec6 commit b14c6b4
Show file tree
Hide file tree
Showing 9 changed files with 41 additions and 51 deletions.
6 changes: 1 addition & 5 deletions configure.py
Original file line number Diff line number Diff line change
Expand Up @@ -2024,11 +2024,7 @@ def make_bin_override():
for builtin in shareable_builtins:
builtin_id = 'node_shared_builtin_' + builtin.replace('/', '_') + '_path'
if getattr(options, builtin_id):
if options.with_intl == 'none':
option_name = '--shared-builtin-' + builtin + '-path'
error(option_name + ' is incompatible with --with-intl=none' )
else:
output['defines'] += [builtin_id.upper() + '=' + getattr(options, builtin_id)]
output['defines'] += [builtin_id.upper() + '=' + getattr(options, builtin_id)]
else:
output['variables']['node_builtin_shareable_builtins'] += [shareable_builtins[builtin]]

Expand Down
1 change: 1 addition & 0 deletions node.gyp
Original file line number Diff line number Diff line change
Expand Up @@ -973,6 +973,7 @@
'deps/googletest/googletest.gyp:gtest_main',
'deps/histogram/histogram.gyp:histogram',
'deps/uvwasi/uvwasi.gyp:uvwasi',
'deps/simdutf/simdutf.gyp:simdutf',
],

'includes': [
Expand Down
15 changes: 2 additions & 13 deletions src/api/environment.cc
Original file line number Diff line number Diff line change
Expand Up @@ -467,21 +467,10 @@ MaybeLocal<Value> LoadEnvironment(
Environment* env,
const char* main_script_source_utf8) {
CHECK_NOT_NULL(main_script_source_utf8);
Isolate* isolate = env->isolate();
return LoadEnvironment(
env,
[&](const StartExecutionCallbackInfo& info) -> MaybeLocal<Value> {
// This is a slightly hacky way to convert UTF-8 to UTF-16.
Local<String> str =
String::NewFromUtf8(isolate,
main_script_source_utf8).ToLocalChecked();
auto main_utf16 = std::make_unique<String::Value>(isolate, str);

// TODO(addaleax): Avoid having a global table for all scripts.
env, [&](const StartExecutionCallbackInfo& info) -> MaybeLocal<Value> {
std::string name = "embedder_main_" + std::to_string(env->thread_id());
builtins::BuiltinLoader::Add(
name.c_str(), UnionBytes(**main_utf16, main_utf16->length()));
env->set_main_utf16(std::move(main_utf16));
builtins::BuiltinLoader::Add(name.c_str(), main_script_source_utf8);
Realm* realm = env->principal_realm();

return realm->ExecuteBootstrapper(name.c_str());
Expand Down
5 changes: 0 additions & 5 deletions src/env-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -805,11 +805,6 @@ void Environment::RemoveCleanupHook(CleanupQueue::Callback fn, void* arg) {
cleanup_queue_.Remove(fn, arg);
}

void Environment::set_main_utf16(std::unique_ptr<v8::String::Value> str) {
CHECK(!main_utf16_);
main_utf16_ = std::move(str);
}

void Environment::set_process_exit_handler(
std::function<void(Environment*, ExitCode)>&& handler) {
process_exit_handler_ = std::move(handler);
Expand Down
6 changes: 0 additions & 6 deletions src/env.h
Original file line number Diff line number Diff line change
Expand Up @@ -961,7 +961,6 @@ class Environment : public MemoryRetainer {

#endif // HAVE_INSPECTOR

inline void set_main_utf16(std::unique_ptr<v8::String::Value>);
inline void set_process_exit_handler(
std::function<void(Environment*, ExitCode)>&& handler);

Expand Down Expand Up @@ -1144,11 +1143,6 @@ class Environment : public MemoryRetainer {

std::unique_ptr<Realm> principal_realm_ = nullptr;

// Keeps the main script source alive is one was passed to LoadEnvironment().
// We should probably find a way to just use plain `v8::String`s created from
// the source passed to LoadEnvironment() directly instead.
std::unique_ptr<v8::String::Value> main_utf16_;

// Used by allocate_managed_buffer() and release_managed_buffer() to keep
// track of the BackingStore for a given pointer.
std::unordered_map<char*, std::unique_ptr<v8::BackingStore>>
Expand Down
25 changes: 13 additions & 12 deletions src/node_builtins.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "env-inl.h"
#include "node_external_reference.h"
#include "node_internals.h"
#include "simdutf.h"
#include "util-inl.h"

namespace node {
Expand Down Expand Up @@ -35,7 +36,6 @@ BuiltinLoader BuiltinLoader::instance_;

BuiltinLoader::BuiltinLoader() : config_(GetConfig()), has_code_cache_(false) {
LoadJavaScriptSource();
#if defined(NODE_HAVE_I18N_SUPPORT)
#ifdef NODE_SHARED_BUILTIN_CJS_MODULE_LEXER_LEXER_PATH
AddExternalizedBuiltin(
"internal/deps/cjs-module-lexer/lexer",
Expand All @@ -52,7 +52,6 @@ BuiltinLoader::BuiltinLoader() : config_(GetConfig()), has_code_cache_(false) {
AddExternalizedBuiltin("internal/deps/undici/undici",
STRINGIFY(NODE_SHARED_BUILTIN_UNDICI_UNDICI_PATH));
#endif // NODE_SHARED_BUILTIN_UNDICI_UNDICI_PATH
#endif // NODE_HAVE_I18N_SUPPORT
}

BuiltinLoader* BuiltinLoader::GetInstance() {
Expand Down Expand Up @@ -240,7 +239,6 @@ MaybeLocal<String> BuiltinLoader::LoadBuiltinSource(Isolate* isolate,
#endif // NODE_BUILTIN_MODULES_PATH
}

#if defined(NODE_HAVE_I18N_SUPPORT)
void BuiltinLoader::AddExternalizedBuiltin(const char* id,
const char* filename) {
std::string source;
Expand All @@ -252,16 +250,19 @@ void BuiltinLoader::AddExternalizedBuiltin(const char* id,
return;
}

icu::UnicodeString utf16 = icu::UnicodeString::fromUTF8(
icu::StringPiece(source.data(), source.length()));
auto source_utf16 = std::make_unique<icu::UnicodeString>(utf16);
Add(id,
UnionBytes(reinterpret_cast<const uint16_t*>((*source_utf16).getBuffer()),
utf16.length()));
// keep source bytes for builtin alive while BuiltinLoader exists
GetInstance()->externalized_source_bytes_.push_back(std::move(source_utf16));
Add(id, source);
}

bool BuiltinLoader::Add(const char* id, std::string_view utf8source) {
size_t expected_u16_length =
simdutf::utf16_length_from_utf8(utf8source.data(), utf8source.length());
std::shared_ptr<uint16_t[]> out{new uint16_t[expected_u16_length]};
size_t u16_length =
simdutf::convert_utf8_to_utf16le(utf8source.data(),
utf8source.length(),
reinterpret_cast<char16_t*>(out.get()));
return Add(id, UnionBytes(out, u16_length));
}
#endif // NODE_HAVE_I18N_SUPPORT

// Returns Local<Function> of the compiled module if return_code_cache
// is false (we are only compiling the function).
Expand Down
9 changes: 1 addition & 8 deletions src/node_builtins.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@
#include <memory>
#include <set>
#include <string>
#if defined(NODE_HAVE_I18N_SUPPORT)
#include <unicode/unistr.h>
#endif // NODE_HAVE_I18N_SUPPORT
#include <vector>
#include "node_mutex.h"
#include "node_union_bytes.h"
Expand Down Expand Up @@ -71,6 +68,7 @@ class NODE_EXTERN_PRIVATE BuiltinLoader {
static v8::Local<v8::String> GetConfigString(v8::Isolate* isolate);
static bool Exists(const char* id);
static bool Add(const char* id, const UnionBytes& source);
static bool Add(const char* id, std::string_view utf8source);

static bool CompileAllBuiltins(v8::Local<v8::Context> context);
static void RefreshCodeCache(const std::vector<CodeCacheInfo>& in);
Expand Down Expand Up @@ -136,18 +134,13 @@ class NODE_EXTERN_PRIVATE BuiltinLoader {
static void HasCachedBuiltins(
const v8::FunctionCallbackInfo<v8::Value>& args);

#if defined(NODE_HAVE_I18N_SUPPORT)
static void AddExternalizedBuiltin(const char* id, const char* filename);
#endif // NODE_HAVE_I18N_SUPPORT

static BuiltinLoader instance_;
BuiltinCategories builtin_categories_;
BuiltinSourceMap source_;
BuiltinCodeCacheMap code_cache_;
UnionBytes config_;
#if defined(NODE_HAVE_I18N_SUPPORT)
std::list<std::unique_ptr<icu::UnicodeString>> externalized_source_bytes_;
#endif // NODE_HAVE_I18N_SUPPORT

// Used to synchronize access to the code cache map
Mutex code_cache_mutex_;
Expand Down
8 changes: 7 additions & 1 deletion src/node_union_bytes.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,18 @@ class NonOwningExternalTwoByteResource

// Similar to a v8::String, but it's independent from Isolates
// and can be materialized in Isolates as external Strings
// via ToStringChecked. The data pointers are owned by the caller.
// via ToStringChecked.
class UnionBytes {
public:
UnionBytes(const uint16_t* data, size_t length)
: one_bytes_(nullptr), two_bytes_(data), length_(length) {}
UnionBytes(const uint8_t* data, size_t length)
: one_bytes_(data), two_bytes_(nullptr), length_(length) {}
template <typename T> // T = uint8_t or uint16_t
UnionBytes(std::shared_ptr<T[]> data, size_t length)
: UnionBytes(data.get(), length) {
owning_ptr_ = data;
}

UnionBytes(const UnionBytes&) = default;
UnionBytes& operator=(const UnionBytes&) = default;
Expand Down Expand Up @@ -94,6 +99,7 @@ class UnionBytes {
const uint8_t* one_bytes_;
const uint16_t* two_bytes_;
size_t length_;
std::shared_ptr<void> owning_ptr_;
};

} // namespace node
Expand Down
17 changes: 16 additions & 1 deletion test/cctest/test_util.cc
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#include "util-inl.h"
#include "debug_utils-inl.h"
#include "env-inl.h"
#include "gtest/gtest.h"
#include "simdutf.h"
#include "util-inl.h"

using node::Calloc;
using node::Malloc;
Expand Down Expand Up @@ -298,3 +299,17 @@ TEST(UtilTest, SPrintF) {
const std::string with_zero = std::string("a") + '\0' + 'b';
EXPECT_EQ(SPrintF("%s", with_zero), with_zero);
}

TEST(UtilTest, SimdutfEndiannessDoesNotMeanEndianness) {
// In simdutf, "LE" does *not* refer to Little Endian, it refers
// to 16-byte code units that are stored using *host* endianness.
// This is weird and confusing naming, and so we add this assertion
// here to verify that this is actually the case (so that CI tells
// us if it changed, because for most people Little Endian is
// host endianness, so locally everything would work fine).
const char utf8source[] = "\xe7\x8c\xab";
char16_t u16output;
size_t u16len = simdutf::convert_utf8_to_utf16le(utf8source, 3, &u16output);
EXPECT_EQ(u16len, 1u);
EXPECT_EQ(u16output, 0x732B);
}

0 comments on commit b14c6b4

Please sign in to comment.