From 99576191f1501c37578b6e1b8d6d9518c430edf5 Mon Sep 17 00:00:00 2001 From: Dominik Picheta Date: Wed, 8 Jan 2025 19:16:41 +0000 Subject: [PATCH] Fixes to external package loading: use external lock and load stdlib. --- src/pyodide/internal/loadPackage.ts | 13 +++-- src/pyodide/internal/metadata.ts | 6 +-- src/pyodide/internal/setupPackages.ts | 9 +++- src/pyodide/internal/snapshot.ts | 49 +++++++++++++++++++ src/pyodide/types/pyodide-lock.d.ts | 5 ++ .../types/runtime-generated/metadata.d.ts | 1 - src/workerd/api/pyodide/pyodide.c++ | 20 +++++++- src/workerd/api/pyodide/pyodide.h | 9 +--- src/workerd/io/compatibility-date.capnp | 2 +- 9 files changed, 94 insertions(+), 20 deletions(-) diff --git a/src/pyodide/internal/loadPackage.ts b/src/pyodide/internal/loadPackage.ts index 37e7d86752c..d0c599a7d67 100644 --- a/src/pyodide/internal/loadPackage.ts +++ b/src/pyodide/internal/loadPackage.ts @@ -18,6 +18,7 @@ import { } from 'pyodide-internal:metadata'; import { SITE_PACKAGES, + STDLIB_PACKAGES, getSitePackagesPath, } from 'pyodide-internal:setupPackages'; import { parseTarInfo } from 'pyodide-internal:tar'; @@ -113,7 +114,12 @@ async function loadPackagesImpl( let loadPromises: Promise<[string, Reader]>[] = []; let loading = []; for (const req of requirements) { - if (SITE_PACKAGES.loadedRequirements.has(req)) continue; + if (req === 'test') { + continue; // Skip the test package, it is only useful for internal Python regression testing. + } + if (SITE_PACKAGES.loadedRequirements.has(req)) { + continue; + } loadPromises.push(loadBundle(req).then((r) => [req, r])); loading.push(req); } @@ -135,9 +141,10 @@ async function loadPackagesImpl( } export async function loadPackages(Module: Module, requirements: Set) { + const pkgsToLoad = requirements.union(new Set(STDLIB_PACKAGES)); if (LOAD_WHEELS_FROM_R2) { - await loadPackagesImpl(Module, requirements, loadBundleFromR2); + await loadPackagesImpl(Module, pkgsToLoad, loadBundleFromR2); } else if (LOAD_WHEELS_FROM_ARTIFACT_BUNDLER) { - await loadPackagesImpl(Module, requirements, loadBundleFromArtifactBundler); + await loadPackagesImpl(Module, pkgsToLoad, loadBundleFromArtifactBundler); } } diff --git a/src/pyodide/internal/metadata.ts b/src/pyodide/internal/metadata.ts index 662748b5b7d..4535ae4bbed 100644 --- a/src/pyodide/internal/metadata.ts +++ b/src/pyodide/internal/metadata.ts @@ -1,5 +1,7 @@ import { default as MetadataReader } from 'pyodide-internal:runtime-generated/metadata'; import { default as PYODIDE_BUCKET } from 'pyodide-internal:generated/pyodide-bucket.json'; +// The pyodide-lock.json is read from the Python bundle (pyodide-capnp-bin). +import { default as PYODIDE_LOCK } from 'pyodide-internal:generated/pyodide-lock.json'; import { default as ArtifactBundler } from 'pyodide-internal:artifacts'; export const IS_WORKERD = MetadataReader.isWorkerd(); @@ -12,9 +14,7 @@ export const LOAD_WHEELS_FROM_R2: boolean = IS_WORKERD; export const LOAD_WHEELS_FROM_ARTIFACT_BUNDLER = MetadataReader.shouldUsePackagesInArtifactBundler(); export const PACKAGES_VERSION = MetadataReader.getPackagesVersion(); -export const LOCKFILE: PackageLock = JSON.parse( - MetadataReader.getPackagesLock() -); +export const LOCKFILE: PackageLock = PYODIDE_LOCK; export const REQUIREMENTS = MetadataReader.getRequirements(); export const MAIN_MODULE_NAME = MetadataReader.getMainModule(); export const MEMORY_SNAPSHOT_READER = MetadataReader.hasMemorySnapshot() diff --git a/src/pyodide/internal/setupPackages.ts b/src/pyodide/internal/setupPackages.ts index 0d20566fc9e..2ad7dbbc530 100644 --- a/src/pyodide/internal/setupPackages.ts +++ b/src/pyodide/internal/setupPackages.ts @@ -23,17 +23,21 @@ function canonicalizePackageName(name: string): string { } // The "name" field in the lockfile is not canonicalized -const STDLIB_PACKAGES: string[] = Object.values(LOCKFILE.packages) +export const STDLIB_PACKAGES: string[] = Object.values(LOCKFILE.packages) .filter(({ install_dir }) => install_dir === 'stdlib') .map(({ name }) => canonicalizePackageName(name)); +// Each item in the list is an element of the file path, for example +// `folder/file.txt` -> `["folder", "file.txt"] +export type FilePath = string[]; + /** * SitePackagesDir keeps track of the virtualized view of the site-packages * directory generated for each worker. */ class SitePackagesDir { public rootInfo: TarFSInfo; - public soFiles: string[][]; + public soFiles: FilePath[]; public loadedRequirements: Set; constructor() { this.rootInfo = { @@ -133,6 +137,7 @@ class SitePackagesDir { export function buildSitePackages(requirements: Set): SitePackagesDir { if (EmbeddedPackagesTarReader.read === undefined) { // Package retrieval is enabled, so the embedded tar reader isn't initialised. + // All packages, including STDLIB_PACKAGES, are loaded in `loadPackages`. return new SitePackagesDir(); } diff --git a/src/pyodide/internal/snapshot.ts b/src/pyodide/internal/snapshot.ts index b6cbd90b3af..e0c62e27195 100644 --- a/src/pyodide/internal/snapshot.ts +++ b/src/pyodide/internal/snapshot.ts @@ -2,6 +2,7 @@ import { default as ArtifactBundler } from 'pyodide-internal:artifacts'; import { default as UnsafeEval } from 'internal:unsafe-eval'; import { default as DiskCache } from 'pyodide-internal:disk_cache'; import { + FilePath, SITE_PACKAGES, getSitePackagesPath, } from 'pyodide-internal:setupPackages'; @@ -102,6 +103,49 @@ function loadDynlib( } } +/** + * This function is used to ensure the order in which we load SO_FILES stays the same. + * + * The sort always puts _lzma.so and _ssl.so + * first, because these SO_FILES are loaded in the baseline snapshot, and if we want to generate + * a package snapshot while a baseline snapshot is loaded we need them to be first. The rest of the + * files are sorted alphabetically. + * + * The `filePaths` list is of the form [["folder", "file.so"], ["file.so"]], so each element in it + * is effectively a file path. + */ +function sortSoFiles(filePaths: FilePath[]): FilePath[] { + let result = []; + let hasLzma = false; + let hasSsl = false; + const lzmaFile = '_lzma.so'; + const sslFile = '_ssl.so'; + for (const path of filePaths) { + if (path.length == 1 && path[0] == lzmaFile) { + hasLzma = true; + } else if (path.length == 1 && path[0] == sslFile) { + hasSsl = true; + } else { + result.push(path); + } + } + + // JS might handle sorting lists of lists fine, but I'd rather be explicit here and make it compare + // strings. + result = result + .map((x) => x.join('/')) + .sort() + .map((x) => x.split('/')); + if (hasSsl) { + result.unshift([sslFile]); + } + if (hasLzma) { + result.unshift([lzmaFile]); + } + + return result; +} + // used for checkLoadedSoFiles a snapshot sanity check const PRELOADED_SO_FILES: string[] = []; @@ -121,6 +165,11 @@ export function preloadDynamicLibs(Module: Module): void { if (IS_CREATING_BASELINE_SNAPSHOT || LOADED_BASELINE_SNAPSHOT) { SO_FILES_TO_LOAD = [['_lzma.so'], ['_ssl.so']]; } + // The order in which we load the SO_FILES matters. For example, if a snapshot was generated with + // SO_FILES loaded in a certain way, then if we load that snapshot and load the SO_FILES + // differently here then Python will crash. + SO_FILES_TO_LOAD = sortSoFiles(SO_FILES_TO_LOAD); + try { const sitePackages = getSitePackagesPath(Module); for (const soFile of SO_FILES_TO_LOAD) { diff --git a/src/pyodide/types/pyodide-lock.d.ts b/src/pyodide/types/pyodide-lock.d.ts index b58ab26e2f0..d769a734a41 100644 --- a/src/pyodide/types/pyodide-lock.d.ts +++ b/src/pyodide/types/pyodide-lock.d.ts @@ -16,3 +16,8 @@ interface PackageLock { [id: string]: PackageDeclaration; }; } + +declare module 'pyodide-internal:generated/pyodide-lock.json' { + const lock: PackageLock; + export default lock; +} diff --git a/src/pyodide/types/runtime-generated/metadata.d.ts b/src/pyodide/types/runtime-generated/metadata.d.ts index d7bb592780e..2886eff93cb 100644 --- a/src/pyodide/types/runtime-generated/metadata.d.ts +++ b/src/pyodide/types/runtime-generated/metadata.d.ts @@ -17,7 +17,6 @@ declare namespace MetadataReader { const disposeMemorySnapshot: () => void; const shouldUsePackagesInArtifactBundler: () => boolean; const getPackagesVersion: () => string; - const getPackagesLock: () => string; const read: (index: number, position: number, buffer: Uint8Array) => number; } diff --git a/src/workerd/api/pyodide/pyodide.c++ b/src/workerd/api/pyodide/pyodide.c++ index 02f443d4c66..475df58f7cf 100644 --- a/src/workerd/api/pyodide/pyodide.c++ +++ b/src/workerd/api/pyodide/pyodide.c++ @@ -4,6 +4,7 @@ #include "pyodide.h" #include +#include #include #include @@ -22,6 +23,22 @@ const kj::Maybe PyodideBundleManager::getPyodideBundle( [](const MessageBundlePair& t) { return t.bundle; }); } +kj::Maybe PyodideBundleManager::getPyodideLock( + PythonSnapshotRelease::Reader pythonSnapshotRelease) const { + auto bundleName = getPythonBundleName(pythonSnapshotRelease); + // We expect the Pyodide Bundle for the specified bundle name to already be downloaded here. + auto maybeBundle = getPyodideBundle(bundleName); + auto bundle = KJ_ASSERT_NONNULL(maybeBundle); + for (auto module: bundle.getModules()) { + if (module.which() == workerd::jsg::Module::JSON && + module.getName() == "pyodide-internal:generated/pyodide-lock.json") { + return kj::str(module.getJson()); + } + } + + return kj::none; +} + void PyodideBundleManager::setPyodideBundleData( kj::String version, kj::Array data) const { auto wordArray = kj::arrayPtr( @@ -440,8 +457,7 @@ jsg::Ref makePyodideMetadataReader( names.finish(), contents.finish(), requirements.finish(), - kj::str("20240829.4"), // TODO: hardcoded version & lock - kj::str(PYODIDE_LOCK.toString()), + kj::str("20240829.4"), // TODO: hardcoded version true /* isWorkerd */, false /* isTracing */, snapshotToDisk, diff --git a/src/workerd/api/pyodide/pyodide.h b/src/workerd/api/pyodide/pyodide.h index 05ddcc9ecab..78a1c81fb75 100644 --- a/src/workerd/api/pyodide/pyodide.h +++ b/src/workerd/api/pyodide/pyodide.h @@ -27,6 +27,7 @@ class PyodideBundleManager { public: void setPyodideBundleData(kj::String version, kj::Array data) const; const kj::Maybe getPyodideBundle(kj::StringPtr version) const; + kj::Maybe getPyodideLock(PythonSnapshotRelease::Reader pythonSnapshotRelease) const; private: struct MessageBundlePair { @@ -80,7 +81,6 @@ class PyodideMetadataReader: public jsg::Object { kj::Array> contents; kj::Array requirements; kj::String packagesVersion; - kj::String packagesLock; bool isWorkerdFlag; bool isTracingFlag; bool snapshotToDisk; @@ -94,7 +94,6 @@ class PyodideMetadataReader: public jsg::Object { kj::Array> contents, kj::Array requirements, kj::String packagesVersion, - kj::String packagesLock, bool isWorkerd, bool isTracing, bool snapshotToDisk, @@ -106,7 +105,6 @@ class PyodideMetadataReader: public jsg::Object { contents(kj::mv(contents)), requirements(kj::mv(requirements)), packagesVersion(kj::mv(packagesVersion)), - packagesLock(kj::mv(packagesLock)), isWorkerdFlag(isWorkerd), isTracingFlag(isTracing), snapshotToDisk(snapshotToDisk), @@ -169,10 +167,6 @@ class PyodideMetadataReader: public jsg::Object { return kj::str(packagesVersion); } - kj::String getPackagesLock() { - return kj::str(packagesLock); - } - JSG_RESOURCE_TYPE(PyodideMetadataReader) { JSG_METHOD(isWorkerd); JSG_METHOD(isTracing); @@ -189,7 +183,6 @@ class PyodideMetadataReader: public jsg::Object { JSG_METHOD(shouldSnapshotToDisk); JSG_METHOD(shouldUsePackagesInArtifactBundler); JSG_METHOD(getPackagesVersion); - JSG_METHOD(getPackagesLock); JSG_METHOD(isCreatingBaselineSnapshot); } diff --git a/src/workerd/io/compatibility-date.capnp b/src/workerd/io/compatibility-date.capnp index 9c7aac02604..ed76ba371ae 100644 --- a/src/workerd/io/compatibility-date.capnp +++ b/src/workerd/io/compatibility-date.capnp @@ -430,7 +430,7 @@ struct CompatibilityFlags @0x8f8c1b68151b6cef { pythonWorkers @43 :Bool $compatEnableFlag("python_workers") $pythonSnapshotRelease(pyodide = "0.26.0a2", pyodideRevision = "2024-03-01", - packages = "2024-03-01", backport = 12, + packages = "2024-03-01", backport = 13, baselineSnapshotHash = "d13ce2f4a0ade2e09047b469874dacf4d071ed3558fec4c26f8d0b99d95f77b5") $impliedByAfterDate(name = "pythonWorkersDevPyodide", date = "2000-01-01"); # Enables Python Workers. Access to this flag is not restricted, instead bundles containing