-
Notifications
You must be signed in to change notification settings - Fork 50
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
RESTful API: upload/delete #452
Changes from 1 commit
a1b1be5
3d60a08
de6f2b8
dd5847a
41018ed
ef13cd6
930f5a5
fe2e5bd
8baccbb
b980d09
8514e55
3d85d16
e473621
55660c1
b4efdb9
ddc9296
bac8f6a
e9e4923
7608c35
df8f24a
4c266b4
d2e3351
cbc96fb
28edaee
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,13 +5,14 @@ | |
*/ | ||
|
||
const {parse: parseContentType} = require("content-type"); | ||
const {Forbidden, InternalServerError, NotFound, Unauthorized} = require("http-errors"); | ||
const {Forbidden, InternalServerError, NotFound, Unauthorized, UnsupportedMediaType} = require("http-errors"); | ||
const negotiateMediaType = require("negotiator/lib/mediaType"); | ||
const stream = require("stream"); | ||
const {promisify} = require("util"); | ||
const zlib = require("zlib"); | ||
const readStream = require("raw-body"); | ||
|
||
const {contentTypesProvided} = require("../negotiate"); | ||
const {contentTypesProvided, contentTypesConsumed} = require("../negotiate"); | ||
const {fetch, Request} = require("../fetch"); | ||
const sources = require("../sources"); | ||
const utils = require("../utils"); | ||
|
@@ -135,6 +136,9 @@ const ifDatasetExists = async (req, res, next) => { | |
}; | ||
|
||
|
||
/* GET | ||
*/ | ||
|
||
/* XXX TODO: Support automatically translating v1 (meta and tree) to v2 | ||
* (main) if the latter is requested but only the former is available? | ||
* We could, but maybe not worth it for these new endpoints, unless/until we | ||
|
@@ -170,6 +174,34 @@ const getDataset = contentTypesProvided([ | |
]); | ||
|
||
|
||
/* PUT | ||
*/ | ||
const receiveDatasetSubresource = type => | ||
receiveSubresource(req => req.context.dataset.subresource(type)); | ||
|
||
|
||
const putDatasetSubresource = type => contentTypesConsumed([ | ||
[`application/vnd.nextstrain.dataset.${type}+json`, receiveDatasetSubresource(type)], | ||
]); | ||
|
||
|
||
const putDatasetMain = putDatasetSubresource("main"); | ||
const putDatasetRootSequence = putDatasetSubresource("root-sequence"); | ||
const putDatasetTipFrequencies = putDatasetSubresource("tip-frequencies"); | ||
|
||
|
||
const putDataset = contentTypesConsumed([ | ||
["application/vnd.nextstrain.dataset.main+json", putDatasetMain], | ||
["application/vnd.nextstrain.dataset.root-sequence+json", putDatasetRootSequence], | ||
["application/vnd.nextstrain.dataset.tip-frequencies+json", putDatasetTipFrequencies], | ||
|
||
/* XXX TODO: Support v1 (meta and tree) too? We could, but maybe ok to say | ||
* "just v2" for these new endpoints. | ||
* -trs, 22 Nov 2021 | ||
*/ | ||
]); | ||
|
||
|
||
/* Narratives | ||
*/ | ||
|
||
|
@@ -198,6 +230,8 @@ const ifNarrativeExists = async (req, res, next) => { | |
}; | ||
|
||
|
||
/* GET | ||
*/ | ||
const sendNarrativeSubresource = type => | ||
sendSubresource(req => req.context.narrative.subresource(type)); | ||
|
||
|
@@ -215,6 +249,22 @@ const getNarrative = contentTypesProvided([ | |
]); | ||
|
||
|
||
/* PUT | ||
*/ | ||
const receiveNarrativeSubresource = type => | ||
receiveSubresource(req => req.context.narrative.subresource(type)); | ||
|
||
|
||
const putNarrativeMarkdown = contentTypesConsumed([ | ||
["text/vnd.nextstrain.narrative+markdown", receiveNarrativeSubresource("md")], | ||
]); | ||
|
||
|
||
const putNarrative = contentTypesConsumed([ | ||
["text/vnd.nextstrain.narrative+markdown", putNarrativeMarkdown], | ||
]); | ||
|
||
|
||
/** | ||
* Split a dataset or narrative `path` into an array of parts. | ||
* | ||
|
@@ -284,6 +334,108 @@ function sendSubresource(subresourceExtractor) { | |
} | ||
|
||
|
||
/** | ||
* Generate an Express endpoint that receives a dataset or narrative | ||
* Subresource determined by the request. | ||
* | ||
* @param {subresourceExtractor} subresourceExtractor - Function to provide the Subresource instance from the request | ||
* @returns {expressEndpointAsync} | ||
*/ | ||
function receiveSubresource(subresourceExtractor) { | ||
return async (req, res) => { | ||
const method = "PUT"; | ||
const subresource = subresourceExtractor(req); | ||
|
||
/* Proxy the data through us: | ||
* | ||
* client (browser, CLI, etc) ⟷ us (nextstrain.org) ⟷ upstream source | ||
*/ | ||
// eslint-disable-next-line prefer-const | ||
let headers = { | ||
"Content-Type": subresource.mediaType, | ||
...copyHeaders(req, ["Content-Encoding", "Content-Length"]), | ||
|
||
/* XXX TODO: Consider setting Cache-Control rather than relying on | ||
* ambiguous defaults. Potentially impacts: | ||
* | ||
* - Our own fetch() caching, including in sendSubresource() above | ||
* - Our Charon endpoints, if upstream headers are sent to the browser? | ||
* - CloudFront caching (not sure about its default behaviour) | ||
* - Browsers, if fetched directly, such as by redirection | ||
* | ||
* I think a cautious initial value would be to set "private" or "public" | ||
* depending on the Source and then always set "must-revalidate, | ||
* proxy-revalidate, max-age=0". This would allow caches (ours, | ||
* browsers, CloudFront?) to store the data but always check upstream | ||
* with conditional requests. | ||
* -trs, 7 Dec 2021 | ||
*/ | ||
}; | ||
|
||
// Body of the request as a Node stream | ||
let body = req; | ||
|
||
// Compress on the fly to gzip if it's not already gzip compressed. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. General question about server load with the functionality being introduced here - is the idea to scale up heroku instances if this affects performance? Do you expect any noticeable changes to performance given that for the foreseeable future uploading data is much rarer than accessing data? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, I don't expect ~any or much noticeable change given that uploading is much rarer than viewing. Additionally, |
||
if (!headers["Content-Encoding"]) { | ||
delete headers["Content-Length"]; // won't be valid after compression | ||
headers["Content-Encoding"] = "gzip"; | ||
body = body.pipe(zlib.createGzip()); | ||
} | ||
|
||
if (headers["Content-Encoding"] !== "gzip") { | ||
throw new UnsupportedMediaType("unsupported Content-Encoding; only gzip is supported"); | ||
} | ||
|
||
/* Our upstreams for PUTs are all S3, and S3 requires a Content-Length | ||
* header (i.e. doesn't accept streaming PUTs). If we don't have a | ||
* Content-Length from the request (i.e. the request is a streaming PUT or | ||
* we're doing on-the-fly compression), then we have to buffer the entire | ||
* body into memory so we can calculate length for S3. | ||
* | ||
* An alternative to buffering the whole body is to use S3's multipart | ||
* upload API, but the minimum part size is 5MB so some buffering would be | ||
* required anyway. Multipart uploads would add inherent complexity at | ||
* runtime and also design time, as we'd have to rework our data model. | ||
* | ||
* In a review of all the (compressed) core and group datasets (nearly | ||
* 11k), over 99% are less than 5MB and none are more than 15MB. Given | ||
* that we'd only be able to use multipart uploads for less than 1% of | ||
* datasets and even the largest datasets would fit comfortably in memory, | ||
* it doesn't seem worth implementing. | ||
* | ||
* Allow buffering up to 20MB of data after gzip compression (guaranteed by | ||
* Content-Encoding handling above). Requests that exceed this will get a | ||
* 413 error (thrown by readStream()), and if this becomes an issue we can | ||
* consider bumping the limit. Clients also have the option of | ||
* pre-compressing the data and including a Content-Length themselves so we | ||
* don't have to buffer it, in which case we don't limit request sizes. | ||
* -trs, 21 Jan 2022 | ||
*/ | ||
if (!headers["Content-Length"]) { | ||
body = await readStream(body, { limit: 20_000_000 /* 20MB */ }); | ||
} | ||
joverlee521 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
const subresourceUrl = await subresource.url(method, { | ||
"Content-Type": headers["Content-Type"], | ||
"Content-Encoding": headers["Content-Encoding"], | ||
}); | ||
|
||
const upstreamRes = await fetch(subresourceUrl, {method, body, headers}); | ||
|
||
switch (upstreamRes.status) { | ||
case 200: | ||
case 204: | ||
break; | ||
|
||
default: | ||
throw new InternalServerError(`upstream said: ${upstreamRes.status} ${upstreamRes.statusText}`); | ||
} | ||
|
||
return res.status(204).end(); | ||
}; | ||
} | ||
|
||
|
||
/** | ||
* Fetch from an upstream server and stream the response body back through as | ||
* our own response body. | ||
|
@@ -499,8 +651,10 @@ module.exports = { | |
canonicalizeDataset, | ||
ifDatasetExists, | ||
getDataset, | ||
putDataset, | ||
|
||
setNarrative, | ||
ifNarrativeExists, | ||
getNarrative, | ||
putNarrative, | ||
}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think covered in another PR, but I don't consider this a todo.