From 7ddfdc7ea6d93a57eaf61e434b57acee53fbb78c Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Wed, 7 Mar 2018 15:55:46 -0500 Subject: [PATCH 1/4] An experimental version of the Data Access API, that redirects to S3 instead of streaming. --- .../edu/harvard/iq/dataverse/api/Access.java | 28 +++------ .../dataverse/api/DownloadInstanceWriter.java | 55 ++++++++++++++++-- .../iq/dataverse/dataaccess/S3AccessIO.java | 57 +++++++++++++++++++ 3 files changed, 115 insertions(+), 25 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Access.java b/src/main/java/edu/harvard/iq/dataverse/api/Access.java index 01e2fbd9728..e7e76f8d985 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Access.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Access.java @@ -187,7 +187,7 @@ public BundleDownloadInstance datafileBundle(@PathParam("fileId") Long fileId, @ @Path("datafile/{fileId}") @GET @Produces({ "application/xml" }) - public DownloadInstance datafile(@PathParam("fileId") Long fileId, @QueryParam("gbrecs") Boolean gbrecs, @QueryParam("key") String apiToken, @Context UriInfo uriInfo, @Context HttpHeaders headers, @Context HttpServletResponse response) /*throws NotFoundException, ServiceUnavailableException, PermissionDeniedException, AuthorizationRequiredException*/ { + public DownloadInstance datafile(@PathParam("fileId") Long fileId, @QueryParam("gbrecs") Boolean gbrecs, @QueryParam("key") String apiToken, @Context UriInfo uriInfo, @Context HttpHeaders headers, @Context HttpServletResponse response) { DataFile df = dataFileService.find(fileId); GuestbookResponse gbr = null; @@ -197,6 +197,11 @@ public DownloadInstance datafile(@PathParam("fileId") Long fileId, @QueryParam(" throw new WebApplicationException(Response.Status.NOT_FOUND); } + if (df.isHarvested()) { + throw new WebApplicationException(Response.Status.NOT_FOUND); + // (nobody should ever be using this API on a harvested DataFile)! + } + if (apiToken == null || apiToken.equals("")) { apiToken = headers.getHeaderString(API_KEY_HEADER); } @@ -445,13 +450,8 @@ public DownloadInstance tabularDatafileMetadataPreprocessed(@PathParam("fileId") @Path("datafiles/{fileIds}") @GET @Produces({"application/zip"}) - public /*ZippedDownloadInstance*/ Response datafiles(@PathParam("fileIds") String fileIds, @QueryParam("gbrecs") Boolean gbrecs, @QueryParam("key") String apiTokenParam, @Context UriInfo uriInfo, @Context HttpHeaders headers, @Context HttpServletResponse response) throws WebApplicationException /*throws NotFoundException, ServiceUnavailableException, PermissionDeniedException, AuthorizationRequiredException*/ { - // create a Download Instance without, without a primary Download Info object: - //ZippedDownloadInstance downloadInstance = new ZippedDownloadInstance(); + public Response datafiles(@PathParam("fileIds") String fileIds, @QueryParam("gbrecs") Boolean gbrecs, @QueryParam("key") String apiTokenParam, @Context UriInfo uriInfo, @Context HttpHeaders headers, @Context HttpServletResponse response) throws WebApplicationException /*throws NotFoundException, ServiceUnavailableException, PermissionDeniedException, AuthorizationRequiredException*/ { - - - long setLimit = systemConfig.getZipDownloadLimit(); if (!(setLimit > 0L)) { setLimit = DataFileZipper.DEFAULT_ZIPFILE_LIMIT; @@ -563,20 +563,6 @@ public void write(OutputStream os) throws IOException, return Response.ok(stream).build(); } - - /* - * Geting rid of the tempPreview API - it's always been a big, fat hack. - * the edit files page is now using the Base64 image strings in the preview - * URLs, just like the search and dataset pages. - @Path("tempPreview/{fileSystemId}") - @GET - @Produces({"image/png"}) - public InputStream tempPreview(@PathParam("fileSystemId") String fileSystemId, @Context UriInfo uriInfo, @Context HttpHeaders headers, @Context HttpServletResponse response) { - - }*/ - - - @Path("fileCardImage/{fileId}") @GET @Produces({ "image/png" }) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java index df4cffae28c..fe78ca91b6e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java @@ -29,9 +29,12 @@ import edu.harvard.iq.dataverse.engine.command.impl.CreateGuestbookResponseCommand; import java.io.File; import java.io.FileInputStream; +import java.net.URI; +import java.net.URISyntaxException; import java.util.ArrayList; import java.util.List; import java.util.logging.Logger; +import javax.ws.rs.RedirectionException; /** * @@ -206,6 +209,45 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] if (storageIO == null) { throw new WebApplicationException(Response.Status.SERVICE_UNAVAILABLE); } + } else { + // [immediate]todo: make this mechanism configurable! + if (storageIO instanceof S3AccessIO && isRedirectToS3()) { + // [attempt to] redirect: + String redirect_url_str = ((S3AccessIO)storageIO).generateTemporaryS3Url(); + // better exception handling here? + logger.info("Data Access API: direct S3 url: "+redirect_url_str); + URI redirect_uri; + + try { + redirect_uri = new URI(redirect_url_str); + } catch (URISyntaxException ex) { + logger.info("Data Access API: failed to create S3 redirect url ("+redirect_url_str+")"); + redirect_uri = null; + } + if (redirect_uri != null) { + // definitely close the (still open) S3 input stream, + // since we are not going to use it. The S3 documentation + // emphasizes that it is very important not to leave these + // lying around un-closed, since they are going to fill + // up the S3 connection pool! + storageIO.getInputStream().close(); + + // increment the download count, if necessary: + if (di.getGbr() != null) { + try { + logger.fine("writing guestbook response, for an S3 download redirect."); + Command cmd = new CreateGuestbookResponseCommand(di.getDataverseRequestService().getDataverseRequest(), di.getGbr(), di.getGbr().getDataFile().getOwner()); + di.getCommand().submit(cmd); + } catch (CommandException e) { + } + } + + // finally, issue the redirect: + Response response = Response.seeOther(redirect_uri).build(); + logger.info("Issuing redirect to the file location on S3."); + throw new RedirectionException(response); + } + } } InputStream instream = storageIO.getInputStream(); @@ -284,13 +326,10 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] logger.fine("writing guestbook response."); Command cmd = new CreateGuestbookResponseCommand(di.getDataverseRequestService().getDataverseRequest(), di.getGbr(), di.getGbr().getDataFile().getOwner()); di.getCommand().submit(cmd); - } catch (CommandException e) { - //if an error occurs here then download won't happen no need for response recs... - } + } catch (CommandException e) {} } else { logger.fine("not writing guestbook response"); } - instream.close(); outstream.close(); @@ -376,5 +415,13 @@ private long getFileSize(DownloadInstance di, String extraHeader) { } return -1; } + + private boolean isRedirectToS3() { + String optionValue = System.getProperty("dataverse.files.s3-download-redirect"); + if ("true".equalsIgnoreCase(optionValue)) { + return true; + } + return false; + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java index b75516e829d..ac4af2e01dd 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java @@ -1,6 +1,7 @@ package edu.harvard.iq.dataverse.dataaccess; import com.amazonaws.AmazonClientException; +import com.amazonaws.HttpMethod; import com.amazonaws.SdkClientException; import com.amazonaws.auth.AWSCredentials; import com.amazonaws.auth.AWSCredentialsProvider; @@ -15,10 +16,12 @@ import com.amazonaws.services.s3.model.DeleteObjectRequest; import com.amazonaws.services.s3.model.DeleteObjectsRequest; import com.amazonaws.services.s3.model.DeleteObjectsRequest.KeyVersion; +import com.amazonaws.services.s3.model.GeneratePresignedUrlRequest; import com.amazonaws.services.s3.model.GetObjectRequest; import com.amazonaws.services.s3.model.ListObjectsRequest; import com.amazonaws.services.s3.model.MultiObjectDeleteException; import com.amazonaws.services.s3.model.ObjectListing; +import com.amazonaws.services.s3.model.ResponseHeaderOverrides; import com.amazonaws.services.s3.model.S3Object; import com.amazonaws.services.s3.model.S3ObjectSummary; import edu.harvard.iq.dataverse.DataFile; @@ -35,6 +38,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.net.URL; import java.nio.channels.Channel; import java.nio.channels.Channels; import java.nio.channels.WritableByteChannel; @@ -624,4 +628,57 @@ private String getMainFileKey() throws IOException { return key; } + + public String generateTemporaryS3Url() throws IOException { + //Questions: + // Q. Should this work for private and public? + // A. Yes! Since the URL has a limited, short life span. -- L.A. + // Q. how long should the download url work? + // A. 1 hour by default seems like an OK number. Making it configurable seems like a good idea too. -- L.A. + if (s3 == null) { + throw new IOException("ERROR: s3 not initialised. "); + } + if (dvObject instanceof DataFile) { + key = getMainFileKey(); + java.util.Date expiration = new java.util.Date(); + long msec = expiration.getTime(); + msec += 1000 * getUrlExpirationMinutes(); + expiration.setTime(msec); + + GeneratePresignedUrlRequest generatePresignedUrlRequest = + new GeneratePresignedUrlRequest(bucketName, key); + generatePresignedUrlRequest.setMethod(HttpMethod.GET); // Default. + generatePresignedUrlRequest.setExpiration(expiration); + ResponseHeaderOverrides responseHeaders = new ResponseHeaderOverrides(); + responseHeaders.setContentDisposition("attachment; filename="+this.getDataFile().getDisplayName()); + responseHeaders.setContentType(this.getDataFile().getContentType()); + generatePresignedUrlRequest.setResponseHeaders(responseHeaders); + + URL s = s3.generatePresignedUrl(generatePresignedUrlRequest); + + return s.toString(); + } else if (dvObject instanceof Dataset) { + throw new IOException("Data Access: GenerateTemporaryS3Url: Invalid DvObject type : Dataset"); + } else if (dvObject instanceof Dataverse) { + throw new IOException("Data Access: Invalid DvObject type : Dataverse"); + } else { + throw new IOException("Data Access: Invalid DvObject type"); + } + } + + private int getUrlExpirationMinutes() { + String optionValue = System.getProperty("dataverse.files.s3-url-expiration-minutes"); + if (optionValue != null) { + Integer num; + try { + num = new Integer(optionValue); + } catch (NumberFormatException ex) { + num = null; + } + if (num != null) { + return num; + } + } + return 60; + } } From 5c6a42f64f32b5002b132cb51751fb0dea3ad239 Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Wed, 7 Mar 2018 16:49:59 -0500 Subject: [PATCH 2/4] removed an unnecessary comment (#4486) --- .../edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java index fe78ca91b6e..d0f3880f6f5 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java @@ -210,7 +210,6 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] throw new WebApplicationException(Response.Status.SERVICE_UNAVAILABLE); } } else { - // [immediate]todo: make this mechanism configurable! if (storageIO instanceof S3AccessIO && isRedirectToS3()) { // [attempt to] redirect: String redirect_url_str = ((S3AccessIO)storageIO).generateTemporaryS3Url(); From 02c974d1a3bab2f002d24c97a7d5cf37d221344f Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Thu, 8 Mar 2018 13:00:36 -0500 Subject: [PATCH 3/4] document new JVM options for S3 #4486 --- doc/sphinx-guides/source/installation/config.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 1e36e265d46..d4de3928d83 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -339,6 +339,14 @@ Then, we'll need to identify which S3 bucket we're using. Replace ``your_bucket_ ``./asadmin create-jvm-options "-Ddataverse.files.s3-bucket-name=your_bucket_name"`` +Optionally, you can have users download files from S3 directly rather than having files pass from S3 through Glassfish to your users. To accomplish this, set ``dataverse.files.s3-download-redirect`` to ``true`` like this: + +``./asadmin create-jvm-options "-Ddataverse.files.s3-download-redirect=true"`` + +If you enable ``dataverse.files.s3-download-redirect`` as described above, note that the S3 URLs expire after an hour by default but you can configure the expiration time using the ``dataverse.files.s3-url-expiration-minutes`` JVM option. Here's an example of setting the expiration time to 120 minutes: + +``./asadmin create-jvm-options "-D dataverse.files.s3-url-expiration-minutes=120"`` + Lastly, go ahead and restart your glassfish server. With Dataverse deployed and the site online, you should be able to upload datasets and data files and see the corresponding files in your S3 bucket. Within a bucket, the folder structure emulates that found in local file storage. .. _Branding Your Installation: From c6c1896b3b7e16da3683027079bf13e6546ffc80 Mon Sep 17 00:00:00 2001 From: landreev Date: Thu, 8 Mar 2018 16:20:46 -0500 Subject: [PATCH 4/4] Exclude tabular file from redirecting to S3 --- .../edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java index d0f3880f6f5..4081d710389 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java @@ -210,7 +210,7 @@ public void writeTo(DownloadInstance di, Class clazz, Type type, Annotation[] throw new WebApplicationException(Response.Status.SERVICE_UNAVAILABLE); } } else { - if (storageIO instanceof S3AccessIO && isRedirectToS3()) { + if (storageIO instanceof S3AccessIO && !(dataFile.isTabularData()) && isRedirectToS3()) { // [attempt to] redirect: String redirect_url_str = ((S3AccessIO)storageIO).generateTemporaryS3Url(); // better exception handling here?