Merge pull request #4489 from IQSS/4486-redirect-to-s3

redirect to S3 instead of streaming.
IQSS · Mar 8, 2018 · d37efb8 · d37efb8
2 parents 8346361 + c6c1896
commit d37efb8
Show file tree

Hide file tree

Showing 4 changed files with 122 additions and 25 deletions.
diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst
@@ -339,6 +339,14 @@ Then, we'll need to identify which S3 bucket we're using. Replace ``your_bucket_
 
 ``./asadmin create-jvm-options "-Ddataverse.files.s3-bucket-name=your_bucket_name"``
 
+Optionally, you can have users download files from S3 directly rather than having files pass from S3 through Glassfish to your users. To accomplish this, set ``dataverse.files.s3-download-redirect`` to ``true`` like this:
+
+``./asadmin create-jvm-options "-Ddataverse.files.s3-download-redirect=true"``
+
+If you enable ``dataverse.files.s3-download-redirect`` as described above, note that the S3 URLs expire after an hour by default but you can configure the expiration time using the ``dataverse.files.s3-url-expiration-minutes`` JVM option. Here's an example of setting the expiration time to 120 minutes:
+
+``./asadmin create-jvm-options "-D dataverse.files.s3-url-expiration-minutes=120"``
+
 Lastly, go ahead and restart your glassfish server. With Dataverse deployed and the site online, you should be able to upload datasets and data files and see the corresponding files in your S3 bucket. Within a bucket, the folder structure emulates that found in local file storage.
 
 .. _Branding Your Installation:

diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Access.java b/src/main/java/edu/harvard/iq/dataverse/api/Access.java
@@ -187,7 +187,7 @@ public BundleDownloadInstance datafileBundle(@PathParam("fileId") Long fileId, @
     @Path("datafile/{fileId}")
     @GET
     @Produces({ "application/xml" })
-    public DownloadInstance datafile(@PathParam("fileId") Long fileId, @QueryParam("gbrecs") Boolean gbrecs, @QueryParam("key") String apiToken, @Context UriInfo uriInfo, @Context HttpHeaders headers, @Context HttpServletResponse response) /*throws NotFoundException, ServiceUnavailableException, PermissionDeniedException, AuthorizationRequiredException*/ {                
+    public DownloadInstance datafile(@PathParam("fileId") Long fileId, @QueryParam("gbrecs") Boolean gbrecs, @QueryParam("key") String apiToken, @Context UriInfo uriInfo, @Context HttpHeaders headers, @Context HttpServletResponse response) {                
         DataFile df = dataFileService.find(fileId);
         GuestbookResponse gbr = null;    
 
@@ -197,6 +197,11 @@ public DownloadInstance datafile(@PathParam("fileId") Long fileId, @QueryParam("
             throw new WebApplicationException(Response.Status.NOT_FOUND);
         }
 
+        if (df.isHarvested()) {
+            throw new WebApplicationException(Response.Status.NOT_FOUND);
+            // (nobody should ever be using this API on a harvested DataFile)!
+        }
+
         if (apiToken == null || apiToken.equals("")) {
             apiToken = headers.getHeaderString(API_KEY_HEADER);
         }
@@ -445,13 +450,8 @@ public DownloadInstance tabularDatafileMetadataPreprocessed(@PathParam("fileId")
     @Path("datafiles/{fileIds}")
     @GET
     @Produces({"application/zip"})
-    public /*ZippedDownloadInstance*/ Response datafiles(@PathParam("fileIds") String fileIds,  @QueryParam("gbrecs") Boolean gbrecs, @QueryParam("key") String apiTokenParam, @Context UriInfo uriInfo, @Context HttpHeaders headers, @Context HttpServletResponse response) throws WebApplicationException /*throws NotFoundException, ServiceUnavailableException, PermissionDeniedException, AuthorizationRequiredException*/ {
-        // create a Download Instance without, without a primary Download Info object:
-        //ZippedDownloadInstance downloadInstance = new ZippedDownloadInstance();
+    public Response datafiles(@PathParam("fileIds") String fileIds,  @QueryParam("gbrecs") Boolean gbrecs, @QueryParam("key") String apiTokenParam, @Context UriInfo uriInfo, @Context HttpHeaders headers, @Context HttpServletResponse response) throws WebApplicationException /*throws NotFoundException, ServiceUnavailableException, PermissionDeniedException, AuthorizationRequiredException*/ {
 
-
-
-
         long setLimit = systemConfig.getZipDownloadLimit();
         if (!(setLimit > 0L)) {
             setLimit = DataFileZipper.DEFAULT_ZIPFILE_LIMIT;
@@ -563,20 +563,6 @@ public void write(OutputStream os) throws IOException,
         return Response.ok(stream).build();
     }
 
-
-    /* 
-     * Geting rid of the tempPreview API - it's always been a big, fat hack. 
-     * the edit files page is now using the Base64 image strings in the preview 
-     * URLs, just like the search and dataset pages.
-    @Path("tempPreview/{fileSystemId}")
-    @GET
-    @Produces({"image/png"})
-    public InputStream tempPreview(@PathParam("fileSystemId") String fileSystemId, @Context UriInfo uriInfo, @Context HttpHeaders headers, @Context HttpServletResponse response) {
-
-    }*/
-
-
-
     @Path("fileCardImage/{fileId}")
     @GET
     @Produces({ "image/png" })

diff --git a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstanceWriter.java
@@ -29,9 +29,12 @@
 import edu.harvard.iq.dataverse.engine.command.impl.CreateGuestbookResponseCommand;
 import java.io.File;
 import java.io.FileInputStream;
+import java.net.URI;
+import java.net.URISyntaxException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.logging.Logger;
+import javax.ws.rs.RedirectionException;
 
 /**
  *
@@ -206,6 +209,44 @@ public void writeTo(DownloadInstance di, Class<?> clazz, Type type, Annotation[]
                     if (storageIO == null) {
                         throw new WebApplicationException(Response.Status.SERVICE_UNAVAILABLE);
                     }
+                } else {
+                    if (storageIO instanceof S3AccessIO && !(dataFile.isTabularData()) && isRedirectToS3()) {
+                        // [attempt to] redirect: 
+                        String redirect_url_str = ((S3AccessIO)storageIO).generateTemporaryS3Url();
+                        // better exception handling here? 
+                        logger.info("Data Access API: direct S3 url: "+redirect_url_str);
+                        URI redirect_uri; 
+
+                        try {
+                            redirect_uri = new URI(redirect_url_str);
+                        } catch (URISyntaxException ex) {
+                            logger.info("Data Access API: failed to create S3 redirect url ("+redirect_url_str+")");
+                            redirect_uri = null; 
+                        }
+                        if (redirect_uri != null) {
+                            // definitely close the (still open) S3 input stream, 
+                            // since we are not going to use it. The S3 documentation
+                            // emphasizes that it is very important not to leave these
+                            // lying around un-closed, since they are going to fill 
+                            // up the S3 connection pool!
+                            storageIO.getInputStream().close();
+
+                            // increment the download count, if necessary:
+                            if (di.getGbr() != null) {
+                                try {
+                                    logger.fine("writing guestbook response, for an S3 download redirect.");
+                                    Command<?> cmd = new CreateGuestbookResponseCommand(di.getDataverseRequestService().getDataverseRequest(), di.getGbr(), di.getGbr().getDataFile().getOwner());
+                                    di.getCommand().submit(cmd);
+                                } catch (CommandException e) {
+                                }
+                            }
+
+                            // finally, issue the redirect:
+                            Response response = Response.seeOther(redirect_uri).build();
+                            logger.info("Issuing redirect to the file location on S3.");
+                            throw new RedirectionException(response);
+                        }
+                    }
                 }
 
                 InputStream instream = storageIO.getInputStream();
@@ -284,13 +325,10 @@ public void writeTo(DownloadInstance di, Class<?> clazz, Type type, Annotation[]
                             logger.fine("writing guestbook response.");
                             Command<?> cmd = new CreateGuestbookResponseCommand(di.getDataverseRequestService().getDataverseRequest(), di.getGbr(), di.getGbr().getDataFile().getOwner());
                             di.getCommand().submit(cmd);
-                        } catch (CommandException e) {
-                            //if an error occurs here then download won't happen no need for response recs...
-                        }
+                        } catch (CommandException e) {}
                     } else {
                         logger.fine("not writing guestbook response");
                     } 
-
 
                     instream.close();
                     outstream.close(); 
@@ -376,5 +414,13 @@ private long getFileSize(DownloadInstance di, String extraHeader) {
         }
         return -1;
     }
+
+    private boolean isRedirectToS3() {
+        String optionValue = System.getProperty("dataverse.files.s3-download-redirect");
+        if ("true".equalsIgnoreCase(optionValue)) {
+            return true;
+        }
+        return false;
+    }
 
 }
diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java
@@ -1,6 +1,7 @@
 package edu.harvard.iq.dataverse.dataaccess;
 
 import com.amazonaws.AmazonClientException;
+import com.amazonaws.HttpMethod;
 import com.amazonaws.SdkClientException;
 import com.amazonaws.auth.AWSCredentials;
 import com.amazonaws.auth.AWSCredentialsProvider;
@@ -15,10 +16,12 @@
 import com.amazonaws.services.s3.model.DeleteObjectRequest;
 import com.amazonaws.services.s3.model.DeleteObjectsRequest;
 import com.amazonaws.services.s3.model.DeleteObjectsRequest.KeyVersion;
+import com.amazonaws.services.s3.model.GeneratePresignedUrlRequest;
 import com.amazonaws.services.s3.model.GetObjectRequest;
 import com.amazonaws.services.s3.model.ListObjectsRequest;
 import com.amazonaws.services.s3.model.MultiObjectDeleteException;
 import com.amazonaws.services.s3.model.ObjectListing;
+import com.amazonaws.services.s3.model.ResponseHeaderOverrides;
 import com.amazonaws.services.s3.model.S3Object;
 import com.amazonaws.services.s3.model.S3ObjectSummary;
 import edu.harvard.iq.dataverse.DataFile;
@@ -35,6 +38,7 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
+import java.net.URL;
 import java.nio.channels.Channel;
 import java.nio.channels.Channels;
 import java.nio.channels.WritableByteChannel;
@@ -624,4 +628,57 @@ private String getMainFileKey() throws IOException {
 
         return key;
     }
+
+    public String generateTemporaryS3Url() throws IOException {
+        //Questions:
+        // Q. Should this work for private and public?
+        // A. Yes! Since the URL has a limited, short life span. -- L.A. 
+        // Q. how long should the download url work?
+        // A. 1 hour by default seems like an OK number. Making it configurable seems like a good idea too. -- L.A.
+        if (s3 == null) {
+            throw new IOException("ERROR: s3 not initialised. ");
+        }
+        if (dvObject instanceof DataFile) {
+            key = getMainFileKey();
+            java.util.Date expiration = new java.util.Date();
+            long msec = expiration.getTime();
+            msec += 1000 * getUrlExpirationMinutes();
+            expiration.setTime(msec);
+
+            GeneratePresignedUrlRequest generatePresignedUrlRequest = 
+                          new GeneratePresignedUrlRequest(bucketName, key);
+            generatePresignedUrlRequest.setMethod(HttpMethod.GET); // Default.
+            generatePresignedUrlRequest.setExpiration(expiration);
+            ResponseHeaderOverrides responseHeaders = new ResponseHeaderOverrides();
+            responseHeaders.setContentDisposition("attachment; filename="+this.getDataFile().getDisplayName());
+            responseHeaders.setContentType(this.getDataFile().getContentType());
+            generatePresignedUrlRequest.setResponseHeaders(responseHeaders);
+
+            URL s = s3.generatePresignedUrl(generatePresignedUrlRequest); 
+
+            return s.toString();
+        } else if (dvObject instanceof Dataset) {
+            throw new IOException("Data Access: GenerateTemporaryS3Url: Invalid DvObject type : Dataset");
+        } else if (dvObject instanceof Dataverse) {
+            throw new IOException("Data Access: Invalid DvObject type : Dataverse");
+        } else {
+            throw new IOException("Data Access: Invalid DvObject type");
+        }
+    }
+
+    private int getUrlExpirationMinutes() {
+        String optionValue = System.getProperty("dataverse.files.s3-url-expiration-minutes"); 
+        if (optionValue != null) {
+            Integer num; 
+            try {
+                num = new Integer(optionValue);
+            } catch (NumberFormatException ex) {
+                num = null; 
+            }
+            if (num != null) {
+                return num;
+            }
+        }
+        return 60; 
+    }
 }