IQSS · kcondon · Jan 27, 2023 · Oct 6, 2022 · Nov 4, 2022 · Nov 4, 2022
diff --git a/doc/release-notes/9130-cleanup-storage.md b/doc/release-notes/9130-cleanup-storage.md
@@ -0,0 +1,3 @@
+### Support for cleaning up files in datasets' storage
+
+Experimental feature: all the files stored in the Dataset storage location that are not in the file list of that Dataset can be removed with the new native API call (/api/datasets/$id/cleanStorage).
diff --git a/doc/sphinx-guides/source/api/native-api.rst b/doc/sphinx-guides/source/api/native-api.rst
@@ -1513,6 +1513,34 @@ The fully expanded example above (without environment variables) looks like this
 
   curl -H X-Dataverse-key: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx -X POST https://demo.dataverse.org/api/datasets/:persistentId/add?persistentId=doi:10.5072/FK2/J8SJZB -F 'jsonData={"description":"A remote image.","storageIdentifier":"trsa://themes/custom/qdr/images/CoreTrustSeal-logo-transparent.png","checksumType":"MD5","md5Hash":"509ef88afa907eaf2c17c1c8d8fde77e","label":"testlogo.png","fileName":"testlogo.png","mimeType":"image/png"}'
 
+.. _cleanup-storage-api:
+
+Cleanup storage of a Dataset
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If your Dataverse installation has been configured to support direct uploads, or in some other situations,
+you could end up with some files in the storage of a dataset that are not linked to that dataset directly. Most commonly, this could
+happen when an upload fails in the middle of a transfer, i.e. if a user does a UI direct upload and leaves the page without hitting cancel or save,
+Dataverse doesn't know and doesn't clean up the files. Similarly in the direct upload API, if the final /addFiles call isn't done, the files are abandoned. 
+
+You might also want to remove cached export files or some temporary files, thumbnails, etc.
+
+All the files stored in the Dataset storage location that are not in the file list of that Dataset can be removed, as shown in the example below.
+
+.. code-block:: bash
+
+  export API_TOKEN=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+  export SERVER_URL=https://demo.dataverse.org
+  export PERSISTENT_ID=doi:10.5072/FK2/J8SJZB
+
+  curl -H "X-Dataverse-key: $API_TOKEN" -X GET "$SERVER_URL/api/datasets/:persistentId/cleanStorage?persistentId=$PERSISTENT_ID"
+
+The fully expanded example above (without environment variables) looks like this:
+
+.. code-block:: bash
+
+  curl -H X-Dataverse-key: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx -X GET https://demo.dataverse.org/api/datasets/:persistentId/cleanStorage?persistentId=doi:10.5072/FK2/J8SJZB
+
 Adding Files To a Dataset via Other Tools
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java
@@ -67,6 +67,7 @@
 import edu.harvard.iq.dataverse.dataaccess.DataAccess;
 import edu.harvard.iq.dataverse.dataaccess.ImageThumbConverter;
 import edu.harvard.iq.dataverse.dataaccess.S3AccessIO;
+import edu.harvard.iq.dataverse.dataaccess.StorageIO;
 import edu.harvard.iq.dataverse.engine.command.exception.CommandException;
 import edu.harvard.iq.dataverse.engine.command.exception.UnforcedCommandException;
 import edu.harvard.iq.dataverse.engine.command.impl.GetDatasetStorageSizeCommand;
@@ -114,11 +115,13 @@
 import java.time.LocalDateTime;
 import java.util.*;
 import java.util.concurrent.*;
+import java.util.function.Predicate;
 import java.time.ZoneId;
 import java.time.format.DateTimeFormatter;
 import java.util.Map.Entry;
 import java.util.logging.Level;
 import java.util.logging.Logger;
+import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 
 import javax.ejb.EJB;
@@ -155,6 +158,7 @@
 public class Datasets extends AbstractApiBean {
 
     private static final Logger logger = Logger.getLogger(Datasets.class.getCanonicalName());
+    private static final Pattern dataFilePattern = Pattern.compile("^[0-9a-f]{11}-[0-9a-f]{12}\\.?.*");
 
     @Inject DataverseSession session;
 
@@ -2502,6 +2506,76 @@ public Response addFileToDataset(@PathParam("id") String idSupplied,
     } // end: addFileToDataset
 
 
+    /**
+     * Clean storage of a Dataset
+     *
+     * @param idSupplied
+     * @return
+     */
+    @GET
+    @Path("{id}/cleanStorage")
+    public Response cleanStorage(@PathParam("id") String idSupplied, @QueryParam("dryrun") Boolean dryrun) {
+        // get user and dataset
+        User authUser;
+        try {
+            authUser = findUserOrDie();
+        } catch (WrappedResponse ex) {
+            return error(Response.Status.FORBIDDEN,
+                    BundleUtil.getStringFromBundle("file.addreplace.error.auth")
+            );
+        }
+
+        Dataset dataset;
+        try {
+            dataset = findDatasetOrDie(idSupplied);
+        } catch (WrappedResponse wr) {
+            return wr.getResponse();
+        }
+
+        // check permissions
+        if (!permissionSvc.permissionsFor(createDataverseRequest(authUser), dataset).contains(Permission.EditDataset)) {
+            return error(Response.Status.INTERNAL_SERVER_ERROR, "Access denied!");
+        }
+
+        boolean doDryRun = dryrun != null && dryrun.booleanValue();
+
+        // check if no legacy files are present
+        Set<String> datasetFilenames = getDatasetFilenames(dataset);
+        if (datasetFilenames.stream().anyMatch(x -> !dataFilePattern.matcher(x).matches())) {
+            return error(Response.Status.INTERNAL_SERVER_ERROR, "Dataset contains files not matching the nameing pattern!");
+        }
+
+        Predicate<String> filter = getToDeleteFilesFilter(datasetFilenames);
+        List<String> deleted;
+        try {
+            StorageIO<DvObject> datasetIO = DataAccess.getStorageIO(dataset);
+            deleted = datasetIO.cleanUp(filter, doDryRun);
+        } catch (IOException ex) {
+            logger.log(Level.SEVERE, null, ex);
+            return error(Response.Status.INTERNAL_SERVER_ERROR, "IOException! Serious Error! See administrator!");
+        }
+
+        return ok("Found: " + datasetFilenames.stream().collect(Collectors.joining(", ")) + "\n" + "Deleted: " + deleted.stream().collect(Collectors.joining(", ")));
+
+    }
+
+    private static Set<String> getDatasetFilenames(Dataset dataset) {
+        Set<String> files = new HashSet<>();
+        for (DataFile dataFile: dataset.getFiles()) {
+            String storageIdentifier = dataFile.getStorageIdentifier();
+            String location = storageIdentifier.substring(storageIdentifier.indexOf("://") + 3);
+            String[] locationParts = location.split(":", 3);//separate bucket, swift container, etc. from fileName
+            files.add(locationParts[locationParts.length-1]);
+        }
+        return files;
+    }
+
+    public static Predicate<String> getToDeleteFilesFilter(Set<String> datasetFilenames) {
+        return f -> {
+            return dataFilePattern.matcher(f).matches() && datasetFilenames.stream().noneMatch(x -> f.startsWith(x));
+        };
+    }
+
     private void msg(String m) {
         //System.out.println(m);
         logger.fine(m);

diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIO.java
@@ -33,9 +33,11 @@
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.List;
+import java.util.function.Predicate;
 import java.util.logging.Logger;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
+import java.util.stream.Collectors;
 
 // Dataverse imports:
 import edu.harvard.iq.dataverse.DataFile;
@@ -683,4 +685,56 @@ protected static boolean isValidIdentifier(String driverId, String storageId) {
         }
         return true;
     }
+
+    private List<String> listAllFiles() throws IOException {
+        Dataset dataset = this.getDataset();
+        if (dataset == null) {
+            throw new IOException("This FileAccessIO object hasn't been properly initialized.");
+        }
+
+        Path datasetDirectoryPath = Paths.get(dataset.getAuthorityForFileStorage(), dataset.getIdentifierForFileStorage());
+        if (datasetDirectoryPath == null) {
+            throw new IOException("Could not determine the filesystem directory of the dataset.");
+        }
+
+        DirectoryStream<Path> dirStream = Files.newDirectoryStream(Paths.get(this.getFilesRootDirectory(), datasetDirectoryPath.toString()));
+
+        List<String> res = new ArrayList<>();
+        if (dirStream != null) {
+            for (Path filePath : dirStream) {
+                res.add(filePath.getFileName().toString());
+            }
+            dirStream.close();
+        }
+
+        return res;
+    }
+
+    private void deleteFile(String fileName) throws IOException {
+        Dataset dataset = this.getDataset();
+        if (dataset == null) {
+            throw new IOException("This FileAccessIO object hasn't been properly initialized.");
+        }
+
+        Path datasetDirectoryPath = Paths.get(dataset.getAuthorityForFileStorage(), dataset.getIdentifierForFileStorage());
+        if (datasetDirectoryPath == null) {
+            throw new IOException("Could not determine the filesystem directory of the dataset.");
+        }
+
+        Path p = Paths.get(this.getFilesRootDirectory(), datasetDirectoryPath.toString(), fileName);
+        Files.delete(p);
+    }
+
+    @Override
+    public List<String> cleanUp(Predicate<String> filter, boolean dryRun) throws IOException {
+        List<String> toDelete = this.listAllFiles().stream().filter(filter).collect(Collectors.toList());
+        if (dryRun) {
+            return toDelete;
+        }
+        for (String f : toDelete) {
+            this.deleteFile(f);
+        }
+        return toDelete;
+    }
+
 }
diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/InputStreamIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/InputStreamIO.java
@@ -14,6 +14,7 @@
 import java.nio.channels.WritableByteChannel;
 import java.nio.file.Path;
 import java.util.List;
+import java.util.function.Predicate;
 import java.util.logging.Logger;
 
 /**
@@ -159,5 +160,9 @@ public void revertBackupAsAux(String auxItemTag) throws IOException {
                 throw new UnsupportedDataAccessOperationException("InputStreamIO: this method is not supported in this DataAccess driver.");
     }
 
-
+    @Override
+    public List<String> cleanUp(Predicate<String> filter, boolean dryRun) throws IOException {
+        throw new UnsupportedDataAccessOperationException("InputStreamIO: tthis method is not supported in this DataAccess driver.");
+    }
+
 }
diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/RemoteOverlayAccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/RemoteOverlayAccessIO.java
@@ -24,6 +24,7 @@
 import java.security.KeyStoreException;
 import java.security.NoSuchAlgorithmException;
 import java.util.List;
+import java.util.function.Predicate;
 import java.util.logging.Logger;
 
 import org.apache.http.Header;
@@ -630,5 +631,9 @@ protected static boolean isValidIdentifier(String driverId, String storageId) {
     public static String getBaseStoreIdFor(String driverId) {
         return System.getProperty("dataverse.files." + driverId + ".base-store");
     }
-
+
+    @Override
+    public List<String> cleanUp(Predicate<String> filter, boolean dryRun) throws IOException {
+        return baseStore.cleanUp(filter, dryRun);
+    }
 }
diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java
@@ -60,7 +60,10 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Random;
+import java.util.function.Predicate;
 import java.util.logging.Logger;
+import java.util.stream.Collectors;
+
 import org.apache.commons.io.IOUtils;
 import org.eclipse.microprofile.config.Config;
 import org.eclipse.microprofile.config.ConfigProvider;
@@ -1306,5 +1309,75 @@ protected static boolean isValidIdentifier(String driverId, String storageId) {
         return true;
     }
 
+    private List<String> listAllFiles() throws IOException {
+        if (!this.canWrite()) {
+            open();
+        }
+        Dataset dataset = this.getDataset();
+        if (dataset == null) {
+            throw new IOException("This S3AccessIO object hasn't been properly initialized.");
+        }
+        String prefix = dataset.getAuthorityForFileStorage() + "/" + dataset.getIdentifierForFileStorage() + "/";
+
+        List<String> ret = new ArrayList<>();
+        ListObjectsRequest req = new ListObjectsRequest().withBucketName(bucketName).withPrefix(prefix);
+        ObjectListing storedFilesList = null; 
+        try {
+            storedFilesList = s3.listObjects(req);
+        } catch (SdkClientException sce) {
+            throw new IOException ("S3 listObjects: failed to get a listing for " + prefix);
+        }
+        if (storedFilesList == null) {
+            return ret;
+        }
+        List<S3ObjectSummary> storedFilesSummary = storedFilesList.getObjectSummaries();
+        try {
+            while (storedFilesList.isTruncated()) {
+                logger.fine("S3 listObjects: going to next page of list");
+                storedFilesList = s3.listNextBatchOfObjects(storedFilesList);
+                if (storedFilesList != null) {
+                    storedFilesSummary.addAll(storedFilesList.getObjectSummaries());
+                }
+            }
+        } catch (AmazonClientException ase) {
+            //logger.warning("Caught an AmazonServiceException in S3AccessIO.listObjects():    " + ase.getMessage());
+            throw new IOException("S3AccessIO: Failed to get objects for listing.");
+        }
 
-}
+        for (S3ObjectSummary item : storedFilesSummary) {
+            String fileName = item.getKey().substring(prefix.length());
+            ret.add(fileName);
+        }
+        return ret;
+    }
+
+    private void deleteFile(String fileName) throws IOException {
+        if (!this.canWrite()) {
+            open();
+        }
+        Dataset dataset = this.getDataset();
+        if (dataset == null) {
+            throw new IOException("This S3AccessIO object hasn't been properly initialized.");
+        }
+        String prefix = dataset.getAuthorityForFileStorage() + "/" + dataset.getIdentifierForFileStorage() + "/";
+
+        try {
+            DeleteObjectRequest dor = new DeleteObjectRequest(bucketName, prefix + fileName);
+            s3.deleteObject(dor);
+        } catch (AmazonClientException ase) {
+            logger.warning("S3AccessIO: Unable to delete object    " + ase.getMessage());
+        }
+    }
+
+    @Override
+    public List<String> cleanUp(Predicate<String> filter, boolean dryRun) throws IOException {
+        List<String> toDelete = this.listAllFiles().stream().filter(filter).collect(Collectors.toList());
+        if (dryRun) {
+            return toDelete;
+        }
+        for (String f : toDelete) {
+            this.deleteFile(f);
+        }
+        return toDelete;
+    }
+}
diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/StorageIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/StorageIO.java
@@ -39,6 +39,7 @@
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.function.Predicate;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
@@ -622,4 +623,6 @@ protected static boolean usesStandardNamePattern(String identifier) {
         return m.find();
     }
 
+    public abstract List<String> cleanUp(Predicate<String> filter, boolean dryRun) throws IOException;
+
 }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		### Support for cleaning up files in datasets' storage

		Experimental feature: all the files stored in the Dataset storage location that are not in the file list of that Dataset can be removed with the new native API call (/api/datasets/$id/cleanStorage).