diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index e5449d5f6c0..21e506f9922 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -1089,7 +1089,16 @@ This sets the base name (without dot and extension), if not set it defaults to ' Dataverse calculates checksums for uploaded files so that users can determine if their file was corrupted via upload or download. This is sometimes called "file fixity": https://en.wikipedia.org/wiki/File_Fixity -The default checksum algorithm used is MD5 and should be sufficient for establishing file fixity. "SHA-1" is an experimental alternate value for this setting. +The default checksum algorithm used is MD5 and should be sufficient for establishing file fixity. "SHA-1", "SHA-256" and "SHA-512" are alternate values for this setting. For example: + +``curl -X PUT -d 'SHA-512' http://localhost:8080/api/admin/settings/:FileFixityChecksumAlgorithm`` + +The fixity algorithm used on existing files can be changed by a superuser using the API. An optional query parameter (num) can be used to limit the number of updates attempted. +The API call will only update the algorithm and checksum for a file if the existing checksum can be validated against the file. +Statistics concerning the updates are returned in the response to the API call with details in the log. + +``curl http://localhost:8080/api/admin/updateHashValues/{alg}`` +``curl http://localhost:8080/api/admin/updateHashValues/{alg}?num=1`` .. _:PVMinLength: diff --git a/src/main/java/edu/harvard/iq/dataverse/DataFile.java b/src/main/java/edu/harvard/iq/dataverse/DataFile.java index a6395c94f84..02061e4ecfe 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataFile.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataFile.java @@ -101,7 +101,9 @@ public class DataFile extends DvObject implements Comparable { public enum ChecksumType { MD5("MD5"), - SHA1("SHA-1"); + SHA1("SHA-1"), + SHA256("SHA-256"), + SHA512("SHA-512"); private final String text; diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Admin.java b/src/main/java/edu/harvard/iq/dataverse/api/Admin.java index 037fc694556..3e47ca2b001 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Admin.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Admin.java @@ -27,6 +27,8 @@ import edu.harvard.iq.dataverse.confirmemail.ConfirmEmailData; import edu.harvard.iq.dataverse.confirmemail.ConfirmEmailException; import edu.harvard.iq.dataverse.confirmemail.ConfirmEmailInitResponse; +import edu.harvard.iq.dataverse.dataaccess.DataAccessOption; +import edu.harvard.iq.dataverse.dataaccess.StorageIO; import edu.harvard.iq.dataverse.engine.command.impl.PublishDataverseCommand; import edu.harvard.iq.dataverse.settings.Setting; import javax.json.Json; @@ -41,6 +43,8 @@ import javax.ws.rs.core.Response; import static edu.harvard.iq.dataverse.util.json.NullSafeJsonBuilder.jsonObjectBuilder; import static edu.harvard.iq.dataverse.util.json.JsonPrinter.*; + +import java.io.InputStream; import java.io.StringReader; import java.util.Map; import java.util.logging.Level; @@ -53,6 +57,9 @@ import javax.validation.ConstraintViolationException; import javax.ws.rs.Produces; import javax.ws.rs.core.Response.Status; + +import org.apache.commons.io.IOUtils; + import java.util.List; import edu.harvard.iq.dataverse.authorization.AuthTestDataServiceBean; import edu.harvard.iq.dataverse.authorization.RoleAssignee; @@ -65,6 +72,7 @@ import edu.harvard.iq.dataverse.ingest.IngestServiceBean; import edu.harvard.iq.dataverse.userdata.UserListMaker; import edu.harvard.iq.dataverse.userdata.UserListResult; +import edu.harvard.iq.dataverse.util.FileUtil; import java.util.Date; import java.util.ResourceBundle; import javax.inject.Inject; @@ -1101,6 +1109,115 @@ public Response registerDataFileAll() { + " unregistered, published files registered successfully."); } + @GET + @Path("/updateHashValues/{alg}") + public Response updateHashValues(@PathParam("alg") String alg, @QueryParam("num") int num) { + Integer count = fileService.findAll().size(); + Integer successes = 0; + Integer alreadyUpdated = 0; + Integer rehashed = 0; + Integer harvested=0; + + if (num <= 0) + num = Integer.MAX_VALUE; + DataFile.ChecksumType cType = null; + try { + cType = DataFile.ChecksumType.fromString(alg); + } catch (IllegalArgumentException iae) { + return error(Status.BAD_REQUEST, "Unknown algorithm"); + } + logger.info("Starting to rehash: analyzing " + count + " files. " + new Date()); + logger.info("Hashes not created with " + alg + " will be verified, and, if valid, replaced with a hash using " + + alg); + try { + User u = findAuthenticatedUserOrDie(); + if (!u.isSuperuser()) + return error(Status.UNAUTHORIZED, "must be superuser"); + } catch (WrappedResponse e1) { + return error(Status.UNAUTHORIZED, "api key required"); + } + + for (DataFile df : fileService.findAll()) { + if (rehashed.intValue() >= num) + break; + InputStream in = null; + InputStream in2 = null; + try { + if (df.isHarvested()) { + harvested++; + } else { + if (!df.getChecksumType().equals(cType)) { + + rehashed++; + logger.fine(rehashed + ": Datafile: " + df.getFileMetadata().getLabel() + ", " + + df.getIdentifier()); + // verify hash and calc new one to replace it + StorageIO storage = df.getStorageIO(); + storage.open(DataAccessOption.READ_ACCESS); + if (!df.isTabularData()) { + in = storage.getInputStream(); + } else { + // if this is a tabular file, read the preserved original "auxiliary file" + // instead: + in = storage.getAuxFileAsInputStream(FileUtil.SAVED_ORIGINAL_FILENAME_EXTENSION); + } + if (in == null) + logger.warning("Cannot retrieve file."); + String currentChecksum = FileUtil.CalculateChecksum(in, df.getChecksumType()); + if (currentChecksum.equals(df.getChecksumValue())) { + logger.fine("Current checksum for datafile: " + df.getFileMetadata().getLabel() + ", " + + df.getIdentifier() + " is valid"); + storage.open(DataAccessOption.READ_ACCESS); + if (!df.isTabularData()) { + in2 = storage.getInputStream(); + } else { + // if this is a tabular file, read the preserved original "auxiliary file" + // instead: + in2 = storage.getAuxFileAsInputStream(FileUtil.SAVED_ORIGINAL_FILENAME_EXTENSION); + } + if (in2 == null) + logger.warning("Cannot retrieve file to calculate new checksum."); + String newChecksum = FileUtil.CalculateChecksum(in2, cType); + + df.setChecksumType(cType); + df.setChecksumValue(newChecksum); + successes++; + if (successes % 100 == 0) { + logger.info( + successes + " of " + count + " files rehashed successfully. " + new Date()); + } + } else { + logger.warning("Problem: Current checksum for datafile: " + df.getFileMetadata().getLabel() + + ", " + df.getIdentifier() + " is INVALID"); + } + } else { + alreadyUpdated++; + if (alreadyUpdated % 100 == 0) { + logger.info(alreadyUpdated + " of " + count + + " files are already have hashes with the new algorithm. " + new Date()); + } + } + } + } catch (Exception e) { + logger.warning("Unexpected Exception: " + e.getMessage()); + + } finally { + IOUtils.closeQuietly(in); + IOUtils.closeQuietly(in2); + } + } + logger.info("Final Results:"); + logger.info(harvested + " harvested files skipped."); + logger.info( + alreadyUpdated + " of " + count + " files already had hashes with the new algorithm. " + new Date()); + logger.info(rehashed + " of " + count + " files to rehash. " + new Date()); + logger.info( + successes + " of " + rehashed + " files successfully rehashed with the new algorithm. " + new Date()); + + return ok("Datafile rehashing complete." + successes + " of " + rehashed + " files successfully rehashed."); + } + + @DELETE @Path("/clearMetricsCache") public Response clearMetricsCache() {