From e27131546c4cd1f9535d795825f7f73cd8309bc9 Mon Sep 17 00:00:00 2001 From: Shane Harvey Date: Thu, 4 Nov 2021 17:25:11 -0700 Subject: [PATCH] PYTHON-2998 Remove md5 checksums from gridfs and remove disable_md5 (#776) Speed up gridfs tests (shaves off about 2 minutes on macOS). --- doc/changelog.rst | 7 ++++-- doc/migrate-to-pymongo4.rst | 26 ++++++++++++++++++++ gridfs/__init__.py | 49 +++++++++++++++++-------------------- gridfs/grid_file.py | 23 ++++++----------- test/__init__.py | 7 ++++++ test/gridfs/upload.json | 8 ------ test/test_custom_types.py | 2 +- test/test_grid_file.py | 14 ++++------- test/test_gridfs.py | 32 ++++++------------------ test/test_gridfs_bucket.py | 45 ++++++++-------------------------- test/test_gridfs_spec.py | 6 ++--- test/utils_spec_runner.py | 4 +-- 12 files changed, 94 insertions(+), 129 deletions(-) diff --git a/doc/changelog.rst b/doc/changelog.rst index 31e5d5d1de..3b3a700b3b 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -170,12 +170,15 @@ Breaking Changes in 4.0 are passed to the server as-is rather than the previous behavior which substituted in a projection of ``{"_id": 1}``. This means that an empty projection will now return the entire document, not just the ``"_id"`` field. -- :class:`~pymongo.mongo_client.MongoClient` now raises a :exc:`~pymongo.errors.ConfigurationError` - when more than one URI is passed into the ``hosts`` argument. +- :class:`~pymongo.mongo_client.MongoClient` now raises a + :exc:`~pymongo.errors.ConfigurationError` when more than one URI is passed + into the ``hosts`` argument. - :class:`~pymongo.mongo_client.MongoClient`` now raises an :exc:`~pymongo.errors.InvalidURI` exception when it encounters unescaped percent signs in username and password when parsing MongoDB URIs. +- Removed the `disable_md5` parameter for :class:`~gridfs.GridFSBucket` and + :class:`~gridfs.GridFS`. See :ref:`removed-gridfs-checksum` for details. Notable improvements .................... diff --git a/doc/migrate-to-pymongo4.rst b/doc/migrate-to-pymongo4.rst index c0288c104e..c8ac401a18 100644 --- a/doc/migrate-to-pymongo4.rst +++ b/doc/migrate-to-pymongo4.rst @@ -819,6 +819,32 @@ Changed the default JSON encoding representation from legacy to relaxed. The json_mode parameter for :const:`bson.json_util.dumps` now defaults to :const:`~bson.json_util.RELAXED_JSON_OPTIONS`. +GridFS changes +-------------- + +.. _removed-gridfs-checksum: + +disable_md5 parameter is removed +................................ + +Removed the `disable_md5` option for :class:`~gridfs.GridFSBucket` and +:class:`~gridfs.GridFS`. GridFS no longer generates checksums. +Applications that desire a file digest should implement it outside GridFS +and store it with other file metadata. For example:: + + import hashlib + my_db = MongoClient().test + fs = GridFSBucket(my_db) + grid_in = fs.open_upload_stream("test_file") + file_data = b'...' + sha356 = hashlib.sha256(file_data).hexdigest() + grid_in.write(file_data) + grid_in.sha356 = sha356 # Set the custom 'sha356' field + grid_in.close() + +Note that for large files, the checksum may need to be computed in chunks +to avoid the excessive memory needed to load the entire file at once. + Removed features with no migration path --------------------------------------- diff --git a/gridfs/__init__.py b/gridfs/__init__.py index 2a637398e3..c36d921e8c 100644 --- a/gridfs/__init__.py +++ b/gridfs/__init__.py @@ -39,7 +39,7 @@ class GridFS(object): """An instance of GridFS on top of a single Database. """ - def __init__(self, database, collection="fs", disable_md5=False): + def __init__(self, database, collection="fs"): """Create a new instance of :class:`GridFS`. Raises :class:`TypeError` if `database` is not an instance of @@ -48,14 +48,18 @@ def __init__(self, database, collection="fs", disable_md5=False): :Parameters: - `database`: database to use - `collection` (optional): root collection to use - - `disable_md5` (optional): When True, MD5 checksums will not be - computed for uploaded files. Useful in environments where MD5 - cannot be used for regulatory or other reasons. Defaults to False. + + .. versionchanged:: 4.0 + Removed the `disable_md5` parameter. See + :ref:`removed-gridfs-checksum` for details. .. versionchanged:: 3.11 Running a GridFS operation in a transaction now always raises an error. GridFS does not support multi-document transactions. + .. versionchanged:: 3.7 + Added the `disable_md5` parameter. + .. versionchanged:: 3.1 Indexes are only ensured on the first write to the DB. @@ -77,7 +81,6 @@ def __init__(self, database, collection="fs", disable_md5=False): self.__collection = database[collection] self.__files = self.__collection.files self.__chunks = self.__collection.chunks - self.__disable_md5 = disable_md5 def new_file(self, **kwargs): """Create a new file in GridFS. @@ -93,8 +96,7 @@ def new_file(self, **kwargs): :Parameters: - `**kwargs` (optional): keyword arguments for file creation """ - return GridIn( - self.__collection, disable_md5=self.__disable_md5, **kwargs) + return GridIn(self.__collection, **kwargs) def put(self, data, **kwargs): """Put data in GridFS as a new file. @@ -126,8 +128,7 @@ def put(self, data, **kwargs): .. versionchanged:: 3.0 w=0 writes to GridFS are now prohibited. """ - grid_file = GridIn( - self.__collection, disable_md5=self.__disable_md5, **kwargs) + grid_file = GridIn(self.__collection, **kwargs) try: grid_file.write(data) finally: @@ -423,7 +424,7 @@ class GridFSBucket(object): def __init__(self, db, bucket_name="fs", chunk_size_bytes=DEFAULT_CHUNK_SIZE, write_concern=None, - read_preference=None, disable_md5=False): + read_preference=None): """Create a new instance of :class:`GridFSBucket`. Raises :exc:`TypeError` if `database` is not an instance of @@ -442,13 +443,17 @@ def __init__(self, db, bucket_name="fs", (the default) db.write_concern is used. - `read_preference` (optional): The read preference to use. If ``None`` (the default) db.read_preference is used. - - `disable_md5` (optional): When True, MD5 checksums will not be - computed for uploaded files. Useful in environments where MD5 - cannot be used for regulatory or other reasons. Defaults to False. + + .. versionchanged:: 4.0 + Removed the `disable_md5` parameter. See + :ref:`removed-gridfs-checksum` for details. .. versionchanged:: 3.11 - Running a GridFS operation in a transaction now always raises an - error. GridFSBucket does not support multi-document transactions. + Running a GridFSBucket operation in a transaction now always raises + an error. GridFSBucket does not support multi-document transactions. + + .. versionchanged:: 3.7 + Added the `disable_md5` parameter. .. versionadded:: 3.1 @@ -465,8 +470,6 @@ def __init__(self, db, bucket_name="fs", self._bucket_name = bucket_name self._collection = db[bucket_name] - self._disable_md5 = disable_md5 - self._chunks = self._collection.chunks.with_options( write_concern=write_concern, read_preference=read_preference) @@ -522,11 +525,7 @@ def open_upload_stream(self, filename, chunk_size_bytes=None, if metadata is not None: opts["metadata"] = metadata - return GridIn( - self._collection, - session=session, - disable_md5=self._disable_md5, - **opts) + return GridIn(self._collection, session=session, **opts) def open_upload_stream_with_id( self, file_id, filename, chunk_size_bytes=None, metadata=None, @@ -579,11 +578,7 @@ def open_upload_stream_with_id( if metadata is not None: opts["metadata"] = metadata - return GridIn( - self._collection, - session=session, - disable_md5=self._disable_md5, - **opts) + return GridIn(self._collection, session=session, **opts) def upload_from_stream(self, filename, source, chunk_size_bytes=None, metadata=None, session=None): diff --git a/gridfs/grid_file.py b/gridfs/grid_file.py index 3e455dc932..fc01d88d24 100644 --- a/gridfs/grid_file.py +++ b/gridfs/grid_file.py @@ -14,7 +14,6 @@ """Tools for representing files stored in GridFS.""" import datetime -import hashlib import io import math import os @@ -115,8 +114,7 @@ def _disallow_transactions(session): class GridIn(object): """Class to write data to GridFS. """ - def __init__( - self, root_collection, session=None, disable_md5=False, **kwargs): + def __init__(self, root_collection, session=None, **kwargs): """Write a file to GridFS Application developers should generally not need to @@ -152,12 +150,15 @@ def __init__( - `session` (optional): a :class:`~pymongo.client_session.ClientSession` to use for all commands - - `disable_md5` (optional): When True, an MD5 checksum will not be - computed for the uploaded file. Useful in environments where - MD5 cannot be used for regulatory or other reasons. Defaults to - False. - `**kwargs` (optional): file level options (see above) + .. versionchanged:: 4.0 + Removed the `disable_md5` parameter. See + :ref:`removed-gridfs-checksum` for details. + + .. versionchanged:: 3.7 + Added the `disable_md5` parameter. + .. versionchanged:: 3.6 Added ``session`` parameter. @@ -183,8 +184,6 @@ def __init__( coll = _clear_entity_type_registry( root_collection, read_preference=ReadPreference.PRIMARY) - if not disable_md5: - kwargs["md5"] = hashlib.md5() # Defaults kwargs["_id"] = kwargs.get("_id", ObjectId()) kwargs["chunkSize"] = kwargs.get("chunkSize", DEFAULT_CHUNK_SIZE) @@ -271,9 +270,6 @@ def __flush_data(self, data): """Flush `data` to a chunk. """ self.__ensure_indexes() - if 'md5' in self._file: - self._file['md5'].update(data) - if not data: return assert(len(data) <= self.chunk_size) @@ -301,9 +297,6 @@ def __flush(self): """ try: self.__flush_buffer() - - if "md5" in self._file: - self._file["md5"] = self._file["md5"].hexdigest() # The GridFS spec says length SHOULD be an Int64. self._file["length"] = Int64(self._position) self._file["uploadDate"] = datetime.datetime.utcnow() diff --git a/test/__init__.py b/test/__init__.py index d9144e8623..ab53b7fdc5 100644 --- a/test/__init__.py +++ b/test/__init__.py @@ -949,6 +949,13 @@ def setUpClass(cls): else: cls.credentials = {} + def cleanup_colls(self, *collections): + """Cleanup collections faster than drop_collection.""" + for c in collections: + c = self.client[c.database.name][c.name] + c.delete_many({}) + c.drop_indexes() + def patch_system_certs(self, ca_certs): patcher = SystemCertsPatcher(ca_certs) self.addCleanup(patcher.disable) diff --git a/test/gridfs/upload.json b/test/gridfs/upload.json index 324ac49d23..7d4adec1d8 100644 --- a/test/gridfs/upload.json +++ b/test/gridfs/upload.json @@ -29,7 +29,6 @@ "length": 0, "chunkSize": 4, "uploadDate": "*actual", - "md5": "d41d8cd98f00b204e9800998ecf8427e", "filename": "filename" } ] @@ -62,7 +61,6 @@ "length": 1, "chunkSize": 4, "uploadDate": "*actual", - "md5": "47ed733b8d10be225eceba344d533586", "filename": "filename" } ] @@ -108,7 +106,6 @@ "length": 3, "chunkSize": 4, "uploadDate": "*actual", - "md5": "bafae3a174ab91fc70db7a6aa50f4f52", "filename": "filename" } ] @@ -154,7 +151,6 @@ "length": 4, "chunkSize": 4, "uploadDate": "*actual", - "md5": "7e7c77cff5705d1f7574a25ef6662117", "filename": "filename" } ] @@ -200,7 +196,6 @@ "length": 5, "chunkSize": 4, "uploadDate": "*actual", - "md5": "283d4fea5dded59cf837d3047328f5af", "filename": "filename" } ] @@ -254,7 +249,6 @@ "length": 8, "chunkSize": 4, "uploadDate": "*actual", - "md5": "dd254cdc958e53abaa67da9f797125f5", "filename": "filename" } ] @@ -309,7 +303,6 @@ "length": 1, "chunkSize": 4, "uploadDate": "*actual", - "md5": "47ed733b8d10be225eceba344d533586", "filename": "filename", "contentType": "image/jpeg" } @@ -359,7 +352,6 @@ "length": 1, "chunkSize": 4, "uploadDate": "*actual", - "md5": "47ed733b8d10be225eceba344d533586", "filename": "filename", "metadata": { "x": 1 diff --git a/test/test_custom_types.py b/test/test_custom_types.py index 83d8c8a2c5..5db208ab7e 100644 --- a/test/test_custom_types.py +++ b/test/test_custom_types.py @@ -672,7 +672,7 @@ def test_grid_out_custom_opts(self): self.assertEqual(["foo"], two.aliases) self.assertEqual({"foo": 'red', "bar": 'blue'}, two.metadata) self.assertEqual(3, two.bar) - self.assertEqual("5eb63bbbe01eeed093cb22bb8f5acdc3", two.md5) + self.assertEqual(None, two.md5) for attr in ["_id", "name", "content_type", "length", "chunk_size", "upload_date", "aliases", "metadata", "md5"]: diff --git a/test/test_grid_file.py b/test/test_grid_file.py index 37a1f905a7..a53e40c4c9 100644 --- a/test/test_grid_file.py +++ b/test/test_grid_file.py @@ -80,8 +80,7 @@ def test_grid_in_custom_opts(self): class TestGridFile(IntegrationTest): def setUp(self): - self.db.drop_collection('fs.files') - self.db.drop_collection('fs.chunks') + self.cleanup_colls(self.db.fs.files, self.db.fs.chunks) def test_basic(self): f = GridIn(self.db.fs, filename="test") @@ -112,7 +111,7 @@ def test_md5(self): f = GridIn(self.db.fs) f.write(b"hello world\n") f.close() - self.assertEqual("6f5902ac237024bdd0c176cb93063dc4", f.md5) + self.assertEqual(None, f.md5) def test_alternate_collection(self): self.db.alt.files.delete_many({}) @@ -128,9 +127,6 @@ def test_alternate_collection(self): g = GridOut(self.db.alt, f._id) self.assertEqual(b"hello world", g.read()) - # test that md5 still works... - self.assertEqual("5eb63bbbe01eeed093cb22bb8f5acdc3", g.md5) - def test_grid_in_default_opts(self): self.assertRaises(TypeError, GridIn, "foo") @@ -194,7 +190,7 @@ def test_grid_in_default_opts(self): self.assertEqual({"foo": 1}, a.metadata) - self.assertEqual("d41d8cd98f00b204e9800998ecf8427e", a.md5) + self.assertEqual(None, a.md5) self.assertRaises(AttributeError, setattr, a, "md5", 5) # Make sure custom attributes that were set both before and after @@ -225,7 +221,7 @@ def test_grid_out_default_opts(self): self.assertTrue(isinstance(b.upload_date, datetime.datetime)) self.assertEqual(None, b.aliases) self.assertEqual(None, b.metadata) - self.assertEqual("d41d8cd98f00b204e9800998ecf8427e", b.md5) + self.assertEqual(None, b.md5) for attr in ["_id", "name", "content_type", "length", "chunk_size", "upload_date", "aliases", "metadata", "md5"]: @@ -266,7 +262,7 @@ def test_grid_out_custom_opts(self): self.assertEqual(["foo"], two.aliases) self.assertEqual({"foo": 1, "bar": 2}, two.metadata) self.assertEqual(3, two.bar) - self.assertEqual("5eb63bbbe01eeed093cb22bb8f5acdc3", two.md5) + self.assertEqual(None, two.md5) for attr in ["_id", "name", "content_type", "length", "chunk_size", "upload_date", "aliases", "metadata", "md5"]: diff --git a/test/test_gridfs.py b/test/test_gridfs.py index e6b84f2977..d7d5a74e5f 100644 --- a/test/test_gridfs.py +++ b/test/test_gridfs.py @@ -97,10 +97,8 @@ def setUpClass(cls): cls.alt = gridfs.GridFS(cls.db, "alt") def setUp(self): - self.db.drop_collection("fs.files") - self.db.drop_collection("fs.chunks") - self.db.drop_collection("alt.files") - self.db.drop_collection("alt.chunks") + self.cleanup_colls(self.db.fs.files, self.db.fs.chunks, + self.db.alt.files, self.db.alt.chunks) def test_basic(self): oid = self.fs.put(b"hello world") @@ -158,7 +156,7 @@ def test_empty_file(self): self.assertEqual(oid, raw["_id"]) self.assertTrue(isinstance(raw["uploadDate"], datetime.datetime)) self.assertEqual(255 * 1024, raw["chunkSize"]) - self.assertTrue(isinstance(raw["md5"], str)) + self.assertNotIn("md5", raw) def test_corrupt_chunk(self): files_id = self.fs.put(b'foobar') @@ -174,12 +172,11 @@ def test_corrupt_chunk(self): self.fs.delete(files_id) def test_put_ensures_index(self): - # setUp has dropped collections. - names = self.db.list_collection_names() - self.assertFalse([name for name in names if name.startswith('fs')]) - chunks = self.db.fs.chunks files = self.db.fs.files + # Ensure the collections are removed. + chunks.drop() + files.drop() self.fs.put(b"junk") self.assertTrue(any( @@ -484,21 +481,6 @@ def test_unacknowledged(self): def test_md5(self): gin = self.fs.new_file() - gin.write(b"includes md5 sum") - gin.close() - self.assertIsNotNone(gin.md5) - md5sum = gin.md5 - - gout = self.fs.get(gin._id) - self.assertIsNotNone(gout.md5) - self.assertEqual(md5sum, gout.md5) - - _id = self.fs.put(b"also includes md5 sum") - gout = self.fs.get(_id) - self.assertIsNotNone(gout.md5) - - fs = gridfs.GridFS(self.db, disable_md5=True) - gin = fs.new_file() gin.write(b"no md5 sum") gin.close() self.assertIsNone(gin.md5) @@ -506,7 +488,7 @@ def test_md5(self): gout = self.fs.get(gin._id) self.assertIsNone(gout.md5) - _id = fs.put(b"still no md5 sum") + _id = self.fs.put(b"still no md5 sum") gout = self.fs.get(_id) self.assertIsNone(gout.md5) diff --git a/test/test_gridfs_bucket.py b/test/test_gridfs_bucket.py index 8b4c2cf346..499643f673 100644 --- a/test/test_gridfs_bucket.py +++ b/test/test_gridfs_bucket.py @@ -86,10 +86,8 @@ def setUpClass(cls): cls.db, bucket_name="alt") def setUp(self): - self.db.drop_collection("fs.files") - self.db.drop_collection("fs.chunks") - self.db.drop_collection("alt.files") - self.db.drop_collection("alt.chunks") + self.cleanup_colls(self.db.fs.files, self.db.fs.chunks, + self.db.alt.files, self.db.alt.chunks) def test_basic(self): oid = self.fs.upload_from_stream("test_filename", @@ -105,7 +103,6 @@ def test_basic(self): self.assertEqual(0, self.db.fs.chunks.count_documents({})) def test_multi_chunk_delete(self): - self.db.fs.drop() self.assertEqual(0, self.db.fs.files.count_documents({})) self.assertEqual(0, self.db.fs.chunks.count_documents({})) gfs = gridfs.GridFSBucket(self.db) @@ -130,7 +127,7 @@ def test_empty_file(self): self.assertEqual(oid, raw["_id"]) self.assertTrue(isinstance(raw["uploadDate"], datetime.datetime)) self.assertEqual(255 * 1024, raw["chunkSize"]) - self.assertTrue(isinstance(raw["md5"], str)) + self.assertNotIn("md5", raw) def test_corrupt_chunk(self): files_id = self.fs.upload_from_stream("test_filename", @@ -147,12 +144,11 @@ def test_corrupt_chunk(self): self.fs.delete(files_id) def test_upload_ensures_index(self): - # setUp has dropped collections. - names = self.db.list_collection_names() - self.assertFalse([name for name in names if name.startswith('fs')]) - chunks = self.db.fs.chunks files = self.db.fs.files + # Ensure the collections are removed. + chunks.drop() + files.drop() self.fs.upload_from_stream("filename", b"junk") self.assertTrue(any( @@ -464,41 +460,20 @@ def test_download_to_stream_by_name(self): self.assertEqual(file1.read(), file2.read()) def test_md5(self): - gin = self.fs.open_upload_stream("has md5") - gin.write(b"includes md5 sum") - gin.close() - self.assertIsNotNone(gin.md5) - md5sum = gin.md5 - - gout = self.fs.open_download_stream(gin._id) - self.assertIsNotNone(gout.md5) - self.assertEqual(md5sum, gout.md5) - - gin = self.fs.open_upload_stream_with_id(ObjectId(), "also has md5") - gin.write(b"also includes md5 sum") - gin.close() - self.assertIsNotNone(gin.md5) - md5sum = gin.md5 - - gout = self.fs.open_download_stream(gin._id) - self.assertIsNotNone(gout.md5) - self.assertEqual(md5sum, gout.md5) - - fs = gridfs.GridFSBucket(self.db, disable_md5=True) - gin = fs.open_upload_stream("no md5") + gin = self.fs.open_upload_stream("no md5") gin.write(b"no md5 sum") gin.close() self.assertIsNone(gin.md5) - gout = fs.open_download_stream(gin._id) + gout = self.fs.open_download_stream(gin._id) self.assertIsNone(gout.md5) - gin = fs.open_upload_stream_with_id(ObjectId(), "also no md5") + gin = self.fs.open_upload_stream_with_id(ObjectId(), "also no md5") gin.write(b"also no md5 sum") gin.close() self.assertIsNone(gin.md5) - gout = fs.open_download_stream(gin._id) + gout = self.fs.open_download_stream(gin._id) self.assertIsNone(gout.md5) diff --git a/test/test_gridfs_spec.py b/test/test_gridfs_spec.py index e9e097568c..86449db370 100644 --- a/test/test_gridfs_spec.py +++ b/test/test_gridfs_spec.py @@ -66,10 +66,8 @@ def setUpClass(cls): "download_by_name": cls.fs.open_download_stream_by_name} def init_db(self, data, test): - self.db.drop_collection("fs.files") - self.db.drop_collection("fs.chunks") - self.db.drop_collection("expected.files") - self.db.drop_collection("expected.chunks") + self.cleanup_colls(self.db.fs.files, self.db.fs.chunks, + self.db.expected.files, self.db.expected.chunks) # Read in data. if data['files']: diff --git a/test/utils_spec_runner.py b/test/utils_spec_runner.py index d552a18ca2..f3a9c4390a 100644 --- a/test/utils_spec_runner.py +++ b/test/utils_spec_runner.py @@ -270,9 +270,7 @@ def run_operation(self, sessions, collection, operation): if object_name == 'gridfsbucket': # Only create the GridFSBucket when we need it (for the gridfs # retryable reads tests). - obj = GridFSBucket( - database, bucket_name=collection.name, - disable_md5=True) + obj = GridFSBucket(database, bucket_name=collection.name) else: objects = { 'client': database.client,