Skip to content

Commit

Permalink
PYTHON-2998 Remove md5 checksums from gridfs and remove disable_md5 (#…
Browse files Browse the repository at this point in the history
…776)

Speed up gridfs tests (shaves off about 2 minutes on macOS).
  • Loading branch information
ShaneHarvey authored Nov 5, 2021
1 parent 89f41cf commit e271315
Show file tree
Hide file tree
Showing 12 changed files with 94 additions and 129 deletions.
7 changes: 5 additions & 2 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -170,12 +170,15 @@ Breaking Changes in 4.0
are passed to the server as-is rather than the previous behavior which
substituted in a projection of ``{"_id": 1}``. This means that an empty
projection will now return the entire document, not just the ``"_id"`` field.
- :class:`~pymongo.mongo_client.MongoClient` now raises a :exc:`~pymongo.errors.ConfigurationError`
when more than one URI is passed into the ``hosts`` argument.
- :class:`~pymongo.mongo_client.MongoClient` now raises a
:exc:`~pymongo.errors.ConfigurationError` when more than one URI is passed
into the ``hosts`` argument.
- :class:`~pymongo.mongo_client.MongoClient`` now raises an
:exc:`~pymongo.errors.InvalidURI` exception
when it encounters unescaped percent signs in username and password when
parsing MongoDB URIs.
- Removed the `disable_md5` parameter for :class:`~gridfs.GridFSBucket` and
:class:`~gridfs.GridFS`. See :ref:`removed-gridfs-checksum` for details.

Notable improvements
....................
Expand Down
26 changes: 26 additions & 0 deletions doc/migrate-to-pymongo4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -819,6 +819,32 @@ Changed the default JSON encoding representation from legacy to relaxed.
The json_mode parameter for :const:`bson.json_util.dumps` now defaults to
:const:`~bson.json_util.RELAXED_JSON_OPTIONS`.

GridFS changes
--------------

.. _removed-gridfs-checksum:

disable_md5 parameter is removed
................................

Removed the `disable_md5` option for :class:`~gridfs.GridFSBucket` and
:class:`~gridfs.GridFS`. GridFS no longer generates checksums.
Applications that desire a file digest should implement it outside GridFS
and store it with other file metadata. For example::

import hashlib
my_db = MongoClient().test
fs = GridFSBucket(my_db)
grid_in = fs.open_upload_stream("test_file")
file_data = b'...'
sha356 = hashlib.sha256(file_data).hexdigest()
grid_in.write(file_data)
grid_in.sha356 = sha356 # Set the custom 'sha356' field
grid_in.close()

Note that for large files, the checksum may need to be computed in chunks
to avoid the excessive memory needed to load the entire file at once.

Removed features with no migration path
---------------------------------------

Expand Down
49 changes: 22 additions & 27 deletions gridfs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
class GridFS(object):
"""An instance of GridFS on top of a single Database.
"""
def __init__(self, database, collection="fs", disable_md5=False):
def __init__(self, database, collection="fs"):
"""Create a new instance of :class:`GridFS`.
Raises :class:`TypeError` if `database` is not an instance of
Expand All @@ -48,14 +48,18 @@ def __init__(self, database, collection="fs", disable_md5=False):
:Parameters:
- `database`: database to use
- `collection` (optional): root collection to use
- `disable_md5` (optional): When True, MD5 checksums will not be
computed for uploaded files. Useful in environments where MD5
cannot be used for regulatory or other reasons. Defaults to False.
.. versionchanged:: 4.0
Removed the `disable_md5` parameter. See
:ref:`removed-gridfs-checksum` for details.
.. versionchanged:: 3.11
Running a GridFS operation in a transaction now always raises an
error. GridFS does not support multi-document transactions.
.. versionchanged:: 3.7
Added the `disable_md5` parameter.
.. versionchanged:: 3.1
Indexes are only ensured on the first write to the DB.
Expand All @@ -77,7 +81,6 @@ def __init__(self, database, collection="fs", disable_md5=False):
self.__collection = database[collection]
self.__files = self.__collection.files
self.__chunks = self.__collection.chunks
self.__disable_md5 = disable_md5

def new_file(self, **kwargs):
"""Create a new file in GridFS.
Expand All @@ -93,8 +96,7 @@ def new_file(self, **kwargs):
:Parameters:
- `**kwargs` (optional): keyword arguments for file creation
"""
return GridIn(
self.__collection, disable_md5=self.__disable_md5, **kwargs)
return GridIn(self.__collection, **kwargs)

def put(self, data, **kwargs):
"""Put data in GridFS as a new file.
Expand Down Expand Up @@ -126,8 +128,7 @@ def put(self, data, **kwargs):
.. versionchanged:: 3.0
w=0 writes to GridFS are now prohibited.
"""
grid_file = GridIn(
self.__collection, disable_md5=self.__disable_md5, **kwargs)
grid_file = GridIn(self.__collection, **kwargs)
try:
grid_file.write(data)
finally:
Expand Down Expand Up @@ -423,7 +424,7 @@ class GridFSBucket(object):

def __init__(self, db, bucket_name="fs",
chunk_size_bytes=DEFAULT_CHUNK_SIZE, write_concern=None,
read_preference=None, disable_md5=False):
read_preference=None):
"""Create a new instance of :class:`GridFSBucket`.
Raises :exc:`TypeError` if `database` is not an instance of
Expand All @@ -442,13 +443,17 @@ def __init__(self, db, bucket_name="fs",
(the default) db.write_concern is used.
- `read_preference` (optional): The read preference to use. If
``None`` (the default) db.read_preference is used.
- `disable_md5` (optional): When True, MD5 checksums will not be
computed for uploaded files. Useful in environments where MD5
cannot be used for regulatory or other reasons. Defaults to False.
.. versionchanged:: 4.0
Removed the `disable_md5` parameter. See
:ref:`removed-gridfs-checksum` for details.
.. versionchanged:: 3.11
Running a GridFS operation in a transaction now always raises an
error. GridFSBucket does not support multi-document transactions.
Running a GridFSBucket operation in a transaction now always raises
an error. GridFSBucket does not support multi-document transactions.
.. versionchanged:: 3.7
Added the `disable_md5` parameter.
.. versionadded:: 3.1
Expand All @@ -465,8 +470,6 @@ def __init__(self, db, bucket_name="fs",

self._bucket_name = bucket_name
self._collection = db[bucket_name]
self._disable_md5 = disable_md5

self._chunks = self._collection.chunks.with_options(
write_concern=write_concern,
read_preference=read_preference)
Expand Down Expand Up @@ -522,11 +525,7 @@ def open_upload_stream(self, filename, chunk_size_bytes=None,
if metadata is not None:
opts["metadata"] = metadata

return GridIn(
self._collection,
session=session,
disable_md5=self._disable_md5,
**opts)
return GridIn(self._collection, session=session, **opts)

def open_upload_stream_with_id(
self, file_id, filename, chunk_size_bytes=None, metadata=None,
Expand Down Expand Up @@ -579,11 +578,7 @@ def open_upload_stream_with_id(
if metadata is not None:
opts["metadata"] = metadata

return GridIn(
self._collection,
session=session,
disable_md5=self._disable_md5,
**opts)
return GridIn(self._collection, session=session, **opts)

def upload_from_stream(self, filename, source, chunk_size_bytes=None,
metadata=None, session=None):
Expand Down
23 changes: 8 additions & 15 deletions gridfs/grid_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

"""Tools for representing files stored in GridFS."""
import datetime
import hashlib
import io
import math
import os
Expand Down Expand Up @@ -115,8 +114,7 @@ def _disallow_transactions(session):
class GridIn(object):
"""Class to write data to GridFS.
"""
def __init__(
self, root_collection, session=None, disable_md5=False, **kwargs):
def __init__(self, root_collection, session=None, **kwargs):
"""Write a file to GridFS
Application developers should generally not need to
Expand Down Expand Up @@ -152,12 +150,15 @@ def __init__(
- `session` (optional): a
:class:`~pymongo.client_session.ClientSession` to use for all
commands
- `disable_md5` (optional): When True, an MD5 checksum will not be
computed for the uploaded file. Useful in environments where
MD5 cannot be used for regulatory or other reasons. Defaults to
False.
- `**kwargs` (optional): file level options (see above)
.. versionchanged:: 4.0
Removed the `disable_md5` parameter. See
:ref:`removed-gridfs-checksum` for details.
.. versionchanged:: 3.7
Added the `disable_md5` parameter.
.. versionchanged:: 3.6
Added ``session`` parameter.
Expand All @@ -183,8 +184,6 @@ def __init__(
coll = _clear_entity_type_registry(
root_collection, read_preference=ReadPreference.PRIMARY)

if not disable_md5:
kwargs["md5"] = hashlib.md5()
# Defaults
kwargs["_id"] = kwargs.get("_id", ObjectId())
kwargs["chunkSize"] = kwargs.get("chunkSize", DEFAULT_CHUNK_SIZE)
Expand Down Expand Up @@ -271,9 +270,6 @@ def __flush_data(self, data):
"""Flush `data` to a chunk.
"""
self.__ensure_indexes()
if 'md5' in self._file:
self._file['md5'].update(data)

if not data:
return
assert(len(data) <= self.chunk_size)
Expand Down Expand Up @@ -301,9 +297,6 @@ def __flush(self):
"""
try:
self.__flush_buffer()

if "md5" in self._file:
self._file["md5"] = self._file["md5"].hexdigest()
# The GridFS spec says length SHOULD be an Int64.
self._file["length"] = Int64(self._position)
self._file["uploadDate"] = datetime.datetime.utcnow()
Expand Down
7 changes: 7 additions & 0 deletions test/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -949,6 +949,13 @@ def setUpClass(cls):
else:
cls.credentials = {}

def cleanup_colls(self, *collections):
"""Cleanup collections faster than drop_collection."""
for c in collections:
c = self.client[c.database.name][c.name]
c.delete_many({})
c.drop_indexes()

def patch_system_certs(self, ca_certs):
patcher = SystemCertsPatcher(ca_certs)
self.addCleanup(patcher.disable)
Expand Down
8 changes: 0 additions & 8 deletions test/gridfs/upload.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
"length": 0,
"chunkSize": 4,
"uploadDate": "*actual",
"md5": "d41d8cd98f00b204e9800998ecf8427e",
"filename": "filename"
}
]
Expand Down Expand Up @@ -62,7 +61,6 @@
"length": 1,
"chunkSize": 4,
"uploadDate": "*actual",
"md5": "47ed733b8d10be225eceba344d533586",
"filename": "filename"
}
]
Expand Down Expand Up @@ -108,7 +106,6 @@
"length": 3,
"chunkSize": 4,
"uploadDate": "*actual",
"md5": "bafae3a174ab91fc70db7a6aa50f4f52",
"filename": "filename"
}
]
Expand Down Expand Up @@ -154,7 +151,6 @@
"length": 4,
"chunkSize": 4,
"uploadDate": "*actual",
"md5": "7e7c77cff5705d1f7574a25ef6662117",
"filename": "filename"
}
]
Expand Down Expand Up @@ -200,7 +196,6 @@
"length": 5,
"chunkSize": 4,
"uploadDate": "*actual",
"md5": "283d4fea5dded59cf837d3047328f5af",
"filename": "filename"
}
]
Expand Down Expand Up @@ -254,7 +249,6 @@
"length": 8,
"chunkSize": 4,
"uploadDate": "*actual",
"md5": "dd254cdc958e53abaa67da9f797125f5",
"filename": "filename"
}
]
Expand Down Expand Up @@ -309,7 +303,6 @@
"length": 1,
"chunkSize": 4,
"uploadDate": "*actual",
"md5": "47ed733b8d10be225eceba344d533586",
"filename": "filename",
"contentType": "image/jpeg"
}
Expand Down Expand Up @@ -359,7 +352,6 @@
"length": 1,
"chunkSize": 4,
"uploadDate": "*actual",
"md5": "47ed733b8d10be225eceba344d533586",
"filename": "filename",
"metadata": {
"x": 1
Expand Down
2 changes: 1 addition & 1 deletion test/test_custom_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -672,7 +672,7 @@ def test_grid_out_custom_opts(self):
self.assertEqual(["foo"], two.aliases)
self.assertEqual({"foo": 'red', "bar": 'blue'}, two.metadata)
self.assertEqual(3, two.bar)
self.assertEqual("5eb63bbbe01eeed093cb22bb8f5acdc3", two.md5)
self.assertEqual(None, two.md5)

for attr in ["_id", "name", "content_type", "length", "chunk_size",
"upload_date", "aliases", "metadata", "md5"]:
Expand Down
14 changes: 5 additions & 9 deletions test/test_grid_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,7 @@ def test_grid_in_custom_opts(self):
class TestGridFile(IntegrationTest):

def setUp(self):
self.db.drop_collection('fs.files')
self.db.drop_collection('fs.chunks')
self.cleanup_colls(self.db.fs.files, self.db.fs.chunks)

def test_basic(self):
f = GridIn(self.db.fs, filename="test")
Expand Down Expand Up @@ -112,7 +111,7 @@ def test_md5(self):
f = GridIn(self.db.fs)
f.write(b"hello world\n")
f.close()
self.assertEqual("6f5902ac237024bdd0c176cb93063dc4", f.md5)
self.assertEqual(None, f.md5)

def test_alternate_collection(self):
self.db.alt.files.delete_many({})
Expand All @@ -128,9 +127,6 @@ def test_alternate_collection(self):
g = GridOut(self.db.alt, f._id)
self.assertEqual(b"hello world", g.read())

# test that md5 still works...
self.assertEqual("5eb63bbbe01eeed093cb22bb8f5acdc3", g.md5)

def test_grid_in_default_opts(self):
self.assertRaises(TypeError, GridIn, "foo")

Expand Down Expand Up @@ -194,7 +190,7 @@ def test_grid_in_default_opts(self):

self.assertEqual({"foo": 1}, a.metadata)

self.assertEqual("d41d8cd98f00b204e9800998ecf8427e", a.md5)
self.assertEqual(None, a.md5)
self.assertRaises(AttributeError, setattr, a, "md5", 5)

# Make sure custom attributes that were set both before and after
Expand Down Expand Up @@ -225,7 +221,7 @@ def test_grid_out_default_opts(self):
self.assertTrue(isinstance(b.upload_date, datetime.datetime))
self.assertEqual(None, b.aliases)
self.assertEqual(None, b.metadata)
self.assertEqual("d41d8cd98f00b204e9800998ecf8427e", b.md5)
self.assertEqual(None, b.md5)

for attr in ["_id", "name", "content_type", "length", "chunk_size",
"upload_date", "aliases", "metadata", "md5"]:
Expand Down Expand Up @@ -266,7 +262,7 @@ def test_grid_out_custom_opts(self):
self.assertEqual(["foo"], two.aliases)
self.assertEqual({"foo": 1, "bar": 2}, two.metadata)
self.assertEqual(3, two.bar)
self.assertEqual("5eb63bbbe01eeed093cb22bb8f5acdc3", two.md5)
self.assertEqual(None, two.md5)

for attr in ["_id", "name", "content_type", "length", "chunk_size",
"upload_date", "aliases", "metadata", "md5"]:
Expand Down
Loading

0 comments on commit e271315

Please sign in to comment.