From 999d8fd60e6402bc61433aa65660812279c0c0e5 Mon Sep 17 00:00:00 2001 From: 5ec1cff <56485584+5ec1cff@users.noreply.github.com> Date: Tue, 21 Jan 2025 02:04:43 +0800 Subject: [PATCH] GH-128131: Completely support random read access of uncompressed unencrypted files in ZipFile (GH-128143) (cherry picked from commit dda02eb7be62bf0af850a7521c77c90ea997df6c) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: 5ec1cff <56485584+5ec1cff@users.noreply.github.com> Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com> Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> --- Lib/test/test_zipfile/test_core.py | 83 +++++++++++++++++++ Lib/zipfile/__init__.py | 4 +- ...-12-21-03-20-12.gh-issue-128131.QpPmNt.rst | 2 + 3 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 4ff9f9c34237c6..4b56f6a380f219 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1,3 +1,4 @@ +import _pyio import array import contextlib import importlib.util @@ -3454,5 +3455,87 @@ def test_too_short(self): b"zzz", zipfile._Extra.strip(b"zzz", (self.ZIP64_EXTRA,))) +class StatIO(_pyio.BytesIO): + """Buffer which remembers the number of bytes that were read.""" + + def __init__(self): + super().__init__() + self.bytes_read = 0 + + def read(self, size=-1): + bs = super().read(size) + self.bytes_read += len(bs) + return bs + + +class StoredZipExtFileRandomReadTest(unittest.TestCase): + """Tests whether an uncompressed, unencrypted zip entry can be randomly + seek and read without reading redundant bytes.""" + def test_stored_seek_and_read(self): + + sio = StatIO() + # 20000 bytes + txt = b'0123456789' * 2000 + + # The seek length must be greater than ZipExtFile.MIN_READ_SIZE + # as `ZipExtFile._read2()` reads in blocks of this size and we + # need to seek out of the buffered data + read_buffer_size = zipfile.ZipExtFile.MIN_READ_SIZE + self.assertGreaterEqual(10002, read_buffer_size) # for forward seek test + self.assertGreaterEqual(5003, read_buffer_size) # for backward seek test + # The read length must be less than MIN_READ_SIZE, since we assume that + # only 1 block is read in the test. + read_length = 100 + self.assertGreaterEqual(read_buffer_size, read_length) # for read() calls + + with zipfile.ZipFile(sio, "w", compression=zipfile.ZIP_STORED) as zipf: + zipf.writestr("foo.txt", txt) + + # check random seek and read on a file + with zipfile.ZipFile(sio, "r") as zipf: + with zipf.open("foo.txt", "r") as fp: + # Test this optimized read hasn't rewound and read from the + # start of the file (as in the case of the unoptimized path) + + # forward seek + old_count = sio.bytes_read + forward_seek_len = 10002 + current_pos = 0 + fp.seek(forward_seek_len, os.SEEK_CUR) + current_pos += forward_seek_len + self.assertEqual(fp.tell(), current_pos) + self.assertEqual(fp._left, fp._compress_left) + arr = fp.read(read_length) + current_pos += read_length + self.assertEqual(fp.tell(), current_pos) + self.assertEqual(arr, txt[current_pos - read_length:current_pos]) + self.assertEqual(fp._left, fp._compress_left) + read_count = sio.bytes_read - old_count + self.assertLessEqual(read_count, read_buffer_size) + + # backward seek + old_count = sio.bytes_read + backward_seek_len = 5003 + fp.seek(-backward_seek_len, os.SEEK_CUR) + current_pos -= backward_seek_len + self.assertEqual(fp.tell(), current_pos) + self.assertEqual(fp._left, fp._compress_left) + arr = fp.read(read_length) + current_pos += read_length + self.assertEqual(fp.tell(), current_pos) + self.assertEqual(arr, txt[current_pos - read_length:current_pos]) + self.assertEqual(fp._left, fp._compress_left) + read_count = sio.bytes_read - old_count + self.assertLessEqual(read_count, read_buffer_size) + + # eof flags test + fp.seek(0, os.SEEK_END) + fp.seek(12345, os.SEEK_SET) + current_pos = 12345 + arr = fp.read(read_length) + current_pos += read_length + self.assertEqual(arr, txt[current_pos - read_length:current_pos]) + + if __name__ == "__main__": unittest.main() diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 8b636094fad3ca..82e307f78e8e3d 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1163,13 +1163,15 @@ def seek(self, offset, whence=os.SEEK_SET): self._offset = buff_offset read_offset = 0 # Fast seek uncompressed unencrypted file - elif self._compress_type == ZIP_STORED and self._decrypter is None and read_offset > 0: + elif self._compress_type == ZIP_STORED and self._decrypter is None and read_offset != 0: # disable CRC checking after first seeking - it would be invalid self._expected_crc = None # seek actual file taking already buffered data into account read_offset -= len(self._readbuffer) - self._offset self._fileobj.seek(read_offset, os.SEEK_CUR) self._left -= read_offset + self._compress_left -= read_offset + self._eof = self._left <= 0 read_offset = 0 # flush read buffer self._readbuffer = b'' diff --git a/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst b/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst new file mode 100644 index 00000000000000..f4c4ebce10729c --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst @@ -0,0 +1,2 @@ +Completely support random access of uncompressed unencrypted read-only +zip files obtained by :meth:`ZipFile.open `.