Skip to content

Commit

Permalink
pythonGH-128131: Completely support random read access of uncompresse…
Browse files Browse the repository at this point in the history
…d unencrypted files in ZipFile (pythonGH-128143)

(cherry picked from commit dda02eb)

Co-authored-by: 5ec1cff <56485584+5ec1cff@users.noreply.github.com>
Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com>
  • Loading branch information
3 people authored and miss-islington committed Jan 20, 2025
1 parent e6cb31a commit 69cffe9
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 1 deletion.
83 changes: 83 additions & 0 deletions Lib/test/test_zipfile/test_core.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import _pyio
import array
import contextlib
import importlib.util
Expand Down Expand Up @@ -3440,5 +3441,87 @@ def test_too_short(self):
b"zzz", zipfile._strip_extra(b"zzz", (self.ZIP64_EXTRA,)))


class StatIO(_pyio.BytesIO):
"""Buffer which remembers the number of bytes that were read."""

def __init__(self):
super().__init__()
self.bytes_read = 0

def read(self, size=-1):
bs = super().read(size)
self.bytes_read += len(bs)
return bs


class StoredZipExtFileRandomReadTest(unittest.TestCase):
"""Tests whether an uncompressed, unencrypted zip entry can be randomly
seek and read without reading redundant bytes."""
def test_stored_seek_and_read(self):

sio = StatIO()
# 20000 bytes
txt = b'0123456789' * 2000

# The seek length must be greater than ZipExtFile.MIN_READ_SIZE
# as `ZipExtFile._read2()` reads in blocks of this size and we
# need to seek out of the buffered data
read_buffer_size = zipfile.ZipExtFile.MIN_READ_SIZE
self.assertGreaterEqual(10002, read_buffer_size) # for forward seek test
self.assertGreaterEqual(5003, read_buffer_size) # for backward seek test
# The read length must be less than MIN_READ_SIZE, since we assume that
# only 1 block is read in the test.
read_length = 100
self.assertGreaterEqual(read_buffer_size, read_length) # for read() calls

with zipfile.ZipFile(sio, "w", compression=zipfile.ZIP_STORED) as zipf:
zipf.writestr("foo.txt", txt)

# check random seek and read on a file
with zipfile.ZipFile(sio, "r") as zipf:
with zipf.open("foo.txt", "r") as fp:
# Test this optimized read hasn't rewound and read from the
# start of the file (as in the case of the unoptimized path)

# forward seek
old_count = sio.bytes_read
forward_seek_len = 10002
current_pos = 0
fp.seek(forward_seek_len, os.SEEK_CUR)
current_pos += forward_seek_len
self.assertEqual(fp.tell(), current_pos)
self.assertEqual(fp._left, fp._compress_left)
arr = fp.read(read_length)
current_pos += read_length
self.assertEqual(fp.tell(), current_pos)
self.assertEqual(arr, txt[current_pos - read_length:current_pos])
self.assertEqual(fp._left, fp._compress_left)
read_count = sio.bytes_read - old_count
self.assertLessEqual(read_count, read_buffer_size)

# backward seek
old_count = sio.bytes_read
backward_seek_len = 5003
fp.seek(-backward_seek_len, os.SEEK_CUR)
current_pos -= backward_seek_len
self.assertEqual(fp.tell(), current_pos)
self.assertEqual(fp._left, fp._compress_left)
arr = fp.read(read_length)
current_pos += read_length
self.assertEqual(fp.tell(), current_pos)
self.assertEqual(arr, txt[current_pos - read_length:current_pos])
self.assertEqual(fp._left, fp._compress_left)
read_count = sio.bytes_read - old_count
self.assertLessEqual(read_count, read_buffer_size)

# eof flags test
fp.seek(0, os.SEEK_END)
fp.seek(12345, os.SEEK_SET)
current_pos = 12345
arr = fp.read(read_length)
current_pos += read_length
self.assertEqual(arr, txt[current_pos - read_length:current_pos])


if __name__ == "__main__":
unittest.main()
4 changes: 3 additions & 1 deletion Lib/zipfile/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1140,13 +1140,15 @@ def seek(self, offset, whence=os.SEEK_SET):
self._offset = buff_offset
read_offset = 0
# Fast seek uncompressed unencrypted file
elif self._compress_type == ZIP_STORED and self._decrypter is None and read_offset > 0:
elif self._compress_type == ZIP_STORED and self._decrypter is None and read_offset != 0:
# disable CRC checking after first seeking - it would be invalid
self._expected_crc = None
# seek actual file taking already buffered data into account
read_offset -= len(self._readbuffer) - self._offset
self._fileobj.seek(read_offset, os.SEEK_CUR)
self._left -= read_offset
self._compress_left -= read_offset
self._eof = self._left <= 0
read_offset = 0
# flush read buffer
self._readbuffer = b''
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Completely support random access of uncompressed unencrypted read-only
zip files obtained by :meth:`ZipFile.open <zipfile.ZipFile.open>`.

0 comments on commit 69cffe9

Please sign in to comment.