From f089c53239a4366feaf5b436bfdfac83ab1df12a Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Tue, 31 Jan 2023 14:40:52 +0100 Subject: [PATCH 01/12] =?UTF-8?q?Implement=20PEP=20706=20=E2=80=93=20Filte?= =?UTF-8?q?r=20for=20tarfile.extractall?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Doc/library/shutil.rst | 25 +- Doc/library/tarfile.rst | 457 ++++++++- Lib/shutil.py | 17 +- Lib/tarfile.py | 369 ++++++- Lib/test/test_shutil.py | 43 +- Lib/test/test_tarfile.py | 929 +++++++++++++++++- ...-03-23-15-24-38.gh-issue-102953.YR4KaK.rst | 4 + 7 files changed, 1749 insertions(+), 95 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2023-03-23-15-24-38.gh-issue-102953.YR4KaK.rst diff --git a/Doc/library/shutil.rst b/Doc/library/shutil.rst index 799505858f7f5a..772f555a8a46e4 100644 --- a/Doc/library/shutil.rst +++ b/Doc/library/shutil.rst @@ -539,7 +539,7 @@ provided. They rely on the :mod:`zipfile` and :mod:`tarfile` modules. Remove the archive format *name* from the list of supported formats. -.. function:: unpack_archive(filename[, extract_dir[, format]]) +.. function:: unpack_archive(filename[, extract_dir[, format[, filter]]]) Unpack an archive. *filename* is the full path of the archive. @@ -553,9 +553,19 @@ provided. They rely on the :mod:`zipfile` and :mod:`tarfile` modules. registered for that extension. In case none is found, a :exc:`ValueError` is raised. + The keyword-only *filter* argument is passed to the underlying unpacking + function. For zip files, *filter* is not accepted. + For tar files, it is recommended to set it to ``'data'``, + unless using features specific to tar and UNIX-like filesystems. + (See :ref:`tarfile-extraction-filter` for details.) + The ``'data'`` filter will become the default for tar files + in Python 3.14. + .. versionchanged:: 3.7 Accepts a :term:`path-like object` for *filename* and *extract_dir*. + .. versionchanged:: 3.8.17 + Added the *filter* argument. .. function:: register_unpack_format(name, extensions, function[, extra_args[, description]]) @@ -564,11 +574,14 @@ provided. They rely on the :mod:`zipfile` and :mod:`tarfile` modules. ``.zip`` for Zip files. *function* is the callable that will be used to unpack archives. The - callable will receive the path of the archive, followed by the directory - the archive must be extracted to. - - When provided, *extra_args* is a sequence of ``(name, value)`` tuples that - will be passed as keywords arguments to the callable. + callable will receive: + + - the path of the archive, as a positional argument; + - the directory the archive must be extracted to, as a positional argument; + - possibly a *filter* keyword argument, if it was given to + :func:`unpack_archive`; + - additional keyword arguments, specified by *extra_args* as a sequence + of ``(name, value)`` tuples. *description* can be provided to describe the format, and will be returned by the :func:`get_unpack_formats` function. diff --git a/Doc/library/tarfile.rst b/Doc/library/tarfile.rst index 7f57e00bbd5fd3..15efbccdd88fc8 100644 --- a/Doc/library/tarfile.rst +++ b/Doc/library/tarfile.rst @@ -36,6 +36,13 @@ Some facts and figures: .. versionchanged:: 3.3 Added support for :mod:`lzma` compression. +.. versionchanged:: 3.12 + Archives are extracted using a :ref:`filter `, + which makes it possible to either limit surprising/dangerous features, + or to acknowledge that they are expected and the archive is fully trusted. + By default, archives are fully trusted, but this default is deprecated + and slated to change in Python 3.14. + .. function:: open(name=None, mode='r', fileobj=None, bufsize=10240, \*\*kwargs) @@ -199,6 +206,38 @@ The :mod:`tarfile` module defines the following exceptions: Is raised by :meth:`TarInfo.frombuf` if the buffer it gets is invalid. +.. exception:: FilterError + + Base class for members :ref:`refused ` by + filters. + + .. attribute:: tarinfo + + Information about the member that the filter refused to extract, + as :ref:`TarInfo `. + +.. exception:: AbsolutePathError + + Raised to refuse extracting a member with an absolute path. + +.. exception:: OutsideDestinationError + + Raised to refuse extracting a member outside the destination directory. + +.. exception:: SpecialFileError + + Raised to refuse extracting a special file (e.g. a device or pipe). + +.. exception:: AbsoluteLinkError + + Raised to refuse extracting a symbolic link with an absolute path. + +.. exception:: LinkOutsideDestinationError + + Raised to refuse extracting a symbolic link pointing outside the destination + directory. + + The following constants are available at the module level: .. data:: ENCODING @@ -304,11 +343,8 @@ be finalized; only the internally used file object will be closed. See the *debug* can be set from ``0`` (no debug messages) up to ``3`` (all debug messages). The messages are written to ``sys.stderr``. - If *errorlevel* is ``0``, all errors are ignored when using :meth:`TarFile.extract`. - Nevertheless, they appear as error messages in the debug output, when debugging - is enabled. If ``1``, all *fatal* errors are raised as :exc:`OSError` - exceptions. If ``2``, all *non-fatal* errors are raised as :exc:`TarError` - exceptions as well. + *errorlevel* controls how extraction errors are handled, + see :attr:`the corresponding attribute <~TarFile.errorlevel>`. The *encoding* and *errors* arguments define the character encoding to be used for reading or writing the archive and how conversion errors are going @@ -375,7 +411,7 @@ be finalized; only the internally used file object will be closed. See the available. -.. method:: TarFile.extractall(path=".", members=None, *, numeric_owner=False) +.. method:: TarFile.extractall(path=".", members=None, *, numeric_owner=False, filter=None) Extract all members from the archive to the current working directory or directory *path*. If optional *members* is given, it must be a subset of the @@ -389,6 +425,12 @@ be finalized; only the internally used file object will be closed. See the are used to set the owner/group for the extracted files. Otherwise, the named values from the tarfile are used. + The *filter* argument specifies how ``members`` are modified or rejected + before extraction. + See :ref:`tarfile-extraction-filter` for details. + It is recommended to set this explicitly depending on which *tar* features + you need to support. + .. warning:: Never extract archives from untrusted sources without prior inspection. @@ -396,14 +438,20 @@ be finalized; only the internally used file object will be closed. See the that have absolute filenames starting with ``"/"`` or filenames with two dots ``".."``. + Set ``filter='data'`` to prevent the most dangerous security issues, + and read the :ref:`tarfile-extraction-filter` section for details. + .. versionchanged:: 3.5 Added the *numeric_owner* parameter. .. versionchanged:: 3.6 The *path* parameter accepts a :term:`path-like object`. + .. versionchanged:: 3.12 + Added the *filter* parameter. + -.. method:: TarFile.extract(member, path="", set_attrs=True, *, numeric_owner=False) +.. method:: TarFile.extract(member, path="", set_attrs=True, *, numeric_owner=False, filter=None) Extract a member from the archive to the current working directory, using its full name. Its file information is extracted as accurately as possible. *member* @@ -411,9 +459,8 @@ be finalized; only the internally used file object will be closed. See the directory using *path*. *path* may be a :term:`path-like object`. File attributes (owner, mtime, mode) are set unless *set_attrs* is false. - If *numeric_owner* is :const:`True`, the uid and gid numbers from the tarfile - are used to set the owner/group for the extracted files. Otherwise, the named - values from the tarfile are used. + The *numeric_owner* and *filter* arguments are the same as + for :meth:`extractall`. .. note:: @@ -424,6 +471,9 @@ be finalized; only the internally used file object will be closed. See the See the warning for :meth:`extractall`. + Set ``filter='data'`` to prevent the most dangerous security issues, + and read the :ref:`tarfile-extraction-filter` section for details. + .. versionchanged:: 3.2 Added the *set_attrs* parameter. @@ -433,6 +483,9 @@ be finalized; only the internally used file object will be closed. See the .. versionchanged:: 3.6 The *path* parameter accepts a :term:`path-like object`. + .. versionchanged:: 3.12 + Added the *filter* parameter. + .. method:: TarFile.extractfile(member) @@ -444,6 +497,55 @@ be finalized; only the internally used file object will be closed. See the .. versionchanged:: 3.3 Return an :class:`io.BufferedReader` object. +.. attribute:: TarFile.errorlevel + :type: int + + If *errorlevel* is ``0``, errors are ignored when using :meth:`TarFile.extract` + and :meth:`TarFile.extractall`. + Nevertheless, they appear as error messages in the debug output when + *debug* is greater than 0. + If ``1`` (the default), all *fatal* errors are raised as :exc:`OSError` or + :exc:`FilterError` exceptions. If ``2``, all *non-fatal* errors are raised + as :exc:`TarError` exceptions as well. + + Some exceptions, e.g. ones caused by wrong argument types or data + corruption, are always raised. + + Custom :ref:`extraction filters ` + should raise :exc:`FilterError` for *fatal* errors + and :exc:`ExtractError` for *non-fatal* ones. + + Note that when an exception is raised, the archive may be partially + extracted. It is the user’s responsibility to clean up. + +.. attribute:: TarFile.extraction_filter + + .. versionadded:: 3.12 + + The :ref:`extraction filter ` used + as a default for the *filter* argument of :meth:`~TarFile.extract` + and :meth:`~TarFile.extractall`. + + The attribute may be ``None`` or a callable. + String names are not allowed for this attribute, unlike the *filter* + argument to :meth:`~TarFile.extract`. + + If ``extraction_filter`` is ``None`` (the default), + calling an extraction method without a *filter* argument will raise a + ``DeprecationWarning``, + and fall back to the :func:`fully_trusted ` filter, + whose dangerous behavior matches previous versions of Python. + + In Python 3.14+, leaving ``extraction_filter=None`` will cause + extraction methods to use the :func:`data ` filter by default. + + The attribute may be set on instances or overridden in subclasses. + It also is possible to set it on the ``TarFile`` class itself to set a + global default, although, since it affects all uses of *tarfile*, + it is best practice to only do so in top-level applications or + :mod:`site configuration `. + To set a global default this way, a filter function needs to be wrapped in + :func:`staticmethod()` to prevent injection of a ``self`` argument. .. method:: TarFile.add(name, arcname=None, recursive=True, *, filter=None) @@ -519,8 +621,23 @@ permissions, owner etc.), it provides some useful methods to determine its type. It does *not* contain the file's data itself. :class:`TarInfo` objects are returned by :class:`TarFile`'s methods -:meth:`getmember`, :meth:`getmembers` and :meth:`gettarinfo`. +:meth:`~TarFile.getmember`, :meth:`~TarFile.getmembers` and +:meth:`~TarFile.gettarinfo`. + +Modifying the objects returned by :meth:`~!TarFile.getmember` or +:meth:`~!TarFile.getmembers` will affect all subsequent +operations on the archive. +For cases where this is unwanted, you can use :mod:`copy.copy() ` or +call the :meth:`~TarInfo.replace` method to create a modified copy in one step. +Several attributes can be set to ``None`` to indicate that a piece of metadata +is unused or unknown. +Different :class:`TarInfo` methods handle ``None`` differently: + +- The :meth:`~TarFile.extract` or :meth:`~TarFile.extractall` methods will + ignore the corresponding metadata, leaving it set to a default. +- :meth:`~TarFile.addfile` will fail. +- :meth:`~TarFile.list` will print a placeholder string. .. class:: TarInfo(name="") @@ -553,24 +670,39 @@ A ``TarInfo`` object has the following public data attributes: .. attribute:: TarInfo.name + :type: str Name of the archive member. .. attribute:: TarInfo.size + :type: int Size in bytes. .. attribute:: TarInfo.mtime + :type: int | float + + Time of last modification in seconds since the :ref:`epoch `, + as in :attr:`os.stat_result.st_mtime`. - Time of last modification. + .. versionchanged:: 3.12 + Can be set to ``None`` for :meth:`~TarFile.extract` and + :meth:`~TarFile.extractall`, causing extraction to skip applying this + attribute. .. attribute:: TarInfo.mode + :type: int - Permission bits. + Permission bits, as for :func:`os.chmod`. + .. versionchanged:: 3.12 + + Can be set to ``None`` for :meth:`~TarFile.extract` and + :meth:`~TarFile.extractall`, causing extraction to skip applying this + attribute. .. attribute:: TarInfo.type @@ -582,35 +714,76 @@ A ``TarInfo`` object has the following public data attributes: .. attribute:: TarInfo.linkname + :type: str Name of the target file name, which is only present in :class:`TarInfo` objects of type :const:`LNKTYPE` and :const:`SYMTYPE`. .. attribute:: TarInfo.uid + :type: int User ID of the user who originally stored this member. + .. versionchanged:: 3.12 + + Can be set to ``None`` for :meth:`~TarFile.extract` and + :meth:`~TarFile.extractall`, causing extraction to skip applying this + attribute. .. attribute:: TarInfo.gid + :type: int Group ID of the user who originally stored this member. + .. versionchanged:: 3.12 + + Can be set to ``None`` for :meth:`~TarFile.extract` and + :meth:`~TarFile.extractall`, causing extraction to skip applying this + attribute. .. attribute:: TarInfo.uname + :type: str User name. + .. versionchanged:: 3.12 + + Can be set to ``None`` for :meth:`~TarFile.extract` and + :meth:`~TarFile.extractall`, causing extraction to skip applying this + attribute. .. attribute:: TarInfo.gname + :type: str Group name. + .. versionchanged:: 3.12 + + Can be set to ``None`` for :meth:`~TarFile.extract` and + :meth:`~TarFile.extractall`, causing extraction to skip applying this + attribute. .. attribute:: TarInfo.pax_headers + :type: dict A dictionary containing key-value pairs of an associated pax extended header. +.. method:: TarInfo.replace(name=..., mtime=..., mode=..., linkname=..., + uid=..., gid=..., uname=..., gname=..., + deep=True) + + .. versionadded:: 3.12 + + Return a *new* copy of the :class:`!TarInfo` object with the given attributes + changed. For example, to return a ``TarInfo`` with the group name set to + ``'staff'``, use:: + + new_tarinfo = old_tarinfo.replace(gname='staff') + + By default, a deep copy is made. + If *deep* is false, the copy is shallow, i.e. ``pax_headers`` + and any custom attributes are shared with the original ``TarInfo`` object. A :class:`TarInfo` object also provides some convenient query methods: @@ -660,9 +833,258 @@ A :class:`TarInfo` object also provides some convenient query methods: Return :const:`True` if it is one of character device, block device or FIFO. +.. _tarfile-extraction-filter: + +Extraction filters +------------------ + +.. versionadded:: 3.12 + +The *tar* format is designed to capture all details of a UNIX-like filesystem, +which makes it very powerful. +Unfortunately, the features make it easy to create tar files that have +unintended -- and possibly malicious -- effects when extracted. +For example, extracting a tar file can overwrite arbitrary files in various +ways (e.g. by using absolute paths, ``..`` path components, or symlinks that +affect later members). + +In most cases, the full functionality is not needed. +Therefore, *tarfile* supports extraction filters: a mechanism to limit +functionality, and thus mitigate some of the security issues. + +.. seealso:: + + :pep:`706` + Contains further motivation and rationale behind the design. + +The *filter* argument to :meth:`TarFile.extract` or :meth:`~TarFile.extractall` +can be: + +* the string ``'fully_trusted'``: Honor all metadata as specified in the + archive. + Should be used if the user trusts the archive completely, or implements + their own complex verification. + +* the string ``'tar'``: Honor most *tar*-specific features (i.e. features of + UNIX-like filesystems), but block features that are very likely to be + surprising or malicious. See :func:`tar_filter` for details. + +* the string ``'data'``: Ignore or block most features specific to UNIX-like + filesystems. Intended for extracting cross-platform data archives. + See :func:`data_filter` for details. + +* ``None`` (default): Use :attr:`TarFile.extraction_filter`. + + If that is also ``None`` (the default), raise a ``DeprecationWarning``, + and fall back to the ``'fully_trusted'`` filter, whose dangerous behavior + matches previous versions of Python. + + In Python 3.14, the ``'data'`` filter will become the default instead. + It's possible to switch earlier; see :attr:`TarFile.extraction_filter`. + +* A callable which will be called for each extracted member with a + :ref:`TarInfo ` describing the member and the destination + path to where the archive is extracted (i.e. the same path is used for all + members):: + + filter(/, member: TarInfo, path: str) -> TarInfo | None + + The callable is called just before each member is extracted, so it can + take the current state of the disk into account. + It can: + + - return a :class:`TarInfo` object which will be used instead of the metadata + in the archive, or + - return ``None``, in which case the member will be skipped, or + - raise an exception to abort the operation or skip the member, + depending on :attr:`~TarFile.errorlevel`. + Note that when extraction is aborted, :meth:`~TarFile.extractall` may leave + the archive partially extracted. It does not attempt to clean up. + +Default named filters +~~~~~~~~~~~~~~~~~~~~~ + +The pre-defined, named filters are available as functions, so they can be +reused in custom filters: + +.. function:: fully_trusted_filter(/, member, path) + + Return *member* unchanged. + + This implements the ``'fully_trusted'`` filter. + +.. function:: tar_filter(/, member, path) + + Implements the ``'tar'`` filter. + + - Strip leading slashes (``/`` and :attr:`os.sep`) from filenames. + - :ref:`Refuse ` to extract files with absolute + paths (in case the name is absolute + even after stripping slashes, e.g. ``C:/foo`` on Windows). + This raises :class:`~tarfile.AbsolutePathError`. + - :ref:`Refuse ` to extract files whose absolute + path (after following symlinks) would end up outside the destination. + This raises :class:`~tarfile.OutsideDestinationError`. + - Clear high mode bits (setuid, setgid, sticky) and group/other write bits + (:attr:`~stat.S_IWGRP`|:attr:`~stat.S_IWOTH`). + + Return the modified ``TarInfo`` member. + +.. function:: data_filter(/, member, path) + + Implements the ``'data'`` filter. + In addition to what ``tar_filter`` does: + + - :ref:`Refuse ` to extract links (hard or soft) + that link to absolute paths, or ones that link outside the destination. + + This raises :class:`~tarfile.AbsoluteLinkError` or + :class:`~tarfile.LinkOutsideDestinationError`. + + Note that such files are refused even on platforms that do not support + symbolic links. + + - :ref:`Refuse ` to extract device files + (including pipes). + This raises :class:`~tarfile.SpecialFileError`. + + - For regular files, including hard links: + + - Set the owner read and write permissions + (:attr:`~stat.S_IRUSR`|:attr:`~stat.S_IWUSR`). + - Remove the group & other executable permission + (:attr:`~stat.S_IXGRP`|:attr:`~stat.S_IXOTH`) + if the owner doesn’t have it (:attr:`~stat.S_IXUSR`). + + - For other files (directories), set ``mode`` to ``None``, so + that extraction methods skip applying permission bits. + - Set user and group info (``uid``, ``gid``, ``uname``, ``gname``) + to ``None``, so that extraction methods skip setting it. + + Return the modified ``TarInfo`` member. + + +.. _tarfile-extraction-refuse: + +Filter errors +~~~~~~~~~~~~~ + +When a filter refuses to extract a file, it will raise an appropriate exception, +a subclass of :class:`~tarfile.FilterError`. +This will abort the extraction if :attr:`TarFile.errorlevel` is 1 or more. +With ``errorlevel=0`` the error will be logged and the member will be skipped, +but extraction will continue. + + +Hints for further verification +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Even with ``filter='data'``, *tarfile* is not suited for extracting untrusted +files without prior inspection. +Among other issues, the pre-defined filters do not prevent denial-of-service +attacks. Users should do additional checks. + +Here is an incomplete list of things to consider: + +* Extract to a :func:`new temporary directory ` + to prevent e.g. exploiting pre-existing links, and to make it easier to + clean up after a failed extraction. +* When working with untrusted data, use external (e.g. OS-level) limits on + disk, memory and CPU usage. +* Check filenames against an allow-list of characters + (to filter out control characters, confusables, foreign path separators, + etc.). +* Check that filenames have expected extensions (discouraging files that + execute when you “click on them”, or extension-less files like Windows special device names). +* Limit the number of extracted files, total size of extracted data, + filename length (including symlink length), and size of individual files. +* Check for files that would be shadowed on case-insensitive filesystems. + +Also note that: + +* Tar files may contain multiple versions of the same file. + Later ones are expected to overwrite any earlier ones. + This feature is crucial to allow updating tape archives, but can be abused + maliciously. +* *tarfile* does not protect against issues with “live” data, + e.g. an attacker tinkering with the destination (or source) directory while + extraction (or archiving) is in progress. + + +Supporting older Python versions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Extraction filters were added to Python 3.12, but may be backported to older +versions as security updates. +To check whether the feature is available, use e.g. +``hasattr(tarfile, 'data_filter')`` rather than checking the Python version. + +The following examples show how to support Python versions with and without +the feature. +Note that setting ``extraction_filter`` will affect any subsequent operations. + +* Fully trusted archive:: + + my_tarfile.extraction_filter = (lambda member, path: member) + my_tarfile.extractall() + +* Use the ``'data'`` filter if available, but revert to Python 3.11 behavior + (``'fully_trusted'``) if this feature is not available:: + + my_tarfile.extraction_filter = getattr(tarfile, 'data_filter', + (lambda member, path: member)) + my_tarfile.extractall() + +* Use the ``'data'`` filter; *fail* if it is not available:: + + my_tarfile.extractall(filter=tarfile.data_filter) + + or:: + + my_tarfile.extraction_filter = tarfile.data_filter + my_tarfile.extractall() + +* Use the ``'data'`` filter; *warn* if it is not available:: + + if hasattr(tarfile, 'data_filter'): + my_tarfile.extractall(filter='data') + else: + # remove this when no longer needed + warn_the_user('Extracting may be unsafe; consider updating Python') + my_tarfile.extractall() + + +Stateful extraction filter example +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +While *tarfile*'s extraction methods take a simple *filter* callable, +custom filters may be more complex objects with an internal state. +It may be useful to write these as context managers, to be used like this:: + + with StatefulFilter() as filter_func: + tar.extractall(path, filter=filter_func) + +Such a filter can be written as, for example:: + + class StatefulFilter: + def __init__(self): + self.file_count = 0 + + def __enter__(self): + return self + + def __call__(self, member, path): + self.file_count += 1 + return member + + def __exit__(self, *exc_info): + print(f'{self.file_count} files extracted') + + .. _tarfile-commandline: .. program:: tarfile + Command-Line Interface ---------------------- @@ -732,6 +1154,13 @@ Command-line options Verbose output. +.. cmdoption:: --filter + + Specifies the *filter* for ``--extract``. + See :ref:`tarfile-extraction-filter` for details. + Only string names are accepted (that is, ``fully_trusted``, ``tar``, + and ``data``). + .. _tar-examples: Examples @@ -741,7 +1170,7 @@ How to extract an entire tar archive to the current working directory:: import tarfile tar = tarfile.open("sample.tar.gz") - tar.extractall() + tar.extractall(filter='data') tar.close() How to extract a subset of a tar archive with :meth:`TarFile.extractall` using diff --git a/Lib/shutil.py b/Lib/shutil.py index fc6fb4edd24c15..c27186df1bdeb9 100644 --- a/Lib/shutil.py +++ b/Lib/shutil.py @@ -924,7 +924,7 @@ def _unpack_zipfile(filename, extract_dir): finally: zip.close() -def _unpack_tarfile(filename, extract_dir): +def _unpack_tarfile(filename, extract_dir, *, filter=None): """Unpack tar/tar.gz/tar.bz2/tar.xz `filename` to `extract_dir` """ import tarfile # late import for breaking circular dependency @@ -934,7 +934,7 @@ def _unpack_tarfile(filename, extract_dir): raise ReadError( "%s is not a compressed or uncompressed tar file" % filename) try: - tarobj.extractall(extract_dir) + tarobj.extractall(extract_dir, filter=filter) finally: tarobj.close() @@ -962,7 +962,7 @@ def _find_unpack_format(filename): return name return None -def unpack_archive(filename, extract_dir=None, format=None): +def unpack_archive(filename, extract_dir=None, format=None, *, filter=None): """Unpack an archive. `filename` is the name of the archive. @@ -976,6 +976,9 @@ def unpack_archive(filename, extract_dir=None, format=None): was registered for that extension. In case none is found, a ValueError is raised. + + If `filter` is given, it is passed to the underlying + extraction function. """ if extract_dir is None: extract_dir = os.getcwd() @@ -983,6 +986,10 @@ def unpack_archive(filename, extract_dir=None, format=None): extract_dir = os.fspath(extract_dir) filename = os.fspath(filename) + if filter is None: + filter_kwargs = {} + else: + filter_kwargs = {'filter': filter} if format is not None: try: format_info = _UNPACK_FORMATS[format] @@ -990,7 +997,7 @@ def unpack_archive(filename, extract_dir=None, format=None): raise ValueError("Unknown unpack format '{0}'".format(format)) from None func = format_info[1] - func(filename, extract_dir, **dict(format_info[2])) + func(filename, extract_dir, **dict(format_info[2]), **filter_kwargs) else: # we need to look at the registered unpackers supported extensions format = _find_unpack_format(filename) @@ -998,7 +1005,7 @@ def unpack_archive(filename, extract_dir=None, format=None): raise ReadError("Unknown archive format '{0}'".format(filename)) func = _UNPACK_FORMATS[format][1] - kwargs = dict(_UNPACK_FORMATS[format][2]) + kwargs = dict(_UNPACK_FORMATS[format][2]) | filter_kwargs func(filename, extract_dir, **kwargs) diff --git a/Lib/tarfile.py b/Lib/tarfile.py index 3be5188c8b0a20..8c220bb20d780b 100755 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -46,6 +46,7 @@ import struct import copy import re +import warnings try: import pwd @@ -69,7 +70,11 @@ __all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError", "ReadError", "CompressionError", "StreamError", "ExtractError", "HeaderError", "ENCODING", "USTAR_FORMAT", "GNU_FORMAT", "PAX_FORMAT", - "DEFAULT_FORMAT", "open"] + "DEFAULT_FORMAT", "open","fully_trusted_filter", "data_filter", + "tar_filter", "FilterError", "AbsoluteLinkError", + "OutsideDestinationError", "SpecialFileError", "AbsolutePathError", + "LinkOutsideDestinationError"] + #--------------------------------------------------------- # tar constants @@ -158,6 +163,8 @@ def stn(s, length, encoding, errors): """Convert a string to a null-terminated bytes object. """ + if s is None: + raise ValueError("metadata cannot contain None") s = s.encode(encoding, errors) return s[:length] + (length - len(s)) * NUL @@ -719,9 +726,127 @@ def __init__(self, tarfile, tarinfo): super().__init__(fileobj) #class ExFileObject + +#----------------------------- +# extraction filters (PEP 706) +#----------------------------- + +class FilterError(TarError): + pass + +class AbsolutePathError(FilterError): + def __init__(self, tarinfo): + self.tarinfo = tarinfo + super().__init__(f'member {tarinfo.name!r} has an absolute path') + +class OutsideDestinationError(FilterError): + def __init__(self, tarinfo, path): + self.tarinfo = tarinfo + self._path = path + super().__init__(f'{tarinfo.name!r} would be extracted to {path!r}, ' + + 'which is outside the destination') + +class SpecialFileError(FilterError): + def __init__(self, tarinfo): + self.tarinfo = tarinfo + super().__init__(f'{tarinfo.name!r} is a special file') + +class AbsoluteLinkError(FilterError): + def __init__(self, tarinfo): + self.tarinfo = tarinfo + super().__init__(f'{tarinfo.name!r} is a symlink to an absolute path') + +class LinkOutsideDestinationError(FilterError): + def __init__(self, tarinfo, path): + self.tarinfo = tarinfo + self._path = path + super().__init__(f'{tarinfo.name!r} would link to {path!r}, ' + + 'which is outside the destination') + +def _get_filtered_attrs(member, dest_path, for_data=True): + new_attrs = {} + name = member.name + dest_path = os.path.realpath(dest_path) + # Strip leading / (tar's directory separator) from filenames. + # Include os.sep (target OS directory separator) as well. + if name.startswith(('/', os.sep)): + name = new_attrs['name'] = member.path.lstrip('/' + os.sep) + if os.path.isabs(name): + # Path is absolute even after stripping. + # For example, 'C:/foo' on Windows. + raise AbsolutePathError(member) + # Ensure we stay in the destination + target_path = os.path.realpath(os.path.join(dest_path, name)) + if os.path.commonpath([target_path, dest_path]) != dest_path: + raise OutsideDestinationError(member, target_path) + # Limit permissions (no high bits, and go-w) + mode = member.mode + if mode is not None: + # Strip high bits & group/other write bits + mode = mode & 0o755 + if for_data: + # For data, handle permissions & file types + if member.isreg() or member.islnk(): + if not mode & 0o100: + # Clear executable bits if not executable by user + mode &= ~0o111 + # Ensure owner can read & write + mode |= 0o600 + elif member.isdir() or member.issym(): + # Ignore mode for directories & symlinks + mode = None + else: + # Reject special files + raise SpecialFileError(member) + if mode != member.mode: + new_attrs['mode'] = mode + if for_data: + # Ignore ownership for 'data' + if member.uid is not None: + new_attrs['uid'] = None + if member.gid is not None: + new_attrs['gid'] = None + if member.uname is not None: + new_attrs['uname'] = None + if member.gname is not None: + new_attrs['gname'] = None + # Check link destination for 'data' + if member.islnk() or member.issym(): + if os.path.isabs(member.linkname): + raise AbsoluteLinkError(member) + target_path = os.path.realpath(os.path.join(dest_path, member.linkname)) + if os.path.commonpath([target_path, dest_path]) != dest_path: + raise LinkOutsideDestinationError(member, target_path) + return new_attrs + +def fully_trusted_filter(member, dest_path): + return member + +def tar_filter(member, dest_path): + new_attrs = _get_filtered_attrs(member, dest_path, False) + if new_attrs: + return member.replace(**new_attrs, deep=False) + return member + +def data_filter(member, dest_path): + new_attrs = _get_filtered_attrs(member, dest_path, True) + if new_attrs: + return member.replace(**new_attrs, deep=False) + return member + +_NAMED_FILTERS = { + "fully_trusted": fully_trusted_filter, + "tar": tar_filter, + "data": data_filter, +} + #------------------ # Exported Classes #------------------ + +# Sentinel for replace() defaults, meaning "don't change the attribute" +_KEEP = object() + class TarInfo(object): """Informational class which holds the details about an archive member given by a tar header block. @@ -781,12 +906,44 @@ def linkpath(self, linkname): def __repr__(self): return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self)) + def replace(self, *, + name=_KEEP, mtime=_KEEP, mode=_KEEP, linkname=_KEEP, + uid=_KEEP, gid=_KEEP, uname=_KEEP, gname=_KEEP, + deep=True, _KEEP=_KEEP): + """Return a deep copy of self with the given attributes replaced. + """ + if deep: + result = copy.deepcopy(self) + else: + result = copy.copy(self) + if name is not _KEEP: + result.name = name + if mtime is not _KEEP: + result.mtime = mtime + if mode is not _KEEP: + result.mode = mode + if linkname is not _KEEP: + result.linkname = linkname + if uid is not _KEEP: + result.uid = uid + if gid is not _KEEP: + result.gid = gid + if uname is not _KEEP: + result.uname = uname + if gname is not _KEEP: + result.gname = gname + return result + def get_info(self): """Return the TarInfo's attributes as a dictionary. """ + if self.mode is None: + mode = None + else: + mode = self.mode & 0o7777 info = { "name": self.name, - "mode": self.mode & 0o7777, + "mode": mode, "uid": self.uid, "gid": self.gid, "size": self.size, @@ -809,6 +966,9 @@ def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescap """Return a tar header as a string of 512 byte blocks. """ info = self.get_info() + for name, value in info.items(): + if value is None: + raise ValueError("%s may not be None" % name) if format == USTAR_FORMAT: return self.create_ustar_header(info, encoding, errors) @@ -922,6 +1082,20 @@ def _create_header(info, format, encoding, errors): """Return a header block. info is a dictionary with file information, format must be one of the *_FORMAT constants. """ + has_device_fields = info.get("type") in (CHRTYPE, BLKTYPE) + if has_device_fields: + devmajor = itn(info.get("devmajor", 0), 8, format) + devminor = itn(info.get("devminor", 0), 8, format) + else: + devmajor = stn("", 8, encoding, errors) + devminor = stn("", 8, encoding, errors) + + # None values in metadata should cause ValueError. + # itn()/stn() do this for all fields except type. + filetype = info.get("type", REGTYPE) + if filetype is None: + raise ValueError("TarInfo.type must not be None") + parts = [ stn(info.get("name", ""), 100, encoding, errors), itn(info.get("mode", 0) & 0o7777, 8, format), @@ -930,7 +1104,7 @@ def _create_header(info, format, encoding, errors): itn(info.get("size", 0), 12, format), itn(info.get("mtime", 0), 12, format), b" ", # checksum field - info.get("type", REGTYPE), + filetype, stn(info.get("linkname", ""), 100, encoding, errors), info.get("magic", POSIX_MAGIC), stn(info.get("uname", ""), 32, encoding, errors), @@ -1412,6 +1586,8 @@ class TarFile(object): fileobject = ExFileObject # The file-object for extractfile(). + extraction_filter = None # The default filter for extraction. + def __init__(self, name=None, mode="r", fileobj=None, format=None, tarinfo=None, dereference=None, ignore_zeros=None, encoding=None, errors="surrogateescape", pax_headers=None, debug=None, @@ -1882,7 +2058,10 @@ def list(self, verbose=True, *, members=None): members = self for tarinfo in members: if verbose: - _safe_print(stat.filemode(tarinfo.mode)) + if tarinfo.mode is None: + _safe_print("??????????") + else: + _safe_print(stat.filemode(tarinfo.mode)) _safe_print("%s/%s" % (tarinfo.uname or tarinfo.uid, tarinfo.gname or tarinfo.gid)) if tarinfo.ischr() or tarinfo.isblk(): @@ -1890,8 +2069,11 @@ def list(self, verbose=True, *, members=None): ("%d,%d" % (tarinfo.devmajor, tarinfo.devminor))) else: _safe_print("%10d" % tarinfo.size) - _safe_print("%d-%02d-%02d %02d:%02d:%02d" \ - % time.localtime(tarinfo.mtime)[:6]) + if tarinfo.mtime is None: + _safe_print("????-??-?? ??:??:??") + else: + _safe_print("%d-%02d-%02d %02d:%02d:%02d" \ + % time.localtime(tarinfo.mtime)[:6]) _safe_print(tarinfo.name + ("/" if tarinfo.isdir() else "")) @@ -1978,32 +2160,63 @@ def addfile(self, tarinfo, fileobj=None): self.members.append(tarinfo) - def extractall(self, path=".", members=None, *, numeric_owner=False): + def _get_filter_function(self, filter): + if filter is None: + filter = self.extraction_filter + if filter is None: + warnings.warn( + 'Python 3.14 will, by default, filter extracted tar ' + + 'archives and reject files or modify their metadata. ' + + 'Use the filter argument to control this behavior.', + DeprecationWarning) + return fully_trusted_filter + if isinstance(filter, str): + raise TypeError( + 'String names are not supported for ' + + 'TarFile.extraction_filter. Use a function such as ' + + 'tarfile.data_filter directly.') + return filter + if callable(filter): + return filter + try: + return _NAMED_FILTERS[filter] + except KeyError: + raise ValueError(f"filter {filter!r} not found") from None + + def extractall(self, path=".", members=None, *, numeric_owner=False, + filter=None): """Extract all members from the archive to the current working directory and set owner, modification time and permissions on directories afterwards. `path' specifies a different directory to extract to. `members' is optional and must be a subset of the list returned by getmembers(). If `numeric_owner` is True, only the numbers for user/group names are used and not the names. + + The `filter` function will be called on each member just + before extraction. + It can return a changed TarInfo or None to skip the member. + String names of common filters are accepted. """ directories = [] + filter_function = self._get_filter_function(filter) if members is None: members = self - for tarinfo in members: + for member in members: + tarinfo = self._get_extract_tarinfo(member, filter_function, path) + if tarinfo is None: + continue if tarinfo.isdir(): - # Extract directories with a safe mode. + # For directories, delay setting attributes until later, + # since permissions can interfere with extraction and + # extracting contents can reset mtime. directories.append(tarinfo) - tarinfo = copy.copy(tarinfo) - tarinfo.mode = 0o700 - # Do not set_attrs directories, as we will do that further down - self.extract(tarinfo, path, set_attrs=not tarinfo.isdir(), - numeric_owner=numeric_owner) + self._extract_one(tarinfo, path, set_attrs=not tarinfo.isdir(), + numeric_owner=numeric_owner) # Reverse sort directories. - directories.sort(key=lambda a: a.name) - directories.reverse() + directories.sort(key=lambda a: a.name, reverse=True) # Set correct owner, mtime and filemode on directories. for tarinfo in directories: @@ -2013,12 +2226,10 @@ def extractall(self, path=".", members=None, *, numeric_owner=False): self.utime(tarinfo, dirpath) self.chmod(tarinfo, dirpath) except ExtractError as e: - if self.errorlevel > 1: - raise - else: - self._dbg(1, "tarfile: %s" % e) + self._handle_nonfatal_error(e) - def extract(self, member, path="", set_attrs=True, *, numeric_owner=False): + def extract(self, member, path="", set_attrs=True, *, numeric_owner=False, + filter=None): """Extract a member from the archive to the current working directory, using its full name. Its file information is extracted as accurately as possible. `member' may be a filename or a TarInfo object. You can @@ -2026,35 +2237,70 @@ def extract(self, member, path="", set_attrs=True, *, numeric_owner=False): mtime, mode) are set unless `set_attrs' is False. If `numeric_owner` is True, only the numbers for user/group names are used and not the names. + + The `filter` function will be called before extraction. + It can return a changed TarInfo or None to skip the member. + String names of common filters are accepted. """ - self._check("r") + filter_function = self._get_filter_function(filter) + tarinfo = self._get_extract_tarinfo(member, filter_function, path) + if tarinfo is not None: + self._extract_one(tarinfo, path, set_attrs, numeric_owner) + def _get_extract_tarinfo(self, member, filter_function, path): + """Get filtered TarInfo (or None) from member, which might be a str""" if isinstance(member, str): tarinfo = self.getmember(member) else: tarinfo = member + unfiltered = tarinfo + try: + tarinfo = filter_function(tarinfo, path) + except (OSError, FilterError) as e: + self._handle_fatal_error(e) + except ExtractError as e: + self._handle_nonfatal_error(e) + if tarinfo is None: + self._dbg(2, "tarfile: Excluded %r" % unfiltered.name) + return None # Prepare the link target for makelink(). if tarinfo.islnk(): + tarinfo = copy.copy(tarinfo) tarinfo._link_target = os.path.join(path, tarinfo.linkname) + return tarinfo + + def _extract_one(self, tarinfo, path, set_attrs, numeric_owner): + """Extract from filtered tarinfo to disk""" + self._check("r") try: self._extract_member(tarinfo, os.path.join(path, tarinfo.name), set_attrs=set_attrs, numeric_owner=numeric_owner) except OSError as e: - if self.errorlevel > 0: - raise - else: - if e.filename is None: - self._dbg(1, "tarfile: %s" % e.strerror) - else: - self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename)) + self._handle_fatal_error(e) except ExtractError as e: - if self.errorlevel > 1: - raise + self._handle_nonfatal_error(e) + + def _handle_nonfatal_error(self, e): + """Handle non-fatal error (ExtractError) according to errorlevel""" + if self.errorlevel > 1: + raise + else: + self._dbg(1, "tarfile: %s" % e) + + def _handle_fatal_error(self, e): + """Handle "fatal" error according to self.errorlevel""" + if self.errorlevel > 0: + raise + elif isinstance(e, OSError): + if e.filename is None: + self._dbg(1, "tarfile: %s" % e.strerror) else: - self._dbg(1, "tarfile: %s" % e) + self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename)) + else: + self._dbg(1, "tarfile: %s %s" % (type(e).__name__, e)) def extractfile(self, member): """Extract a member from the archive as a file object. `member' may be @@ -2140,9 +2386,13 @@ def makedir(self, tarinfo, targetpath): """Make a directory called targetpath. """ try: - # Use a safe mode for the directory, the real mode is set - # later in _extract_member(). - os.mkdir(targetpath, 0o700) + if tarinfo.mode is None: + # Use the system's default mode + os.mkdir(targetpath) + else: + # Use a safe mode for the directory, the real mode is set + # later in _extract_member(). + os.mkdir(targetpath, 0o700) except FileExistsError: pass @@ -2185,6 +2435,9 @@ def makedev(self, tarinfo, targetpath): raise ExtractError("special devices not supported by system") mode = tarinfo.mode + if mode is None: + # Use mknod's default + mode = 0o600 if tarinfo.isblk(): mode |= stat.S_IFBLK else: @@ -2203,7 +2456,6 @@ def makelink(self, tarinfo, targetpath): if tarinfo.issym(): os.symlink(tarinfo.linkname, targetpath) else: - # See extract(). if os.path.exists(tarinfo._link_target): os.link(tarinfo._link_target, targetpath) else: @@ -2228,15 +2480,19 @@ def chown(self, tarinfo, targetpath, numeric_owner): u = tarinfo.uid if not numeric_owner: try: - if grp: + if grp and tarinfo.gname: g = grp.getgrnam(tarinfo.gname)[2] except KeyError: pass try: - if pwd: + if pwd and tarinfo.uname: u = pwd.getpwnam(tarinfo.uname)[2] except KeyError: pass + if g is None: + g = -1 + if u is None: + u = -1 try: if tarinfo.issym() and hasattr(os, "lchown"): os.lchown(targetpath, u, g) @@ -2248,6 +2504,8 @@ def chown(self, tarinfo, targetpath, numeric_owner): def chmod(self, tarinfo, targetpath): """Set file permissions of targetpath according to tarinfo. """ + if tarinfo.mode is None: + return if hasattr(os, 'chmod'): try: os.chmod(targetpath, tarinfo.mode) @@ -2257,10 +2515,13 @@ def chmod(self, tarinfo, targetpath): def utime(self, tarinfo, targetpath): """Set modification time of targetpath according to tarinfo. """ + mtime = tarinfo.mtime + if mtime is None: + return if not hasattr(os, 'utime'): return try: - os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime)) + os.utime(targetpath, (mtime, mtime)) except OSError: raise ExtractError("could not change modification time") @@ -2327,13 +2588,26 @@ def _getmember(self, name, tarinfo=None, normalize=False): members = self.getmembers() # Limit the member search list up to tarinfo. + skipping = False if tarinfo is not None: - members = members[:members.index(tarinfo)] + try: + index = members.index(tarinfo) + except ValueError: + # The given starting point might be a (modified) copy. + # We'll later skip members until we find an equivalent. + skipping = True + else: + # Happy fast path + members = members[:index] if normalize: name = os.path.normpath(name) for member in reversed(members): + if skipping: + if tarinfo.offset == member.offset: + skipping = False + continue if normalize: member_name = os.path.normpath(member.name) else: @@ -2342,6 +2616,10 @@ def _getmember(self, name, tarinfo=None, normalize=False): if name == member_name: return member + if skipping: + # Starting point was not found + raise ValueError(tarinfo) + def _load(self): """Read through the entire archive file and look for readable members. @@ -2434,6 +2712,7 @@ def __exit__(self, type, value, traceback): #-------------------- # exported functions #-------------------- + def is_tarfile(name): """Return True if name points to a tar archive that we are able to handle, else return False. @@ -2455,6 +2734,10 @@ def main(): parser = argparse.ArgumentParser(description=description) parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Verbose output') + parser.add_argument('--filter', metavar='', + choices=_NAMED_FILTERS, + help='Filter for extraction') + group = parser.add_mutually_exclusive_group(required=True) group.add_argument('-l', '--list', metavar='', help='Show listing of a tarfile') @@ -2466,8 +2749,12 @@ def main(): help='Create tarfile from sources') group.add_argument('-t', '--test', metavar='', help='Test if a tarfile is valid') + args = parser.parse_args() + if args.filter and args.extract is None: + parser.exit(1, '--filter is only valid for extraction\n') + if args.test is not None: src = args.test if is_tarfile(src): @@ -2498,7 +2785,7 @@ def main(): if is_tarfile(src): with TarFile.open(src, 'r:*') as tf: - tf.extractall(path=curdir) + tf.extractall(path=curdir, filter=args.filter) if args.verbose: if curdir == '.': msg = '{!r} file is extracted.'.format(src) diff --git a/Lib/test/test_shutil.py b/Lib/test/test_shutil.py index ef9323962cae59..2fcaa6ac4a0a7a 100644 --- a/Lib/test/test_shutil.py +++ b/Lib/test/test_shutil.py @@ -23,6 +23,7 @@ from test import support from test.support import TESTFN, FakePath +from test.support import warnings_helper TESTFN2 = TESTFN + "2" @@ -1239,12 +1240,16 @@ def test_register_archive_format(self): formats = [name for name, params in get_archive_formats()] self.assertNotIn('xxx', formats) - def check_unpack_archive(self, format): - self.check_unpack_archive_with_converter(format, lambda path: path) - self.check_unpack_archive_with_converter(format, pathlib.Path) - self.check_unpack_archive_with_converter(format, FakePath) + ### shutil.unpack_archive - def check_unpack_archive_with_converter(self, format, converter): + def check_unpack_archive(self, format, **kwargs): + self.check_unpack_archive_with_converter( + format, lambda path: path, **kwargs) + self.check_unpack_archive_with_converter( + format, pathlib.Path, **kwargs) + self.check_unpack_archive_with_converter(format, FakePath, **kwargs) + + def check_unpack_archive_with_converter(self, format, converter, **kwargs): root_dir, base_dir = self._create_files() expected = rlistdir(root_dir) expected.remove('outer') @@ -1254,35 +1259,47 @@ def check_unpack_archive_with_converter(self, format, converter): # let's try to unpack it now tmpdir2 = self.mkdtemp() - unpack_archive(converter(filename), converter(tmpdir2)) + unpack_archive(converter(filename), converter(tmpdir2), **kwargs) self.assertEqual(rlistdir(tmpdir2), expected) # and again, this time with the format specified tmpdir3 = self.mkdtemp() - unpack_archive(converter(filename), converter(tmpdir3), format=format) + unpack_archive(converter(filename), converter(tmpdir3), format=format, + **kwargs) self.assertEqual(rlistdir(tmpdir3), expected) - self.assertRaises(shutil.ReadError, unpack_archive, converter(TESTFN)) - self.assertRaises(ValueError, unpack_archive, converter(TESTFN), format='xxx') + with self.assertRaises(shutil.ReadError): + unpack_archive(converter(TESTFN), **kwargs) + with self.assertRaises(ValueError): + unpack_archive(converter(TESTFN), format='xxx', **kwargs) + + def check_unpack_tarball(self, format): + self.check_unpack_archive(format, filter='fully_trusted') + self.check_unpack_archive(format, filter='data') + with warnings_helper.check_warnings( + ('The default', RuntimeWarning)): + self.check_unpack_archive(format) def test_unpack_archive_tar(self): - self.check_unpack_archive('tar') + self.check_unpack_tarball('tar') @support.requires_zlib def test_unpack_archive_gztar(self): - self.check_unpack_archive('gztar') + self.check_unpack_tarball('gztar') @support.requires_bz2 def test_unpack_archive_bztar(self): - self.check_unpack_archive('bztar') + self.check_unpack_tarball('bztar') @support.requires_lzma def test_unpack_archive_xztar(self): - self.check_unpack_archive('xztar') + self.check_unpack_tarball('xztar') @support.requires_zlib def test_unpack_archive_zip(self): self.check_unpack_archive('zip') + with self.assertRaises(TypeError): + self.check_unpack_archive('zip', filter='data') def test_unpack_registry(self): diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index 9133d60e49be14..f4b3a4d9c70b56 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -2,9 +2,13 @@ import os import io from hashlib import md5 -from contextlib import contextmanager +from contextlib import contextmanager, ExitStack from random import Random import pathlib +import shutil +import re +import warnings +import stat import unittest import unittest.mock @@ -12,6 +16,7 @@ from test import support from test.support import script_helper +from test.support import warnings_helper # Check for our compression modules. try: @@ -2180,21 +2185,18 @@ def test_number_field_limits(self): tarfile.itn(0x10000000000, 6, tarfile.GNU_FORMAT) def test__all__(self): - blacklist = {'version', 'grp', 'pwd', 'symlink_exception', - 'NUL', 'BLOCKSIZE', 'RECORDSIZE', 'GNU_MAGIC', - 'POSIX_MAGIC', 'LENGTH_NAME', 'LENGTH_LINK', - 'LENGTH_PREFIX', 'REGTYPE', 'AREGTYPE', 'LNKTYPE', - 'SYMTYPE', 'CHRTYPE', 'BLKTYPE', 'DIRTYPE', 'FIFOTYPE', - 'CONTTYPE', 'GNUTYPE_LONGNAME', 'GNUTYPE_LONGLINK', - 'GNUTYPE_SPARSE', 'XHDTYPE', 'XGLTYPE', 'SOLARIS_XHDTYPE', - 'SUPPORTED_TYPES', 'REGULAR_TYPES', 'GNU_TYPES', - 'PAX_FIELDS', 'PAX_NAME_FIELDS', 'PAX_NUMBER_FIELDS', - 'stn', 'nts', 'nti', 'itn', 'calc_chksums', 'copyfileobj', - 'filemode', - 'EmptyHeaderError', 'TruncatedHeaderError', - 'EOFHeaderError', 'InvalidHeaderError', - 'SubsequentHeaderError', 'ExFileObject', - 'main'} + blacklist = { + 'version', 'grp', 'pwd', 'symlink_exception', 'NUL', 'BLOCKSIZE', + 'RECORDSIZE', 'GNU_MAGIC', 'POSIX_MAGIC', 'LENGTH_NAME', + 'LENGTH_LINK', 'LENGTH_PREFIX', 'REGTYPE', 'AREGTYPE', 'LNKTYPE', + 'SYMTYPE', 'CHRTYPE', 'BLKTYPE', 'DIRTYPE', 'FIFOTYPE', 'CONTTYPE', + 'GNUTYPE_LONGNAME', 'GNUTYPE_LONGLINK', 'GNUTYPE_SPARSE', + 'XHDTYPE', 'XGLTYPE', 'SOLARIS_XHDTYPE', 'SUPPORTED_TYPES', + 'REGULAR_TYPES', 'GNU_TYPES', 'PAX_FIELDS', 'PAX_NAME_FIELDS', + 'PAX_NUMBER_FIELDS', 'stn', 'nts', 'nti', 'itn', 'calc_chksums', + 'copyfileobj', 'filemode', 'EmptyHeaderError', + 'TruncatedHeaderError', 'EOFHeaderError', 'InvalidHeaderError', + 'SubsequentHeaderError', 'ExFileObject', 'main'} support.check__all__(self, tarfile, blacklist=blacklist) @@ -2217,6 +2219,15 @@ def make_simple_tarfile(self, tar_name): for tardata in files: tf.add(tardata, arcname=os.path.basename(tardata)) + def make_evil_tarfile(self, tar_name): + files = [support.findfile('tokenize_tests.txt')] + self.addCleanup(support.unlink, tar_name) + with tarfile.open(tar_name, 'w') as tf: + benign = tarfile.TarInfo('benign') + tf.addfile(benign, fileobj=io.BytesIO(b'')) + evil = tarfile.TarInfo('../evil') + tf.addfile(evil, fileobj=io.BytesIO(b'')) + def test_bad_use(self): rc, out, err = self.tarfilecmd_failure() self.assertEqual(out, b'') @@ -2370,6 +2381,25 @@ def test_extract_command_verbose(self): finally: support.rmtree(tarextdir) + def test_extract_command_filter(self): + self.make_evil_tarfile(tmpname) + # Make an inner directory, so the member named '../evil' + # is still extracted into `tarextdir` + destdir = os.path.join(tarextdir, 'dest') + os.mkdir(tarextdir) + try: + with support.temp_cwd(destdir): + self.tarfilecmd_failure('-e', tmpname, + '-v', + '--filter', 'data') + out = self.tarfilecmd('-e', tmpname, + '-v', + '--filter', 'fully_trusted', + PYTHONIOENCODING='utf-8') + self.assertIn(b' file is extracted.', out) + finally: + support.rmtree(tarextdir) + def test_extract_command_different_directory(self): self.make_simple_tarfile(tmpname) try: @@ -2643,6 +2673,873 @@ def test_keyword_only(self, mock_geteuid): tarfl.extract, filename_1, TEMPDIR, False, True) +class ReplaceTests(ReadTest, unittest.TestCase): + def test_replace_name(self): + member = self.tar.getmember('ustar/regtype') + replaced = member.replace(name='misc/other') + self.assertEqual(replaced.name, 'misc/other') + self.assertEqual(member.name, 'ustar/regtype') + self.assertEqual(self.tar.getmember('ustar/regtype').name, + 'ustar/regtype') + + def test_replace_deep(self): + member = self.tar.getmember('pax/regtype1') + replaced = member.replace() + replaced.pax_headers['gname'] = 'not-bar' + self.assertEqual(member.pax_headers['gname'], 'bar') + self.assertEqual( + self.tar.getmember('pax/regtype1').pax_headers['gname'], 'bar') + + def test_replace_shallow(self): + member = self.tar.getmember('pax/regtype1') + replaced = member.replace(deep=False) + replaced.pax_headers['gname'] = 'not-bar' + self.assertEqual(member.pax_headers['gname'], 'not-bar') + self.assertEqual( + self.tar.getmember('pax/regtype1').pax_headers['gname'], 'not-bar') + + def test_replace_all(self): + member = self.tar.getmember('ustar/regtype') + for attr_name in ('name', 'mtime', 'mode', 'linkname', + 'uid', 'gid', 'uname', 'gname'): + with self.subTest(attr_name=attr_name): + replaced = member.replace(**{attr_name: None}) + self.assertEqual(getattr(replaced, attr_name), None) + self.assertNotEqual(getattr(member, attr_name), None) + + def test_replace_internal(self): + member = self.tar.getmember('ustar/regtype') + with self.assertRaises(TypeError): + member.replace(offset=123456789) + + +class NoneInfoExtractTests(ReadTest): + # These mainly check that all kinds of members are extracted successfully + # if some metadata is None. + # Some of the methods do additional spot checks. + + # We also test that the default filters can deal with None. + + extraction_filter = None + + @classmethod + def setUpClass(cls): + tar = tarfile.open(tarname, mode='r', encoding="iso8859-1") + cls.control_dir = pathlib.Path(TEMPDIR) / "extractall_ctrl" + tar.errorlevel = 0 + with ExitStack() as cm: + if cls.extraction_filter is None: + cm.enter_context(warnings.catch_warnings( + action="ignore", category=DeprecationWarning)) + tar.extractall(cls.control_dir, filter=cls.extraction_filter) + tar.close() + cls.control_paths = set( + p.relative_to(cls.control_dir) + for p in pathlib.Path(cls.control_dir).glob('**/*')) + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.control_dir) + + def check_files_present(self, directory): + got_paths = set( + p.relative_to(directory) + for p in pathlib.Path(directory).glob('**/*')) + self.assertEqual(self.control_paths, got_paths) + + @contextmanager + def extract_with_none(self, *attr_names): + DIR = pathlib.Path(TEMPDIR) / "extractall_none" + self.tar.errorlevel = 0 + for member in self.tar.getmembers(): + for attr_name in attr_names: + setattr(member, attr_name, None) + with support.temp_dir(DIR): + self.tar.extractall(DIR, filter='fully_trusted') + self.check_files_present(DIR) + yield DIR + + def test_extractall_none_mtime(self): + # mtimes of extracted files should be later than 'now' -- the mtime + # of a previously created directory. + now = pathlib.Path(TEMPDIR).stat().st_mtime + with self.extract_with_none('mtime') as DIR: + for path in pathlib.Path(DIR).glob('**/*'): + with self.subTest(path=path): + try: + mtime = path.stat().st_mtime + except OSError: + # Some systems can't stat symlinks, ignore those + if not path.is_symlink(): + raise + else: + self.assertGreaterEqual(path.stat().st_mtime, now) + + def test_extractall_none_mode(self): + # modes of directories and regular files should match the mode + # of a "normally" created directory or regular file + dir_mode = pathlib.Path(TEMPDIR).stat().st_mode + regular_file = pathlib.Path(TEMPDIR) / 'regular_file' + regular_file.write_text('') + regular_file_mode = regular_file.stat().st_mode + with self.extract_with_none('mode') as DIR: + for path in pathlib.Path(DIR).glob('**/*'): + with self.subTest(path=path): + if path.is_dir(): + self.assertEqual(path.stat().st_mode, dir_mode) + elif path.is_file(): + self.assertEqual(path.stat().st_mode, + regular_file_mode) + + def test_extractall_none_uid(self): + with self.extract_with_none('uid'): + pass + + def test_extractall_none_gid(self): + with self.extract_with_none('gid'): + pass + + def test_extractall_none_uname(self): + with self.extract_with_none('uname'): + pass + + def test_extractall_none_gname(self): + with self.extract_with_none('gname'): + pass + + def test_extractall_none_ownership(self): + with self.extract_with_none('uid', 'gid', 'uname', 'gname'): + pass + +class NoneInfoExtractTests_Data(NoneInfoExtractTests, unittest.TestCase): + extraction_filter = 'data' + +class NoneInfoExtractTests_FullyTrusted(NoneInfoExtractTests, + unittest.TestCase): + extraction_filter = 'fully_trusted' + +class NoneInfoExtractTests_Tar(NoneInfoExtractTests, unittest.TestCase): + extraction_filter = 'tar' + +class NoneInfoExtractTests_Default(NoneInfoExtractTests, + unittest.TestCase): + extraction_filter = None + +class NoneInfoTests_Misc(unittest.TestCase): + def test_add(self): + # When addfile() encounters None metadata, it raises a ValueError + bio = io.BytesIO() + for tarformat in (tarfile.USTAR_FORMAT, tarfile.GNU_FORMAT, + tarfile.PAX_FORMAT): + with self.subTest(tarformat=tarformat): + tar = tarfile.open(fileobj=bio, mode='w', format=tarformat) + tarinfo = tar.gettarinfo(tarname) + try: + tar.addfile(tarinfo) + except Exception: + if tarformat == tarfile.USTAR_FORMAT: + # In the old, limited format, adding might fail for + # reasons like the UID being too large + pass + else: + raise + else: + for attr_name in ('mtime', 'mode', 'uid', 'gid', + 'uname', 'gname'): + with self.subTest(attr_name=attr_name): + replaced = tarinfo.replace(**{attr_name: None}) + with self.assertRaisesRegex(ValueError, + f"{attr_name}"): + tar.addfile(replaced) + + def test_list(self): + # Change some metadata to None, then compare list() output + # word-for-word. We want list() to not raise, and to only change + # printout for the affected piece of metadata. + # (n.b.: some contents of the test archive are hardcoded.) + for attr_names in ({'mtime'}, {'mode'}, {'uid'}, {'gid'}, + {'uname'}, {'gname'}, + {'uid', 'uname'}, {'gid', 'gname'}): + with (self.subTest(attr_names=attr_names), + tarfile.open(tarname, encoding="iso8859-1") as tar): + tio_prev = io.TextIOWrapper(io.BytesIO(), 'ascii', newline='\n') + with support.swap_attr(sys, 'stdout', tio_prev): + tar.list() + for member in tar.getmembers(): + for attr_name in attr_names: + setattr(member, attr_name, None) + tio_new = io.TextIOWrapper(io.BytesIO(), 'ascii', newline='\n') + with support.swap_attr(sys, 'stdout', tio_new): + tar.list() + for expected, got in zip(tio_prev.detach().getvalue().split(), + tio_new.detach().getvalue().split()): + if attr_names == {'mtime'} and re.match(rb'2003-01-\d\d', expected): + self.assertEqual(got, b'????-??-??') + elif attr_names == {'mtime'} and re.match(rb'\d\d:\d\d:\d\d', expected): + self.assertEqual(got, b'??:??:??') + elif attr_names == {'mode'} and re.match( + rb'.([r-][w-][x-]){3}', expected): + self.assertEqual(got, b'??????????') + elif attr_names == {'uname'} and expected.startswith( + (b'tarfile/', b'lars/', b'foo/')): + exp_user, exp_group = expected.split(b'/') + got_user, got_group = got.split(b'/') + self.assertEqual(got_group, exp_group) + self.assertRegex(got_user, b'[0-9]+') + elif attr_names == {'gname'} and expected.endswith( + (b'/tarfile', b'/users', b'/bar')): + exp_user, exp_group = expected.split(b'/') + got_user, got_group = got.split(b'/') + self.assertEqual(got_user, exp_user) + self.assertRegex(got_group, b'[0-9]+') + elif attr_names == {'uid'} and expected.startswith( + (b'1000/')): + exp_user, exp_group = expected.split(b'/') + got_user, got_group = got.split(b'/') + self.assertEqual(got_group, exp_group) + self.assertEqual(got_user, b'None') + elif attr_names == {'gid'} and expected.endswith((b'/100')): + exp_user, exp_group = expected.split(b'/') + got_user, got_group = got.split(b'/') + self.assertEqual(got_user, exp_user) + self.assertEqual(got_group, b'None') + elif attr_names == {'uid', 'uname'} and expected.startswith( + (b'tarfile/', b'lars/', b'foo/', b'1000/')): + exp_user, exp_group = expected.split(b'/') + got_user, got_group = got.split(b'/') + self.assertEqual(got_group, exp_group) + self.assertEqual(got_user, b'None') + elif attr_names == {'gname', 'gid'} and expected.endswith( + (b'/tarfile', b'/users', b'/bar', b'/100')): + exp_user, exp_group = expected.split(b'/') + got_user, got_group = got.split(b'/') + self.assertEqual(got_user, exp_user) + self.assertEqual(got_group, b'None') + else: + # In other cases the output should be the same + self.assertEqual(expected, got) + +def _filemode_to_int(mode): + """Inverse of `stat.filemode` (for permission bits) + + Using mode strings rather than numbers makes the later tests more readable. + """ + str_mode = mode[1:] + result = ( + {'r': stat.S_IRUSR, '-': 0}[str_mode[0]] + | {'w': stat.S_IWUSR, '-': 0}[str_mode[1]] + | {'x': stat.S_IXUSR, '-': 0, + 's': stat.S_IXUSR | stat.S_ISUID, + 'S': stat.S_ISUID}[str_mode[2]] + | {'r': stat.S_IRGRP, '-': 0}[str_mode[3]] + | {'w': stat.S_IWGRP, '-': 0}[str_mode[4]] + | {'x': stat.S_IXGRP, '-': 0, + 's': stat.S_IXGRP | stat.S_ISGID, + 'S': stat.S_ISGID}[str_mode[5]] + | {'r': stat.S_IROTH, '-': 0}[str_mode[6]] + | {'w': stat.S_IWOTH, '-': 0}[str_mode[7]] + | {'x': stat.S_IXOTH, '-': 0, + 't': stat.S_IXOTH | stat.S_ISVTX, + 'T': stat.S_ISVTX}[str_mode[8]] + ) + # check we did this right + assert stat.filemode(result)[1:] == mode[1:] + + return result + +class ArchiveMaker: + """Helper to create a tar file with specific contents + + Usage: + + with ArchiveMaker() as t: + t.add('filename', ...) + + with t.open() as tar: + ... # `tar` is now a TarFile with 'filename' in it! + """ + def __init__(self): + self.bio = io.BytesIO() + + def __enter__(self): + self.tar_w = tarfile.TarFile(mode='w', fileobj=self.bio) + return self + + def __exit__(self, *exc): + self.tar_w.close() + self.contents = self.bio.getvalue() + self.bio = None + + def add(self, name, *, type=None, symlink_to=None, hardlink_to=None, + mode=None, **kwargs): + """Add a member to the test archive. Call within `with`.""" + name = str(name) + tarinfo = tarfile.TarInfo(name).replace(**kwargs) + if mode: + tarinfo.mode = _filemode_to_int(mode) + if symlink_to is not None: + type = tarfile.SYMTYPE + tarinfo.linkname = str(symlink_to) + if hardlink_to is not None: + type = tarfile.LNKTYPE + tarinfo.linkname = str(hardlink_to) + if name.endswith('/') and type is None: + type = tarfile.DIRTYPE + if type is not None: + tarinfo.type = type + if tarinfo.isreg(): + fileobj = io.BytesIO(bytes(tarinfo.size)) + else: + fileobj = None + self.tar_w.addfile(tarinfo, fileobj) + + def open(self, **kwargs): + """Open the resulting archive as TarFile. Call after `with`.""" + bio = io.BytesIO(self.contents) + return tarfile.open(fileobj=bio, **kwargs) + + +class TestExtractionFilters(unittest.TestCase): + + # A temporary directory for the extraction results. + # All files that "escape" the destination path should still end + # up in this directory. + outerdir = pathlib.Path(TEMPDIR) / 'outerdir' + + # The destination for the extraction, within `outerdir` + destdir = outerdir / 'dest' + + @contextmanager + def check_context(self, tar, filter): + """Extracts `tar` to `self.destdir` and allows checking the result + + If an error occurs, it must be checked using `expect_exception` + + Otherwise, all resulting files must be checked using `expect_file`, + except the destination directory itself and parent directories of + other files. + When checking directories, do so before their contents. + """ + with support.temp_dir(self.outerdir): + try: + tar.extractall(self.destdir, filter=filter) + except Exception as exc: + self.raised_exception = exc + self.expected_paths = set() + else: + self.raised_exception = None + self.expected_paths = set(self.outerdir.glob('**/*')) + self.expected_paths.discard(self.destdir) + try: + yield + finally: + tar.close() + if self.raised_exception: + raise self.raised_exception + self.assertEqual(self.expected_paths, set()) + + def expect_file(self, name, type=None, symlink_to=None, mode=None): + """Check a single file. See check_context.""" + if self.raised_exception: + raise self.raised_exception + # use normpath() rather than resolve() so we don't follow symlinks + path = pathlib.Path(os.path.normpath(self.destdir / name)) + self.assertIn(path, self.expected_paths) + self.expected_paths.remove(path) + if mode is not None: + got = stat.filemode(stat.S_IMODE(path.stat().st_mode)) + self.assertEqual(got, mode) + if type is None and isinstance(name, str) and name.endswith('/'): + type = tarfile.DIRTYPE + if symlink_to is not None: + got = (self.destdir / name).readlink() + expected = pathlib.Path(symlink_to) + # The symlink might be the same (textually) as what we expect, + # but some systems change the link to an equivalent path, so + # we fall back to samefile(). + if expected != got: + self.assertTrue(got.samefile(expected)) + elif type == tarfile.REGTYPE or type is None: + self.assertTrue(path.is_file()) + elif type == tarfile.DIRTYPE: + self.assertTrue(path.is_dir()) + elif type == tarfile.FIFOTYPE: + self.assertTrue(path.is_fifo()) + else: + raise NotImplementedError(type) + for parent in path.parents: + self.expected_paths.discard(parent) + + def expect_exception(self, exc_type, message_re='.'): + with self.assertRaisesRegex(exc_type, message_re): + if self.raised_exception is not None: + raise self.raised_exception + self.raised_exception = None + + def test_benign_file(self): + with ArchiveMaker() as arc: + arc.add('benign.txt') + for filter in 'fully_trusted', 'tar', 'data': + with self.check_context(arc.open(), filter): + self.expect_file('benign.txt') + + def test_absolute(self): + # Test handling a member with an absolute path + # Inspired by 'absolute1' in https://github.com/jwilk/traversal-archives + with ArchiveMaker() as arc: + arc.add(self.outerdir / 'escaped.evil') + + with self.check_context(arc.open(), 'fully_trusted'): + self.expect_file('../escaped.evil') + + for filter in 'tar', 'data': + with self.check_context(arc.open(), filter): + if str(self.outerdir).startswith('/'): + # We strip leading slashes, as e.g. GNU tar does + # (without --absolute-filenames). + outerdir_stripped = str(self.outerdir).lstrip('/') + self.expect_file(f'{outerdir_stripped}/escaped.evil') + else: + # On this system, absolute paths don't have leading + # slashes. + # So, there's nothing to strip. We refuse to unpack + # to an absolute path, nonetheless. + self.expect_exception( + tarfile.AbsolutePathError, + """['"].*escaped.evil['"] has an absolute path""") + + def test_parent_symlink(self): + # Test interplaying symlinks + # Inspired by 'dirsymlink2a' in jwilk/traversal-archives + with ArchiveMaker() as arc: + arc.add('current', symlink_to='.') + arc.add('parent', symlink_to='current/..') + arc.add('parent/evil') + + if support.can_symlink(): + with self.check_context(arc.open(), 'fully_trusted'): + if self.raised_exception is not None: + # Windows will refuse to create a file that's a symlink to itself + # (and tarfile doesn't swallow that exception) + self.expect_exception(FileExistsError) + # The other cases will fail with this error too. + # Skip the rest of this test. + return + else: + self.expect_file('current', symlink_to='.') + self.expect_file('parent', symlink_to='current/..') + self.expect_file('../evil') + + with self.check_context(arc.open(), 'tar'): + self.expect_exception( + tarfile.OutsideDestinationError, + """'parent/evil' would be extracted to ['"].*evil['"], """ + + "which is outside the destination") + + with self.check_context(arc.open(), 'data'): + self.expect_exception( + tarfile.LinkOutsideDestinationError, + """'parent' would link to ['"].*outerdir['"], """ + + "which is outside the destination") + + else: + # No symlink support. The symlinks are ignored. + with self.check_context(arc.open(), 'fully_trusted'): + self.expect_file('parent/evil') + with self.check_context(arc.open(), 'tar'): + self.expect_file('parent/evil') + with self.check_context(arc.open(), 'data'): + self.expect_file('parent/evil') + + def test_parent_symlink2(self): + # Test interplaying symlinks + # Inspired by 'dirsymlink2b' in jwilk/traversal-archives + with ArchiveMaker() as arc: + arc.add('current', symlink_to='.') + arc.add('current/parent', symlink_to='..') + arc.add('parent/evil') + + with self.check_context(arc.open(), 'fully_trusted'): + if support.can_symlink(): + self.expect_file('current', symlink_to='.') + self.expect_file('parent', symlink_to='..') + self.expect_file('../evil') + else: + self.expect_file('current/') + self.expect_file('parent/evil') + + with self.check_context(arc.open(), 'tar'): + if support.can_symlink(): + self.expect_exception( + tarfile.OutsideDestinationError, + "'parent/evil' would be extracted to " + + """['"].*evil['"], which is outside """ + + "the destination") + else: + self.expect_file('current/') + self.expect_file('parent/evil') + + with self.check_context(arc.open(), 'data'): + self.expect_exception( + tarfile.LinkOutsideDestinationError, + """'current/parent' would link to ['"].*['"], """ + + "which is outside the destination") + + def test_absolute_symlink(self): + # Test symlink to an absolute path + # Inspired by 'dirsymlink' in jwilk/traversal-archives + with ArchiveMaker() as arc: + arc.add('parent', symlink_to=self.outerdir) + arc.add('parent/evil') + + with self.check_context(arc.open(), 'fully_trusted'): + if support.can_symlink(): + self.expect_file('parent', symlink_to=self.outerdir) + self.expect_file('../evil') + else: + self.expect_file('parent/evil') + + with self.check_context(arc.open(), 'tar'): + if support.can_symlink(): + self.expect_exception( + tarfile.OutsideDestinationError, + "'parent/evil' would be extracted to " + + """['"].*evil['"], which is outside """ + + "the destination") + else: + self.expect_file('parent/evil') + + with self.check_context(arc.open(), 'data'): + self.expect_exception( + tarfile.AbsoluteLinkError, + "'parent' is a symlink to an absolute path") + + def test_sly_relative0(self): + # Inspired by 'relative0' in jwilk/traversal-archives + with ArchiveMaker() as arc: + arc.add('../moo', symlink_to='..//tmp/moo') + + try: + with self.check_context(arc.open(), filter='fully_trusted'): + if support.can_symlink(): + if isinstance(self.raised_exception, FileExistsError): + # XXX TarFile happens to fail creating a parent + # directory. + # This might be a bug, but fixing it would hurt + # security. + # Note that e.g. GNU `tar` rejects '..' components, + # so you could argue this is an invalid archive and we + # just raise an bad type of exception. + self.expect_exception(FileExistsError) + else: + self.expect_file('../moo', symlink_to='..//tmp/moo') + else: + # The symlink can't be extracted and is ignored + pass + except FileExistsError: + pass + + for filter in 'tar', 'data': + with self.check_context(arc.open(), filter): + self.expect_exception( + tarfile.OutsideDestinationError, + "'../moo' would be extracted to " + + "'.*moo', which is outside " + + "the destination") + + def test_sly_relative2(self): + # Inspired by 'relative2' in jwilk/traversal-archives + with ArchiveMaker() as arc: + arc.add('tmp/') + arc.add('tmp/../../moo', symlink_to='tmp/../..//tmp/moo') + + with self.check_context(arc.open(), 'fully_trusted'): + self.expect_file('tmp', type=tarfile.DIRTYPE) + if support.can_symlink(): + self.expect_file('../moo', symlink_to='tmp/../../tmp/moo') + + for filter in 'tar', 'data': + with self.check_context(arc.open(), filter): + self.expect_exception( + tarfile.OutsideDestinationError, + "'tmp/../../moo' would be extracted to " + + """['"].*moo['"], which is outside the """ + + "destination") + + def test_modes(self): + # Test how file modes are extracted + # (Note that the modes are ignored on platforms without working chmod) + with ArchiveMaker() as arc: + arc.add('all_bits', mode='?rwsrwsrwt') + arc.add('perm_bits', mode='?rwxrwxrwx') + arc.add('exec_group_other', mode='?rw-rwxrwx') + arc.add('read_group_only', mode='?---r-----') + arc.add('no_bits', mode='?---------') + arc.add('dir/', mode='?---rwsrwt', type=tarfile.DIRTYPE) + + with self.check_context(arc.open(), 'fully_trusted'): + self.expect_file('all_bits', mode='?rwsrwsrwt') + self.expect_file('perm_bits', mode='?rwxrwxrwx') + self.expect_file('exec_group_other', mode='?rw-rwxrwx') + self.expect_file('read_group_only', mode='?---r-----') + self.expect_file('no_bits', mode='?---------') + self.expect_file('dir', type=tarfile.DIRTYPE, mode='?---rwsrwt') + + with self.check_context(arc.open(), 'tar'): + self.expect_file('all_bits', mode='?rwxr-xr-x') + self.expect_file('perm_bits', mode='?rwxr-xr-x') + self.expect_file('exec_group_other', mode='?rw-r-xr-x') + self.expect_file('read_group_only', mode='?---r-----') + self.expect_file('no_bits', mode='?---------') + self.expect_file('dir/', type=tarfile.DIRTYPE, mode='?---r-xr-x') + + with self.check_context(arc.open(), 'data'): + normal_dir_mode = stat.filemode(stat.S_IMODE( + self.outerdir.stat().st_mode)) + self.expect_file('all_bits', mode='?rwxr-xr-x') + self.expect_file('perm_bits', mode='?rwxr-xr-x') + self.expect_file('exec_group_other', mode='?rw-r--r--') + self.expect_file('read_group_only', mode='?rw-r-----') + self.expect_file('no_bits', mode='?rw-------') + self.expect_file('dir/', type=tarfile.DIRTYPE, mode=normal_dir_mode) + + def test_pipe(self): + # Test handling of a special file + with ArchiveMaker() as arc: + arc.add('foo', type=tarfile.FIFOTYPE) + + for filter in 'fully_trusted', 'tar': + with self.check_context(arc.open(), filter): + if hasattr(os, 'mkfifo'): + self.expect_file('foo', type=tarfile.FIFOTYPE) + else: + # The pipe can't be extracted and is skipped. + pass + + with self.check_context(arc.open(), 'data'): + self.expect_exception( + tarfile.SpecialFileError, + "'foo' is a special file") + + def test_special_files(self): + # Creating device files is tricky. Instead of attempting that let's + # only check the filter result. + for special_type in tarfile.FIFOTYPE, tarfile.CHRTYPE, tarfile.BLKTYPE: + tarinfo = tarfile.TarInfo('foo') + tarinfo.type = special_type + trusted = tarfile.fully_trusted_filter(tarinfo, '') + self.assertIs(trusted, tarinfo) + tar = tarfile.tar_filter(tarinfo, '') + self.assertEqual(tar.type, special_type) + with self.assertRaises(tarfile.SpecialFileError) as cm: + tarfile.data_filter(tarinfo, '') + self.assertIsInstance(cm.exception.tarinfo, tarfile.TarInfo) + self.assertEqual(cm.exception.tarinfo.name, 'foo') + + def test_fully_trusted_filter(self): + # The 'fully_trusted' filter returns the original TarInfo objects. + with tarfile.TarFile.open(tarname) as tar: + for tarinfo in tar.getmembers(): + filtered = tarfile.fully_trusted_filter(tarinfo, '') + self.assertIs(filtered, tarinfo) + + def test_tar_filter(self): + # The 'tar' filter returns TarInfo objects with the same name/type. + # (It can also fail for particularly "evil" input, but we don't have + # that in the test archive.) + with tarfile.TarFile.open(tarname) as tar: + for tarinfo in tar.getmembers(): + filtered = tarfile.tar_filter(tarinfo, '') + self.assertIs(filtered.name, tarinfo.name) + self.assertIs(filtered.type, tarinfo.type) + + def test_data_filter(self): + # The 'data' filter either raises, or returns TarInfo with the same + # name/type. + with tarfile.TarFile.open(tarname) as tar: + for tarinfo in tar.getmembers(): + try: + filtered = tarfile.data_filter(tarinfo, '') + except tarfile.FilterError: + continue + self.assertIs(filtered.name, tarinfo.name) + self.assertIs(filtered.type, tarinfo.type) + + def test_default_filter_warns(self): + """Ensure the default filter warns""" + with ArchiveMaker() as arc: + arc.add('foo') + with warnings_helper.check_warnings( + ('Python 3.14', DeprecationWarning)): + with self.check_context(arc.open(), None): + self.expect_file('foo') + + def test_change_default_filter_on_instance(self): + tar = tarfile.TarFile(tarname, 'r') + def strict_filter(tarinfo, path): + if tarinfo.name == 'ustar/regtype': + return tarinfo + else: + return None + tar.extraction_filter = strict_filter + with self.check_context(tar, None): + self.expect_file('ustar/regtype') + + def test_change_default_filter_on_class(self): + def strict_filter(tarinfo, path): + if tarinfo.name == 'ustar/regtype': + return tarinfo + else: + return None + tar = tarfile.TarFile(tarname, 'r') + with support.swap_attr(tarfile.TarFile, 'extraction_filter', + staticmethod(strict_filter)): + with self.check_context(tar, None): + self.expect_file('ustar/regtype') + + def test_change_default_filter_on_subclass(self): + class TarSubclass(tarfile.TarFile): + def extraction_filter(self, tarinfo, path): + if tarinfo.name == 'ustar/regtype': + return tarinfo + else: + return None + + tar = TarSubclass(tarname, 'r') + with self.check_context(tar, None): + self.expect_file('ustar/regtype') + + def test_change_default_filter_to_string(self): + tar = tarfile.TarFile(tarname, 'r') + tar.extraction_filter = 'data' + with self.check_context(tar, None): + self.expect_exception(TypeError) + + def test_custom_filter(self): + def custom_filter(tarinfo, path): + self.assertIs(path, self.destdir) + if tarinfo.name == 'move_this': + return tarinfo.replace(name='moved') + if tarinfo.name == 'ignore_this': + return None + return tarinfo + + with ArchiveMaker() as arc: + arc.add('move_this') + arc.add('ignore_this') + arc.add('keep') + with self.check_context(arc.open(), custom_filter): + self.expect_file('moved') + self.expect_file('keep') + + def test_bad_filter_name(self): + with ArchiveMaker() as arc: + arc.add('foo') + with self.check_context(arc.open(), 'bad filter name'): + self.expect_exception(ValueError) + + def test_stateful_filter(self): + # Stateful filters should be possible. + # (This doesn't really test tarfile. Rather, it demonstrates + # that third parties can implement a stateful filter.) + class StatefulFilter: + def __enter__(self): + self.num_files_processed = 0 + return self + + def __call__(self, tarinfo, path): + try: + tarinfo = tarfile.data_filter(tarinfo, path) + except tarfile.FilterError: + return None + self.num_files_processed += 1 + return tarinfo + + def __exit__(self, *exc_info): + self.done = True + + with ArchiveMaker() as arc: + arc.add('good') + arc.add('bad', symlink_to='/') + arc.add('good') + with StatefulFilter() as custom_filter: + with self.check_context(arc.open(), custom_filter): + self.expect_file('good') + self.assertEqual(custom_filter.num_files_processed, 2) + self.assertEqual(custom_filter.done, True) + + def test_errorlevel(self): + def extracterror_filter(tarinfo, path): + raise tarfile.ExtractError('failed with ExtractError') + def filtererror_filter(tarinfo, path): + raise tarfile.FilterError('failed with FilterError') + def oserror_filter(tarinfo, path): + raise OSError('failed with OSError') + def tarerror_filter(tarinfo, path): + raise tarfile.TarError('failed with base TarError') + def valueerror_filter(tarinfo, path): + raise ValueError('failed with ValueError') + + with ArchiveMaker() as arc: + arc.add('file') + + # If errorlevel is 0, errors affected by errorlevel are ignored + + with self.check_context(arc.open(errorlevel=0), extracterror_filter): + self.expect_file('file') + + with self.check_context(arc.open(errorlevel=0), filtererror_filter): + self.expect_file('file') + + with self.check_context(arc.open(errorlevel=0), oserror_filter): + self.expect_file('file') + + with self.check_context(arc.open(errorlevel=0), tarerror_filter): + self.expect_exception(tarfile.TarError) + + with self.check_context(arc.open(errorlevel=0), valueerror_filter): + self.expect_exception(ValueError) + + # If 1, all fatal errors are raised + + with self.check_context(arc.open(errorlevel=1), extracterror_filter): + self.expect_file('file') + + with self.check_context(arc.open(errorlevel=1), filtererror_filter): + self.expect_exception(tarfile.FilterError) + + with self.check_context(arc.open(errorlevel=1), oserror_filter): + self.expect_exception(OSError) + + with self.check_context(arc.open(errorlevel=1), tarerror_filter): + self.expect_exception(tarfile.TarError) + + with self.check_context(arc.open(errorlevel=1), valueerror_filter): + self.expect_exception(ValueError) + + # If 2, all non-fatal errors are raised as well. + + with self.check_context(arc.open(errorlevel=2), extracterror_filter): + self.expect_exception(tarfile.ExtractError) + + with self.check_context(arc.open(errorlevel=2), filtererror_filter): + self.expect_exception(tarfile.FilterError) + + with self.check_context(arc.open(errorlevel=2), oserror_filter): + self.expect_exception(OSError) + + with self.check_context(arc.open(errorlevel=2), tarerror_filter): + self.expect_exception(tarfile.TarError) + + with self.check_context(arc.open(errorlevel=2), valueerror_filter): + self.expect_exception(ValueError) + + # We only handle ExtractionError, FilterError & OSError specially. + + with self.check_context(arc.open(errorlevel='boo!'), filtererror_filter): + self.expect_exception(TypeError) # errorlevel is not int + + def setUpModule(): support.unlink(TEMPDIR) os.makedirs(TEMPDIR) diff --git a/Misc/NEWS.d/next/Library/2023-03-23-15-24-38.gh-issue-102953.YR4KaK.rst b/Misc/NEWS.d/next/Library/2023-03-23-15-24-38.gh-issue-102953.YR4KaK.rst new file mode 100644 index 00000000000000..48a105a4a17b29 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-03-23-15-24-38.gh-issue-102953.YR4KaK.rst @@ -0,0 +1,4 @@ +The extraction methods in :mod:`tarfile`, and :func:`shutil.unpack_archive`, +have a new a *filter* argument that allows limiting tar features than may be +surprising or dangerous, such as creating files outside the destination +directory. See :ref:`tarfile-extraction-filter` for details. From 9106981373c485e0105f33fe087ae691f32a24bf Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Tue, 16 May 2023 14:34:16 +0200 Subject: [PATCH 02/12] Downgrade to Python 3.6 --- Lib/shutil.py | 3 ++- Lib/test/test_shutil.py | 3 +-- Lib/test/test_tarfile.py | 12 +++++------- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/Lib/shutil.py b/Lib/shutil.py index c27186df1bdeb9..a2e35cc80aa8a7 100644 --- a/Lib/shutil.py +++ b/Lib/shutil.py @@ -1005,7 +1005,8 @@ def unpack_archive(filename, extract_dir=None, format=None, *, filter=None): raise ReadError("Unknown archive format '{0}'".format(filename)) func = _UNPACK_FORMATS[format][1] - kwargs = dict(_UNPACK_FORMATS[format][2]) | filter_kwargs + kwargs = dict(_UNPACK_FORMATS[format][2]) + kwargs.update(filter_kwargs) func(filename, extract_dir, **kwargs) diff --git a/Lib/test/test_shutil.py b/Lib/test/test_shutil.py index 2fcaa6ac4a0a7a..6370a4bffb2d9f 100644 --- a/Lib/test/test_shutil.py +++ b/Lib/test/test_shutil.py @@ -23,7 +23,6 @@ from test import support from test.support import TESTFN, FakePath -from test.support import warnings_helper TESTFN2 = TESTFN + "2" @@ -1276,7 +1275,7 @@ def check_unpack_archive_with_converter(self, format, converter, **kwargs): def check_unpack_tarball(self, format): self.check_unpack_archive(format, filter='fully_trusted') self.check_unpack_archive(format, filter='data') - with warnings_helper.check_warnings( + with support.check_warnings( ('The default', RuntimeWarning)): self.check_unpack_archive(format) diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index f4b3a4d9c70b56..3b7d514816c71e 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -16,7 +16,6 @@ from test import support from test.support import script_helper -from test.support import warnings_helper # Check for our compression modules. try: @@ -2729,8 +2728,7 @@ def setUpClass(cls): tar.errorlevel = 0 with ExitStack() as cm: if cls.extraction_filter is None: - cm.enter_context(warnings.catch_warnings( - action="ignore", category=DeprecationWarning)) + cm.enter_context(warnings.catch_warnings()) tar.extractall(cls.control_dir, filter=cls.extraction_filter) tar.close() cls.control_paths = set( @@ -2860,8 +2858,8 @@ def test_list(self): for attr_names in ({'mtime'}, {'mode'}, {'uid'}, {'gid'}, {'uname'}, {'gname'}, {'uid', 'uname'}, {'gid', 'gname'}): - with (self.subTest(attr_names=attr_names), - tarfile.open(tarname, encoding="iso8859-1") as tar): + with self.subTest(attr_names=attr_names), \ + tarfile.open(tarname, encoding="iso8859-1") as tar: tio_prev = io.TextIOWrapper(io.BytesIO(), 'ascii', newline='\n') with support.swap_attr(sys, 'stdout', tio_prev): tar.list() @@ -3052,7 +3050,7 @@ def expect_file(self, name, type=None, symlink_to=None, mode=None): if type is None and isinstance(name, str) and name.endswith('/'): type = tarfile.DIRTYPE if symlink_to is not None: - got = (self.destdir / name).readlink() + got = pathlib.Path(os.readlink(self.destdir / name)) expected = pathlib.Path(symlink_to) # The symlink might be the same (textually) as what we expect, # but some systems change the link to an equivalent path, so @@ -3369,7 +3367,7 @@ def test_default_filter_warns(self): """Ensure the default filter warns""" with ArchiveMaker() as arc: arc.add('foo') - with warnings_helper.check_warnings( + with support.check_warnings( ('Python 3.14', DeprecationWarning)): with self.check_context(arc.open(), None): self.expect_file('foo') From 3ab896be0b764dc7feddd08b92052ec544410d5a Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Tue, 25 Apr 2023 14:27:45 +0200 Subject: [PATCH 03/12] Remove the DeprecationWarning --- Lib/tarfile.py | 5 ----- Lib/test/test_shutil.py | 3 +-- Lib/test/test_tarfile.py | 14 +++++--------- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/Lib/tarfile.py b/Lib/tarfile.py index 8c220bb20d780b..80bd04f980eca8 100755 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -2164,11 +2164,6 @@ def _get_filter_function(self, filter): if filter is None: filter = self.extraction_filter if filter is None: - warnings.warn( - 'Python 3.14 will, by default, filter extracted tar ' - + 'archives and reject files or modify their metadata. ' - + 'Use the filter argument to control this behavior.', - DeprecationWarning) return fully_trusted_filter if isinstance(filter, str): raise TypeError( diff --git a/Lib/test/test_shutil.py b/Lib/test/test_shutil.py index 6370a4bffb2d9f..433c55fd9a8b77 100644 --- a/Lib/test/test_shutil.py +++ b/Lib/test/test_shutil.py @@ -1275,8 +1275,7 @@ def check_unpack_archive_with_converter(self, format, converter, **kwargs): def check_unpack_tarball(self, format): self.check_unpack_archive(format, filter='fully_trusted') self.check_unpack_archive(format, filter='data') - with support.check_warnings( - ('The default', RuntimeWarning)): + with support.check_no_warnings(self): self.check_unpack_archive(format) def test_unpack_archive_tar(self): diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index 3b7d514816c71e..f25ffeec96adb4 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -2,7 +2,7 @@ import os import io from hashlib import md5 -from contextlib import contextmanager, ExitStack +from contextlib import contextmanager from random import Random import pathlib import shutil @@ -2726,10 +2726,7 @@ def setUpClass(cls): tar = tarfile.open(tarname, mode='r', encoding="iso8859-1") cls.control_dir = pathlib.Path(TEMPDIR) / "extractall_ctrl" tar.errorlevel = 0 - with ExitStack() as cm: - if cls.extraction_filter is None: - cm.enter_context(warnings.catch_warnings()) - tar.extractall(cls.control_dir, filter=cls.extraction_filter) + tar.extractall(cls.control_dir, filter=cls.extraction_filter) tar.close() cls.control_paths = set( p.relative_to(cls.control_dir) @@ -3363,12 +3360,11 @@ def test_data_filter(self): self.assertIs(filtered.name, tarinfo.name) self.assertIs(filtered.type, tarinfo.type) - def test_default_filter_warns(self): - """Ensure the default filter warns""" + def test_default_filter_warns_not(self): + """Ensure the default filter does not warn (like in 3.12)""" with ArchiveMaker() as arc: arc.add('foo') - with support.check_warnings( - ('Python 3.14', DeprecationWarning)): + with support.check_no_warnings(self): with self.check_context(arc.open(), None): self.expect_file('foo') From 300a1a82e1539b5d777f6f28baefe401267e4a54 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Wed, 17 May 2023 15:28:20 +0200 Subject: [PATCH 04/12] Backport test.support.check_no_warnings --- Lib/test/support/__init__.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/Lib/test/support/__init__.py b/Lib/test/support/__init__.py index 8de486fa876cd5..646001ba27de8b 100644 --- a/Lib/test/support/__init__.py +++ b/Lib/test/support/__init__.py @@ -1246,6 +1246,30 @@ def check_warnings(*filters, **kwargs): return _filterwarnings(filters, quiet) +@contextlib.contextmanager +def check_no_warnings(testcase, message='', category=Warning, force_gc=False): + """Context manager to check that no warnings are emitted. + + This context manager enables a given warning within its scope + and checks that no warnings are emitted even with that warning + enabled. + + If force_gc is True, a garbage collection is attempted before checking + for warnings. This may help to catch warnings emitted when objects + are deleted, such as ResourceWarning. + + Other keyword arguments are passed to warnings.filterwarnings(). + """ + with warnings.catch_warnings(record=True) as warns: + warnings.filterwarnings('always', + message=message, + category=category) + yield + if force_gc: + gc_collect() + testcase.assertEqual(warns, []) + + @contextlib.contextmanager def check_no_resource_warning(testcase): """Context manager to check that no ResourceWarning is emitted. From 7ea805059d5eaf069732acf1814ae1854d511cbe Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Tue, 25 Apr 2023 14:28:21 +0200 Subject: [PATCH 05/12] Remove new __all__ entries --- Lib/tarfile.py | 5 +---- Lib/test/test_tarfile.py | 32 ++++++++++++++++++++------------ 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/Lib/tarfile.py b/Lib/tarfile.py index 80bd04f980eca8..cb9d25ee511c47 100755 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -70,10 +70,7 @@ __all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError", "ReadError", "CompressionError", "StreamError", "ExtractError", "HeaderError", "ENCODING", "USTAR_FORMAT", "GNU_FORMAT", "PAX_FORMAT", - "DEFAULT_FORMAT", "open","fully_trusted_filter", "data_filter", - "tar_filter", "FilterError", "AbsoluteLinkError", - "OutsideDestinationError", "SpecialFileError", "AbsolutePathError", - "LinkOutsideDestinationError"] + "DEFAULT_FORMAT", "open"] #--------------------------------------------------------- diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index f25ffeec96adb4..54f4e417b1c489 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -2184,18 +2184,26 @@ def test_number_field_limits(self): tarfile.itn(0x10000000000, 6, tarfile.GNU_FORMAT) def test__all__(self): - blacklist = { - 'version', 'grp', 'pwd', 'symlink_exception', 'NUL', 'BLOCKSIZE', - 'RECORDSIZE', 'GNU_MAGIC', 'POSIX_MAGIC', 'LENGTH_NAME', - 'LENGTH_LINK', 'LENGTH_PREFIX', 'REGTYPE', 'AREGTYPE', 'LNKTYPE', - 'SYMTYPE', 'CHRTYPE', 'BLKTYPE', 'DIRTYPE', 'FIFOTYPE', 'CONTTYPE', - 'GNUTYPE_LONGNAME', 'GNUTYPE_LONGLINK', 'GNUTYPE_SPARSE', - 'XHDTYPE', 'XGLTYPE', 'SOLARIS_XHDTYPE', 'SUPPORTED_TYPES', - 'REGULAR_TYPES', 'GNU_TYPES', 'PAX_FIELDS', 'PAX_NAME_FIELDS', - 'PAX_NUMBER_FIELDS', 'stn', 'nts', 'nti', 'itn', 'calc_chksums', - 'copyfileobj', 'filemode', 'EmptyHeaderError', - 'TruncatedHeaderError', 'EOFHeaderError', 'InvalidHeaderError', - 'SubsequentHeaderError', 'ExFileObject', 'main'} + blacklist = {'version', 'grp', 'pwd', 'symlink_exception', + 'NUL', 'BLOCKSIZE', 'RECORDSIZE', 'GNU_MAGIC', + 'POSIX_MAGIC', 'LENGTH_NAME', 'LENGTH_LINK', + 'LENGTH_PREFIX', 'REGTYPE', 'AREGTYPE', 'LNKTYPE', + 'SYMTYPE', 'CHRTYPE', 'BLKTYPE', 'DIRTYPE', 'FIFOTYPE', + 'CONTTYPE', 'GNUTYPE_LONGNAME', 'GNUTYPE_LONGLINK', + 'GNUTYPE_SPARSE', 'XHDTYPE', 'XGLTYPE', 'SOLARIS_XHDTYPE', + 'SUPPORTED_TYPES', 'REGULAR_TYPES', 'GNU_TYPES', + 'PAX_FIELDS', 'PAX_NAME_FIELDS', 'PAX_NUMBER_FIELDS', + 'stn', 'nts', 'nti', 'itn', 'calc_chksums', 'copyfileobj', + 'filemode', + 'EmptyHeaderError', 'TruncatedHeaderError', + 'EOFHeaderError', 'InvalidHeaderError', + 'SubsequentHeaderError', 'ExFileObject', + 'main', + 'fully_trusted_filter', 'data_filter', + 'tar_filter', 'FilterError', 'AbsoluteLinkError', + 'OutsideDestinationError', 'SpecialFileError', + 'AbsolutePathError', 'LinkOutsideDestinationError', + } support.check__all__(self, tarfile, blacklist=blacklist) From 921e5f1dfa230d1187ca706308e0effa7912b64f Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Tue, 25 Apr 2023 14:56:01 +0200 Subject: [PATCH 06/12] gh-102950: Adjust tarfile filter tests for systems that don't set the sticky bit (GH-103831) Also remove expilcit `type=tarfile.DIRTYPE`, the slash at the end is enough. Backport of c8c3956d905e019101038b018129a4c90c9c9b8f --- Lib/test/test_tarfile.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index 54f4e417b1c489..873fd4d1151b87 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -3278,15 +3278,35 @@ def test_modes(self): arc.add('exec_group_other', mode='?rw-rwxrwx') arc.add('read_group_only', mode='?---r-----') arc.add('no_bits', mode='?---------') - arc.add('dir/', mode='?---rwsrwt', type=tarfile.DIRTYPE) + arc.add('dir/', mode='?---rwsrwt') + + # On some systems, setting the sticky bit is a no-op. + # Check if that's the case. + tmp_filename = os.path.join(TEMPDIR, "tmp.file") + with open(tmp_filename, 'w'): + pass + os.chmod(tmp_filename, os.stat(tmp_filename).st_mode | stat.S_ISVTX) + have_sticky_files = (os.stat(tmp_filename).st_mode & stat.S_ISVTX) + os.unlink(tmp_filename) + + os.mkdir(tmp_filename) + os.chmod(tmp_filename, os.stat(tmp_filename).st_mode | stat.S_ISVTX) + have_sticky_dirs = (os.stat(tmp_filename).st_mode & stat.S_ISVTX) + os.rmdir(tmp_filename) with self.check_context(arc.open(), 'fully_trusted'): - self.expect_file('all_bits', mode='?rwsrwsrwt') + if have_sticky_files: + self.expect_file('all_bits', mode='?rwsrwsrwt') + else: + self.expect_file('all_bits', mode='?rwsrwsrwx') self.expect_file('perm_bits', mode='?rwxrwxrwx') self.expect_file('exec_group_other', mode='?rw-rwxrwx') self.expect_file('read_group_only', mode='?---r-----') self.expect_file('no_bits', mode='?---------') - self.expect_file('dir', type=tarfile.DIRTYPE, mode='?---rwsrwt') + if have_sticky_dirs: + self.expect_file('dir/', mode='?---rwsrwt') + else: + self.expect_file('dir/', mode='?---rwsrwx') with self.check_context(arc.open(), 'tar'): self.expect_file('all_bits', mode='?rwxr-xr-x') @@ -3294,7 +3314,7 @@ def test_modes(self): self.expect_file('exec_group_other', mode='?rw-r-xr-x') self.expect_file('read_group_only', mode='?---r-----') self.expect_file('no_bits', mode='?---------') - self.expect_file('dir/', type=tarfile.DIRTYPE, mode='?---r-xr-x') + self.expect_file('dir/', mode='?---r-xr-x') with self.check_context(arc.open(), 'data'): normal_dir_mode = stat.filemode(stat.S_IMODE( @@ -3304,7 +3324,7 @@ def test_modes(self): self.expect_file('exec_group_other', mode='?rw-r--r--') self.expect_file('read_group_only', mode='?rw-r-----') self.expect_file('no_bits', mode='?rw-------') - self.expect_file('dir/', type=tarfile.DIRTYPE, mode=normal_dir_mode) + self.expect_file('dir/', mode=normal_dir_mode) def test_pipe(self): # Test handling of a special file From fc9913db5bd7e0dfd676f8bafc69f837d31037b8 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Tue, 9 May 2023 16:34:08 +0200 Subject: [PATCH 07/12] Skip chmod checking on Windows --- Lib/test/test_tarfile.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index 873fd4d1151b87..c09a25d554fcdb 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -3049,9 +3049,14 @@ def expect_file(self, name, type=None, symlink_to=None, mode=None): path = pathlib.Path(os.path.normpath(self.destdir / name)) self.assertIn(path, self.expected_paths) self.expected_paths.remove(path) - if mode is not None: + + # When checking mode, ignore Windows (which can only set user read and + # user write bits). Newer versions of Python use `os_helper.can_chmod()` + # instead of hardcoding Windows. + if mode is not None and sys.platform != 'win32': got = stat.filemode(stat.S_IMODE(path.stat().st_mode)) self.assertEqual(got, mode) + if type is None and isinstance(name, str) and name.endswith('/'): type = tarfile.DIRTYPE if symlink_to is not None: From a11a1988cab99b3fa1acdadbbb854c1c2eaef536 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Tue, 25 Apr 2023 14:17:07 +0200 Subject: [PATCH 08/12] Adjust the docs for 3.7.17 --- Doc/library/shutil.rst | 7 ++-- Doc/library/tarfile.rst | 72 ++++++++++++++++++----------------------- Doc/whatsnew/3.7.rst | 16 +++++++++ 3 files changed, 52 insertions(+), 43 deletions(-) diff --git a/Doc/library/shutil.rst b/Doc/library/shutil.rst index 772f555a8a46e4..01ea9e35bd4e80 100644 --- a/Doc/library/shutil.rst +++ b/Doc/library/shutil.rst @@ -553,8 +553,9 @@ provided. They rely on the :mod:`zipfile` and :mod:`tarfile` modules. registered for that extension. In case none is found, a :exc:`ValueError` is raised. - The keyword-only *filter* argument is passed to the underlying unpacking - function. For zip files, *filter* is not accepted. + The keyword-only *filter* argument, which was added in Python 3.7.17, + is passed to the underlying unpacking function. + For zip files, *filter* is not accepted. For tar files, it is recommended to set it to ``'data'``, unless using features specific to tar and UNIX-like filesystems. (See :ref:`tarfile-extraction-filter` for details.) @@ -564,7 +565,7 @@ provided. They rely on the :mod:`zipfile` and :mod:`tarfile` modules. .. versionchanged:: 3.7 Accepts a :term:`path-like object` for *filename* and *extract_dir*. - .. versionchanged:: 3.8.17 + .. versionchanged:: 3.7.17 Added the *filter* argument. .. function:: register_unpack_format(name, extensions, function[, extra_args[, description]]) diff --git a/Doc/library/tarfile.rst b/Doc/library/tarfile.rst index 15efbccdd88fc8..2610fe3a683f01 100644 --- a/Doc/library/tarfile.rst +++ b/Doc/library/tarfile.rst @@ -36,13 +36,6 @@ Some facts and figures: .. versionchanged:: 3.3 Added support for :mod:`lzma` compression. -.. versionchanged:: 3.12 - Archives are extracted using a :ref:`filter `, - which makes it possible to either limit surprising/dangerous features, - or to acknowledge that they are expected and the archive is fully trusted. - By default, archives are fully trusted, but this default is deprecated - and slated to change in Python 3.14. - .. function:: open(name=None, mode='r', fileobj=None, bufsize=10240, \*\*kwargs) @@ -425,8 +418,8 @@ be finalized; only the internally used file object will be closed. See the are used to set the owner/group for the extracted files. Otherwise, the named values from the tarfile are used. - The *filter* argument specifies how ``members`` are modified or rejected - before extraction. + The *filter* argument, which was added in Python 3.7.17, specifies how + ``members`` are modified or rejected before extraction. See :ref:`tarfile-extraction-filter` for details. It is recommended to set this explicitly depending on which *tar* features you need to support. @@ -447,7 +440,7 @@ be finalized; only the internally used file object will be closed. See the .. versionchanged:: 3.6 The *path* parameter accepts a :term:`path-like object`. - .. versionchanged:: 3.12 + .. versionchanged:: 3.7.17 Added the *filter* parameter. @@ -483,7 +476,7 @@ be finalized; only the internally used file object will be closed. See the .. versionchanged:: 3.6 The *path* parameter accepts a :term:`path-like object`. - .. versionchanged:: 3.12 + .. versionchanged:: 3.7.17 Added the *filter* parameter. @@ -498,7 +491,6 @@ be finalized; only the internally used file object will be closed. See the Return an :class:`io.BufferedReader` object. .. attribute:: TarFile.errorlevel - :type: int If *errorlevel* is ``0``, errors are ignored when using :meth:`TarFile.extract` and :meth:`TarFile.extractall`. @@ -520,7 +512,7 @@ be finalized; only the internally used file object will be closed. See the .. attribute:: TarFile.extraction_filter - .. versionadded:: 3.12 + .. versionadded:: 3.7.17 The :ref:`extraction filter ` used as a default for the *filter* argument of :meth:`~TarFile.extract` @@ -531,10 +523,12 @@ be finalized; only the internally used file object will be closed. See the argument to :meth:`~TarFile.extract`. If ``extraction_filter`` is ``None`` (the default), - calling an extraction method without a *filter* argument will raise a - ``DeprecationWarning``, - and fall back to the :func:`fully_trusted ` filter, - whose dangerous behavior matches previous versions of Python. + calling an extraction method without a *filter* argument will + use the :func:`fully_trusted ` filter for + compatibility with previous Python versions. + + In Python 3.12+, leaving ``extraction_filter=None`` will emit a + ``DeprecationWarning``. In Python 3.14+, leaving ``extraction_filter=None`` will cause extraction methods to use the :func:`data ` filter by default. @@ -639,6 +633,11 @@ Different :class:`TarInfo` methods handle ``None`` differently: - :meth:`~TarFile.addfile` will fail. - :meth:`~TarFile.list` will print a placeholder string. + +.. versionchanged:: 3.7.17 + Added :meth:`~TarInfo.replace` and handling of ``None``. + + .. class:: TarInfo(name="") Create a :class:`TarInfo` object. @@ -670,35 +669,31 @@ A ``TarInfo`` object has the following public data attributes: .. attribute:: TarInfo.name - :type: str Name of the archive member. .. attribute:: TarInfo.size - :type: int Size in bytes. .. attribute:: TarInfo.mtime - :type: int | float Time of last modification in seconds since the :ref:`epoch `, as in :attr:`os.stat_result.st_mtime`. - .. versionchanged:: 3.12 + .. versionchanged:: 3.7.17 Can be set to ``None`` for :meth:`~TarFile.extract` and :meth:`~TarFile.extractall`, causing extraction to skip applying this attribute. .. attribute:: TarInfo.mode - :type: int Permission bits, as for :func:`os.chmod`. - .. versionchanged:: 3.12 + .. versionchanged:: 3.7.17 Can be set to ``None`` for :meth:`~TarFile.extract` and :meth:`~TarFile.extractall`, causing extraction to skip applying this @@ -714,58 +709,52 @@ A ``TarInfo`` object has the following public data attributes: .. attribute:: TarInfo.linkname - :type: str Name of the target file name, which is only present in :class:`TarInfo` objects of type :const:`LNKTYPE` and :const:`SYMTYPE`. .. attribute:: TarInfo.uid - :type: int User ID of the user who originally stored this member. - .. versionchanged:: 3.12 + .. versionchanged:: 3.7.17 Can be set to ``None`` for :meth:`~TarFile.extract` and :meth:`~TarFile.extractall`, causing extraction to skip applying this attribute. .. attribute:: TarInfo.gid - :type: int Group ID of the user who originally stored this member. - .. versionchanged:: 3.12 + .. versionchanged:: 3.7.17 Can be set to ``None`` for :meth:`~TarFile.extract` and :meth:`~TarFile.extractall`, causing extraction to skip applying this attribute. .. attribute:: TarInfo.uname - :type: str User name. - .. versionchanged:: 3.12 + .. versionchanged:: 3.7.17 Can be set to ``None`` for :meth:`~TarFile.extract` and :meth:`~TarFile.extractall`, causing extraction to skip applying this attribute. .. attribute:: TarInfo.gname - :type: str Group name. - .. versionchanged:: 3.12 + .. versionchanged:: 3.7.17 Can be set to ``None`` for :meth:`~TarFile.extract` and :meth:`~TarFile.extractall`, causing extraction to skip applying this attribute. .. attribute:: TarInfo.pax_headers - :type: dict A dictionary containing key-value pairs of an associated pax extended header. @@ -773,7 +762,7 @@ A ``TarInfo`` object has the following public data attributes: uid=..., gid=..., uname=..., gname=..., deep=True) - .. versionadded:: 3.12 + .. versionadded:: 3.7.17 Return a *new* copy of the :class:`!TarInfo` object with the given attributes changed. For example, to return a ``TarInfo`` with the group name set to @@ -838,7 +827,7 @@ A :class:`TarInfo` object also provides some convenient query methods: Extraction filters ------------------ -.. versionadded:: 3.12 +.. versionadded:: 3.7.17 The *tar* format is designed to capture all details of a UNIX-like filesystem, which makes it very powerful. @@ -875,9 +864,10 @@ can be: * ``None`` (default): Use :attr:`TarFile.extraction_filter`. - If that is also ``None`` (the default), raise a ``DeprecationWarning``, - and fall back to the ``'fully_trusted'`` filter, whose dangerous behavior - matches previous versions of Python. + If that is also ``None`` (the default), the ``'fully_trusted'`` + filter will be used (for compatibility with earlier versions of Python). + + In Python 3.12, the default will emit a ``DeprecationWarning``. In Python 3.14, the ``'data'`` filter will become the default instead. It's possible to switch earlier; see :attr:`TarFile.extraction_filter`. @@ -1014,7 +1004,7 @@ Also note that: Supporting older Python versions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Extraction filters were added to Python 3.12, but may be backported to older +Extraction filters were added to Python 3.12, and are backported to older versions as security updates. To check whether the feature is available, use e.g. ``hasattr(tarfile, 'data_filter')`` rather than checking the Python version. @@ -1161,6 +1151,8 @@ Command-line options Only string names are accepted (that is, ``fully_trusted``, ``tar``, and ``data``). + .. versionadded:: 3.7.17 + .. _tar-examples: Examples @@ -1170,7 +1162,7 @@ How to extract an entire tar archive to the current working directory:: import tarfile tar = tarfile.open("sample.tar.gz") - tar.extractall(filter='data') + tar.extractall() tar.close() How to extract a subset of a tar archive with :meth:`TarFile.extractall` using diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst index 21f96228cf1cd6..1ba291d846ec77 100644 --- a/Doc/whatsnew/3.7.rst +++ b/Doc/whatsnew/3.7.rst @@ -2615,3 +2615,19 @@ This limit can be configured or disabled by environment variable, command line flag, or :mod:`sys` APIs. See the :ref:`integer string conversion length limitation ` documentation. The default limit is 4300 digits in string form. + +Notable Changes in 3.7.17 +========================= + +tarfile +------- + +* The extraction methods in :mod:`tarfile`, and :func:`shutil.unpack_archive`, + have a new a *filter* argument that allows limiting tar features than may be + surprising or dangerous, such as creating files outside the destination + directory. + See :ref:`tarfile-extraction-filter` for details. + In Python 3.12, use without the *filter* argument will show a + :exc:`DeprecationWarning`. + In Python 3.14, the default will switch to ``'data'``. + (Contributed by Petr Viktorin in :pep:`706`.) From 8332ea3e6b2436256d87ae645dfec5c8bbe1c9ab Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Tue, 16 May 2023 14:41:38 +0200 Subject: [PATCH 09/12] Backport warning to shutil.unpack_archive docs --- Doc/library/shutil.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Doc/library/shutil.rst b/Doc/library/shutil.rst index 01ea9e35bd4e80..91ad1ad85cecc3 100644 --- a/Doc/library/shutil.rst +++ b/Doc/library/shutil.rst @@ -562,6 +562,13 @@ provided. They rely on the :mod:`zipfile` and :mod:`tarfile` modules. The ``'data'`` filter will become the default for tar files in Python 3.14. + .. warning:: + + Never extract archives from untrusted sources without prior inspection. + It is possible that files are created outside of the path specified in + the *extract_dir* argument, e.g. members that have absolute filenames + starting with "/" or filenames with two dots "..". + .. versionchanged:: 3.7 Accepts a :term:`path-like object` for *filename* and *extract_dir*. From b626549298621aa3d4aa26da7016d4c4dae96205 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Tue, 23 May 2023 15:20:06 +0200 Subject: [PATCH 10/12] Skip test when an obscure Windows pathlib error appears --- Lib/test/test_tarfile.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index c09a25d554fcdb..09b8fcb3b8e856 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -2736,9 +2736,19 @@ def setUpClass(cls): tar.errorlevel = 0 tar.extractall(cls.control_dir, filter=cls.extraction_filter) tar.close() + try: + paths = list(pathlib.Path(cls.control_dir).glob('**/*')) + except OSError as err: + if getattr(err, 'winerror', None) == 123: + # 123 is Windows error ERROR_INVALID_NAME + # glob() fails this way when symlink targets are too long + # Fixed in 3.8+: https://github.com/python/cpython/issues/79487 + raise unittest.SkipTest('Path.glob failed') + else: + raise cls.control_paths = set( p.relative_to(cls.control_dir) - for p in pathlib.Path(cls.control_dir).glob('**/*')) + for p in paths) @classmethod def tearDownClass(cls): @@ -3031,7 +3041,16 @@ def check_context(self, tar, filter): self.expected_paths = set() else: self.raised_exception = None - self.expected_paths = set(self.outerdir.glob('**/*')) + try: + self.expected_paths = set(self.outerdir.glob('**/*')) + except OSError as err + if getattr(err, 'winerror', None) == 123: + # 123 is Windows error ERROR_INVALID_NAME + # glob() fails this way when symlink targets are too long + # Fixed in 3.8+: https://github.com/python/cpython/issues/79487 + raise unittest.SkipTest('Path.glob failed') + else: + raise self.expected_paths.discard(self.destdir) try: yield From 7ae4a888c04a5d69448aeaa7ef2350a97fb73004 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Tue, 23 May 2023 15:23:50 +0200 Subject: [PATCH 11/12] Feed a string to os.readlink() --- Lib/test/test_tarfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index 09b8fcb3b8e856..eae01f9c9e05e9 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -3079,7 +3079,7 @@ def expect_file(self, name, type=None, symlink_to=None, mode=None): if type is None and isinstance(name, str) and name.endswith('/'): type = tarfile.DIRTYPE if symlink_to is not None: - got = pathlib.Path(os.readlink(self.destdir / name)) + got = pathlib.Path(os.readlink(str(self.destdir / name))) expected = pathlib.Path(symlink_to) # The symlink might be the same (textually) as what we expect, # but some systems change the link to an equivalent path, so From 64d1a92126b210540b1e793799161a98de7cd91f Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Thu, 25 May 2023 16:08:52 +0200 Subject: [PATCH 12/12] Fix syntax error I pushed the wrong commit O.o --- Lib/test/test_tarfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index eae01f9c9e05e9..5a4fd36bba4e5c 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -3043,7 +3043,7 @@ def check_context(self, tar, filter): self.raised_exception = None try: self.expected_paths = set(self.outerdir.glob('**/*')) - except OSError as err + except OSError as err: if getattr(err, 'winerror', None) == 123: # 123 is Windows error ERROR_INVALID_NAME # glob() fails this way when symlink targets are too long