Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement cache size for CachingFileSystem #1377

Merged
merged 4 commits into from
Sep 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions fsspec/implementations/cached.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,11 @@ def __init__(
self.expiry = expiry_time
self.compression = compression

# Size of cache in bytes. If None then the size is unknown and will be
# recalculated the next time cache_size() is called. On writes to the
# cache this is reset to None.
self._cache_size = None

if same_names is not None and cache_mapper is not None:
raise ValueError(
"Cannot specify both same_names and cache_mapper in "
Expand Down Expand Up @@ -165,6 +170,17 @@ def _remove_tempdir(tempdir):
def _mkcache(self):
os.makedirs(self.storage[-1], exist_ok=True)

def cache_size(self):
"""Return size of cache in bytes.

If more than one cache directory is in use, only the size of the last
one (the writable cache directory) is returned.
"""
if self._cache_size is None:
cache_dir = self.storage[-1]
self._cache_size = filesystem("file").du(cache_dir, withdirs=True)
return self._cache_size

def load_cache(self):
"""Read set of stored blocks from file"""
self._metadata.load()
Expand All @@ -176,6 +192,7 @@ def save_cache(self):
self._mkcache()
self._metadata.save()
self.last_cache = time.time()
self._cache_size = None

def _check_cache(self):
"""Reload caches if time elapsed or any disappeared"""
Expand All @@ -202,6 +219,7 @@ def clear_cache(self):
"""
rmtree(self.storage[-1])
self.load_cache()
self._cache_size = None
martindurant marked this conversation as resolved.
Show resolved Hide resolved

def clear_expired_cache(self, expiry_time=None):
"""Remove all expired files and metadata from the cache
Expand Down Expand Up @@ -231,6 +249,8 @@ def clear_expired_cache(self, expiry_time=None):
rmtree(self.storage[-1])
self.load_cache()

self._cache_size = None

def pop_from_cache(self, path):
"""Remove cached version of given file

Expand All @@ -242,6 +262,7 @@ def pop_from_cache(self, path):
fn = self._metadata.pop_file(path)
if fn is not None:
os.remove(fn)
self._cache_size = None

def _open(
self,
Expand Down Expand Up @@ -389,6 +410,7 @@ def __getattribute__(self, item):
"__hash__",
"__eq__",
"to_json",
"cache_size",
]:
# all the methods defined in this class. Note `open` here, since
# it calls `_open`, but is actually in superclass
Expand Down Expand Up @@ -535,6 +557,7 @@ def commit_many(self, open_files):
os.remove(f.name)
except FileNotFoundError:
pass
self._cache_size = None

def _make_local_details(self, path):
hash = self._mapper(path)
Expand Down Expand Up @@ -704,6 +727,7 @@ def _open(self, path, mode="rb", **kwargs):
kwargs["mode"] = mode

self._mkcache()
self._cache_size = None
if self.compression:
with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2:
if isinstance(f, AbstractBufferedFile):
Expand Down
39 changes: 39 additions & 0 deletions fsspec/implementations/tests/test_cached.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
)
from fsspec.implementations.cached import CachingFileSystem, LocalTempFile
from fsspec.implementations.local import make_path_posix
from fsspec.tests.conftest import win

from .test_ftp import FTPFileSystem

Expand Down Expand Up @@ -1211,3 +1212,41 @@ def test_cache_dir_auto_deleted(temp_cache, tmpdir):
assert not local.exists(cache_dir)
else:
assert local.exists(cache_dir)


@pytest.mark.parametrize("protocol", ["filecache", "blockcache", "simplecache"])
def test_cache_size(tmpdir, protocol):
if win and protocol == "blockcache":
pytest.skip("Windows file locking affects blockcache size tests")

source = os.path.join(tmpdir, "source")
afile = os.path.join(source, "afile")
os.mkdir(source)
open(afile, "w").write("test")

fs = fsspec.filesystem(protocol, target_protocol="file")
empty_cache_size = fs.cache_size()

# Create cache
with fs.open(afile, "rb") as f:
assert f.read(5) == b"test"
single_file_cache_size = fs.cache_size()
assert single_file_cache_size > empty_cache_size

# Remove cached file but leave cache metadata file
fs.pop_from_cache(afile)
if win and protocol == "filecache":
empty_cache_size < fs.cache_size()
elif protocol != "simplecache":
assert empty_cache_size < fs.cache_size() < single_file_cache_size
else:
# simplecache never stores metadata
assert fs.cache_size() == single_file_cache_size

# Completely remove cache
fs.clear_cache()
if protocol != "simplecache":
assert fs.cache_size() == empty_cache_size
else:
# Whole cache directory has been deleted
assert fs.cache_size() == 0