diff --git a/pydala/catalog.py b/pydala/catalog.py index bcc790d..8416edf 100644 --- a/pydala/catalog.py +++ b/pydala/catalog.py @@ -11,7 +11,8 @@ from .dataset import CsvDataset, JsonDataset, ParquetDataset, PyarrowDataset from .filesystem import FileSystem -from .helpers.misc import delattr_rec, get_nested_keys, getattr_rec, setattr_rec +from .helpers.misc import (delattr_rec, get_nested_keys, getattr_rec, + setattr_rec) from .helpers.sql import get_table_names from .table import PydalaTable diff --git a/pydala/dataset.py b/pydala/dataset.py index b1cb5ba..5cbf915 100644 --- a/pydala/dataset.py +++ b/pydala/dataset.py @@ -18,10 +18,8 @@ from .helpers.polars import pl as _pl from .io import Writer from .metadata import ParquetDatasetMetadata, PydalaDatasetMetadata -from .schema import ( - replace_schema, # from .optimize import Optimize - shrink_large_string, -) +from .schema import replace_schema # from .optimize import Optimize +from .schema import shrink_large_string from .table import PydalaTable @@ -62,21 +60,31 @@ def __init__( # enable object caching for e.g. parquet metadata self.ddb_con.execute( f"""PRAGMA enable_object_cache; - SET THREADS={psutil.cpu_count()*2};""" + SET THREADS={psutil.cpu_count() * 2};""" ) self._timestamp_column = timestamp_column - #self.load_files() + # self.load_files() - if self.has_files: - if partitioning == "ignore": - self._partitioning = None - elif partitioning is None and "=" in self._files[0]: - self._partitioning = "hive" - else: - self._partitioning = partitioning + # NOTE: Set partitioning manually, if not set, try to infer it + if partitioning is None: + # try to infer partitioning + if any(["=" in obj for obj in self.fs.ls(self._path)]): + partitioning = "hive" else: - self._partitioning = partitioning + if partitioning == "ignore": + partitioning = None + self._partitioning = partitioning + + # if self.has_files: + # if partitioning == "ignore": + # self._partitioning = None + # elif partitioning is None and "=" in self._files[0]: + # self._partitioning = "hive" + # else: + # self._partitioning = partitioning + # else: + # self._partitioning = partitioning try: self.load() @@ -99,9 +107,7 @@ def load_files(self) -> None: self._files = [ fn.replace(self._path, "").lstrip("/") for fn in sorted( - self._filesystem.glob( - os.path.join(self._path, f"**/*.{self._format}") - ) + self._filesystem.glob(os.path.join(self._path, f"**/*.{self._format}")) ) ] @@ -155,15 +161,11 @@ def load(self): format=self._format, partitioning=self._partitioning, ) - self.table = PydalaTable( - result=self._arrow_dataset, ddb_con=self.ddb_con - ) + self.table = PydalaTable(result=self._arrow_dataset, ddb_con=self.ddb_con) # self.ddb_con.register("arrow__dataset", self._arrow_parquet_dataset) if self._timestamp_column is None: - self._timestamp_columns = get_timestamp_column( - self.table.pl.head(10) - ) + self._timestamp_columns = get_timestamp_column(self.table.pl.head(10)) if len(self._timestamp_columns) > 0: self._timestamp_column = self._timestamp_columns[0] @@ -345,9 +347,7 @@ def partition_names(self) -> list: if not hasattr(self, "_partition_names") and hasattr( self._arrow_dataset, "partitioning" ): - self._partition_names = ( - self._arrow_dataset.partitioning.schema.names - ) + self._partition_names = self._arrow_dataset.partitioning.schema.names return self._partition_names @@ -472,12 +472,7 @@ def filter( the method will automatically use DuckDB for filtering. """ - if any( - [ - s in filter_expr - for s in ["%", "like", "similar to", "*", "(", ")"] - ] - ): + if any([s in filter_expr for s in ["%", "like", "similar to", "*", "(", ")"]]): use = "duckdb" if use == "auto": @@ -505,9 +500,7 @@ def registered_tables(self) -> list[str]: Returns: list[str]: A list of table names. """ - return ( - self.ddb_con.sql("SHOW TABLES").arrow().column("name").to_pylist() - ) + return self.ddb_con.sql("SHOW TABLES").arrow().column("name").to_pylist() def interrupt_duckdb(self): """ @@ -569,9 +562,7 @@ def _get_delta_other_df( # _pl.first(col).alias("max"), _pl.last(col).alias("min") # ) # else: - max_min = df.select( - _pl.max(col).alias("max"), _pl.min(col).alias("min") - ) + max_min = df.select(_pl.max(col).alias("max"), _pl.min(col).alias("min")) if collect: max_min = max_min.collect() @@ -732,7 +723,7 @@ def write_to_dataset( self.delete_files(del_files) self.clear_cache() - #self.load_files() + # self.load_files() class ParquetDataset(PydalaDatasetMetadata, BaseDataset): @@ -855,14 +846,10 @@ def load( filesystem=self._filesystem, ) - self.table = PydalaTable( - result=self._arrow_dataset, ddb_con=self.ddb_con - ) + self.table = PydalaTable(result=self._arrow_dataset, ddb_con=self.ddb_con) if self._timestamp_column is None: - self._timestamp_columns = get_timestamp_column( - self.table.pl.head(10) - ) + self._timestamp_columns = get_timestamp_column(self.table.pl.head(10)) if len(self._timestamp_columns) > 0: self._timestamp_column = self._timestamp_columns[0] if self._timestamp_column is not None: @@ -1187,17 +1174,13 @@ def load(self): .opt_dtype(strict=False) .to_arrow() ) - self.table = PydalaTable( - result=self._arrow_dataset, ddb_con=self.ddb_con - ) + self.table = PydalaTable(result=self._arrow_dataset, ddb_con=self.ddb_con) self.ddb_con.register(f"{self.name}", self._arrow_dataset) # self.ddb_con.register("arrow__dataset", self._arrow_parquet_dataset) if self._timestamp_column is None: - self._timestamp_columns = get_timestamp_column( - self.table.pl.head(10) - ) + self._timestamp_columns = get_timestamp_column(self.table.pl.head(10)) if len(self._timestamp_columns) > 1: self._timestamp_column = self._timestamp_columns[0] @@ -1255,9 +1238,7 @@ def _compact_partition( # else: # num_rows = 0 - batches = scan.to_batch_reader( - sort_by=sort_by, batch_size=max_rows_per_file - ) + batches = scan.to_batch_reader(sort_by=sort_by, batch_size=max_rows_per_file) for batch in batches: self.write_to_dataset( pa.table(batch), @@ -1331,7 +1312,7 @@ def _compact_by_timeperiod( if len(self.scan_files) == 1: date_diff = ( self.metadata_table.filter( - f"file_path='{self.scan_files[0].replace(self._path,'').lstrip('/')}'" + f"file_path='{self.scan_files[0].replace(self._path, '').lstrip('/')}'" ) .aggregate("max(AE_DATUM.max) - min(AE_DATUM.min)") .fetchone()[0] @@ -1403,9 +1384,7 @@ def compact_by_timeperiod( end_dates = dates[1:] files_to_delete = [] - for start_date, end_date in tqdm.tqdm( - list(zip(start_dates, end_dates)) - ): + for start_date, end_date in tqdm.tqdm(list(zip(start_dates, end_dates))): files_to_delete_ = self._compact_by_timeperiod( start_date=start_date, end_date=end_date, @@ -1515,16 +1494,13 @@ def _optimize_dtypes( [ field for field in scan.arrow_dataset.schema - if field.name - not in scan.arrow_dataset.partitioning.schema.names + if field.name not in scan.arrow_dataset.partitioning.schema.names ] ) if schema != optimized_schema: table = replace_schema( - scan.pl.opt_dtype( - strict=strict, exclude=exclude, include=include - ) + scan.pl.opt_dtype(strict=strict, exclude=exclude, include=include) .collect(streaming=True) .to_arrow(), schema=optimized_schema, diff --git a/pydala/filesystem.py b/pydala/filesystem.py index e15d8e1..593f724 100644 --- a/pydala/filesystem.py +++ b/pydala/filesystem.py @@ -1,7 +1,6 @@ import datetime as dt import inspect import os -import asyncio from datetime import datetime, timedelta from functools import wraps from pathlib import Path @@ -13,13 +12,12 @@ import psutil import pyarrow as pa import pyarrow.dataset as pds -import pyarrow.parquet as pq import pyarrow.fs as pfs +import pyarrow.parquet as pq import s3fs from fsspec import AbstractFileSystem, filesystem from fsspec.implementations.cache_mapper import AbstractCacheMapper from fsspec.implementations.cached import SimpleCacheFileSystem - # from fsspec.implementations import cached as cachedfs from fsspec.implementations.dirfs import DirFileSystem from loguru import logger @@ -746,10 +744,58 @@ def sync_folder( self.cp(new_src, dst) -def list_files_recursive(self, path:str, format:str=""): - bucket, prefix = path.split("/", maxsplit=1) - return [f["Key"] for f in asyncio.run(self.s3.list_objects_v2(Bucket=bucket, Prefix=prefix))["Contents"] if f["Key"].endswith(format)] - +# NOTE: This is not working properly due to some event loop issues + +# def list_files_recursive(self, path: str, format: str = ""): +# bucket, prefix = path.split("/", maxsplit=1) +# return [ +# f["Key"] +# for f in asyncio.run(self.s3.list_objects_v2(Bucket=bucket, Prefix=prefix))[ +# "Contents" +# ] +# if f["Key"].endswith(format) +# ] + + +# async def _list_files_recursive( +# self, path: str, format: str = "", max_items: int = 10000 +# ): +# bucket, prefix = path.split("/", maxsplit=1) +# continuation_token = None +# files = [] + +# while True: +# if continuation_token: +# response = await self.s3.list_objects_v2( +# Bucket=bucket, +# Prefix=prefix, +# ContinuationToken=continuation_token, +# MaxKeys=max_items, +# ) +# else: +# response = await self.s3.list_objects_v2( +# Bucket=bucket, Prefix=prefix, MaxKeys=max_items +# ) + +# if "Contents" in response: +# files.extend( +# [f["Key"] for f in response["Contents"] if f["Key"].endswith(format)] +# ) + +# if response.get("IsTruncated"): # Check if there are more objects to retrieve +# continuation_token = response.get("NextContinuationToken") +# else: +# break + +# return files + + +# def list_files_recursive(self, path: str, format: str = "", max_items: int = 10000): +# loop = asyncio.get_event_loop() +# if loop.is_closed(): +# loop = asyncio.new_event_loop() +# asyncio.set_event_loop(loop) +# return loop.run_until_complete(_list_files_recursive(self, path, format, max_items)) AbstractFileSystem.read_parquet = read_parquet @@ -782,7 +828,7 @@ def list_files_recursive(self, path:str, format:str=""): # AbstractFileSystem.parallel_mv = parallel_mv # AbstractFileSystem.parallel_rm = parallel_rm AbstractFileSystem.sync_folder = sync_folder -AbstractFileSystem.list_files_recursive = list_files_recursive +# AbstractFileSystem.list_files_recursive = list_files_recursive def FileSystem( diff --git a/pydala/helpers/datetime.py b/pydala/helpers/datetime.py index 16cacb6..f36447b 100644 --- a/pydala/helpers/datetime.py +++ b/pydala/helpers/datetime.py @@ -1,11 +1,11 @@ import datetime as dt import re +from functools import lru_cache import pendulum as pdl import polars as pl import polars.selectors as cs import pyarrow as pa -from functools import lru_cache def get_timestamp_column(df: pl.DataFrame | pl.LazyFrame | pa.Table) -> str | list[str]: diff --git a/pydala/helpers/misc.py b/pydala/helpers/misc.py index 6972224..786fefb 100644 --- a/pydala/helpers/misc.py +++ b/pydala/helpers/misc.py @@ -2,7 +2,6 @@ import re from typing import Any -import pendulum as pdl import pyarrow as pa import pyarrow.compute as pc import pyarrow.parquet as pq diff --git a/pydala/io.py b/pydala/io.py index 3fede0e..24a21ee 100644 --- a/pydala/io.py +++ b/pydala/io.py @@ -8,7 +8,6 @@ import polars.selectors as cs import pyarrow as pa import pyarrow.dataset as pds - # import pyarrow.dataset as pds import pyarrow.parquet as pq from fsspec import AbstractFileSystem diff --git a/pydala/metadata.py b/pydala/metadata.py index 91b15e2..d635d3b 100644 --- a/pydala/metadata.py +++ b/pydala/metadata.py @@ -1,3 +1,4 @@ +import concurrent.futures import copy import os import pickle @@ -11,7 +12,6 @@ from fsspec import AbstractFileSystem from .filesystem import FileSystem, clear_cache - # from .helpers.metadata import collect_parquet_metadata # , remove_from_metadata from .helpers.misc import get_partitions_from_path, run_parallel from .schema import repair_schema, unify_schemas @@ -107,40 +107,48 @@ def remove_from_metadata( return metadata -def get_file_paths(metadata: pq.FileMetaData,) -> list[str]: - return [metadata.row_group(i).column(0).file_path for i in range(metadata.num_row_groups)] - -class FileMetadata: - def __init__( - self, - path:str, - filesystem: AbstractFileSystem | pfs.FileSystem | None = None, - bucket: str | None = None, - cached: bool = False, - **caching_options, - **kwargs, - ): - self._path = path - self._bucket = bucket - self._cached = cached - self._base_filesystem = filesystem - self._filesystem = FileSystem( - bucket=bucket, fs=filesystem, cached=cached, **caching_options - ) - - self._caching_options = caching_options - - self.load_files() - - def load_files(self): - self._files = self._filesystem.list_files_recursive(self._path) - - +def get_file_paths( + metadata: pq.FileMetaData, +) -> list[str]: + return [ + metadata.row_group(i).column(0).file_path + for i in range(metadata.num_row_groups) + ] + + +# class FileMetadata: +# def __init__( +# self, +# path:str, +# filesystem: AbstractFileSystem | pfs.FileSystem | None = None, +# bucket: str | None = None, +# cached: bool = False, +# **caching_options, +# **kwargs, +# ): +# self._path = path +# self._bucket = bucket +# self._cached = cached +# self._base_filesystem = filesystem +# self._filesystem = FileSystem( +# bucket=bucket, fs=filesystem, cached=cached, **caching_options +# ) +# +# self._caching_options = caching_options +# +# self.load_files() +# +# def load_files(self): +# self._files = self._filesystem.list_files_recursive(self._path) +# +# +# +# +# @property +# def fs(self): +# return self._filesystem - @property - def fs(self): - return self._filesystem class ParquetDatasetMetadata: def __init__( @@ -174,14 +182,14 @@ def __init__( ) self._makedirs() - #self.load_files() + # self.load_files() self._caching_options = caching_options self._metadata_file = os.path.join(path, "_metadata") self._file_metadata_file = os.path.join(path, "_file_metadata") self._metadata = self._read_metadata() - self._file_metadata = self._read_file_metadata() + self._file_metadata = None # self._read_file_metadata() if update_metadata: self.update() @@ -204,8 +212,19 @@ def _makedirs(self): self._filesystem.touch(os.path.join(self._path, "tmp.delete")) self._filesystem.rm(os.path.join(self._path, "tmp.delete")) - def load_files(self)->None: - self._files = get_file_paths(self._metadata) + def load_files(self) -> None: + if self.has_metadata: + self._files = get_file_paths(self._metadata) + else: + self.clear_cache() + self._files = [ + fn.replace(self._path, "").lstrip("/") + for fn in sorted( + self._filesystem.glob( + os.path.join(self._path, f"**/*.{self._format}") + ) + ) + ] def _ls_files(self) -> None: """ @@ -248,9 +267,7 @@ def _collect_file_metadata(self, files: list[str] | None = None, **kwargs) -> No # if file_metadata: for f in file_metadata: - file_metadata[f].set_file_path( - f - ) + file_metadata[f].set_file_path(f) if self.has_file_metadata: self._file_metadata.update(file_metadata) @@ -297,7 +314,7 @@ def update_file_metadata(self, files: list[str] | None = None, **kwargs) -> None rm_files += sorted(set(self._file_metadata.keys()) - set(all_files)) else: - new_files += sorted(set(new_files + self._files)) + new_files += sorted(set(all_files + self.files)) if files is not None: new_files = sorted(set(files + new_files)) @@ -349,7 +366,7 @@ def _get_unified_schema( if not self.has_file_metadata: self.update_file_metadata() - new_files = sorted((set(self._files) - set(self.files_in_metadata))) + new_files = sorted((set(self.files) - set(self.files_in_metadata))) if len(new_files): schemas = [ @@ -460,7 +477,8 @@ def _update_metadata_file(self, **kwargs): Returns: None """ - + if not self.has_file_metadata: + self._read_file_metadata() self._metadata_temp = copy.copy(self._metadata) # update metadata if self.has_file_metadata: @@ -654,7 +672,7 @@ def has_files(self): """ Returns True if the dataset has files, False otherwise. """ - return len(self._files) > 0 + return len(self.files) > 0 @property def files_in_metadata(self) -> list: @@ -747,6 +765,8 @@ def reset_scan(self): def _gen_metadata_table( metadata: pq.FileMetaData | list[pq.FileMetaData], partitioning: None | str | list[str] = None, + backend: str = "threading", + verbose: bool = True, ): """ Generates a polars DataFrame with statistics for each row group in the dataset. @@ -756,59 +776,59 @@ def _gen_metadata_table( metadata = [metadata] metadata_table = defaultdict(list) - for metadata_ in metadata: - for rg_num in range(metadata_.num_row_groups): - row_group = metadata_.row_group(rg_num) - file_path = row_group.column(0).file_path - metadata_table["file_path"].append(file_path) - metadata_table["num_columns"].append(row_group.num_columns) - metadata_table["num_rows"].append(row_group.num_rows) - metadata_table["total_byte_size"].append(row_group.total_byte_size) - metadata_table["compression"].append(row_group.column(0).compression) - - if "=" in file_path: - partitioning = partitioning or "hive" - - if partitioning is not None: - partitions = dict( - get_partitions_from_path(file_path, partitioning=partitioning) + + def process_row_group(metadata_, rg_num): + row_group = metadata_.row_group(rg_num) + file_path = row_group.column(0).file_path + result = { + "file_path": file_path, + "num_columns": row_group.num_columns, + "num_rows": row_group.num_rows, + "total_byte_size": row_group.total_byte_size, + "compression": row_group.column(0).compression, + } + + if "=" in file_path: + partitioning_ = partitioning or "hive" + else: + partitioning_ = partitioning + + if partitioning_ is not None: + partitions = dict( + get_partitions_from_path(file_path, partitioning=partitioning_) + ) + result.update(partitions) + + return result + + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = [] + for metadata_ in metadata: + for rg_num in range(metadata_.num_row_groups): + futures.append( + executor.submit(process_row_group, metadata_, rg_num) ) - for part in partitions: - metadata_table[part].append(partitions[part]) - - for col_num in range(row_group.num_columns): - rgc = row_group.column(col_num) - rgc = rgc.to_dict() - col_name = rgc.pop("path_in_schema") - rgc.pop("file_path") - rgc.pop("compression") - if "statistics" in rgc: - if rgc["statistics"] is not None: - rgc.update(rgc.pop("statistics")) - else: - rgc.pop("statistics") - rgc.update( - { - "has_min_max": False, - "min": None, - "max": None, - "null_count": None, - "distinct_count": None, - "num_values": None, - "physical_type": "UNKNOWN", - } - ) - metadata_table[col_name].append(rgc) + + for future in concurrent.futures.as_completed(futures): + result = future.result() + for key, value in result.items(): + metadata_table[key].append(value) + return metadata_table def update_metadata_table( self, # metadata: pq.FileMetaData | list[pq.FileMetaData], # partitioning: None | str | list[str] = None, + backend: str = "threading", + verbose: bool = True, ): if self.has_metadata: metadata_table = self._gen_metadata_table( - metadata=self.metadata, partitioning=self._partitioning + metadata=self.metadata, + partitioning=self._partitioning, + backend=backend, + verbose=verbose, ) # self._metadata_table = pa.Table.from_pydict(metadata_table) self._metadata_table = self.ddb_con.from_arrow( diff --git a/pyproject.toml b/pyproject.toml index 0b030b5..f86dc61 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,29 +1,29 @@ [project] -authors = [{ name = "Volker L.", email = "ligno.blades@gmail.com" }] +authors = [{name = "Volker L.", email = "ligno.blades@gmail.com"}] dependencies = [ - "pyarrow>=12.0.1", - "s3fs>=2023.6.0", - "duckdb>=0.10.0", - "tqdm>=4.65.0", - "joblib>=1.3.2", - "pandas>=2.0.3", - "pendulum>=2.1.2", - "msgspec>=0.18.4", - "munch>=4.0.0", - "sqlglot>=23.11.2", - "sqlparse>=0.5.0", - "psutil>=6.0.0", - "pyyaml>=6.0.2", - "loguru>=0.7.2", + "pyarrow>=12.0.1", + "s3fs>=2023.6.0", + "duckdb>=0.10.0", + "tqdm>=4.65.0", + "joblib>=1.3.2", + "pandas>=2.0.3", + "pendulum>=2.1.2", + "msgspec>=0.18.4", + "munch>=4.0.0", + "sqlglot>=23.11.2", + "sqlparse>=0.5.0", + "psutil>=6.0.0", + "pyyaml>=6.0.2", + "loguru>=0.7.2", ] description = "poor manĀ“s data lake" homepage = "https://github.com/legout/pydala2" -license = { text = "MIT" } +license = {text = "MIT"} name = "pydala2" readme = "README.md" repository = "https://github.com/legout/pydala2" requires-python = ">= 3.10" -version = "0.8.8.3" +version = "0.9.0" [project.optional-dependencies] legacy = ["polars-lts-cpu>=0.20.4"] @@ -35,29 +35,30 @@ requires = ["hatchling"] [tool.rye] dev-dependencies = [ - "ipython>=8.26.0", - "isort>=5.13.2", - "polars>=1.5.0", - "loguru>=0.7.2", - "ipykernel>=6.29.5", - "msgpack>=1.1.0", - "orjson>=3.10.7", - "fastparquet>=2024.5.0", - "lxml>=5.3.0", - "dill>=0.3.9", - "blosc>=1.11.2", - "compress-pickle>=2.1.0", - "zstandard>=0.23.0", - "brotli>=1.1.0", - "lz4>=4.3.3", - "snappy>=3.1.1", - "python-snappy>=0.7.3", - "deltalake>=0.20.2", - "datafusion>=42.0.0", - "ibis>=3.3.0", - "ibis-framework[duckdb,polars]>=9.5.0", - "ruff>=0.7.1", - "adlfs>=2024.7.0", + "ipython>=8.26.0", + "isort>=5.13.2", + "polars>=1.5.0", + "loguru>=0.7.2", + "ipykernel>=6.29.5", + "msgpack>=1.1.0", + "orjson>=3.10.7", + "fastparquet>=2024.5.0", + "lxml>=5.3.0", + "dill>=0.3.9", + "blosc>=1.11.2", + "compress-pickle>=2.1.0", + "zstandard>=0.23.0", + "brotli>=1.1.0", + "lz4>=4.3.3", + "snappy>=3.1.1", + "python-snappy>=0.7.3", + "deltalake>=0.20.2", + "datafusion>=42.0.0", + "ibis>=3.3.0", + "ibis-framework[duckdb,polars]>=9.5.0", + "ruff>=0.7.1", + "adlfs>=2024.7.0", + "jupyterlab>=4.3.0", ] managed = true diff --git a/requirements-dev.lock b/requirements-dev.lock index 4f90907..d098f88 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -15,7 +15,7 @@ aiobotocore==2.15.2 # via s3fs aiohappyeyeballs==2.4.3 # via aiohttp -aiohttp==3.10.10 +aiohttp==3.11.7 # via adlfs # via aiobotocore # via s3fs @@ -23,15 +23,28 @@ aioitertools==0.12.0 # via aiobotocore aiosignal==1.3.1 # via aiohttp +anyio==4.6.2.post1 + # via httpx + # via jupyter-server appnope==0.1.4 # via ipykernel +argon2-cffi==23.1.0 + # via jupyter-server +argon2-cffi-bindings==21.2.0 + # via argon2-cffi +arrow==1.3.0 + # via isoduration asttokens==2.4.1 # via stack-data +async-lru==2.0.4 + # via jupyterlab atpublic==5.0 # via ibis-framework attrs==24.2.0 # via aiohttp -azure-core==1.31.0 + # via jsonschema + # via referencing +azure-core==1.32.0 # via adlfs # via azure-identity # via azure-storage-blob @@ -39,15 +52,24 @@ azure-datalake-store==0.0.53 # via adlfs azure-identity==1.19.0 # via adlfs -azure-storage-blob==12.23.1 +azure-storage-blob==12.24.0 # via adlfs +babel==2.16.0 + # via jupyterlab-server +beautifulsoup4==4.12.3 + # via nbconvert +bleach==6.2.0 + # via nbconvert blosc==1.11.2 botocore==1.35.36 # via aiobotocore brotli==1.1.0 certifi==2024.8.30 + # via httpcore + # via httpx # via requests cffi==1.17.1 + # via argon2-cffi-bindings # via azure-datalake-store # via cryptography charset-normalizer==3.4.0 @@ -66,20 +88,26 @@ cryptography==43.0.3 cypari==2.5.5 # via snappy datafusion==42.0.0 -debugpy==1.8.7 +debugpy==1.8.9 # via ipykernel decorator==5.1.1 # via ipython # via snappy # via spherogram -deltalake==0.20.2 +defusedxml==0.7.1 + # via nbconvert +deltalake==0.21.0 dill==0.3.9 -duckdb==1.1.2 +duckdb==1.1.3 # via ibis-framework # via pydala2 executing==2.1.0 # via stack-data -fastparquet==2024.5.0 +fastjsonschema==2.20.0 + # via nbformat +fastparquet==2024.11.0 +fqdn==1.5.1 + # via jsonschema frozenlist==1.5.0 # via aiohttp # via aiosignal @@ -87,31 +115,81 @@ fsspec==2024.10.0 # via adlfs # via fastparquet # via s3fs -fxrays==1.3.5 +fxrays==1.3.6 # via snappy +h11==0.14.0 + # via httpcore +httpcore==1.0.7 + # via httpx +httpx==0.27.2 + # via jupyterlab ibis==3.3.0 ibis-framework==9.5.0 idna==3.10 + # via anyio + # via httpx + # via jsonschema # via requests # via yarl ipykernel==6.29.5 + # via jupyterlab ipython==8.29.0 # via ipykernel # via snappy isodate==0.7.2 # via azure-storage-blob +isoduration==20.11.0 + # via jsonschema isort==5.13.2 -jedi==0.19.1 +jedi==0.19.2 # via ipython +jinja2==3.1.4 + # via jupyter-server + # via jupyterlab + # via jupyterlab-server + # via nbconvert jmespath==1.0.1 # via botocore joblib==1.4.2 # via pydala2 +json5==0.9.28 + # via jupyterlab-server +jsonpointer==3.0.0 + # via jsonschema +jsonschema==4.23.0 + # via jupyter-events + # via jupyterlab-server + # via nbformat +jsonschema-specifications==2024.10.1 + # via jsonschema jupyter-client==8.6.3 # via ipykernel + # via jupyter-server + # via nbclient jupyter-core==5.7.2 # via ipykernel # via jupyter-client + # via jupyter-server + # via jupyterlab + # via nbclient + # via nbconvert + # via nbformat +jupyter-events==0.10.0 + # via jupyter-server +jupyter-lsp==2.2.5 + # via jupyterlab +jupyter-server==2.14.2 + # via jupyter-lsp + # via jupyterlab + # via jupyterlab-server + # via notebook-shim +jupyter-server-terminals==0.5.3 + # via jupyter-server +jupyterlab==4.3.1 +jupyterlab-pygments==0.3.0 + # via nbconvert +jupyterlab-server==2.27.3 + # via jupyterlab knot-floer-homology==1.2 # via spherogram loguru==0.7.2 @@ -122,12 +200,17 @@ lxml==5.3.0 lz4==4.3.3 markdown-it-py==3.0.0 # via rich +markupsafe==3.0.2 + # via jinja2 + # via nbconvert matplotlib-inline==0.1.7 # via ipykernel # via ipython mdurl==0.1.2 # via markdown-it-py -msal==1.31.0 +mistune==3.0.2 + # via nbconvert +msal==1.31.1 # via azure-datalake-store # via azure-identity # via msal-extensions @@ -141,25 +224,43 @@ multidict==6.1.0 # via yarl munch==4.0.0 # via pydala2 +nbclient==0.10.0 + # via nbconvert +nbconvert==7.16.4 + # via jupyter-server +nbformat==5.10.4 + # via jupyter-server + # via nbclient + # via nbconvert nest-asyncio==1.6.0 # via ipykernel networkx==3.4.2 # via spherogram -numpy==2.1.2 +notebook-shim==0.2.4 + # via jupyterlab +numpy==2.1.3 # via fastparquet # via ibis-framework # via pandas # via pyarrow -orjson==3.10.10 -packaging==24.1 +orjson==3.10.11 +overrides==7.7.0 + # via jupyter-server +packaging==24.2 # via fastparquet # via ibis-framework # via ipykernel + # via jupyter-server + # via jupyterlab + # via jupyterlab-server + # via nbconvert # via snappy pandas==2.2.3 # via fastparquet # via ibis-framework # via pydala2 +pandocfilters==1.5.1 + # via nbconvert parso==0.8.4 # via jedi parsy==2.1 @@ -172,19 +273,23 @@ platformdirs==4.3.6 # via jupyter-core plink==2.4.2 # via snappy -polars==1.12.0 +polars==1.14.0 # via ibis-framework portalocker==2.10.1 # via msal-extensions +prometheus-client==0.21.0 + # via jupyter-server prompt-toolkit==3.0.48 # via ipython propcache==0.2.0 + # via aiohttp # via yarl psutil==6.1.0 # via ipykernel # via pydala2 ptyprocess==0.7.0 # via pexpect + # via terminado pure-eval==0.2.3 # via stack-data pyarrow==17.0.0 @@ -198,61 +303,99 @@ pycparser==2.22 # via cffi pygments==2.18.0 # via ipython + # via nbconvert # via rich -pyjwt==2.9.0 +pyjwt==2.10.0 # via msal pypng==0.20220715.0 # via snappy python-dateutil==2.9.0.post0 + # via arrow # via botocore # via ibis-framework # via jupyter-client # via pandas # via pendulum # via time-machine +python-json-logger==2.0.7 + # via jupyter-events python-snappy==0.7.3 pytz==2024.2 # via ibis-framework # via pandas pyyaml==6.0.2 + # via jupyter-events # via pydala2 pyzmq==26.2.0 # via ipykernel # via jupyter-client + # via jupyter-server +referencing==0.35.1 + # via jsonschema + # via jsonschema-specifications + # via jupyter-events requests==2.32.3 # via azure-core # via azure-datalake-store + # via jupyterlab-server # via msal -rich==13.9.3 +rfc3339-validator==0.1.4 + # via jsonschema + # via jupyter-events +rfc3986-validator==0.1.1 + # via jsonschema + # via jupyter-events +rich==13.9.4 # via ibis-framework -ruff==0.7.1 +rpds-py==0.21.0 + # via jsonschema + # via referencing +ruff==0.8.0 s3fs==2024.10.0 # via pydala2 +send2trash==1.8.3 + # via jupyter-server +setuptools==75.6.0 + # via jupyterlab six==1.16.0 # via asttokens # via azure-core # via python-dateutil + # via rfc3339-validator snappy==3.1.1 snappy-manifolds==1.2 # via snappy # via spherogram +sniffio==1.3.1 + # via anyio + # via httpx +soupsieve==2.6 + # via beautifulsoup4 spherogram==2.2.1 # via snappy sqlglot==25.20.2 # via ibis-framework # via pydala2 -sqlparse==0.5.1 +sqlparse==0.5.2 # via pydala2 stack-data==0.6.3 # via ipython +terminado==0.18.1 + # via jupyter-server + # via jupyter-server-terminals time-machine==2.16.0 # via pendulum +tinycss2==1.4.0 + # via nbconvert toolz==0.12.1 # via ibis-framework -tornado==6.4.1 +tornado==6.4.2 # via ipykernel # via jupyter-client -tqdm==4.66.6 + # via jupyter-server + # via jupyterlab + # via terminado +tqdm==4.67.0 # via pydala2 traitlets==5.14.3 # via comm @@ -260,7 +403,15 @@ traitlets==5.14.3 # via ipython # via jupyter-client # via jupyter-core + # via jupyter-events + # via jupyter-server + # via jupyterlab # via matplotlib-inline + # via nbclient + # via nbconvert + # via nbformat +types-python-dateutil==2.9.0.20241003 + # via arrow typing-extensions==4.12.2 # via azure-core # via azure-identity @@ -270,13 +421,22 @@ typing-extensions==4.12.2 tzdata==2024.2 # via pandas # via pendulum +uri-template==1.3.0 + # via jsonschema urllib3==2.2.3 # via botocore # via requests wcwidth==0.2.13 # via prompt-toolkit -wrapt==1.16.0 +webcolors==24.11.1 + # via jsonschema +webencodings==0.5.1 + # via bleach + # via tinycss2 +websocket-client==1.8.0 + # via jupyter-server +wrapt==1.17.0 # via aiobotocore -yarl==1.17.0 +yarl==1.18.0 # via aiohttp zstandard==0.23.0 diff --git a/requirements.lock b/requirements.lock index c2ae102..9b1d603 100644 --- a/requirements.lock +++ b/requirements.lock @@ -14,7 +14,7 @@ aiobotocore==2.15.2 # via s3fs aiohappyeyeballs==2.4.3 # via aiohttp -aiohttp==3.10.10 +aiohttp==3.11.7 # via aiobotocore # via s3fs aioitertools==0.12.0 @@ -25,7 +25,7 @@ attrs==24.2.0 # via aiohttp botocore==1.35.36 # via aiobotocore -duckdb==1.1.2 +duckdb==1.1.3 # via pydala2 frozenlist==1.5.0 # via aiohttp @@ -47,13 +47,14 @@ multidict==6.1.0 # via yarl munch==4.0.0 # via pydala2 -numpy==2.1.2 +numpy==2.1.3 # via pandas pandas==2.2.3 # via pydala2 pendulum==3.0.0 # via pydala2 propcache==0.2.0 + # via aiohttp # via yarl psutil==6.1.0 # via pydala2 @@ -72,20 +73,20 @@ s3fs==2024.10.0 # via pydala2 six==1.16.0 # via python-dateutil -sqlglot==25.28.0 +sqlglot==25.31.4 # via pydala2 -sqlparse==0.5.1 +sqlparse==0.5.2 # via pydala2 time-machine==2.16.0 # via pendulum -tqdm==4.66.6 +tqdm==4.67.0 # via pydala2 tzdata==2024.2 # via pandas # via pendulum urllib3==2.2.3 # via botocore -wrapt==1.16.0 +wrapt==1.17.0 # via aiobotocore -yarl==1.17.0 +yarl==1.18.0 # via aiohttp