diff --git a/docs/build/api-reference/README.md b/docs/build/api-reference/README.md index e88ceae216..af51fa4870 100644 --- a/docs/build/api-reference/README.md +++ b/docs/build/api-reference/README.md @@ -1531,7 +1531,7 @@ Create a copy of the package #### to\_zip ```python - | to_zip(target, encoder_class=None) + | to_zip(target, *, resolve=[], encoder_class=None) ``` Save package to a zip @@ -1539,6 +1539,11 @@ Save package to a zip **Arguments**: - `target` _str_ - target path +- `resolve` _str[]_ - Data sources to resolve. + For "inline" data it means saving them as CSV and including into ZIP. + For "remote" data it means downloading them and including into ZIP. + For example, `resolve=["inline", "remote"]` +- `encoder_class` _object_ - json encoder class **Raises**: @@ -6110,7 +6115,7 @@ Convert resource to File #### to\_zip ```python - | to_zip(target, encoder_class=None) + | to_zip(target, *, resolve=[], encoder_class=None) ``` Save resource to a zip @@ -6118,6 +6123,11 @@ Save resource to a zip **Arguments**: - `target` _str_ - target path +- `resolve` _str[]_ - Data sources to resolve. + For "inline" data it means saving them as CSV and including into ZIP. + For "remote" data it means downloading them and including into ZIP. + For example, `resolve=["inline", "remote"]` +- `encoder_class` _object_ - json encoder class **Raises**: diff --git a/frictionless/package.py b/frictionless/package.py index d53b9b3e1b..2b517f687e 100644 --- a/frictionless/package.py +++ b/frictionless/package.py @@ -2,6 +2,7 @@ import json import glob import zipfile +import tempfile from copy import deepcopy from .exception import FrictionlessException from .metadata import Metadata @@ -382,33 +383,80 @@ def to_copy(self): trusted=self.__trusted, ) - # NOTE: support multipart - def to_zip(self, target, encoder_class=None): + # TODO: support multipart + # TODO: there is 100% duplication with resource.to_zip + def to_zip(self, target, *, resolve=[], encoder_class=None): """Save package to a zip Parameters: target (str): target path + resolve (str[]): Data sources to resolve. + For "inline" data it means saving them as CSV and including into ZIP. + For "remote" data it means downloading them and including into ZIP. + For example, `resolve=["inline", "remote"]` + encoder_class (object): json encoder class Raises: FrictionlessException: on any error """ try: with zipfile.ZipFile(target, "w") as zip: - descriptor = self.copy() - for resource in self.resources: - if resource.inline: - continue - if resource.remote: - continue + package_descriptor = self.to_dict() + for index, resource in enumerate(self.resources): + descriptor = package_descriptor["resources"][index] + + # Multipart data if resource.multipart: - continue - if not helpers.is_safe_path(resource.path): - continue - zip.write(resource.source, resource.path) - descriptor = json.dumps( - descriptor, indent=2, ensure_ascii=False, cls=encoder_class + note = "Zipping multipart resource is not yet supported" + raise FrictionlessException(errors.ResourceError(note=note)) + + # Inline data + elif resource.inline: + if "inline" in resolve: + path = f"{resource.name}.csv" + descriptor["path"] = path + del descriptor["data"] + with tempfile.NamedTemporaryFile() as file: + resource.write(file.name, format="csv") + zip.write(file.name, path) + elif not isinstance(resource.data, list): + note = f"Use resolve argument to zip {resource.data}" + raise FrictionlessException(errors.ResourceError(note=note)) + + # Remote data + elif resource.remote: + if "remote" in resolve: + path = f"{resource.name}.{resource.format}" + descriptor["path"] = path + with tempfile.NamedTemporaryFile() as file: + byte_stream = resource.read_byte_stream() + while True: + chunk = byte_stream.read(1024) + if not chunk: + break + file.write(chunk) + file.flush() + zip.write(file.name, path) + + # Local Data + else: + path = resource.path + if not helpers.is_safe_path(path): + path = f"{resource.name}.{resource.format}" + descriptor["path"] = path + zip.write(resource.source, path) + + # Metadata + zip.writestr( + "datapackage.json", + json.dumps( + package_descriptor, + indent=2, + ensure_ascii=False, + cls=encoder_class, + ), ) - zip.writestr("datapackage.json", descriptor) + except Exception as exception: error = errors.PackageError(note=str(exception)) raise FrictionlessException(error) from exception diff --git a/frictionless/resource.py b/frictionless/resource.py index e14e435b20..31940182db 100644 --- a/frictionless/resource.py +++ b/frictionless/resource.py @@ -3,6 +3,7 @@ import json import petl import zipfile +import tempfile import warnings from copy import deepcopy from importlib import import_module @@ -791,33 +792,79 @@ def to_file(self, **options): options.setdefault("control", self.control) return module.File(**options) - # NOTE: support multipart - def to_zip(self, target, encoder_class=None): + # TODO: support multipart + # TODO: there is 100% duplication with package.to_zip + def to_zip(self, target, *, resolve=[], encoder_class=None): """Save resource to a zip Parameters: target (str): target path + resolve (str[]): Data sources to resolve. + For "inline" data it means saving them as CSV and including into ZIP. + For "remote" data it means downloading them and including into ZIP. + For example, `resolve=["inline", "remote"]` + encoder_class (object): json encoder class Raises: FrictionlessException: on any error """ try: with zipfile.ZipFile(target, "w") as zip: - descriptor = self.copy() for resource in [self]: - if resource.inline: - continue - if resource.remote: - continue + descriptor = self.to_dict() + + # Multipart data if resource.multipart: - continue - if not helpers.is_safe_path(resource.path): - continue - zip.write(resource.source, resource.path) - descriptor = json.dumps( - descriptor, indent=2, ensure_ascii=False, cls=encoder_class - ) - zip.writestr("dataresource.json", descriptor) + note = "Zipping multipart resource is not yet supported" + raise FrictionlessException(errors.ResourceError(note=note)) + + # Inline data + elif resource.inline: + if "inline" in resolve: + path = f"{resource.name}.csv" + descriptor["path"] = path + del descriptor["data"] + with tempfile.NamedTemporaryFile() as file: + resource.write(file.name, format="csv") + zip.write(file.name, path) + elif not isinstance(resource.data, list): + note = f"Use resolve argument to zip {resource.data}" + raise FrictionlessException(errors.ResourceError(note=note)) + + # Remote data + elif resource.remote: + if "remote" in resolve: + path = f"{resource.name}.{resource.format}" + descriptor["path"] = path + with tempfile.NamedTemporaryFile() as file: + byte_stream = resource.read_byte_stream() + while True: + chunk = byte_stream.read(1024) + if not chunk: + break + file.write(chunk) + file.flush() + zip.write(file.name, path) + + # Local Data + else: + path = resource.path + if not helpers.is_safe_path(path): + path = f"{resource.name}.{resource.format}" + descriptor["path"] = path + zip.write(resource.source, path) + + # Metadata + zip.writestr( + "dataresource.json", + json.dumps( + descriptor, + indent=2, + ensure_ascii=False, + cls=encoder_class, + ), + ) + except (IOError, zipfile.BadZipfile, zipfile.LargeZipFile) as exception: error = errors.ResourceError(note=str(exception)) raise FrictionlessException(error) from exception diff --git a/tests/conftest.py b/tests/conftest.py index 57e21f5282..bd285a2140 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,18 +12,24 @@ # Fixtures -# TODO: create the same for other databases @pytest.fixture def database_url(tmpdir): path = str(tmpdir.join("database.db")) conn = sqlite3.connect(path) - conn.execute("CREATE TABLE data (id INTEGER PRIMARY KEY, name TEXT)") - conn.execute("INSERT INTO data VALUES (1, 'english'), (2, '中国人')") + conn.execute("CREATE TABLE 'table' (id INTEGER PRIMARY KEY, name TEXT)") + conn.execute("INSERT INTO 'table' VALUES (1, 'english'), (2, '中国人')") conn.commit() yield "sqlite:///%s" % path conn.close() +# TODO: create the same for other databases +@pytest.fixture +def sqlite_url(tmpdir): + path = str(tmpdir.join("database.db")) + return "sqlite:///%s" % path + + # Settings diff --git a/tests/plugins/test_sql.py b/tests/plugins/test_sql.py index 1267d7f3d0..fbec6bc844 100644 --- a/tests/plugins/test_sql.py +++ b/tests/plugins/test_sql.py @@ -13,7 +13,7 @@ def test_table_sql(database_url): - dialect = SqlDialect(table="data") + dialect = SqlDialect(table="table") with Table(database_url, dialect=dialect) as table: assert table.schema == { "fields": [ @@ -27,14 +27,14 @@ def test_table_sql(database_url): def test_table_sql_order_by(database_url): - dialect = SqlDialect(table="data", order_by="id") + dialect = SqlDialect(table="table", order_by="id") with Table(database_url, dialect=dialect) as table: assert table.header == ["id", "name"] assert table.read_data() == [[1, "english"], [2, "中国人"]] def test_table_sql_order_by_desc(database_url): - dialect = SqlDialect(table="data", order_by="id desc") + dialect = SqlDialect(table="table", order_by="id desc") with Table(database_url, dialect=dialect) as table: assert table.header == ["id", "name"] assert table.read_data() == [[2, "中国人"], [1, "english"]] @@ -50,7 +50,7 @@ def test_table_sql_table_is_required_error(database_url): def test_table_sql_headers_false(database_url): - dialect = SqlDialect(table="data") + dialect = SqlDialect(table="table") with Table(database_url, dialect=dialect, headers=False) as table: assert table.header == [] assert table.read_data() == [["id", "name"], [1, "english"], [2, "中国人"]] @@ -69,8 +69,8 @@ def test_table_sql_write(database_url): # Storage -def test_storage_types(database_url): - engine = sa.create_engine(database_url) +def test_storage_types(sqlite_url): + engine = sa.create_engine(sqlite_url) prefix = "prefix_" # Export/Import @@ -126,8 +126,8 @@ def test_storage_types(database_url): storage.delete_package(target.resource_names) -def test_storage_integrity(database_url): - engine = sa.create_engine(database_url) +def test_storage_integrity(sqlite_url): + engine = sa.create_engine(sqlite_url) prefix = "prefix_" # Export/Import @@ -184,8 +184,8 @@ def test_storage_integrity(database_url): storage.delete_package(target.resource_names) -def test_storage_constraints(database_url): - engine = sa.create_engine(database_url) +def test_storage_constraints(sqlite_url): + engine = sa.create_engine(sqlite_url) prefix = "prefix_" # Export/Import @@ -235,8 +235,8 @@ def test_storage_constraints(database_url): ("maximum", 9), ], ) -def test_storage_constraints_not_valid_error(database_url, field_name, cell): - engine = sa.create_engine(database_url) +def test_storage_constraints_not_valid_error(sqlite_url, field_name, cell): + engine = sa.create_engine(sqlite_url) package = Package("data/storage/constraints.json") resource = package.get_resource("constraints") # We set an invalid cell to the data property @@ -248,8 +248,8 @@ def test_storage_constraints_not_valid_error(database_url, field_name, cell): resource.to_sql(engine=engine, force=True) -def test_storage_read_resource_not_existent_error(database_url): - engine = sa.create_engine(database_url) +def test_storage_read_resource_not_existent_error(sqlite_url): + engine = sa.create_engine(sqlite_url) storage = SqlStorage(engine=engine) with pytest.raises(FrictionlessException) as excinfo: storage.read_resource("bad") @@ -258,8 +258,8 @@ def test_storage_read_resource_not_existent_error(database_url): assert error.note.count("does not exist") -def test_storage_write_resource_existent_error(database_url): - engine = sa.create_engine(database_url) +def test_storage_write_resource_existent_error(sqlite_url): + engine = sa.create_engine(sqlite_url) resource = Resource(path="data/table.csv") storage = resource.to_sql(engine=engine) with pytest.raises(FrictionlessException) as excinfo: @@ -271,8 +271,8 @@ def test_storage_write_resource_existent_error(database_url): storage.delete_package(list(storage)) -def test_storage_delete_resource_not_existent_error(database_url): - engine = sa.create_engine(database_url) +def test_storage_delete_resource_not_existent_error(sqlite_url): + engine = sa.create_engine(sqlite_url) storage = SqlStorage(engine=engine) with pytest.raises(FrictionlessException) as excinfo: storage.delete_resource("bad") @@ -281,11 +281,13 @@ def test_storage_delete_resource_not_existent_error(database_url): assert error.note.count("does not exist") -def test_storage_views_support(database_url): - engine = sa.create_engine(database_url) - engine.execute("CREATE VIEW data_view AS SELECT * FROM data") +def test_storage_views_support(sqlite_url): + engine = sa.create_engine(sqlite_url) + engine.execute("CREATE TABLE 'table' (id INTEGER PRIMARY KEY, name TEXT)") + engine.execute("INSERT INTO 'table' VALUES (1, 'english'), (2, '中国人')") + engine.execute("CREATE VIEW 'table_view' AS SELECT * FROM 'table'") storage = SqlStorage(engine=engine) - resource = storage.read_resource("data_view") + resource = storage.read_resource("table_view") assert resource.schema == { "fields": [ {"name": "id", "type": "integer"}, @@ -298,10 +300,10 @@ def test_storage_views_support(database_url): ] -def test_storage_resource_url_argument(database_url): +def test_storage_resource_url_argument(sqlite_url): source = Resource(path="data/table.csv") - source.to_sql(url=database_url) - target = Resource.from_sql(name="table", url=database_url) + source.to_sql(url=sqlite_url) + target = Resource.from_sql(name="table", url=sqlite_url) assert target.schema == { "fields": [ {"name": "id", "type": "integer"}, @@ -314,10 +316,10 @@ def test_storage_resource_url_argument(database_url): ] -def test_storage_package_url_argument(database_url): +def test_storage_package_url_argument(sqlite_url): source = Package(resources=[Resource(path="data/table.csv")]) - source.to_sql(url=database_url) - target = Package.from_sql(url=database_url) + source.to_sql(url=sqlite_url) + target = Package.from_sql(url=sqlite_url) assert target.get_resource("table").schema == { "fields": [ {"name": "id", "type": "integer"}, @@ -453,7 +455,7 @@ def test_postgresql_storage_integrity(): # TODO: recover enum support @pytest.mark.ci @pytest.mark.skip -def test_postgresql_storage_constraints(database_url): +def test_postgresql_storage_constraints(sqlite_url): engine = sa.create_engine(os.environ["POSTGRESQL_URL"]) prefix = "prefix_" @@ -505,7 +507,7 @@ def test_postgresql_storage_constraints(database_url): ("maximum", 9), ], ) -def test_postgresql_storage_constraints_not_valid_error(database_url, field_name, cell): +def test_postgresql_storage_constraints_not_valid_error(sqlite_url, field_name, cell): engine = sa.create_engine(os.environ["POSTGRESQL_URL"]) package = Package("data/storage/constraints.json") resource = package.get_resource("constraints") diff --git a/tests/test_package.py b/tests/test_package.py index a5316887e3..ad844aea1c 100644 --- a/tests/test_package.py +++ b/tests/test_package.py @@ -486,16 +486,125 @@ def test_package_to_zip(tmpdir): # Read package = Package(target) - assert package == { - "name": "name", - "resources": [{"name": "name", "path": "table.csv"}], - } + assert package.name == "name" + assert package.get_resource("name").name == "name" + assert package.get_resource("name").path == "table.csv" assert package.get_resource("name").read_rows() == [ {"id": 1, "name": "english"}, {"id": 2, "name": "中国人"}, ] +def test_package_to_zip_withdir_path(tmpdir): + + # Write + target = os.path.join(tmpdir, "package.zip") + resource = Resource(path="data/table.csv") + package = Package(resources=[resource]) + package.to_zip(target) + + # Read + package = Package(target) + assert package.get_resource("table").path == "data/table.csv" + assert package.get_resource("table").read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] + + +def test_package_to_zip_absolute_path(tmpdir): + + # Write + target = os.path.join(tmpdir, "package.zip") + resource = Resource(path=os.path.abspath("data/table.csv"), trusted=True) + package = Package(resources=[resource], trusted=True) + package.to_zip(target) + + # Read + package = Package(target) + assert package.get_resource("table").path == "table.csv" + assert package.get_resource("table").read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] + + +def test_package_to_zip_resolve_inline(tmpdir): + + # Write + target = os.path.join(tmpdir, "package.zip") + resource = Resource(name="table", data=[["id", "name"], [1, "english"], [2, "中国人"]]) + package = Package(resources=[resource]) + package.to_zip(target, resolve=["inline"]) + + # Read + package = Package(target) + assert package.get_resource("table").path == "table.csv" + assert package.get_resource("table").read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] + + +def test_package_to_zip_resolve_inline_sql(tmpdir, database_url): + + # Write + target = os.path.join(tmpdir, "package.zip") + resource = Resource.from_sql(name="table", url=database_url) + package = Package(resources=[resource]) + package.to_zip(target, resolve=["inline"]) + + # Read + package = Package(target) + assert package.get_resource("table").path == "table.csv" + assert package.get_resource("table").read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] + + +@pytest.mark.ci +def test_package_to_zip_resolve_remote(tmpdir): + + # Write + target = os.path.join(tmpdir, "package.zip") + resource = Resource(path=BASE_URL % "data/table.csv") + package = Package(resources=[resource]) + package.to_zip(target, resolve=["remote"]) + + # Read + package = Package(target) + assert package.get_resource("table").path == "table.csv" + assert package.get_resource("table").read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] + + +@pytest.mark.ci +def test_package_to_zip_resolve_inline_and_remote(tmpdir): + + # Write + target = os.path.join(tmpdir, "package.zip") + resource1 = Resource(name="name1", data=[["id", "name"], [1, "english"], [2, "中国人"]]) + resource2 = Resource(name="name2", path=BASE_URL % "data/table.csv") + package = Package(resources=[resource1, resource2]) + package.to_zip(target, resolve=["inline", "remote"]) + + # Read + package = Package(target) + assert package.get_resource("name1").path == "name1.csv" + assert package.get_resource("name1").read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] + assert package.get_resource("name2").path == "name2.csv" + assert package.get_resource("name2").read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] + + @pytest.mark.ci def test_package_to_zip_source_remote(tmpdir): diff --git a/tests/test_resource.py b/tests/test_resource.py index 6cbed497b7..fcaf6e3c55 100644 --- a/tests/test_resource.py +++ b/tests/test_resource.py @@ -662,6 +662,90 @@ def test_resource_to_zip(tmpdir): ] +def test_resource_to_zip_withdir_path(tmpdir): + + # Write + target = os.path.join(tmpdir, "resource.zip") + resource = Resource(path="data/table.csv") + resource.to_zip(target) + + # Read + resource = Resource(target) + assert resource == {"path": "data/table.csv"} + assert resource.read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] + + +def test_resource_to_zip_absolute_path(tmpdir): + + # Write + target = os.path.join(tmpdir, "resource.zip") + resource = Resource(path=os.path.abspath("data/table.csv"), trusted=True) + resource.to_zip(target) + + # Read + resource = Resource(target) + assert resource == {"path": "table.csv"} + assert resource.read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] + + +def test_resource_to_zip_resolve_inline(tmpdir): + + # Write + target = os.path.join(tmpdir, "resource.zip") + resource = Resource(name="table", data=[["id", "name"], [1, "english"], [2, "中国人"]]) + resource.to_zip(target, resolve=["inline"]) + + # Read + resource = Resource(target) + assert resource.name == "table" + assert resource.path == "table.csv" + assert resource.read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] + + +def test_resource_to_zip_resolve_inline_sql(tmpdir, database_url): + + # Write + target = os.path.join(tmpdir, "resource.zip") + resource = Resource.from_sql(name="table", url=database_url) + resource.to_zip(target, resolve=["inline"]) + + # Read + resource = Resource(target) + assert resource.name == "table" + assert resource.path == "table.csv" + assert resource.read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] + + +@pytest.mark.ci +def test_resource_to_zip_resolve_remote(tmpdir): + + # Write + target = os.path.join(tmpdir, "resource.zip") + resource = Resource(path=BASE_URL % "data/table.csv") + resource.to_zip(target, resolve=["remote"]) + + # Read + resource = Resource(target) + assert resource.name == "table" + assert resource.path == "table.csv" + assert resource.read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] + + @pytest.mark.ci def test_resource_to_zip_source_remote(tmpdir):