Skip to content

Commit

Permalink
Return data ids of test dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
JonathanFrey2003 committed Nov 21, 2024
1 parent dae30e7 commit afa79cb
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 13 deletions.
4 changes: 2 additions & 2 deletions notebooks/base.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": ".venv",
"language": "python",
"name": "python3"
},
Expand All @@ -31,7 +31,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
"version": "3.13.0"
}
},
"nbformat": 4,
Expand Down
35 changes: 25 additions & 10 deletions src/dataland_qa_lab/dataland/upload_test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,17 @@
from dataland_qa_lab.utils import config


def upload_test_data() -> None:
"""Function to upload 10 test cases for EU Taxonomy Nuclear and Gas to Dataland."""
pdf_path = Path("../data/pdfs/")
json_path = Path("../data/jsons/")
def upload_test_data(pdf_path: Path, json_path: Path) -> list[str]:
"""Upload 10 test cases.
Upload 10 test cases for EU Taxonomy Nuclear and Gas to Dataland.
:param pdf_path: absolute path to pdf files (required)
:type pdf_path: Path
:param json_path: absolute path to json files (required)
:type json_path: Path
:return: Returns a list containing the data ids of the test datasets.
"""
conf = config.get_config()
dataland_client = conf.dataland_client

Expand All @@ -31,11 +37,13 @@ def upload_test_data() -> None:
"eb119227edc8c66d672785619522cd6045b2faf37e63796207799c0e40fa66be",
"dba48e9f5e7e6fc9862dd95159960eb2a270d6975f2457f443ca422e7449e7d6",
]

new_data_ids = []

for company, pdf_id in zip(companies, pdfs, strict=False):
# if needed upload pdf file to dataland
if not dataland_client.documents_api.get_document(document_id=pdf_id):
pdf_file_path = pdf_path / f"{company}.pdf"
pdf_content = pdf_file_path.read_bytes()
pdf_content = (pdf_path / f"{company}.pdf").read_bytes()

dataland_client.documents_api.post_document(document=pdf_content)

Expand All @@ -62,11 +70,18 @@ def upload_test_data() -> None:
json_str = json.dumps(json_data, indent=4)
json_file_path.write_text(json_str, encoding="utf-8")

# if needed upload document
if not dataland_client.eu_taxonomy_nuclear_and_gas_api.get_all_company_nuclear_and_gas_data(
# if needed upload dataset
old_dataset = dataland_client.eu_taxonomy_nuclear_and_gas_api.get_all_company_nuclear_and_gas_data(
company_id=company_id
):
)
if not old_dataset:
nuclear_and_gas_data = CompanyAssociatedDataNuclearAndGasData.from_json(json_str)
dataland_client.eu_taxonomy_nuclear_and_gas_api.post_company_associated_nuclear_and_gas_data(

new_dataset = dataland_client.eu_taxonomy_nuclear_and_gas_api.post_company_associated_nuclear_and_gas_data(
company_associated_data_nuclear_and_gas_data=nuclear_and_gas_data, bypass_qa=True
)
new_data_ids.append(new_dataset.data_id)
else:
new_data_ids.append(old_dataset[0].meta_info.data_id)

return new_data_ids
8 changes: 7 additions & 1 deletion tests/dataland/test_upload_test_data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
from pathlib import Path

from src.dataland_qa_lab.dataland.upload_test_data import upload_test_data


def test_upload_test_data() -> None:
upload_test_data()
project_root = Path(__file__).resolve().parent.parent.parent
pdf_path = project_root / "data" / "pdfs"
json_path = project_root / "data" / "jsons"
test_data = upload_test_data(pdf_path=pdf_path, json_path=json_path)
assert len(test_data) == 10

0 comments on commit afa79cb

Please sign in to comment.