From 635f14f1995758de386cf2ba284c47a05c7fc5a1 Mon Sep 17 00:00:00 2001 From: NicolasH2604 Date: Thu, 21 Nov 2024 11:48:06 +0100 Subject: [PATCH] DF-24 notebook dataland api (#7) * DF-24 * DF-24 * DF-24 * DF-24 * DF-24 * DF-24 * DF-24 * DF-24 * DF-24 * DF-24 * DF-24 * Edit Dummy Test * Update Tests * Update 2 Tests * Removed Comment --- notebooks/DF-24_Notebook-API.ipynb | 208 ++++++++++++++++++ .../dataland/dataland_client.py | 5 + src/dataland_qa_lab/dataland/get_data.py | 49 +++++ tests/dataland/test_dataland_client.py | 11 + 4 files changed, 273 insertions(+) create mode 100644 notebooks/DF-24_Notebook-API.ipynb create mode 100644 src/dataland_qa_lab/dataland/get_data.py diff --git a/notebooks/DF-24_Notebook-API.ipynb b/notebooks/DF-24_Notebook-API.ipynb new file mode 100644 index 0000000..7db54d6 --- /dev/null +++ b/notebooks/DF-24_Notebook-API.ipynb @@ -0,0 +1,208 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "ac4a575e", + "metadata": {}, + "outputs": [], + "source": [ + "%run base.ipynb" + ] + }, + { + "cell_type": "markdown", + "id": "ec394145", + "metadata": {}, + "source": [ + "# EPIC 1 - User Story DF-24\n", + "## Erstellung eines Notebooks und aufrufen der Daten per Dataland-API\n", + "\n", + "Als QA-Lab-Team möchten wir ein Notebook erstellen, mit dem wir eine Anfrage an die Dataland-API senden, sodass wir anhand der Berichts-ID den Unternehmensbericht erhalten.\n", + "\n", + "Beschreibung:
\n", + "Nutzen von Dataland Endpoint /data/nuclear-and-gas/companies/{companyId} im Nuclear & Gas Controller. Die Funktionalität wird außerhalb des Notebooks implementiert.\n", + "\n", + "Akzeptanzkriterien:
\n", + " - Notebook ist auf jedem Laptop ausführbar
\n", + " - Anfrage an Dataland gibt Daten zurück" + ] + }, + { + "cell_type": "markdown", + "id": "6b79b78e", + "metadata": {}, + "source": [ + "## 1.Schritt: Dataset mit Hilfe der Company_ID über die Dataland_API aufrufen\n", + "Mithilfe der Company_ID wird die Dataland_API aufgerufen. Nachdem Eintragen aus welcher Periode das Dataset sein soll, ist es möglich den Wert 1 zu erlangen." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf87363e", + "metadata": {}, + "outputs": [], + "source": [ + "import dataland_qa_lab.dataland.get_data as qa\n", + "\n", + "company_id = \"4423c691-0436-423f-abcb-0a08127ee848\"\n", + "year = \"2024\"\n", + "\n", + "# Laden aller Datasets aus jeder Periode\n", + "all_datasets = qa.get_all_company_datasets(company_id=company_id)\n", + "\n", + "# Data_ID der gewünschten Periode des Datensets erhalten\n", + "data_id = qa.get_data_id_by_year(company_id=company_id, year=year)\n", + "print(\"Data_ID = \" + data_id)\n", + "\n", + "# Laden eines bestimmten Datensets einer bestimmten Periode\n", + "dataset_by_year = qa.get_dataset_by_year(company_id=company_id, year=year)\n", + "\n", + "# Rückgabe des ersten Wertes in dem bestimmten Datenset\n", + "value1 = qa.get_value1_by_year(company_id=company_id, year=year)\n", + "print(\"Value 1 = \" + value1)\n", + "\n", + "# Erhalten der Datenreferenz in Bytes, wenn eine Datenquelle hinterlegt ist\n", + "datasource_reference_bytes = qa.get_datasource_reference_bytes(company_id=company_id, year=year)" + ] + }, + { + "cell_type": "markdown", + "id": "c42cdbaa", + "metadata": {}, + "source": [ + "## Laden aller 6 JA/ NEIN Fragen\n", + "Hier werden die Daten von Dataland geladen und alle 6 Werte gesondert gespeichert." + ] + }, + { + "cell_type": "markdown", + "id": "a4031931", + "metadata": {}, + "source": [ + "from dataland_qa_lab.utils import config\n", + "\n", + "conf = config.get_config()\n", + "dataland_client = conf.dataland_client\n", + "\n", + "api = dataland_client.eu_taxonomy_nuclear_and_gas_api\n", + "dataset = api.get_all_company_nuclear_and_gas_data(company_id=company_id)\n", + "\n", + "# Eintragen aus welcher Periode man das Dataset haben will -> In diesem Fall 2024\n", + "data_id = \"test\"\n", + "for t in range(len(dataset)):\n", + " if (dataset[t].meta_info.reporting_period == year):\n", + " data_id = dataset[t].meta_info.data_id\n", + " break\n", + "\n", + "data = dataland_client.eu_taxonomy_nuclear_and_gas_api.get_company_associated_nuclear_and_gas_data(data_id=data_id)\n", + "\n", + "wert1 = data.data.general.general.nuclear_energy_related_activities_section426\n", + "wert2 = data.data.general.general.nuclear_energy_related_activities_section427\n", + "wert3 = data.data.general.general.nuclear_energy_related_activities_section428\n", + "wert4 = data.data.general.general.fossil_gas_related_activities_section429\n", + "wert5 = data.data.general.general.fossil_gas_related_activities_section430\n", + "wert6 = data.data.general.general.fossil_gas_related_activities_section431\n", + "print(data)\n", + "print(wert1)\n", + "print(wert2)\n", + "print(wert3)\n", + "print(wert4)\n", + "print(wert5)\n", + "print(wert6)" + ] + }, + { + "cell_type": "markdown", + "id": "13580b00", + "metadata": {}, + "source": [ + "## Daten aus dem Dokument extrahieren und in Text umwandeln - DEMO" + ] + }, + { + "cell_type": "markdown", + "id": "a9846d68", + "metadata": {}, + "source": [ + "import io\n", + "\n", + "import pypdf\n", + "\n", + "full_document_byte_stream = io.BytesIO(document_bytes)\n", + "full_pdf = pypdf.PdfReader(full_document_byte_stream)\n", + "\n", + "partial_document_byte_stream = io.BytesIO()\n", + "partial_pdf = pypdf.PdfWriter()\n", + "\n", + "partial_pdf.add_page(full_pdf.get_page(int(wert1.data_source.page) - 1)) # Correct for 0 offset\n", + "partial_pdf.write(partial_document_byte_stream)\n", + "partial_document_byte_stream.seek(0)\n", + "None" + ] + }, + { + "cell_type": "markdown", + "id": "2da4ca74", + "metadata": {}, + "source": [ + "from azure.ai.documentintelligence import DocumentIntelligenceClient\n", + "from azure.ai.documentintelligence.models import AnalyzeResult, ContentFormat\n", + "from azure.core.credentials import AzureKeyCredential\n", + "\n", + "docintel_cred = AzureKeyCredential(conf.azure_docintel_api_key)\n", + "document_intelligence_client = DocumentIntelligenceClient(\n", + " endpoint=conf.azure_docintel_endpoint, credential=docintel_cred\n", + ")\n", + "\n", + "poller = document_intelligence_client.begin_analyze_document(\n", + " \"prebuilt-layout\",\n", + " analyze_request=partial_document_byte_stream,\n", + " content_type=\"application/octet-stream\",\n", + " output_content_format=ContentFormat.MARKDOWN,\n", + ")\n", + "result: AnalyzeResult = poller.result()" + ] + }, + { + "cell_type": "markdown", + "id": "4d28e7c3", + "metadata": {}, + "source": [ + "#### Das Ergebnis wird als Markdown direkt im Notebook wiedergegeben" + ] + }, + { + "cell_type": "markdown", + "id": "f3ba48bf", + "metadata": {}, + "source": [ + "from IPython.display import Markdown, display\n", + "\n", + "display(Markdown(result.content))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/dataland_qa_lab/dataland/dataland_client.py b/src/dataland_qa_lab/dataland/dataland_client.py index 7454cef..be8a4e9 100644 --- a/src/dataland_qa_lab/dataland/dataland_client.py +++ b/src/dataland_qa_lab/dataland/dataland_client.py @@ -44,6 +44,11 @@ def eu_taxonomy_nf_api(self) -> dataland_backend.EutaxonomyNonFinancialsDataCont """Function to run the eu-taxonomy-non-financials-data-controller API.""" return dataland_backend.EutaxonomyNonFinancialsDataControllerApi(self.backend_client) + @property + def eu_taxonomy_nuclear_and_gas_api(self) -> dataland_backend.NuclearAndGasDataControllerApi: + """Function to run the eu-taxonomy-nuclear-and-gas-data-controller API.""" + return dataland_backend.NuclearAndGasDataControllerApi(self.backend_client) + @property def documents_client(self) -> dataland_documents.ApiClient: """Retrieves the client for accessing the documents API.""" diff --git a/src/dataland_qa_lab/dataland/get_data.py b/src/dataland_qa_lab/dataland/get_data.py new file mode 100644 index 0000000..201ac11 --- /dev/null +++ b/src/dataland_qa_lab/dataland/get_data.py @@ -0,0 +1,49 @@ +from dataland_backend.models.company_associated_data_nuclear_and_gas_data import CompanyAssociatedDataNuclearAndGasData + +from dataland_qa_lab.utils import config + +conf = config.get_config() +dataland_client = conf.dataland_client + + +def get_all_company_datasets(company_id: str) -> str: + """Laden aller Datasets aus jeder Periode.""" + api = dataland_client.eu_taxonomy_nuclear_and_gas_api + dataset = api.get_all_company_nuclear_and_gas_data(company_id=company_id) + + return dataset + + +def get_data_id_by_year(company_id: str, year: str) -> str: + """Data_ID der gewünschten Periode des Datensets erhalten.""" + dataset = get_all_company_datasets(company_id=company_id) + # Eintragen aus welcher Periode man das Dataset haben will -> In diesem Fall 2024 + data_id = "test" + for t in range(len(dataset)): + if dataset[t].meta_info.reporting_period == year: + data_id = dataset[t].meta_info.data_id + break + + return data_id + + +def get_dataset_by_year(company_id: str, year: str) -> CompanyAssociatedDataNuclearAndGasData: + """Laden eines bestimmten Datensets einer bestimmten Periode.""" + data_id = get_data_id_by_year(company_id=company_id, year=year) + data = dataland_client.eu_taxonomy_nuclear_and_gas_api.get_company_associated_nuclear_and_gas_data(data_id=data_id) + return data + + +def get_value1_by_year(company_id: str, year: str) -> str: + """Rückgabe des ersten Wertes in dem bestimmten Datenset.""" + data = get_dataset_by_year(company_id=company_id, year=year) + value1 = data.data.general.general.nuclear_energy_related_activities_section426.value.value + return value1 + + +def get_datasource_reference_bytes(company_id: str, year: str) -> str: + """Erhalten der Datenreferenz in Bytes, wenn eine Datenquelle hinterlegt ist.""" + data = get_dataset_by_year(company_id=company_id, year=year) + value1 = data.data.general.general.nuclear_energy_related_activities_section426 + document_bytes = dataland_client.documents_api.get_document(value1.data_source.file_reference) + return document_bytes diff --git a/tests/dataland/test_dataland_client.py b/tests/dataland/test_dataland_client.py index 7321b7b..c3b3896 100644 --- a/tests/dataland/test_dataland_client.py +++ b/tests/dataland/test_dataland_client.py @@ -1,7 +1,18 @@ +import dataland_qa_lab.dataland.get_data as qa from dataland_qa_lab.utils import config def test_dataland_connectivity() -> None: client = config.get_config().dataland_client resolved_companies = client.company_api.get_companies(chunk_size=1) + + company_id = "4423c691-0436-423f-abcb-0a08127ee848" + year = "2024" + qa.get_all_company_datasets(company_id=company_id) + qa.get_data_id_by_year(company_id=company_id, year=year) + qa.get_dataset_by_year(company_id=company_id, year=year) + qa.get_value1_by_year(company_id=company_id, year=year) + qa.get_datasource_reference_bytes(company_id=company_id, year=year) + test_dataland = client.eu_taxonomy_nuclear_and_gas_api # noqa: F841 + assert len(resolved_companies) > 0