From 24e9156c850d36724ecdf410261d7f2ae7914735 Mon Sep 17 00:00:00 2001 From: Christian Bertleff <149955753+chrisBer67@users.noreply.github.com> Date: Thu, 30 Jan 2025 10:33:06 +0100 Subject: [PATCH 1/4] Df 21 logging (#37) * DF-21 Logging tests * fixes * fix * Test notebook * fixes * DF-21 final changes * format fix * another fix * test fixes * bug fix --- notebooks/test.ipynb | 62 ------------------- .../database/database_engine.py | 10 +-- src/dataland_qa_lab/dataland/data_provider.py | 4 ++ .../dataland/dataset_provider.py | 5 ++ .../dataland/scheduled_processor.py | 5 +- .../dataland/unreviewed_datasets.py | 8 ++- .../review/dataset_reviewer.py | 24 +++++++ .../review/generate_gpt_request.py | 8 ++- .../nuclear_and_gas_report_generator.py | 1 - 9 files changed, 53 insertions(+), 74 deletions(-) delete mode 100644 notebooks/test.ipynb diff --git a/notebooks/test.ipynb b/notebooks/test.ipynb deleted file mode 100644 index ddfcde5..0000000 --- a/notebooks/test.ipynb +++ /dev/null @@ -1,62 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ChatCompletion(id='chatcmpl-ArVtqW4mKgsnP22FSigZKYzUMPaRg', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_EEG3soMwEWVsq3HVX3MGCdcE', function=Function(arguments='{\"1\": \"No\", \"2\": \"No\", \"3\": \"No\", \"4\": \"Yes\", \"5\": \"Yes\", \"6\": \"No\"}', name='requested_information_precisely_found_in_relevant_documents'), type='function')]), content_filter_results={})], created=1737317370, model='gpt-4o-2024-08-06', object='chat.completion', service_tier=None, system_fingerprint='fp_f3927aa00d', usage=CompletionUsage(completion_tokens=57, prompt_tokens=9290, total_tokens=9347, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)), prompt_filter_results=[{'prompt_index': 0, 'content_filter_results': {}}])\n", - "ChatCompletion(id='chatcmpl-ArVts2AyR36a1jH2of2CdsZWZ0kx7', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_RW388dLjILMRSweDJiryA6QA', function=Function(arguments='{\"answer_value_CCM+CCA%_row1\":\"-1\",\"answer_value_CCM%_row1\":\"-1\",\"answer_value_CCA%_row1\":\"-1\",\"answer_value_CCM+CCA%_row2\":\"-1\",\"answer_value_CCM%_row2\":\"-1\",\"answer_value_CCA%_row2\":\"-1\",\"answer_value_CCM+CCA%_row3\":\"-1\",\"answer_value_CCM%_row3\":\"-1\",\"answer_value_CCA%_row3\":\"-1\",\"answer_value_CCM+CCA%_row4\":\"-1\",\"answer_value_CCM%_row4\":\"-1\",\"answer_value_CCA%_row4\":\"-1\",\"answer_value_CCM+CCA%_row5\":\"-1\",\"answer_value_CCM%_row5\":\"-1\",\"answer_value_CCA%_row5\":\"-1\",\"answer_value_CCM+CCA%_row6\":\"-1\",\"answer_value_CCM%_row6\":\"-1\",\"answer_value_CCA%_row6\":\"-1\",\"answer_value_CCM+CCA%_row7\":\"17\",\"answer_value_CCM%_row7\":\"17\",\"answer_value_CCA%_row7\":\"-1\",\"answer_value_CCM+CCA%_row8\":\"-1\",\"answer_value_CCM%_row8\":\"-1\",\"answer_value_CCA%_row8\":\"-1\"}', name='requested_information_precisely_found_in_relevant_documents'), type='function')]), content_filter_results={})], created=1737317372, model='gpt-4o-2024-08-06', object='chat.completion', service_tier=None, system_fingerprint='fp_f3927aa00d', usage=CompletionUsage(completion_tokens=319, prompt_tokens=10167, total_tokens=10486, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)), prompt_filter_results=[{'prompt_index': 0, 'content_filter_results': {}}])\n", - "ChatCompletion(id='chatcmpl-ArVtyyeknzyrBkr8b0tSsh72c1Yp4', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_pkRwMIZDH6vL6B6cy0hIStIL', function=Function(arguments='{\"answer_value_CCM+CCA%_row1\":\"-1\",\"answer_value_CCM%_row1\":\"-1\",\"answer_value_CCA%_row1\":\"-1\",\"answer_value_CCM+CCA%_row2\":\"-1\",\"answer_value_CCM%_row2\":\"-1\",\"answer_value_CCA%_row2\":\"-1\",\"answer_value_CCM+CCA%_row3\":\"-1\",\"answer_value_CCM%_row3\":\"-1\",\"answer_value_CCA%_row3\":\"-1\",\"answer_value_CCM+CCA%_row4\":\"-1\",\"answer_value_CCM%_row4\":\"-1\",\"answer_value_CCA%_row4\":\"-1\",\"answer_value_CCM+CCA%_row5\":\"-1\",\"answer_value_CCM%_row5\":\"-1\",\"answer_value_CCA%_row5\":\"-1\",\"answer_value_CCM+CCA%_row6\":\"-1\",\"answer_value_CCM%_row6\":\"-1\",\"answer_value_CCA%_row6\":\"-1\",\"answer_value_CCM+CCA%_row7\":\"89\",\"answer_value_CCM%_row7\":\"89\",\"answer_value_CCA%_row7\":\"-1\",\"answer_value_CCM+CCA%_row8\":\"-1\",\"answer_value_CCM%_row8\":\"-1\",\"answer_value_CCA%_row8\":\"-1\"}', name='requested_information_precisely_found_in_relevant_documents'), type='function')]), content_filter_results={})], created=1737317378, model='gpt-4o-2024-08-06', object='chat.completion', service_tier=None, system_fingerprint='fp_f3927aa00d', usage=CompletionUsage(completion_tokens=319, prompt_tokens=10192, total_tokens=10511, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)), prompt_filter_results=[{'prompt_index': 0, 'content_filter_results': {}}])\n", - "ChatCompletion(id='chatcmpl-ArVu4tnjlTSDHDGQU3EjakjK335lC', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_OKixRVlh8618ONUm4mcaRJJh', function=Function(arguments='{\"answer_value_CCM+CCA%_row1\": \"-1\", \"answer_value_CCM%_row1\": \"-1\", \"answer_value_CCA%_row1\": \"-1\", \"answer_value_CCM+CCA%_row2\": \"-1\", \"answer_value_CCM%_row2\": \"-1\", \"answer_value_CCA%_row2\": \"-1\", \"answer_value_CCM+CCA%_row3\": \"-1\", \"answer_value_CCM%_row3\": \"-1\", \"answer_value_CCA%_row3\": \"-1\", \"answer_value_CCM+CCA%_row4\": \"-1\", \"answer_value_CCM%_row4\": \"-1\", \"answer_value_CCA%_row4\": \"-1\", \"answer_value_CCM+CCA%_row5\": \"-1\", \"answer_value_CCM%_row5\": \"-1\", \"answer_value_CCA%_row5\": \"-1\", \"answer_value_CCM+CCA%_row6\": \"-1\", \"answer_value_CCM%_row6\": \"-1\", \"answer_value_CCA%_row6\": \"-1\", \"answer_value_CCM+CCA%_row7\": \"100\", \"answer_value_CCM%_row7\": \"100\", \"answer_value_CCA%_row7\": \"-1\", \"answer_value_CCM+CCA%_row8\": \"100\", \"answer_value_CCM%_row8\": \"-1\", \"answer_value_CCA%_row8\": \"-1\"}', name='requested_information_precisely_found_in_relevant_documents'), type='function')]), content_filter_results={})], created=1737317384, model='gpt-4o-2024-08-06', object='chat.completion', service_tier=None, system_fingerprint='fp_f3927aa00d', usage=CompletionUsage(completion_tokens=334, prompt_tokens=10167, total_tokens=10501, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)), prompt_filter_results=[{'prompt_index': 0, 'content_filter_results': {}}])\n", - "ChatCompletion(id='chatcmpl-ArVu8XEuVvVIojPqKi7NbiXGXuZuR', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_dNzXEdlts0DjwajTFLP80tgc', function=Function(arguments='{\"answer_value_CCM+CCA%_row1\": \"-1\", \"answer_value_CCM%_row1\": \"-1\", \"answer_value_CCA%_row1\": \"-1\", \"answer_value_CCM+CCA%_row2\": \"-1\", \"answer_value_CCM%_row2\": \"-1\", \"answer_value_CCA%_row2\": \"-1\", \"answer_value_CCM+CCA%_row3\": \"-1\", \"answer_value_CCM%_row3\": \"-1\", \"answer_value_CCA%_row3\": \"-1\", \"answer_value_CCM+CCA%_row4\": \"-1\", \"answer_value_CCM%_row4\": \"-1\", \"answer_value_CCA%_row4\": \"-1\", \"answer_value_CCM+CCA%_row5\": \"-1\", \"answer_value_CCM%_row5\": \"-1\", \"answer_value_CCA%_row5\": \"-1\", \"answer_value_CCM+CCA%_row6\": \"-1\", \"answer_value_CCM%_row6\": \"-1\", \"answer_value_CCA%_row6\": \"-1\", \"answer_value_CCM+CCA%_row7\": \"100\", \"answer_value_CCM%_row7\": \"100\", \"answer_value_CCA%_row7\": \"-1\", \"answer_value_CCM+CCA%_row8\": \"100\", \"answer_value_CCM%_row8\": \"-1\", \"answer_value_CCA%_row8\": \"-1\"}', name='requested_information_precisely_found_in_relevant_documents'), type='function')]), content_filter_results={})], created=1737317388, model='gpt-4o-2024-08-06', object='chat.completion', service_tier=None, system_fingerprint='fp_f3927aa00d', usage=CompletionUsage(completion_tokens=334, prompt_tokens=10192, total_tokens=10526, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)), prompt_filter_results=[{'prompt_index': 0, 'content_filter_results': {}}])\n", - "ChatCompletion(id='chatcmpl-ArVuENAVhKvKGwjVk7SkG2KykBzMC', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_wEAOdnT8mBClgKZqX5qwvyxn', function=Function(arguments='{\"answer_value_CCM+CCA%_row1\":\"-1\",\"answer_value_CCM%_row1\":\"-1\",\"answer_value_CCA%_row1\":\"-1\",\"answer_value_CCM+CCA%_row2\":\"-1\",\"answer_value_CCM%_row2\":\"-1\",\"answer_value_CCA%_row2\":\"-1\",\"answer_value_CCM+CCA%_row3\":\"-1\",\"answer_value_CCM%_row3\":\"-1\",\"answer_value_CCA%_row3\":\"-1\",\"answer_value_CCM+CCA%_row4\":\"94\",\"answer_value_CCM%_row4\":\"94\",\"answer_value_CCA%_row4\":\"-1\",\"answer_value_CCM+CCA%_row5\":\"1\",\"answer_value_CCM%_row5\":\"1\",\"answer_value_CCA%_row5\":\"-1\",\"answer_value_CCM+CCA%_row6\":\"-1\",\"answer_value_CCM%_row6\":\"-1\",\"answer_value_CCA%_row6\":\"-1\",\"answer_value_CCM+CCA%_row7\":\"5\",\"answer_value_CCM%_row7\":\"5\",\"answer_value_CCA%_row7\":\"-1\",\"answer_value_CCM+CCA%_row8\":\"100\",\"answer_value_CCM%_row8\":\"-1\",\"answer_value_CCA%_row8\":\"-1\"}', name='requested_information_precisely_found_in_relevant_documents'), type='function')]), content_filter_results={})], created=1737317394, model='gpt-4o-2024-08-06', object='chat.completion', service_tier=None, system_fingerprint='fp_f3927aa00d', usage=CompletionUsage(completion_tokens=314, prompt_tokens=10169, total_tokens=10483, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)), prompt_filter_results=[{'prompt_index': 0, 'content_filter_results': {}}])\n", - "ChatCompletion(id='chatcmpl-ArVuoo7TaNGd6OIMsc26mdWlbtD18', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_JG1xwui66uA8OBEDk3SkCPXs', function=Function(arguments='{\"answer_value_CCM+CCA%_row1\": \"-1\", \"answer_value_CCM%_row1\": \"-1\", \"answer_value_CCA%_row1\": \"-1\", \"answer_value_CCM+CCA%_row2\": \"-1\", \"answer_value_CCM%_row2\": \"-1\", \"answer_value_CCA%_row2\": \"-1\", \"answer_value_CCM+CCA%_row3\": \"-1\", \"answer_value_CCM%_row3\": \"-1\", \"answer_value_CCA%_row3\": \"-1\", \"answer_value_CCM+CCA%_row4\": \"96\", \"answer_value_CCM%_row4\": \"96\", \"answer_value_CCA%_row4\": \"-1\", \"answer_value_CCM+CCA%_row5\": \"2\", \"answer_value_CCM%_row5\": \"2\", \"answer_value_CCA%_row5\": \"-1\", \"answer_value_CCM+CCA%_row6\": \"-1\", \"answer_value_CCM%_row6\": \"-1\", \"answer_value_CCA%_row6\": \"-1\", \"answer_value_CCM+CCA%_row7\": \"2\", \"answer_value_CCM%_row7\": \"2\", \"answer_value_CCA%_row7\": \"-1\", \"answer_value_CCM+CCA%_row8\": \"100\", \"answer_value_CCM%_row8\": \"-1\", \"answer_value_CCA%_row8\": \"-1\"}', name='requested_information_precisely_found_in_relevant_documents'), type='function')]), content_filter_results={})], created=1737317430, model='gpt-4o-2024-08-06', object='chat.completion', service_tier=None, system_fingerprint='fp_f3927aa00d', usage=CompletionUsage(completion_tokens=330, prompt_tokens=10194, total_tokens=10524, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)), prompt_filter_results=[{'prompt_index': 0, 'content_filter_results': {}}])\n", - "ChatCompletion(id='chatcmpl-ArVuu62jO87Euq7rsNcRO9nF0Yk95', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_g4sMxj0RnvjIyvnIyfHR3Zwj', function=Function(arguments='{\"answer_value_%_row1\":\"-1\",\"answer_value_%_row2\":\"-1\",\"answer_value_%_row3\":\"2\",\"answer_value_%_row4\":\"-1\",\"answer_value_%_row5\":\"-1\",\"answer_value_%_row6\":\"-1\",\"answer_value_%_row7\":\"98\",\"answer_value_%_row8\":\"100\"}', name='requested_information_precisely_found_in_relevant_documents'), type='function')]), content_filter_results={})], created=1737317436, model='gpt-4o-2024-08-06', object='chat.completion', service_tier=None, system_fingerprint='fp_f3927aa00d', usage=CompletionUsage(completion_tokens=94, prompt_tokens=9246, total_tokens=9340, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)), prompt_filter_results=[{'prompt_index': 0, 'content_filter_results': {}}])\n", - "ChatCompletion(id='chatcmpl-ArVuxXUhkGNy8MyEOMZ81DIcItUzl', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_TfnB9JTOgwezczAGTOz66GX8', function=Function(arguments='{\"answer_value_%_row1\": \"-1\", \"answer_value_%_row2\": \"-1\", \"answer_value_%_row3\": \"-1\", \"answer_value_%_row4\": \"-1\", \"answer_value_%_row5\": \"-1\", \"answer_value_%_row6\": \"-1\", \"answer_value_%_row7\": \"96\", \"answer_value_%_row8\": \"100\"}', name='requested_information_precisely_found_in_relevant_documents'), type='function')]), content_filter_results={})], created=1737317439, model='gpt-4o-2024-08-06', object='chat.completion', service_tier=None, system_fingerprint='fp_f3927aa00d', usage=CompletionUsage(completion_tokens=111, prompt_tokens=9255, total_tokens=9366, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)), prompt_filter_results=[{'prompt_index': 0, 'content_filter_results': {}}])\n" - ] - }, - { - "data": { - "text/plain": [ - "QaReportMetaInformation(data_id='d026358f-39e0-4d00-8395-2ce821aa38ec', data_type='nuclear-and-gas', qa_report_id='3010bf09-3131-412d-b27b-f5315e2b40b1', reporter_user_id='c8dca010-54b1-454e-8941-927bee0b744f', upload_time=1737317440974, active=True)" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from dataland_qa_lab.review.dataset_reviewer import review_dataset\n", - "\n", - "review_dataset(\"d026358f-39e0-4d00-8395-2ce821aa38ec\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/src/dataland_qa_lab/database/database_engine.py b/src/dataland_qa_lab/database/database_engine.py index 5c8b354..2688997 100644 --- a/src/dataland_qa_lab/database/database_engine.py +++ b/src/dataland_qa_lab/database/database_engine.py @@ -29,7 +29,7 @@ def add_entity(entity: any) -> bool: session.add(entity) session.commit() except SQLAlchemyError: - logger.exception("Error while adding entity to database") + logger.exception(msg="Error while adding entity to database", exc_info=SQLAlchemyError) session.rollback() return False finally: @@ -47,7 +47,7 @@ def get_entity(entity_id: str, entity_class: any) -> any: entity = session.query(entity_class).filter(primary_key_column == entity_id).first() session.commit() except SQLAlchemyError: - logger.exception("Error retrieving entity") + logger.exception(msg="Error retrieving entity", exc_info=SQLAlchemyError) session.rollback() return None finally: @@ -64,7 +64,7 @@ def update_entity(entity: any) -> bool: session.merge(entity) session.commit() except SQLAlchemyError: - logger.exception("Error updating entity") + logger.exception(msg="Error updating entity", exc_info=SQLAlchemyError) session.close() return False finally: @@ -83,11 +83,11 @@ def delete_entity(entity_id: str, entity_class: any) -> bool: if entity: session.delete(entity) else: - logger.error("Entity not found") + logger.error(msg="Entity not found") return False session.commit() except SQLAlchemyError: - logger.exception("Error updating entity") + logger.exception(msg="Error updating entity", exc_info=SQLAlchemyError) session.rollback() return False finally: diff --git a/src/dataland_qa_lab/dataland/data_provider.py b/src/dataland_qa_lab/dataland/data_provider.py index 6d2616a..3072317 100644 --- a/src/dataland_qa_lab/dataland/data_provider.py +++ b/src/dataland_qa_lab/dataland/data_provider.py @@ -1,3 +1,5 @@ +import logging + from dataland_backend.models.extended_document_reference import ExtendedDocumentReference from dataland_backend.models.nuclear_and_gas_aligned_denominator import NuclearAndGasAlignedDenominator from dataland_backend.models.nuclear_and_gas_aligned_numerator import NuclearAndGasAlignedNumerator @@ -7,6 +9,8 @@ from dataland_qa_lab.utils.nuclear_and_gas_data_collection import NuclearAndGasDataCollection +logger = logging.getLogger(__name__) + def get_yes_no_values_by_data(data: NuclearAndGasDataCollection) -> dict[str, YesNo | None]: """Get Yes/No values of the given dataset as a dictionary with section names as keys.""" diff --git a/src/dataland_qa_lab/dataland/dataset_provider.py b/src/dataland_qa_lab/dataland/dataset_provider.py index 5a28de0..ff08dfc 100644 --- a/src/dataland_qa_lab/dataland/dataset_provider.py +++ b/src/dataland_qa_lab/dataland/dataset_provider.py @@ -1,11 +1,16 @@ +import logging + from dataland_backend.models.company_associated_data_nuclear_and_gas_data import CompanyAssociatedDataNuclearAndGasData from pydantic import StrictStr from dataland_qa_lab.utils import config +logger = logging.getLogger(__name__) + def get_dataset_by_id(data_id: StrictStr) -> CompanyAssociatedDataNuclearAndGasData: """Return the nuclear and gas dataset based on the data id.""" client = config.get_config().dataland_client dataset = client.eu_taxonomy_nuclear_and_gas_api.get_company_associated_nuclear_and_gas_data(data_id=data_id) + return dataset diff --git a/src/dataland_qa_lab/dataland/scheduled_processor.py b/src/dataland_qa_lab/dataland/scheduled_processor.py index b0297b7..7efaf8b 100644 --- a/src/dataland_qa_lab/dataland/scheduled_processor.py +++ b/src/dataland_qa_lab/dataland/scheduled_processor.py @@ -3,8 +3,10 @@ from dataland_qa_lab.dataland.unreviewed_datasets import UnreviewedDatasets from dataland_qa_lab.review.dataset_reviewer import review_dataset +from dataland_qa_lab.utils import console_logger logger = logging.getLogger(__name__) +console_logger.configure_console_logger() def run_scheduled_processing(iterations: int) -> None: @@ -16,6 +18,7 @@ def run_scheduled_processing(iterations: int) -> None: try: unreviewed_datasets = UnreviewedDatasets() list_of_data_ids = unreviewed_datasets.list_of_data_ids + logger.info("Processing unreviewed datasets with the list of Data-IDs: %s", list_of_data_ids) if not list_of_data_ids: time.sleep(600) @@ -27,7 +30,7 @@ def run_scheduled_processing(iterations: int) -> None: list_of_data_ids.remove(data_id) except Exception: - logger.exception("Error processing dataset %s", data_id) + logger.exception("Error processing dataset with the Data-ID: %s", data_id) except Exception as e: logger.critical("Critical error: %s", e) diff --git a/src/dataland_qa_lab/dataland/unreviewed_datasets.py b/src/dataland_qa_lab/dataland/unreviewed_datasets.py index 66cf55c..a271aca 100644 --- a/src/dataland_qa_lab/dataland/unreviewed_datasets.py +++ b/src/dataland_qa_lab/dataland/unreviewed_datasets.py @@ -14,12 +14,14 @@ class UnreviewedDatasets: def __init__(self) -> None: """Initialize the unreviewed datasets with the data from the API.""" client = config.get_config().dataland_client + logger.info(msg="Initializing the unreviewed Datasets with the data from Dataland.") try: number_of_datasets = client.qa_api.get_number_of_pending_datasets() if number_of_datasets is None or number_of_datasets < 0: - msg = "Recieved an invalid number of pending datasets." - raise ValueError(msg) # noqa: TRY301 + msg_p = "Recieved an invalid number of pending datasets." + logger.error(msg=msg_p, exc_info=ValueError) + raise ValueError(msg_p) # noqa: TRY301 self.datasets = client.qa_api.get_info_on_pending_datasets( data_types=["nuclear-and-gas"], chunk_size=number_of_datasets @@ -28,5 +30,5 @@ def __init__(self) -> None: self.list_of_data_ids = [dataset.data_id for dataset in self.datasets] except Exception: - logger.exception("An error occurred") + logger.exception(msg="An error occurred", exc_info=Exception) raise diff --git a/src/dataland_qa_lab/review/dataset_reviewer.py b/src/dataland_qa_lab/review/dataset_reviewer.py index 13c9c54..80155a0 100644 --- a/src/dataland_qa_lab/review/dataset_reviewer.py +++ b/src/dataland_qa_lab/review/dataset_reviewer.py @@ -1,3 +1,4 @@ +import logging from datetime import UTC, datetime, timedelta, timezone from dataland_qa.models.qa_report_meta_information import QaReportMetaInformation @@ -10,11 +11,17 @@ from dataland_qa_lab.utils import config from dataland_qa_lab.utils.nuclear_and_gas_data_collection import NuclearAndGasDataCollection +logger = logging.getLogger(__name__) + def review_dataset(data_id: str) -> QaReportMetaInformation | None: """Review a dataset.""" + logger.info("Starting the review of the Dataset: %s", data_id) + dataset = dataset_provider.get_dataset_by_id(data_id) + logger.debug("Dataset retrieved form the given Data-ID.") + logger.info("Creating database.") create_tables() existing_entity = get_entity(data_id, ReviewedDataset) @@ -23,22 +30,30 @@ def review_dataset(data_id: str) -> QaReportMetaInformation | None: ger_timezone = timedelta(hours=2) if now_utc.astimezone(timezone(timedelta(hours=1))).dst() else timedelta(hours=1) formatted_german_time1 = (now_utc + ger_timezone).strftime("%Y-%m-%d %H:%M:%S") + logger.debug("Checking if the dataset is already existing in the database") if existing_entity is None: + logger.info("Dataset with the Data-ID does not exist in the database. Starting review.") review_dataset = ReviewedDataset(data_id=data_id, review_start_time=formatted_german_time1) + logger.debug("Adding the dataset in the database with the Data-ID and review start time.") add_entity(review_dataset) data_collection = NuclearAndGasDataCollection(dataset.data) + logger.debug("Data collection created.") page_numbers = pages_provider.get_relevant_page_numbers(data_collection) + logger.debug("Relevant page numbers extracted.") relevant_pages_pdf_reader = pages_provider.get_relevant_pages_of_pdf(data_collection) + logger.debug("Relevant pages extracted.") readable_text = text_to_doc_intelligence.get_markdown_from_dataset( data_id=data_id, page_numbers=page_numbers, relevant_pages_pdf_reader=relevant_pages_pdf_reader ) + logger.debug("Text extracted from the relevant pages.") report = NuclearAndGasReportGenerator().generate_report(relevant_pages=readable_text, dataset=data_collection) + logger.info("Report generated succesfully.") data = config.get_config().dataland_client.eu_taxonomy_nuclear_gas_qa_api.post_nuclear_and_gas_data_qa_report( data_id=data_id, nuclear_and_gas_data=report @@ -51,10 +66,19 @@ def review_dataset(data_id: str) -> QaReportMetaInformation | None: ger_timezone = timedelta(hours=1) formatted_german_time2 = (now_utc + ger_timezone).strftime("%Y-%m-%d %H:%M:%S") + + logger.debug("Adding review end time in the database.") review_dataset.review_end_time = formatted_german_time2 + + logger.debug("Adding review completed to the database.") review_dataset.review_completed = True + + logger.debug("Adding the Report-ID to the database.") review_dataset.report_id = data.qa_report_id update_entity(review_dataset) + + logger.info("Report posted successfully for dataset with ID: %s", data_id) return data + logger.info("Dataset with the Data-ID already exist in the database.") return None diff --git a/src/dataland_qa_lab/review/generate_gpt_request.py b/src/dataland_qa_lab/review/generate_gpt_request.py index 741a071..0283413 100644 --- a/src/dataland_qa_lab/review/generate_gpt_request.py +++ b/src/dataland_qa_lab/review/generate_gpt_request.py @@ -1,9 +1,12 @@ import ast +import logging from openai import AzureOpenAI from dataland_qa_lab.utils import config +logger = logging.getLogger(__name__) + class GenerateGptRequest: """Generates the actual GPT request.""" @@ -48,7 +51,8 @@ def generate_gpt_request(mainprompt: str, subprompt: str) -> list: if updated_openai_response.choices[0].message.tool_calls: tool_call = updated_openai_response.choices[0].message.tool_calls[0].function else: - msg = "No tool calls found in the GPT response." - raise ValueError(msg) + msg_p = "No tool calls found in the GPT response." + logger.exception(msg=msg_p, exc_info=ValueError) + raise ValueError(msg_p) data_dict = ast.literal_eval(tool_call.arguments) return list(data_dict.values()) diff --git a/src/dataland_qa_lab/review/report_generator/nuclear_and_gas_report_generator.py b/src/dataland_qa_lab/review/report_generator/nuclear_and_gas_report_generator.py index 2cd3b09..70c49cc 100644 --- a/src/dataland_qa_lab/review/report_generator/nuclear_and_gas_report_generator.py +++ b/src/dataland_qa_lab/review/report_generator/nuclear_and_gas_report_generator.py @@ -50,5 +50,4 @@ def generate_report(self, relevant_pages: AnalyzeResult, dataset: NuclearAndGasD self.report.general.taxonomy_non_eligible = non_eligible_report_generator.build_taxonomy_non_eligible_report( dataset=dataset, relevant_pages=relevant_pages ) - return self.report From 0839485f893fc771516988c31aa0e60524064014 Mon Sep 17 00:00:00 2001 From: fschnizer Date: Thu, 30 Jan 2025 13:56:23 +0100 Subject: [PATCH 2/4] =?UTF-8?q?Df=2019=20Automatisiertes=20Pr=C3=BCfen=20d?= =?UTF-8?q?er=20Datasets=20(#38)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add run_scheduled_processing to main() * Lint * Test-Anpassung --------- Co-authored-by: jonkra20 --- data/jsons/concordia.json | 2 +- data/jsons/covestro.json | 2 +- data/jsons/deka.json | 2 +- data/jsons/enel.json | 2 +- data/jsons/eon.json | 2 +- data/jsons/iberdrola.json | 2 +- data/jsons/munichre.json | 2 +- data/jsons/rwe.json | 2 +- data/jsons/total.json | 2 +- pdm.lock | 44 +++++----- pyproject.toml | 2 +- src/dataland_qa_lab/bin/server.py | 3 + .../dataland/scheduled_processor.py | 15 ++-- .../review/dataset_reviewer.py | 4 +- tests/dataland/test_dataland_e2e.py | 85 ------------------- .../dataland/test_run_scheduled_processing.py | 29 +------ tests/end_to_end/test_report_e2e.py | 6 +- 17 files changed, 47 insertions(+), 159 deletions(-) delete mode 100644 tests/dataland/test_dataland_e2e.py diff --git a/data/jsons/concordia.json b/data/jsons/concordia.json index 583133a..96d3666 100644 --- a/data/jsons/concordia.json +++ b/data/jsons/concordia.json @@ -1,5 +1,5 @@ { - "companyId": "90ba9a69-1612-42e1-aeff-681d3eb683ba", + "companyId": "ef443b1e-bd8b-4d39-ad0b-a7b990faff61", "reportingPeriod": "2023", "data": { "general": { diff --git a/data/jsons/covestro.json b/data/jsons/covestro.json index 62896cf..4d3a2b4 100644 --- a/data/jsons/covestro.json +++ b/data/jsons/covestro.json @@ -1,5 +1,5 @@ { - "companyId": "0127d6ce-ba2e-44b3-ae93-c2a6b70c7b6e", + "companyId": "287108b3-965e-4adc-8c64-df5b0dc4b8ef", "reportingPeriod": "2023", "data": { "general": { diff --git a/data/jsons/deka.json b/data/jsons/deka.json index 3eba6dd..6b6de01 100644 --- a/data/jsons/deka.json +++ b/data/jsons/deka.json @@ -1,5 +1,5 @@ { - "companyId": "001dc409-8b9a-4536-87b3-c7dada9e1327", + "companyId": "7de152c1-16eb-4bd4-97c5-1f2985af5ca6", "reportingPeriod": "2023", "data": { "general": { diff --git a/data/jsons/enel.json b/data/jsons/enel.json index 8d13e81..f8fd04a 100644 --- a/data/jsons/enel.json +++ b/data/jsons/enel.json @@ -1,5 +1,5 @@ { - "companyId": "0105cba8-9606-4516-a02a-df9af5d0a156", + "companyId": "67f125ec-a9b1-49e3-9d0d-287cb6c9370c", "reportingPeriod": "2023", "data": { "general": { diff --git a/data/jsons/eon.json b/data/jsons/eon.json index 04570c6..c3ec466 100644 --- a/data/jsons/eon.json +++ b/data/jsons/eon.json @@ -1,5 +1,5 @@ { - "companyId": "9fc4ba23-9c30-4180-8e5f-58de5ed08d7e", + "companyId": "77ca0f9e-c123-4320-b5aa-e87030766e14", "reportingPeriod": "2023", "data": { "general": { diff --git a/data/jsons/iberdrola.json b/data/jsons/iberdrola.json index d100c88..aa4d36e 100644 --- a/data/jsons/iberdrola.json +++ b/data/jsons/iberdrola.json @@ -1,5 +1,5 @@ { - "companyId": "aa064795-1924-4e57-8d3d-63ff7dbd6b53", + "companyId": "916138b9-c1ac-4f5f-b2d1-df567df46809", "reportingPeriod": "2023", "data": { "general": { diff --git a/data/jsons/munichre.json b/data/jsons/munichre.json index c6d772e..18cafd4 100644 --- a/data/jsons/munichre.json +++ b/data/jsons/munichre.json @@ -1,5 +1,5 @@ { - "companyId": "9cef6954-ee4f-421a-b7cf-c884a1b9a080", + "companyId": "90e59d86-4ecd-4fb4-8310-9f0f91020e41", "reportingPeriod": "2023", "data": { "general": { diff --git a/data/jsons/rwe.json b/data/jsons/rwe.json index 251dcc6..bc9d6d6 100644 --- a/data/jsons/rwe.json +++ b/data/jsons/rwe.json @@ -1,5 +1,5 @@ { - "companyId": "ac54a10f-ab2a-4a68-9d68-de0779cec8a4", + "companyId": "7c9793a4-14ab-40b0-b3a3-98e8710cdc34", "reportingPeriod": "2023", "data": { "general": { diff --git a/data/jsons/total.json b/data/jsons/total.json index b5bf9e1..c38460c 100644 --- a/data/jsons/total.json +++ b/data/jsons/total.json @@ -1,5 +1,5 @@ { - "companyId": "5251cb45-ea80-4da3-8f68-5d73e30d1c6d", + "companyId": "cf6eb9ec-a117-40e9-b7f3-f287f8842b85", "reportingPeriod": "2023", "data": { "general": { diff --git a/pdm.lock b/pdm.lock index 49271f2..89f9250 100644 --- a/pdm.lock +++ b/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "linting", "notebooks", "testing"] strategy = ["inherit_metadata"] lock_version = "4.5.0" -content_hash = "sha256:4fc6fd636aab0690d56a5e824ffbb6fefabcbb557bdc0c86ebefce5e03e386b9" +content_hash = "sha256:d53b04eaeb306fbc55e1e3aa97e0b1a27e4549235f20bcd4aa2d159007a63500" [[metadata.targets]] requires_python = ">=3.12" @@ -573,7 +573,7 @@ version = "3.1.1" requires_python = ">=3.7" summary = "Lightweight in-process concurrent programming" groups = ["default"] -marker = "(platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\") and python_version < \"3.13\"" +marker = "(platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\") and python_version < \"3.14\"" files = [ {file = "greenlet-3.1.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:4afe7ea89de619adc868e087b4d2359282058479d7cfb94970adf4b55284574d"}, {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f406b22b7c9a9b4f8aa9d2ab13d6ae0ac3e85c9a809bd590ad53fed2bf70dc79"}, @@ -1876,34 +1876,34 @@ files = [ [[package]] name = "sqlalchemy" -version = "2.0.36" +version = "2.0.37" requires_python = ">=3.7" summary = "Database Abstraction Library" groups = ["default"] dependencies = [ - "greenlet!=0.4.17; (platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\") and python_version < \"3.13\"", + "greenlet!=0.4.17; (platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\") and python_version < \"3.14\"", "importlib-metadata; python_version < \"3.8\"", "typing-extensions>=4.6.0", ] files = [ - {file = "SQLAlchemy-2.0.36-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f7b64e6ec3f02c35647be6b4851008b26cff592a95ecb13b6788a54ef80bbdd4"}, - {file = "SQLAlchemy-2.0.36-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:46331b00096a6db1fdc052d55b101dbbfc99155a548e20a0e4a8e5e4d1362855"}, - {file = "SQLAlchemy-2.0.36-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fdf3386a801ea5aba17c6410dd1dc8d39cf454ca2565541b5ac42a84e1e28f53"}, - {file = "SQLAlchemy-2.0.36-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac9dfa18ff2a67b09b372d5db8743c27966abf0e5344c555d86cc7199f7ad83a"}, - {file = "SQLAlchemy-2.0.36-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:90812a8933df713fdf748b355527e3af257a11e415b613dd794512461eb8a686"}, - {file = "SQLAlchemy-2.0.36-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1bc330d9d29c7f06f003ab10e1eaced295e87940405afe1b110f2eb93a233588"}, - {file = "SQLAlchemy-2.0.36-cp312-cp312-win32.whl", hash = "sha256:79d2e78abc26d871875b419e1fd3c0bca31a1cb0043277d0d850014599626c2e"}, - {file = "SQLAlchemy-2.0.36-cp312-cp312-win_amd64.whl", hash = "sha256:b544ad1935a8541d177cb402948b94e871067656b3a0b9e91dbec136b06a2ff5"}, - {file = "SQLAlchemy-2.0.36-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b5cc79df7f4bc3d11e4b542596c03826063092611e481fcf1c9dfee3c94355ef"}, - {file = "SQLAlchemy-2.0.36-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3c01117dd36800f2ecaa238c65365b7b16497adc1522bf84906e5710ee9ba0e8"}, - {file = "SQLAlchemy-2.0.36-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9bc633f4ee4b4c46e7adcb3a9b5ec083bf1d9a97c1d3854b92749d935de40b9b"}, - {file = "SQLAlchemy-2.0.36-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e46ed38affdfc95d2c958de328d037d87801cfcbea6d421000859e9789e61c2"}, - {file = "SQLAlchemy-2.0.36-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:b2985c0b06e989c043f1dc09d4fe89e1616aadd35392aea2844f0458a989eacf"}, - {file = "SQLAlchemy-2.0.36-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4a121d62ebe7d26fec9155f83f8be5189ef1405f5973ea4874a26fab9f1e262c"}, - {file = "SQLAlchemy-2.0.36-cp313-cp313-win32.whl", hash = "sha256:0572f4bd6f94752167adfd7c1bed84f4b240ee6203a95e05d1e208d488d0d436"}, - {file = "SQLAlchemy-2.0.36-cp313-cp313-win_amd64.whl", hash = "sha256:8c78ac40bde930c60e0f78b3cd184c580f89456dd87fc08f9e3ee3ce8765ce88"}, - {file = "SQLAlchemy-2.0.36-py3-none-any.whl", hash = "sha256:fddbe92b4760c6f5d48162aef14824add991aeda8ddadb3c31d56eb15ca69f8e"}, - {file = "sqlalchemy-2.0.36.tar.gz", hash = "sha256:7f2767680b6d2398aea7082e45a774b2b0767b5c8d8ffb9c8b683088ea9b29c5"}, + {file = "SQLAlchemy-2.0.37-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2952748ecd67ed3b56773c185e85fc084f6bdcdec10e5032a7c25a6bc7d682ef"}, + {file = "SQLAlchemy-2.0.37-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3151822aa1db0eb5afd65ccfafebe0ef5cda3a7701a279c8d0bf17781a793bb4"}, + {file = "SQLAlchemy-2.0.37-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eaa8039b6d20137a4e02603aba37d12cd2dde7887500b8855356682fc33933f4"}, + {file = "SQLAlchemy-2.0.37-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1cdba1f73b64530c47b27118b7053b8447e6d6f3c8104e3ac59f3d40c33aa9fd"}, + {file = "SQLAlchemy-2.0.37-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1b2690456528a87234a75d1a1644cdb330a6926f455403c8e4f6cad6921f9098"}, + {file = "SQLAlchemy-2.0.37-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:cf5ae8a9dcf657fd72144a7fd01f243236ea39e7344e579a121c4205aedf07bb"}, + {file = "SQLAlchemy-2.0.37-cp312-cp312-win32.whl", hash = "sha256:ea308cec940905ba008291d93619d92edaf83232ec85fbd514dcb329f3192761"}, + {file = "SQLAlchemy-2.0.37-cp312-cp312-win_amd64.whl", hash = "sha256:635d8a21577341dfe4f7fa59ec394b346da12420b86624a69e466d446de16aff"}, + {file = "SQLAlchemy-2.0.37-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8c4096727193762e72ce9437e2a86a110cf081241919ce3fab8e89c02f6b6658"}, + {file = "SQLAlchemy-2.0.37-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e4fb5ac86d8fe8151966814f6720996430462e633d225497566b3996966b9bdb"}, + {file = "SQLAlchemy-2.0.37-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e56a139bfe136a22c438478a86f8204c1eb5eed36f4e15c4224e4b9db01cb3e4"}, + {file = "SQLAlchemy-2.0.37-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f95fc8e3f34b5f6b3effb49d10ac97c569ec8e32f985612d9b25dd12d0d2e94"}, + {file = "SQLAlchemy-2.0.37-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c505edd429abdfe3643fa3b2e83efb3445a34a9dc49d5f692dd087be966020e0"}, + {file = "SQLAlchemy-2.0.37-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:12b0f1ec623cccf058cf21cb544f0e74656618165b083d78145cafde156ea7b6"}, + {file = "SQLAlchemy-2.0.37-cp313-cp313-win32.whl", hash = "sha256:293f9ade06b2e68dd03cfb14d49202fac47b7bb94bffcff174568c951fbc7af2"}, + {file = "SQLAlchemy-2.0.37-cp313-cp313-win_amd64.whl", hash = "sha256:d70f53a0646cc418ca4853da57cf3ddddbccb8c98406791f24426f2dd77fd0e2"}, + {file = "SQLAlchemy-2.0.37-py3-none-any.whl", hash = "sha256:a8998bf9f8658bd3839cbc44ddbe982955641863da0c1efe5b00c1ab4f5c16b1"}, + {file = "sqlalchemy-2.0.37.tar.gz", hash = "sha256:12b28d99a9c14eaf4055810df1001557176716de0167b91026e648e65229bffb"}, ] [[package]] diff --git a/pyproject.toml b/pyproject.toml index b075470..87e91cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ dependencies = [ "cryptography>=43.0.1", "azure-ai-documentintelligence>=1.0.0b4", "openai>=1.51.2", - "sqlalchemy>=2.0.36", + "sqlalchemy>=2.0.37", "pg8000>=1.31.2", "ruff>=0.8.6", ] diff --git a/src/dataland_qa_lab/bin/server.py b/src/dataland_qa_lab/bin/server.py index b61a40a..88f66a7 100644 --- a/src/dataland_qa_lab/bin/server.py +++ b/src/dataland_qa_lab/bin/server.py @@ -1,6 +1,7 @@ import logging import time +from dataland_qa_lab.dataland import scheduled_processor from dataland_qa_lab.utils import console_logger logger = logging.getLogger("dataland_qa_lab.bin.server") @@ -11,6 +12,8 @@ def main(single_pass_e2e: bool = False) -> None: console_logger.configure_console_logger() logger.info("Launching the Dataland QA Lab server") + scheduled_processor.run_scheduled_processing(single_pass_e2e=single_pass_e2e) + while True: logger.info("Still running") if single_pass_e2e: diff --git a/src/dataland_qa_lab/dataland/scheduled_processor.py b/src/dataland_qa_lab/dataland/scheduled_processor.py index 7efaf8b..ef96069 100644 --- a/src/dataland_qa_lab/dataland/scheduled_processor.py +++ b/src/dataland_qa_lab/dataland/scheduled_processor.py @@ -9,29 +9,24 @@ console_logger.configure_console_logger() -def run_scheduled_processing(iterations: int) -> None: +def run_scheduled_processing(single_pass_e2e: bool = False) -> None: """Continuously processes unreviewed datasets at scheduled intervals.""" - max_iterations = 100 - counter = 0 - while counter < iterations and counter < max_iterations: - counter += 1 + while True: try: unreviewed_datasets = UnreviewedDatasets() list_of_data_ids = unreviewed_datasets.list_of_data_ids logger.info("Processing unreviewed datasets with the list of Data-IDs: %s", list_of_data_ids) - if not list_of_data_ids: - time.sleep(600) - continue - for data_id in reversed(list_of_data_ids[:]): try: review_dataset(data_id) list_of_data_ids.remove(data_id) - except Exception: logger.exception("Error processing dataset with the Data-ID: %s", data_id) + if single_pass_e2e: + break + time.sleep(600) except Exception as e: logger.critical("Critical error: %s", e) raise diff --git a/src/dataland_qa_lab/review/dataset_reviewer.py b/src/dataland_qa_lab/review/dataset_reviewer.py index 80155a0..25adef8 100644 --- a/src/dataland_qa_lab/review/dataset_reviewer.py +++ b/src/dataland_qa_lab/review/dataset_reviewer.py @@ -14,7 +14,7 @@ logger = logging.getLogger(__name__) -def review_dataset(data_id: str) -> QaReportMetaInformation | None: +def review_dataset(data_id: str, single_pass_e2e: bool = False) -> QaReportMetaInformation | None: """Review a dataset.""" logger.info("Starting the review of the Dataset: %s", data_id) @@ -24,7 +24,7 @@ def review_dataset(data_id: str) -> QaReportMetaInformation | None: logger.info("Creating database.") create_tables() - existing_entity = get_entity(data_id, ReviewedDataset) + existing_entity = None if single_pass_e2e else get_entity(data_id, ReviewedDataset) now_utc = datetime.now(UTC) ger_timezone = timedelta(hours=2) if now_utc.astimezone(timezone(timedelta(hours=1))).dst() else timedelta(hours=1) diff --git a/tests/dataland/test_dataland_e2e.py b/tests/dataland/test_dataland_e2e.py deleted file mode 100644 index 050f467..0000000 --- a/tests/dataland/test_dataland_e2e.py +++ /dev/null @@ -1,85 +0,0 @@ -from unittest.mock import ANY, MagicMock, patch - -from azure.ai.documentintelligence.models import AnalyzeResult -from dataland_backend.models.extended_data_point_nuclear_and_gas_aligned_denominator import ( - ExtendedDataPointNuclearAndGasAlignedDenominator, -) -from dataland_backend.models.extended_data_point_yes_no import ExtendedDataPointYesNo -from dataland_backend.models.nuclear_and_gas_aligned_denominator import NuclearAndGasAlignedDenominator -from dataland_backend.models.nuclear_and_gas_data import NuclearAndGasData -from dataland_backend.models.nuclear_and_gas_environmental_objective import NuclearAndGasEnvironmentalObjective -from dataland_backend.models.nuclear_and_gas_general_taxonomy_aligned_denominator import ( - NuclearAndGasGeneralTaxonomyAlignedDenominator, -) - -from dataland_qa_lab.review.dataset_reviewer import review_dataset - - -def create_document_intelligence_mock() -> AnalyzeResult: - return AnalyzeResult(content="mocked content") - - -def create_mock_nuclear_and_gas_data() -> NuclearAndGasData: - mock_data = MagicMock() - mock_data.general = MagicMock() - - mock_data.general.general = MagicMock( - nuclear_energy_related_activities_section426=ExtendedDataPointYesNo(value="Yes", data_source=None), - nuclear_energy_related_activities_section427=ExtendedDataPointYesNo(value="No", data_source=None), - nuclear_energy_related_activities_section428=ExtendedDataPointYesNo(value="Yes", data_source=None), - fossil_gas_related_activities_section429=ExtendedDataPointYesNo(value="Yes", data_source=None), - fossil_gas_related_activities_section430=ExtendedDataPointYesNo(value="Yes", data_source=None), - fossil_gas_related_activities_section431=ExtendedDataPointYesNo(value="No", data_source=None), - ) - - mock_data.general.taxonomy_aligned_denominator = NuclearAndGasGeneralTaxonomyAlignedDenominator( - nuclear_and_gas_taxonomy_aligned_capex_denominator=ExtendedDataPointNuclearAndGasAlignedDenominator( - value=NuclearAndGasAlignedDenominator( - taxonomyAlignedShareDenominatorNAndG426=NuclearAndGasEnvironmentalObjective() - ) - ) - ) - - return mock_data - - -@patch( - "dataland_qa_lab.pages.text_to_doc_intelligence.extract_text_of_pdf", - return_value=create_document_intelligence_mock(), -) -@patch("dataland_qa_lab.dataland.dataset_provider.get_dataset_by_id") -@patch("dataland_qa_lab.pages.pages_provider.get_relevant_pages_of_pdf") -@patch("dataland_qa_lab.utils.config.get_config") -@patch( - "dataland_qa_lab.review.report_generator.nuclear_and_gas_report_generator.NuclearAndGasReportGenerator.generate_report" -) -def test_review_dataset_with_mocked_client( - mock_generate_report: MagicMock, - mock_get_config: MagicMock, - mock_get_relevant_pages_of_pdf: MagicMock, - mock_get_dataset_by_id: MagicMock, - mock_extract_text_of_pdf: MagicMock, -) -> None: - mock_config_instance = MagicMock() - mock_get_config.return_value = mock_config_instance - - mock_dataland_client_instance = MagicMock() - mock_config_instance.dataland_client = mock_dataland_client_instance - - mock_dataset = MagicMock() - mock_dataset.data = create_mock_nuclear_and_gas_data() - mock_get_dataset_by_id.return_value = mock_dataset - - mock_get_relevant_pages_of_pdf.return_value = {"content": "mocked content"} - mock_generate_report.return_value = "mocked report" - - # Test review_dataset - data_id = "mocked_data_id" - review_dataset(data_id) - - mock_get_dataset_by_id.assert_called_once_with(data_id) - mock_get_relevant_pages_of_pdf.assert_called_once() - mock_generate_report.assert_called_once_with(relevant_pages=mock_extract_text_of_pdf.return_value, dataset=ANY) - mock_dataland_client_instance.eu_taxonomy_nuclear_gas_qa_api.post_nuclear_and_gas_data_qa_report.assert_called_once_with( - data_id=data_id, nuclear_and_gas_data=mock_generate_report.return_value - ) diff --git a/tests/dataland/test_run_scheduled_processing.py b/tests/dataland/test_run_scheduled_processing.py index 47ed16f..ebb7058 100644 --- a/tests/dataland/test_run_scheduled_processing.py +++ b/tests/dataland/test_run_scheduled_processing.py @@ -9,32 +9,5 @@ def test_run_scheduled_processing_unreviewed_datasets_error(mock_unreviewed_datasets: MagicMock) -> None: mock_unreviewed_datasets.side_effect = Exception("Error while creating UnreviewedDatasets") with pytest.raises(Exception) as context: # noqa: PT011 - run_scheduled_processing(iterations=1) + run_scheduled_processing(single_pass_e2e=True) assert str(context.value) == "Error while creating UnreviewedDatasets" - - -@patch("dataland_qa_lab.dataland.scheduled_processor.time.sleep") # Mock time.sleep to avoid delays -@patch("dataland_qa_lab.dataland.scheduled_processor.UnreviewedDatasets") -def test_run_scheduled_processing_loops(mock_unreviewed_datasets: MagicMock, mock_sleep) -> None: # noqa: ANN001 - mock_instance = MagicMock() - mock_instance.list_of_data_ids = [] - mock_unreviewed_datasets.return_value = mock_instance - - iterations = 5 - run_scheduled_processing(iterations=iterations) - assert mock_unreviewed_datasets.call_count == iterations - assert mock_sleep.call_count == iterations - - -@patch("dataland_qa_lab.dataland.scheduled_processor.time.sleep") -@patch("dataland_qa_lab.dataland.scheduled_processor.UnreviewedDatasets") -def test_run_scheduled_processing_max_loops(mock_unreviewed_datasets: MagicMock, mock_sleep: MagicMock) -> None: - mock_instance = MagicMock() - mock_instance.list_of_data_ids = [] - mock_unreviewed_datasets.return_value = mock_instance - - mock_sleep.side_effect = lambda x: x if x <= 5 else None - - iterations = 100 - run_scheduled_processing(iterations=iterations) - assert mock_unreviewed_datasets.call_count == 100 diff --git a/tests/end_to_end/test_report_e2e.py b/tests/end_to_end/test_report_e2e.py index 9012d49..bdbf698 100644 --- a/tests/end_to_end/test_report_e2e.py +++ b/tests/end_to_end/test_report_e2e.py @@ -87,15 +87,17 @@ def test_report_generator_end_to_end() -> None: @patch("dataland_qa_lab.pages.text_to_doc_intelligence.extract_text_of_pdf") +@patch("dataland_qa_lab.database.database_engine.get_entity") def mocked_review_dataset( data_id: str, mock_extract_text_of_pdf: MagicMock, + mock_get_entity: MagicMock, ) -> QaReportMetaInformation: """Review the dataset with mocked Azure calls.""" mock_extract_text_of_pdf.return_value = mock_constants.E2E_AZURE_DOCUMENT_INTELLIGENCE_MOCK - + mock_get_entity.return_value = None with patch("openai.resources.chat.Completions.create", side_effect=mock_open_ai): - report_data = review_dataset(data_id=data_id) + report_data = review_dataset(data_id=data_id, single_pass_e2e=True) return report_data From 15b9156f7f43d0d851f492d888ee25f092ae2afc Mon Sep 17 00:00:00 2001 From: TilmanNiem <117115402+TilmanNiem@users.noreply.github.com> Date: Fri, 31 Jan 2025 09:33:29 +0100 Subject: [PATCH 3/4] remove redundant console configuration (#40) * remove redundant console configuration * fix formattign * fix docker * fix template --- .env.template | 2 +- docker-compose.yml | 10 ++++++++++ src/dataland_qa_lab/dataland/scheduled_processor.py | 2 -- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/.env.template b/.env.template index f13d67c..eca2f9c 100644 --- a/.env.template +++ b/.env.template @@ -15,4 +15,4 @@ POSTGRES_USER="${POSTGRES_USER}" PGADMIN_DEFAULT_EMAIL="${PGADMIN_DEFAULT_EMAIL}" PGADMIN_DEFAULT_PASSWORD="${PGADMIN_DEFAULT_PASSWORD}" -DATABASE_CONNECTION_STRING="postgresql+pg8000://${POSTGRES_USER}:${POSTGRES_PASSWORD}@localhost:5432/dataland_qa_lab" +DATABASE_CONNECTION_STRING="postgresql+pg8000://${POSTGRES_USER}:${POSTGRES_PASSWORD}@database:5432/dataland_qa_lab" diff --git a/docker-compose.yml b/docker-compose.yml index a619adf..d38ece2 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -11,6 +11,11 @@ services: AZURE_OPENAI_ENDPOINT: ${AZURE_OPENAI_ENDPOINT} AZURE_DOCINTEL_API_KEY: ${AZURE_DOCINTEL_API_KEY} AZURE_DOCINTEL_ENDPOINT: ${AZURE_DOCINTEL_ENDPOINT} + DATABASE_CONNECTION_STRING: ${DATABASE_CONNECTION_STRING} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + POSTGRES_USER: ${POSTGRES_USER} + PGADMIN_DEFAULT_EMAIL: ${PGADMIN_DEFAULT_EMAIL} + PGADMIN_DEFAULT_PASSWORD: ${PGADMIN_DEFAULT_PASSWORD} qa-lab-server-prod: profiles: - prod @@ -22,6 +27,11 @@ services: AZURE_OPENAI_ENDPOINT: ${AZURE_OPENAI_ENDPOINT} AZURE_DOCINTEL_API_KEY: ${AZURE_DOCINTEL_API_KEY} AZURE_DOCINTEL_ENDPOINT: ${AZURE_DOCINTEL_ENDPOINT} + DATABASE_CONNECTION_STRING: ${DATABASE_CONNECTION_STRING} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + POSTGRES_USER: ${POSTGRES_USER} + PGADMIN_DEFAULT_EMAIL: ${PGADMIN_DEFAULT_EMAIL} + PGADMIN_DEFAULT_PASSWORD: ${PGADMIN_DEFAULT_PASSWORD} database: image: postgres:17.2 restart: always diff --git a/src/dataland_qa_lab/dataland/scheduled_processor.py b/src/dataland_qa_lab/dataland/scheduled_processor.py index ef96069..ddb6539 100644 --- a/src/dataland_qa_lab/dataland/scheduled_processor.py +++ b/src/dataland_qa_lab/dataland/scheduled_processor.py @@ -3,10 +3,8 @@ from dataland_qa_lab.dataland.unreviewed_datasets import UnreviewedDatasets from dataland_qa_lab.review.dataset_reviewer import review_dataset -from dataland_qa_lab.utils import console_logger logger = logging.getLogger(__name__) -console_logger.configure_console_logger() def run_scheduled_processing(single_pass_e2e: bool = False) -> None: From 62018e437c0c29cb2f4a76dd5a35bf36a48b63cc Mon Sep 17 00:00:00 2001 From: Si Thu <116348957+Si2-Aung@users.noreply.github.com> Date: Tue, 4 Feb 2025 22:19:46 +0100 Subject: [PATCH 4/4] Df 131 (#39) * Try catch * bug fixes * 0 to -1 * Old df-131 removed and changes taken over * Ruff error fix * ruff linting fix * denominator verdict extended * test coverage increase for unreviewed_dataset, data_provider and scheduled_processing * test coverage increase for numeric_generator & prompt_servic * Template 2-5 verdict extended * Template 1 verdict extended * test cases added * df-131 finalized * Error handling "get_relevant_pages_of_pdf" * Yes_No List error fixed * Pages Provider angepasst * Tests error fixes * Lint fix * Test_updated * text_to_doc output change * Float convertion replaced with regex matching * Float convertion fixed * Sonar error fixed? * No Data source fixed * Lint fix * Comments resolved * Print tests * fix: provide a not empty value to relevant pages to ensure test does not break * fix: ensure error is thrown * fix: ensure no null values are saved as markdown * fix: arrangement of parameters adapted to patches --------- Co-authored-by: aardunne Co-authored-by: fschnizer Co-authored-by: TilmanNiem --- notebooks/test_existing_company_reports.ipynb | 266 +++++------------- src/dataland_qa_lab/dataland/data_provider.py | 108 ++++--- .../dataland/unreviewed_datasets.py | 8 +- src/dataland_qa_lab/pages/pages_provider.py | 10 +- .../pages/text_to_doc_intelligence.py | 3 + .../prompting_services/prompting_service.py | 8 +- .../review/dataset_reviewer.py | 23 +- .../review/generate_gpt_request.py | 95 ++++--- .../review/numeric_value_generator.py | 116 ++++---- .../denominator_report_generator.py | 40 ++- .../eligible_not_aligned_report_generator.py | 42 ++- .../non_eligible_report_generator.py | 39 ++- .../nuclear_and_gas_report_generator.py | 6 +- .../numerator_report_generator.py | 39 ++- .../yes_no_report_generator.py | 49 +++- .../review/yes_no_value_generator.py | 32 ++- .../utils/nuclear_and_gas_data_collection.py | 36 ++- tests/dataland/test_data_provider.py | 62 +++- tests/dataland/test_prompt_services.py | 85 +++++- tests/dataland/test_unreviewed_datasets.py | 22 ++ tests/end_to_end/test_report_e2e.py | 9 +- .../test_denominator_report_generator.py | 53 +++- ...t_eligible_not_aligned_report_generator.py | 39 +++ tests/review/test_non_eligible_generator.py | 39 +++ .../review/test_numerator_report_generator.py | 37 +++ tests/review/test_numeric_value_generator.py | 128 +++++++++ tests/review/test_report_generator.py | 16 -- tests/review/test_yes_no_report_generator.py | 104 +++++++ 28 files changed, 1074 insertions(+), 440 deletions(-) create mode 100644 tests/review/test_numeric_value_generator.py create mode 100644 tests/review/test_yes_no_report_generator.py diff --git a/notebooks/test_existing_company_reports.ipynb b/notebooks/test_existing_company_reports.ipynb index c9b0418..33b35da 100644 --- a/notebooks/test_existing_company_reports.ipynb +++ b/notebooks/test_existing_company_reports.ipynb @@ -4,7 +4,19 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Python-dotenv could not parse statement starting at line 15\n", + "Python-dotenv could not parse statement starting at line 18\n", + "Python-dotenv could not parse statement starting at line 20\n", + "Python-dotenv could not parse statement starting at line 23\n", + "Python-dotenv could not parse statement starting at line 25\n" + ] + } + ], "source": [ "from dataland_backend.models.data_type_enum import DataTypeEnum\n", "\n", @@ -70,7 +82,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "BPCE\n" + "Aktiebolaget Electrolux\n" ] } ], @@ -79,7 +91,7 @@ "extracted_yes_no_values = {}\n", "\n", "# check yes no values\n", - "for data_id, company_info in zip(data_ids[8:9], company_infos[8:9], strict=False):\n", + "for data_id, company_info in zip(data_ids[0:1], company_infos[0:1], strict=False):\n", " print(company_info.company_name)\n", " data = dataland_client.eu_taxonomy_nuclear_and_gas_api.get_company_associated_nuclear_and_gas_data(data_id=data_id)\n", " data_collection = NuclearAndGasDataCollection(dataset=data.data)\n", @@ -107,11 +119,11 @@ "output_type": "stream", "text": [ "\n", - "Company: BPCE\n", + "Company: Aktiebolaget Electrolux\n", "nuclear_energy_related_activities_section426: Dataland=YesNo.NO, Extracted=YesNo.NO\n", - "nuclear_energy_related_activities_section427: Dataland=YesNo.YES, Extracted=YesNo.YES\n", - "nuclear_energy_related_activities_section428: Dataland=YesNo.YES, Extracted=YesNo.YES\n", - "fossil_gas_related_activities_section429: Dataland=YesNo.YES, Extracted=YesNo.YES\n", + "nuclear_energy_related_activities_section427: Dataland=YesNo.NO, Extracted=YesNo.NO\n", + "nuclear_energy_related_activities_section428: Dataland=YesNo.NO, Extracted=YesNo.NO\n", + "fossil_gas_related_activities_section429: Dataland=YesNo.NO, Extracted=YesNo.NO\n", "fossil_gas_related_activities_section430: Dataland=YesNo.NO, Extracted=YesNo.NO\n", "fossil_gas_related_activities_section431: Dataland=YesNo.NO, Extracted=YesNo.NO\n", "1.0\n" @@ -143,46 +155,56 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Skipping company Aktiebolaget Electrolux due to missing data from Dataland: Error retrieving taxonomy-aligned revenue denominator: 'NoneType' object has no attribute 'value'\n" + ] + } + ], "source": [ "numeric_values_dataland = {}\n", "extracted_numeric_values = {}\n", "\n", "# check numeric values\n", - "for data_id, company_info in zip(data_ids[6:7], company_infos[6:7], strict=False):\n", + "for data_id, company_info in zip(data_ids[0:1], company_infos[0:1], strict=False):\n", " data = dataland_client.eu_taxonomy_nuclear_and_gas_api.get_company_associated_nuclear_and_gas_data(data_id=data_id)\n", " data_collection = NuclearAndGasDataCollection(dataset=data.data)\n", + " try:\n", + " # get values on Dataland\n", + " if company_info.company_name not in numeric_values_dataland:\n", + " numeric_values_dataland[company_info.company_name] = {}\n", "\n", - " # get values on Dataland\n", - " if company_info.company_name not in numeric_values_dataland:\n", - " numeric_values_dataland[company_info.company_name] = {}\n", - "\n", - " numeric_values_dataland[company_info.company_name][\"aligned_revenue_denominator\"] = (\n", - " get_taxonomy_aligned_revenue_denominator_values_by_data(data=data_collection)\n", - " )\n", - " numeric_values_dataland[company_info.company_name][\"aligned_capex_denominator\"] = (\n", - " get_taxonomy_aligned_capex_denominator_values_by_data(data=data_collection)\n", - " )\n", - " numeric_values_dataland[company_info.company_name][\"aligned_revenue_numerator\"] = (\n", - " get_taxonomy_aligned_revenue_numerator_values_by_data(data=data_collection)\n", - " )\n", - " numeric_values_dataland[company_info.company_name][\"aligned_capex_numerator\"] = (\n", - " get_taxonomy_aligned_capex_numerator_values_by_data(data=data_collection)\n", - " )\n", - " numeric_values_dataland[company_info.company_name][\"not_aligned_revenue\"] = (\n", - " get_taxonomy_eligible_but_not_aligned_revenue_values_by_data(data=data_collection)\n", - " )\n", - " numeric_values_dataland[company_info.company_name][\"not_aligned_capex\"] = (\n", - " get_taxonomy_eligible_but_not_aligned_capex_values_by_data(data=data_collection)\n", - " )\n", - " numeric_values_dataland[company_info.company_name][\"non_eligible_revenue\"] = (\n", - " get_taxonomy_non_eligible_revenue_values_by_data(data=data_collection)\n", - " )\n", - " numeric_values_dataland[company_info.company_name][\"non_eligible_capex\"] = (\n", - " get_taxonomy_non_eligible_capex_values_by_data(data=data_collection)\n", - " )\n", + " numeric_values_dataland[company_info.company_name][\"aligned_revenue_denominator\"] = (\n", + " get_taxonomy_aligned_revenue_denominator_values_by_data(data=data_collection)\n", + " )\n", + " numeric_values_dataland[company_info.company_name][\"aligned_capex_denominator\"] = (\n", + " get_taxonomy_aligned_capex_denominator_values_by_data(data=data_collection)\n", + " )\n", + " numeric_values_dataland[company_info.company_name][\"aligned_revenue_numerator\"] = (\n", + " get_taxonomy_aligned_revenue_numerator_values_by_data(data=data_collection)\n", + " )\n", + " numeric_values_dataland[company_info.company_name][\"aligned_capex_numerator\"] = (\n", + " get_taxonomy_aligned_capex_numerator_values_by_data(data=data_collection)\n", + " )\n", + " numeric_values_dataland[company_info.company_name][\"not_aligned_revenue\"] = (\n", + " get_taxonomy_eligible_but_not_aligned_revenue_values_by_data(data=data_collection)\n", + " )\n", + " numeric_values_dataland[company_info.company_name][\"not_aligned_capex\"] = (\n", + " get_taxonomy_eligible_but_not_aligned_capex_values_by_data(data=data_collection)\n", + " )\n", + " numeric_values_dataland[company_info.company_name][\"non_eligible_revenue\"] = (\n", + " get_taxonomy_non_eligible_revenue_values_by_data(data=data_collection)\n", + " )\n", + " numeric_values_dataland[company_info.company_name][\"non_eligible_capex\"] = (\n", + " get_taxonomy_non_eligible_capex_values_by_data(data=data_collection)\n", + " )\n", + " except AttributeError as e:\n", + " print(f\"Skipping company {company_info.company_name} due to missing data from Dataland: {e}\")\n", "\n", " # get values from AI\n", " try:\n", @@ -209,7 +231,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -217,168 +239,8 @@ "output_type": "stream", "text": [ "\n", - "Company: Berliner Volksbank eG\n", - "Section 0: Dataland=0, Extracted=0.0\n", - "Section 1: Dataland=0, Extracted=0.0\n", - "Section 2: Dataland=0, Extracted=0.0\n", - "Section 3: Dataland=0, Extracted=0.0\n", - "Section 4: Dataland=0, Extracted=0.0\n", - "Section 5: Dataland=0, Extracted=0.0\n", - "Section 6: Dataland=0, Extracted=0.0\n", - "Section 7: Dataland=0, Extracted=0.0\n", - "Section 8: Dataland=0, Extracted=0.0\n", - "Section 9: Dataland=0, Extracted=0.0\n", - "Section 10: Dataland=0, Extracted=0.0\n", - "Section 11: Dataland=0, Extracted=0.0\n", - "Section 12: Dataland=0, Extracted=0.0\n", - "Section 13: Dataland=0, Extracted=0.0\n", - "Section 14: Dataland=0, Extracted=0.0\n", - "Section 15: Dataland=0, Extracted=0.0\n", - "Section 16: Dataland=0, Extracted=0.0\n", - "Section 17: Dataland=0, Extracted=0.0\n", - "Section 18: Dataland=0.1, Extracted=0.1\n", - "Section 19: Dataland=0.1, Extracted=0.1\n", - "Section 20: Dataland=0, Extracted=0.0\n", - "Section 21: Dataland=0.1, Extracted=0.1\n", - "Section 22: Dataland=0.1, Extracted=0.1\n", - "Section 23: Dataland=0, Extracted=0.0\n", - "Section 24: Dataland=0, Extracted=0.0\n", - "Section 25: Dataland=0, Extracted=0.0\n", - "Section 26: Dataland=0, Extracted=0.0\n", - "Section 27: Dataland=0, Extracted=0.0\n", - "Section 28: Dataland=0, Extracted=0.0\n", - "Section 29: Dataland=0, Extracted=0.0\n", - "Section 30: Dataland=0, Extracted=0.0\n", - "Section 31: Dataland=0, Extracted=0.0\n", - "Section 32: Dataland=0, Extracted=0.0\n", - "Section 33: Dataland=0, Extracted=0.0\n", - "Section 34: Dataland=0, Extracted=0.0\n", - "Section 35: Dataland=0, Extracted=0.0\n", - "Section 36: Dataland=0, Extracted=0.0\n", - "Section 37: Dataland=0, Extracted=0.0\n", - "Section 38: Dataland=0, Extracted=0.0\n", - "Section 39: Dataland=0, Extracted=0.0\n", - "Section 40: Dataland=0, Extracted=0.0\n", - "Section 41: Dataland=0, Extracted=0.0\n", - "Section 42: Dataland=0.1, Extracted=0.1\n", - "Section 43: Dataland=0.1, Extracted=0.1\n", - "Section 44: Dataland=0, Extracted=0.0\n", - "Section 45: Dataland=0.1, Extracted=0.1\n", - "Section 46: Dataland=0.1, Extracted=0.1\n", - "Section 47: Dataland=0, Extracted=0.0\n", - "Section 48: Dataland=0, Extracted=0.0\n", - "Section 49: Dataland=0, Extracted=0.0\n", - "Section 50: Dataland=0, Extracted=0.0\n", - "Section 51: Dataland=0, Extracted=0.0\n", - "Section 52: Dataland=0, Extracted=0.0\n", - "Section 53: Dataland=0, Extracted=0.0\n", - "Section 54: Dataland=0, Extracted=0.0\n", - "Section 55: Dataland=0, Extracted=0.0\n", - "Section 56: Dataland=0, Extracted=0.0\n", - "Section 57: Dataland=0, Extracted=0.0\n", - "Section 58: Dataland=0, Extracted=0.0\n", - "Section 59: Dataland=0, Extracted=0.0\n", - "Section 60: Dataland=0, Extracted=0.0\n", - "Section 61: Dataland=0, Extracted=0.0\n", - "Section 62: Dataland=0, Extracted=0.0\n", - "Section 63: Dataland=0, Extracted=0.0\n", - "Section 64: Dataland=0, Extracted=0.0\n", - "Section 65: Dataland=0, Extracted=0.0\n", - "Section 66: Dataland=100, Extracted=100.0\n", - "Section 67: Dataland=100, Extracted=100.0\n", - "Section 68: Dataland=0, Extracted=0.0\n", - "Section 69: Dataland=100, Extracted=100.0\n", - "Section 70: Dataland=100, Extracted=100.0\n", - "Section 71: Dataland=0, Extracted=0.0\n", - "Section 72: Dataland=0, Extracted=0.0\n", - "Section 73: Dataland=0, Extracted=0.0\n", - "Section 74: Dataland=0, Extracted=0.0\n", - "Section 75: Dataland=0, Extracted=0.0\n", - "Section 76: Dataland=0, Extracted=0.0\n", - "Section 77: Dataland=0, Extracted=0.0\n", - "Section 78: Dataland=0, Extracted=0.0\n", - "Section 79: Dataland=0, Extracted=0.0\n", - "Section 80: Dataland=0, Extracted=0.0\n", - "Section 81: Dataland=0, Extracted=0.0\n", - "Section 82: Dataland=0, Extracted=0.0\n", - "Section 83: Dataland=0, Extracted=0.0\n", - "Section 84: Dataland=0, Extracted=0.0\n", - "Section 85: Dataland=0, Extracted=0.0\n", - "Section 86: Dataland=0, Extracted=0.0\n", - "Section 87: Dataland=0, Extracted=0.0\n", - "Section 88: Dataland=0, Extracted=0.0\n", - "Section 89: Dataland=0, Extracted=0.0\n", - "Section 90: Dataland=100, Extracted=100.0\n", - "Section 91: Dataland=100, Extracted=100.0\n", - "Section 92: Dataland=0, Extracted=0.0\n", - "Section 93: Dataland=100, Extracted=100.0\n", - "Section 94: Dataland=100, Extracted=100.0\n", - "Section 95: Dataland=0, Extracted=0.0\n", - "Section 96: Dataland=0, Extracted=0.0\n", - "Section 97: Dataland=0, Extracted=0.0\n", - "Section 98: Dataland=0, Extracted=0.0\n", - "Section 99: Dataland=0, Extracted=0.0\n", - "Section 100: Dataland=0, Extracted=0.0\n", - "Section 101: Dataland=0, Extracted=0.0\n", - "Section 102: Dataland=0, Extracted=0.0\n", - "Section 103: Dataland=0, Extracted=0.0\n", - "Section 104: Dataland=0, Extracted=0.0\n", - "Section 105: Dataland=0, Extracted=0.0\n", - "Section 106: Dataland=0, Extracted=0.0\n", - "Section 107: Dataland=0, Extracted=0.0\n", - "Section 108: Dataland=0, Extracted=0.0\n", - "Section 109: Dataland=0, Extracted=0.0\n", - "Section 110: Dataland=0, Extracted=0.0\n", - "Section 111: Dataland=0, Extracted=0.0\n", - "Section 112: Dataland=0, Extracted=0.0\n", - "Section 113: Dataland=0, Extracted=0.0\n", - "Section 114: Dataland=7.82, Extracted=7.82\n", - "Section 115: Dataland=7.82, Extracted=7.82\n", - "Section 116: Dataland=0, Extracted=0.0\n", - "Section 117: Dataland=7.82, Extracted=7.82\n", - "Section 118: Dataland=7.82, Extracted=7.82\n", - "Section 119: Dataland=0, Extracted=0.0\n", - "Section 120: Dataland=0, Extracted=0.0\n", - "Section 121: Dataland=0, Extracted=0.0\n", - "Section 122: Dataland=0, Extracted=0.0\n", - "Section 123: Dataland=0, Extracted=0.0\n", - "Section 124: Dataland=0, Extracted=0.0\n", - "Section 125: Dataland=0, Extracted=0.0\n", - "Section 126: Dataland=0, Extracted=0.0\n", - "Section 127: Dataland=0, Extracted=0.0\n", - "Section 128: Dataland=0, Extracted=0.0\n", - "Section 129: Dataland=0, Extracted=0.0\n", - "Section 130: Dataland=0, Extracted=0.0\n", - "Section 131: Dataland=0, Extracted=0.0\n", - "Section 132: Dataland=0, Extracted=0.0\n", - "Section 133: Dataland=0, Extracted=0.0\n", - "Section 134: Dataland=0, Extracted=0.0\n", - "Section 135: Dataland=0, Extracted=0.0\n", - "Section 136: Dataland=0, Extracted=0.0\n", - "Section 137: Dataland=0, Extracted=0.0\n", - "Section 138: Dataland=7.82, Extracted=7.82\n", - "Section 139: Dataland=7.82, Extracted=7.82\n", - "Section 140: Dataland=0, Extracted=0.0\n", - "Section 141: Dataland=7.82, Extracted=7.82\n", - "Section 142: Dataland=7.82, Extracted=7.82\n", - "Section 143: Dataland=0, Extracted=0.0\n", - "Section 144: Dataland=0, Extracted=0.0\n", - "Section 145: Dataland=0, Extracted=0.0\n", - "Section 146: Dataland=0, Extracted=0.0\n", - "Section 147: Dataland=0, Extracted=0.0\n", - "Section 148: Dataland=0, Extracted=0.0\n", - "Section 149: Dataland=0, Extracted=0.0\n", - "Section 150: Dataland=4.17, Extracted=4.17\n", - "Section 151: Dataland=4.17, Extracted=4.17\n", - "Section 152: Dataland=0, Extracted=0.0\n", - "Section 153: Dataland=0, Extracted=0.0\n", - "Section 154: Dataland=0, Extracted=0.0\n", - "Section 155: Dataland=0, Extracted=0.0\n", - "Section 156: Dataland=0, Extracted=0.0\n", - "Section 157: Dataland=0, Extracted=0.0\n", - "Section 158: Dataland=4.17, Extracted=4.17\n", - "Section 159: Dataland=4.17, Extracted=4.17\n", - "Matching ratio: 100.00%\n" + "Company: Aktiebolaget Electrolux\n", + "Matching ratio: 0.00%\n" ] } ], @@ -439,7 +301,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.0" + "version": "3.12.6" } }, "nbformat": 4, diff --git a/src/dataland_qa_lab/dataland/data_provider.py b/src/dataland_qa_lab/dataland/data_provider.py index 3072317..e07916d 100644 --- a/src/dataland_qa_lab/dataland/data_provider.py +++ b/src/dataland_qa_lab/dataland/data_provider.py @@ -14,87 +14,126 @@ def get_yes_no_values_by_data(data: NuclearAndGasDataCollection) -> dict[str, YesNo | None]: """Get Yes/No values of the given dataset as a dictionary with section names as keys.""" - sections = data.yes_no_data_points + try: + sections = data.yes_no_data_points + + section_values = { + key: (data.datapoint.value if data and data.datapoint and data.datapoint.value is not None else None) + for key, data in sections.items() + } + except (AttributeError, KeyError, TypeError) as e: + msg = f"Error retrieving yes/no values: {e}" + raise AttributeError(msg) from e - section_values = { - key: (data.datapoint.value if data and data.datapoint and data.datapoint.value is not None else None) - for key, data in sections.items() - } return section_values def get_taxonomy_aligned_revenue_denominator_values_by_data(data: NuclearAndGasDataCollection) -> dict: """Retrieve taxonomy-aligned revenue denominator values from the dataset.""" denominator_values_dict = {} - denominator_values = data.taxonomy_aligned_denominator.get("taxonomy_aligned_revenue_denominator").datapoint.value - for field_name in NuclearAndGasAlignedDenominator.model_fields: - denominator_values_dict[field_name] = extract_field_data(denominator_values, field_name) + try: + denominator_values = data.taxonomy_aligned_denominator.get( + "taxonomy_aligned_revenue_denominator" + ).datapoint.value + for field_name in NuclearAndGasAlignedDenominator.model_fields: + denominator_values_dict[field_name] = extract_field_data(denominator_values, field_name) + except (AttributeError, KeyError, TypeError) as e: + msg = f"Error retrieving taxonomy-aligned revenue denominator: {e}" + raise AttributeError(msg) from e + return denominator_values_dict def get_taxonomy_aligned_capex_denominator_values_by_data(data: NuclearAndGasDataCollection) -> dict: """Retrieve taxonomy-aligned capex denominator values from the dataset.""" denominator_values_dict = {} - denominator_values = data.taxonomy_aligned_denominator.get("taxonomy_aligned_capex_denominator").datapoint.value - for field_name in NuclearAndGasAlignedDenominator.model_fields: - denominator_values_dict[field_name] = extract_field_data(denominator_values, field_name) + try: + denominator_values = data.taxonomy_aligned_denominator.get("taxonomy_aligned_capex_denominator").datapoint.value + for field_name in NuclearAndGasAlignedDenominator.model_fields: + denominator_values_dict[field_name] = extract_field_data(denominator_values, field_name) + except (AttributeError, KeyError, TypeError) as e: + msg = f"Error retrieving taxonomy-aligned capex denominator: {e}" + raise AttributeError(msg) from e return denominator_values_dict def get_taxonomy_aligned_revenue_numerator_values_by_data(data: NuclearAndGasDataCollection) -> dict: """Retrieve taxonomy-aligned revenue numerator values from the dataset.""" numerator_values_dict = {} - numerator_values = data.taxonomy_aligned_numerator.get("taxonomy_aligned_revenue_numerator").datapoint.value - for field_name in NuclearAndGasAlignedNumerator.model_fields: - numerator_values_dict[field_name] = extract_field_data(numerator_values, field_name) + try: + numerator_values = data.taxonomy_aligned_numerator.get("taxonomy_aligned_revenue_numerator").datapoint.value + for field_name in NuclearAndGasAlignedNumerator.model_fields: + numerator_values_dict[field_name] = extract_field_data(numerator_values, field_name) + except (AttributeError, KeyError, TypeError) as e: + msg = f"Error retrieving taxonomy-aligned revenue numerator: {e}" + raise AttributeError(msg) from e return numerator_values_dict def get_taxonomy_aligned_capex_numerator_values_by_data(data: NuclearAndGasDataCollection) -> dict: """Retrieve taxonomy-aligned capex numerator values from the dataset.""" numerator_values_dict = {} - numerator_values = data.taxonomy_aligned_numerator.get("taxonomy_aligned_capex_numerator").datapoint.value - for field_name in NuclearAndGasAlignedNumerator.model_fields: - numerator_values_dict[field_name] = extract_field_data(numerator_values, field_name) + try: + numerator_values = data.taxonomy_aligned_numerator.get("taxonomy_aligned_capex_numerator").datapoint.value + for field_name in NuclearAndGasAlignedNumerator.model_fields: + numerator_values_dict[field_name] = extract_field_data(numerator_values, field_name) + except (AttributeError, KeyError, TypeError) as e: + msg = f"Error retrieving taxonomy-aligned capex numerator: {e}" + raise AttributeError(msg) from e return numerator_values_dict def get_taxonomy_eligible_but_not_aligned_revenue_values_by_data(data: NuclearAndGasDataCollection) -> dict: """Retrieve taxonomy eligible but not aligned revenue numerator values from the dataset.""" eligible_but_not_aligned_dict = {} - eligible_values = data.taxonomy_eligble_but_not_aligned.get("taxonomy_not_aligned_revenue").datapoint.value - for field_name in NuclearAndGasEligibleButNotAligned.model_fields: - eligible_but_not_aligned_dict[field_name] = extract_field_data(eligible_values, field_name) + try: + eligible_values = data.taxonomy_eligble_but_not_aligned.get("taxonomy_not_aligned_revenue").datapoint.value + for field_name in NuclearAndGasEligibleButNotAligned.model_fields: + eligible_but_not_aligned_dict[field_name] = extract_field_data(eligible_values, field_name) + except (AttributeError, KeyError, TypeError) as e: + msg = f"Error retrieving taxonomy eligible but not aligned revenue: {e}" + raise AttributeError(msg) from e return eligible_but_not_aligned_dict def get_taxonomy_eligible_but_not_aligned_capex_values_by_data(data: NuclearAndGasDataCollection) -> dict: """Retrieve taxonomy eligible but not aligned capex from the dataset.""" eligible_but_not_aligned_dict = {} - eligible_values = data.taxonomy_eligble_but_not_aligned.get("taxonomy_not_aligned_capex").datapoint.value - for field_name in NuclearAndGasEligibleButNotAligned.model_fields: - eligible_but_not_aligned_dict[field_name] = extract_field_data(eligible_values, field_name) + try: + eligible_values = data.taxonomy_eligble_but_not_aligned.get("taxonomy_not_aligned_capex").datapoint.value + for field_name in NuclearAndGasEligibleButNotAligned.model_fields: + eligible_but_not_aligned_dict[field_name] = extract_field_data(eligible_values, field_name) + except (AttributeError, KeyError, TypeError) as e: + msg = f"Error retrieving taxonomy eligible but not aligned capex: {e}" + raise AttributeError(msg) from e return eligible_but_not_aligned_dict def get_taxonomy_non_eligible_revenue_values_by_data(data: NuclearAndGasDataCollection) -> dict: - """Retrieve taxonomy non eligible revenue numerator values from the dataset.""" + """Retrieve taxonomy non-eligible revenue numerator values from the dataset.""" non_eligible_dict = {} - non_eligible_values = data.taxonomy_non_eligible.get("taxonomy_non_eligible_revenue").datapoint.value - for field_name in NuclearAndGasNonEligible.model_fields: - value = getattr(non_eligible_values, field_name, None) - non_eligible_dict[field_name] = -1 if value is None else value - + try: + non_eligible_values = data.taxonomy_non_eligible.get("taxonomy_non_eligible_revenue").datapoint.value + for field_name in NuclearAndGasNonEligible.model_fields: + value = getattr(non_eligible_values, field_name, None) + non_eligible_dict[field_name] = -1 if value is None else value + except (AttributeError, KeyError, TypeError) as e: + msg = f"Error retrieving taxonomy non-eligible revenue: {e}" + raise AttributeError(msg) from e return non_eligible_dict def get_taxonomy_non_eligible_capex_values_by_data(data: NuclearAndGasDataCollection) -> dict: - """Retrieve taxonomy non eligible capex numerator values from the dataset.""" + """Retrieve taxonomy non-eligible capex numerator values from the dataset.""" non_eligible_dict = {} - non_eligible_values = data.taxonomy_non_eligible.get("taxonomy_non_eligible_capex").datapoint.value - for field_name in NuclearAndGasNonEligible.model_fields: - value = getattr(non_eligible_values, field_name, None) - non_eligible_dict[field_name] = -1 if value is None else value + try: + non_eligible_values = data.taxonomy_non_eligible.get("taxonomy_non_eligible_capex").datapoint.value + for field_name in NuclearAndGasNonEligible.model_fields: + value = getattr(non_eligible_values, field_name, None) + non_eligible_dict[field_name] = -1 if value is None else value + except (AttributeError, KeyError, TypeError) as e: + msg = f"Error retrieving taxonomy non-eligible capex: {e}" + raise AttributeError(msg) from e return non_eligible_dict @@ -144,7 +183,6 @@ def get_datasources_of_nuclear_and_gas_numeric_values( section_list = { key: data_source for section in sections.values() for key, data_source in extract_data_source(section).items() } - return section_list diff --git a/src/dataland_qa_lab/dataland/unreviewed_datasets.py b/src/dataland_qa_lab/dataland/unreviewed_datasets.py index a271aca..c8e9e13 100644 --- a/src/dataland_qa_lab/dataland/unreviewed_datasets.py +++ b/src/dataland_qa_lab/dataland/unreviewed_datasets.py @@ -15,7 +15,10 @@ def __init__(self) -> None: """Initialize the unreviewed datasets with the data from the API.""" client = config.get_config().dataland_client logger.info(msg="Initializing the unreviewed Datasets with the data from Dataland.") - + if client is None: + logger.exception("Client Setup failed in the configuration.") + msg = "Client Setup failed in the configuration." + raise ValueError(msg) try: number_of_datasets = client.qa_api.get_number_of_pending_datasets() if number_of_datasets is None or number_of_datasets < 0: @@ -29,6 +32,9 @@ def __init__(self) -> None: self.list_of_data_ids = [dataset.data_id for dataset in self.datasets] + except RuntimeError: + logger.exception("Timeout occurred while initializing the unreviewed datasets.") + raise except Exception: logger.exception(msg="An error occurred", exc_info=Exception) raise diff --git a/src/dataland_qa_lab/pages/pages_provider.py b/src/dataland_qa_lab/pages/pages_provider.py index a0433ff..447c87b 100644 --- a/src/dataland_qa_lab/pages/pages_provider.py +++ b/src/dataland_qa_lab/pages/pages_provider.py @@ -16,14 +16,16 @@ def get_relevant_page_numbers(dataset: NuclearAndGasDataCollection) -> list[int] return sorted(set(yes_no_pages + numeric_pages)) -def get_relevant_pages_of_pdf(dataset: NuclearAndGasDataCollection) -> pypdf.PdfReader: +def get_relevant_pages_of_pdf(dataset: NuclearAndGasDataCollection) -> pypdf.PdfReader | None: """Get page numbers of relevant data.""" dataland_client = config.get_config().dataland_client page_numbers = get_relevant_page_numbers(dataset=dataset) - file_reference = dataset.yes_no_data_points.get( - "nuclear_energy_related_activities_section426" - ).datapoint.data_source.file_reference + try: + datapoint = dataset.yes_no_data_points.get("nuclear_energy_related_activities_section426").datapoint + file_reference = datapoint.data_source.file_reference + except AttributeError: + return None full_pdf = dataland_client.documents_api.get_document(file_reference) full_pdf_stream = io.BytesIO(full_pdf) diff --git a/src/dataland_qa_lab/pages/text_to_doc_intelligence.py b/src/dataland_qa_lab/pages/text_to_doc_intelligence.py index 51503b7..ae683a7 100644 --- a/src/dataland_qa_lab/pages/text_to_doc_intelligence.py +++ b/src/dataland_qa_lab/pages/text_to_doc_intelligence.py @@ -43,6 +43,9 @@ def get_markdown_from_dataset(data_id: str, relevant_pages_pdf_reader: pypdf.Pdf else: readable_text = extract_text_of_pdf(relevant_pages_pdf_reader) + if readable_text is None: + return None + new_document = ReviewedDatasetMarkdowns( data_id=data_id, markdown_text=readable_text, diff --git a/src/dataland_qa_lab/prompting_services/prompting_service.py b/src/dataland_qa_lab/prompting_services/prompting_service.py index a1d4b8d..019cd5f 100644 --- a/src/dataland_qa_lab/prompting_services/prompting_service.py +++ b/src/dataland_qa_lab/prompting_services/prompting_service.py @@ -21,7 +21,7 @@ def create_main_prompt(template: int, pdf: str, kpi: str) -> str: "Taxonomy-aligned economic activities (denominator)", give me the percentage of "CCM+CCA", "CCM" and "CCA" for all rows. Focus on the row numbers on the left side of the table. - If you can't find the percentage value, write "0". + If you can't find the percentage value, write "-1". Consider translating for this given task like Meldebogen instead of template. # Relevant Documents {pdf} @@ -31,7 +31,7 @@ def create_main_prompt(template: int, pdf: str, kpi: str) -> str: "Taxonomy-aligned economic activities (numerator)", give me the percentage of "CCM+CCA", "CCM" and "CCA" for all rows. Focus on the row numbers on the left side of the table. - If you can't find the percentage value, write "0". + If you can't find the percentage value, write "-1". Consider translating for this given task like Meldebogen instead of template. # Relevant Documents {pdf} @@ -41,7 +41,7 @@ def create_main_prompt(template: int, pdf: str, kpi: str) -> str: "Taxonomy-eligible but not taxonomy-aligned economic activities", give me the percentage of "CCM+CCA", "CCM" and "CCA" for all rows. Focus on the row numbers on the left side of the table. - If you can't find the percentage value, write "0". + If you can't find the percentage value, write "-1". Consider translating for this given task like Meldebogen instead of template. # Relevant Documents {pdf} @@ -51,7 +51,7 @@ def create_main_prompt(template: int, pdf: str, kpi: str) -> str: "Taxonomy non-eligible economic activities", give me the percentage for all rows. Focus on the row numbers on the left side of the table. - If you can't find the percentage value, write "0". + If you can't find the percentage value, write "-1". Consider translating for this given task like Meldebogen instead of template. # Relevant Documents {pdf} diff --git a/src/dataland_qa_lab/review/dataset_reviewer.py b/src/dataland_qa_lab/review/dataset_reviewer.py index 25adef8..fa7411d 100644 --- a/src/dataland_qa_lab/review/dataset_reviewer.py +++ b/src/dataland_qa_lab/review/dataset_reviewer.py @@ -45,15 +45,22 @@ def review_dataset(data_id: str, single_pass_e2e: bool = False) -> QaReportMetaI logger.debug("Relevant page numbers extracted.") relevant_pages_pdf_reader = pages_provider.get_relevant_pages_of_pdf(data_collection) - logger.debug("Relevant pages extracted.") + if relevant_pages_pdf_reader is None: + logger.debug("No Data source found for the relevant pages.") + report = NuclearAndGasReportGenerator().generate_report(relevant_pages=None, dataset=data_collection) + logger.info("QA not attempted report generated successfully.") - readable_text = text_to_doc_intelligence.get_markdown_from_dataset( - data_id=data_id, page_numbers=page_numbers, relevant_pages_pdf_reader=relevant_pages_pdf_reader - ) - logger.debug("Text extracted from the relevant pages.") - - report = NuclearAndGasReportGenerator().generate_report(relevant_pages=readable_text, dataset=data_collection) - logger.info("Report generated succesfully.") + else: + logger.debug("Relevant pages extracted.") + readable_text = text_to_doc_intelligence.get_markdown_from_dataset( + data_id=data_id, page_numbers=page_numbers, relevant_pages_pdf_reader=relevant_pages_pdf_reader + ) + logger.debug("Text extracted from the relevant pages.") + + report = NuclearAndGasReportGenerator().generate_report( + relevant_pages=readable_text, dataset=data_collection + ) + logger.info("Report generated succesfully.") data = config.get_config().dataland_client.eu_taxonomy_nuclear_gas_qa_api.post_nuclear_and_gas_data_qa_report( data_id=data_id, nuclear_and_gas_data=report diff --git a/src/dataland_qa_lab/review/generate_gpt_request.py b/src/dataland_qa_lab/review/generate_gpt_request.py index 0283413..535df9f 100644 --- a/src/dataland_qa_lab/review/generate_gpt_request.py +++ b/src/dataland_qa_lab/review/generate_gpt_request.py @@ -21,38 +21,65 @@ def generate_gpt_request(mainprompt: str, subprompt: str) -> list: Returns: List[str]: A list of extracted values from the GPT response. + + Raises: + ValueError: For any issues encountered during the process. """ - conf = config.get_config() - - client = AzureOpenAI( - api_key=conf.azure_openai_api_key, - api_version="2024-07-01-preview", - azure_endpoint=conf.azure_openai_endpoint, - ) - updated_openai_response = client.chat.completions.create( - model="gpt-4o", - temperature=0, - messages=[ - {"role": "system", "content": mainprompt}, - ], - tool_choice="required", - tools=[ - { - "type": "function", - "function": { - "name": "requested_information_precisely_found_in_relevant_documents", - "description": "Submit the requested information. " - "Use this function when the information is precisely stated in the relevant documents.", - "parameters": subprompt, - }, - } - ], - ) - if updated_openai_response.choices[0].message.tool_calls: - tool_call = updated_openai_response.choices[0].message.tool_calls[0].function - else: - msg_p = "No tool calls found in the GPT response." - logger.exception(msg=msg_p, exc_info=ValueError) - raise ValueError(msg_p) - data_dict = ast.literal_eval(tool_call.arguments) - return list(data_dict.values()) + try: + try: + conf = config.get_config() + except Exception as e: + msg = f"Error loading configuration in Gpt_request generator: {e}" + raise ValueError(msg) from e + + # Initialize Azure OpenAI client + try: + client = AzureOpenAI( + api_key=conf.azure_openai_api_key, + api_version="2024-07-01-preview", + azure_endpoint=conf.azure_openai_endpoint, + ) + except Exception as e: + msg = f"Error initializing AzureOpenAI client: {e}" + raise ValueError(msg) from e + + # Create GPT request + try: + updated_openai_response = client.chat.completions.create( + model="gpt-4o", + temperature=0, + messages=[ + {"role": "system", "content": mainprompt}, + ], + tool_choice="required", + tools=[ + { + "type": "function", + "function": { + "name": "requested_information_precisely_found_in_relevant_documents", + "description": "Submit the requested information. " + "Use this function when the information is precisely stated in the relevant documents.", + "parameters": subprompt, + }, + } + ], + ) + except Exception as e: + msg = f"Error during GPT request creation: {e}" + raise ValueError(msg) from e + + try: + if updated_openai_response.choices[0].message.tool_calls: + tool_call = updated_openai_response.choices[0].message.tool_calls[0].function + except Exception as e: + msg = f"Error extracting tool calls: {e}" + raise ValueError(e) from e + + data_dict = ast.literal_eval(tool_call.arguments) + + return list(data_dict.values()) + + except (ValueError, KeyError, TypeError) as general_error: + # General error handling + msg = f"An unexpected error occurred: {general_error}" + raise ValueError(msg) from general_error diff --git a/src/dataland_qa_lab/review/numeric_value_generator.py b/src/dataland_qa_lab/review/numeric_value_generator.py index c10d3aa..ae69705 100644 --- a/src/dataland_qa_lab/review/numeric_value_generator.py +++ b/src/dataland_qa_lab/review/numeric_value_generator.py @@ -1,4 +1,4 @@ -from azure.ai.documentintelligence.models import AnalyzeResult +import re from dataland_qa_lab.prompting_services import prompting_service from dataland_qa_lab.review import generate_gpt_request @@ -7,58 +7,76 @@ class NumericValueGenerator: """Extracts and stores all values of template 2 to 5 and compares them to the values in dataland.""" + TEMPLATE_ID_5 = 5 + + @staticmethod + def get_taxonomy_aligned_denominator(readable_text: str, kpi: str) -> list: + """Extracts information from template 2 using Azure OpenAI and returns a list of results.""" + return NumericValueGenerator.extract_values_from_template(2, readable_text, kpi) + + @staticmethod + def get_taxonomy_aligned_numerator(readable_text: str, kpi: str) -> list: + """Extracts information from template 3 using Azure OpenAI and returns a list of results.""" + return NumericValueGenerator.extract_values_from_template(3, readable_text, kpi) + + @staticmethod + def get_taxonomy_eligible_not_alligned(readable_text: str, kpi: str) -> list: + """Extracts information from template 4 using Azure OpenAI and returns a list of results.""" + return NumericValueGenerator.extract_values_from_template(4, readable_text, kpi) + @staticmethod - def get_taxonomy_alligned_denominator(readable_text: AnalyzeResult, kpi: str) -> list: - """Extracts information from template 2 using Azure OpenAI and returns a list of results. - - Returns: - list: A list including the etracted values of template 2 - """ - dominator_values = generate_gpt_request.GenerateGptRequest.generate_gpt_request( - prompting_service.PromptingService.create_main_prompt(2, readable_text, kpi), - prompting_service.PromptingService.create_sub_prompt_template2to4(kpi), - ) - float_results = [float(value) for value in dominator_values] - return float_results + def get_taxonomy_non_eligible(readable_text: str, kpi: str) -> list: + """Extracts information from template 5 using Azure OpenAI and returns a list of results.""" + return NumericValueGenerator.extract_values_from_template(5, readable_text, kpi) @staticmethod - def get_taxonomy_alligned_numerator(readable_text: AnalyzeResult, kpi: str) -> list: - """Extracts information from template 3 using Azure OpenAI and returns a list of results. - - Returns: - list: A list including the etracted values of template 3. - """ - numerator_values = generate_gpt_request.GenerateGptRequest.generate_gpt_request( - prompting_service.PromptingService.create_main_prompt(3, readable_text, kpi), - prompting_service.PromptingService.create_sub_prompt_template2to4(kpi), - ) - float_results = [float(value) for value in numerator_values] - return float_results + def extract_values_from_template(template_id: int, readable_text: str, kpi: str) -> list: + """Generic method to extract values from a given template using Azure OpenAI.""" + try: + prompt_method = ( + prompting_service.PromptingService.create_sub_prompt_template5 + if template_id == NumericValueGenerator.TEMPLATE_ID_5 + else prompting_service.PromptingService.create_sub_prompt_template2to4 + ) + + values = generate_gpt_request.GenerateGptRequest.generate_gpt_request( + prompting_service.PromptingService.create_main_prompt(template_id, readable_text, kpi), + prompt_method(kpi), + ) + + if not values: + msg = f"No results returned from GPT for template {template_id} values." + NumericValueGenerator.throw_error(msg) + + return NumericValueGenerator.convert_to_float(values, template_id) + except ValueError as e: + msg = f"Error extracting values from template {template_id}: {e}" + raise ValueError(msg) from e @staticmethod - def get_taxonomy_eligible_not_alligned(readable_text: AnalyzeResult, kpi: str) -> list: - """Extracts information from template 4 using Azure OpenAI and returns a list of results. - - Returns: - list: A list including the etracted values of template 4. - """ - eligible_values = generate_gpt_request.GenerateGptRequest.generate_gpt_request( - prompting_service.PromptingService.create_main_prompt(4, readable_text, kpi), - prompting_service.PromptingService.create_sub_prompt_template2to4(kpi), - ) - float_results = [float(value) for value in eligible_values] - return float_results + def throw_error(msg: str) -> ValueError: + """Raises a ValueError with the given message.""" + raise ValueError(msg) @staticmethod - def get_taxonomy_non_eligible(readable_text: AnalyzeResult, kpi: str) -> list: - """Extracts information from template 5 using Azure OpenAI and returns a list of results. - - Returns: - list: A list including the extracted values of template 5. - """ - non_eligible_values = generate_gpt_request.GenerateGptRequest.generate_gpt_request( - prompting_service.PromptingService.create_main_prompt(5, readable_text, kpi), - prompting_service.PromptingService.create_sub_prompt_template5(kpi), - ) - float_results = [float(value) for value in non_eligible_values] - return float_results + def convert_to_float(values: list, template_id: int) -> list: + """Converts extracted values to floats.""" + try: + return [NumericValueGenerator.extract_number(value) for value in values] + except Exception as e: + msg = f"Unexpected error during float conversion for template {template_id}: {e}" + raise ValueError(msg) from e + + @staticmethod + def extract_number(value: str) -> float: + """Extracts the first numeric part from a string and converts it to a float.""" + if isinstance(value, float | int): # Directly return if it's already numeric + return float(value) + + # Safe regex: Match optional negative sign, then digits, optional dot, and more digits + match = re.search(r"-?\d+(?:\.\d+)?", value) + if match: + return float(match.group(0)) # Convert directly to float + + msg = f"Could not extract a valid number from '{value}'" + raise ValueError(msg) diff --git a/src/dataland_qa_lab/review/report_generator/denominator_report_generator.py b/src/dataland_qa_lab/review/report_generator/denominator_report_generator.py index 67e78e1..0ae32b1 100644 --- a/src/dataland_qa_lab/review/report_generator/denominator_report_generator.py +++ b/src/dataland_qa_lab/review/report_generator/denominator_report_generator.py @@ -1,4 +1,3 @@ -from azure.ai.documentintelligence.models import AnalyzeResult from dataland_qa.models.extended_data_point_nuclear_and_gas_aligned_denominator import ( ExtendedDataPointNuclearAndGasAlignedDenominator, ) @@ -19,7 +18,7 @@ def build_taxonomy_aligned_denominator_report( - dataset: NuclearAndGasDataCollection, relevant_pages: AnalyzeResult + dataset: NuclearAndGasDataCollection, relevant_pages: str ) -> NuclearAndGasGeneralTaxonomyAlignedDenominator: """Create a report frame for the Nuclear and Gas General Taxonomy Aligned Denominator.""" return NuclearAndGasGeneralTaxonomyAlignedDenominator( @@ -31,11 +30,19 @@ def build_taxonomy_aligned_denominator_report( def build_denominator_report_frame( - dataset: NuclearAndGasDataCollection, relevant_pages: AnalyzeResult, kpi: str + dataset: NuclearAndGasDataCollection, relevant_pages: str, kpi: str ) -> QaReportDataPointExtendedDataPointNuclearAndGasAlignedDenominator: """Build a report frame for a specific KPI denominator (Revenue or CapEx).""" - prompted_values = NumericValueGenerator.get_taxonomy_alligned_denominator(relevant_pages, kpi) - dataland_values = get_dataland_values(dataset, kpi) + if relevant_pages is None: + return create_not_attempted_report("No relevant pages found") + try: + prompted_values = NumericValueGenerator.get_taxonomy_aligned_denominator(relevant_pages, kpi) + except ValueError: + return create_not_attempted_report("Error retrieving prompted values for template 2") + try: + dataland_values = get_dataland_values(dataset, kpi) + except RuntimeError: + return create_not_attempted_report("Error retrieving dataland values for template 2") corrected_values, verdict, comment, quality = comparator.compare_values_template_2to4( prompted_values, dataland_values, NuclearAndGasAlignedDenominator @@ -58,12 +65,27 @@ def build_denominator_report_frame( ) +def create_not_attempted_report( + error_message: str, +) -> QaReportDataPointExtendedDataPointNuclearAndGasAlignedDenominator: + """Create a not attempted report frame for the Nuclear and Gas General Taxonomy Aligned Denominator.""" + return QaReportDataPointExtendedDataPointNuclearAndGasAlignedDenominator( + comment=error_message, + verdict=QaReportDataPointVerdict.QANOTATTEMPTED, + correctedData=ExtendedDataPointNuclearAndGasAlignedDenominator(), + ) + + def get_dataland_values(dataset: NuclearAndGasDataCollection, kpi: str) -> dict: """Retrieve dataland denominator values based on KPI.""" - if kpi == "Revenue": - data = data_provider.get_taxonomy_aligned_revenue_denominator_values_by_data(dataset) - else: - data = data_provider.get_taxonomy_aligned_capex_denominator_values_by_data(dataset) + try: + if kpi == "Revenue": + data = data_provider.get_taxonomy_aligned_revenue_denominator_values_by_data(dataset) + else: + data = data_provider.get_taxonomy_aligned_capex_denominator_values_by_data(dataset) + except Exception as e: + msg = f"Error retrieving dataland values for {kpi}: {e}" + raise RuntimeError(msg) from e return data diff --git a/src/dataland_qa_lab/review/report_generator/eligible_not_aligned_report_generator.py b/src/dataland_qa_lab/review/report_generator/eligible_not_aligned_report_generator.py index efb9f6e..e35edfa 100644 --- a/src/dataland_qa_lab/review/report_generator/eligible_not_aligned_report_generator.py +++ b/src/dataland_qa_lab/review/report_generator/eligible_not_aligned_report_generator.py @@ -1,4 +1,3 @@ -from azure.ai.documentintelligence.models import AnalyzeResult from dataland_qa.models.extended_data_point_nuclear_and_gas_eligible_but_not_aligned import ( ExtendedDataPointNuclearAndGasEligibleButNotAligned, ) @@ -19,7 +18,7 @@ def build_taxonomy_eligible_but_not_aligned_report( - dataset: NuclearAndGasDataCollection, relevant_pages: AnalyzeResult + dataset: NuclearAndGasDataCollection, relevant_pages: str ) -> NuclearAndGasGeneralTaxonomyEligibleButNotAligned: """Create Report Frame for the Nuclear and Gas General Taxonomy eligible but not alinged data.""" return NuclearAndGasGeneralTaxonomyEligibleButNotAligned( @@ -33,12 +32,19 @@ def build_taxonomy_eligible_but_not_aligned_report( def build_eligible_but_not_aligned_frame( - dataset: NuclearAndGasDataCollection, relevant_pages: AnalyzeResult, kpi: str + dataset: NuclearAndGasDataCollection, relevant_pages: str, kpi: str ) -> QaReportDataPointExtendedDataPointNuclearAndGasEligibleButNotAligned: """Build a report frame for a specific KPI (Revenue or CapEx).""" - prompted_values = NumericValueGenerator.get_taxonomy_eligible_not_alligned(relevant_pages, kpi) - dataland_values = get_dataland_values(dataset, kpi) - + if relevant_pages is None: + return create_not_attempted_report("No relevant pages found") + try: + prompted_values = NumericValueGenerator.get_taxonomy_eligible_not_alligned(relevant_pages, kpi) + except ValueError: + return create_not_attempted_report("Error retrieving prompted values for template 4") + try: + dataland_values = get_dataland_values(dataset, kpi) + except RuntimeError: + return create_not_attempted_report("Error retrieving dataland values for template 4") corrected_values, verdict, comment, quality = comparator.compare_values_template_2to4( prompted_values, dataland_values, NuclearAndGasEligibleButNotAligned ) @@ -57,13 +63,27 @@ def build_eligible_but_not_aligned_frame( ) +def create_not_attempted_report( + error_message: str, +) -> QaReportDataPointExtendedDataPointNuclearAndGasEligibleButNotAligned: + """Create a not attempted report for the Nuclear and Gas General Taxonomy eligible but not aligned Denominator.""" + return QaReportDataPointExtendedDataPointNuclearAndGasEligibleButNotAligned( + comment=error_message, + verdict=QaReportDataPointVerdict.QANOTATTEMPTED, + correctedData=ExtendedDataPointNuclearAndGasEligibleButNotAligned(), + ) + + def get_dataland_values(dataset: NuclearAndGasDataCollection, kpi: str) -> dict: """Retrieve dataland Eligible but not aligned values based on KPI.""" - if kpi == "Revenue": - data = data_provider.get_taxonomy_eligible_but_not_aligned_revenue_values_by_data(dataset) - else: - data = data_provider.get_taxonomy_eligible_but_not_aligned_capex_values_by_data(dataset) - + try: + if kpi == "Revenue": + data = data_provider.get_taxonomy_eligible_but_not_aligned_revenue_values_by_data(dataset) + else: + data = data_provider.get_taxonomy_eligible_but_not_aligned_capex_values_by_data(dataset) + except Exception as e: + msg = f"Error retrieving dataland values for {kpi}: {e}" + raise RuntimeError(msg) from e return data diff --git a/src/dataland_qa_lab/review/report_generator/non_eligible_report_generator.py b/src/dataland_qa_lab/review/report_generator/non_eligible_report_generator.py index 8308f9b..925a82a 100644 --- a/src/dataland_qa_lab/review/report_generator/non_eligible_report_generator.py +++ b/src/dataland_qa_lab/review/report_generator/non_eligible_report_generator.py @@ -1,4 +1,3 @@ -from azure.ai.documentintelligence.models import AnalyzeResult from dataland_qa.models.extended_data_point_nuclear_and_gas_non_eligible import ( ExtendedDataPointNuclearAndGasNonEligible, ) @@ -16,7 +15,7 @@ def build_taxonomy_non_eligible_report( - dataset: NuclearAndGasDataCollection, relevant_pages: AnalyzeResult + dataset: NuclearAndGasDataCollection, relevant_pages: str ) -> NuclearAndGasGeneralTaxonomyNonEligible: """Create Report Frame for the Nuclear and Gas General Taxonomy Non Eligible.""" return NuclearAndGasGeneralTaxonomyNonEligible( @@ -26,11 +25,19 @@ def build_taxonomy_non_eligible_report( def build_non_eligible_report_frame( - dataset: NuclearAndGasDataCollection, relevant_pages: AnalyzeResult, kpi: str + dataset: NuclearAndGasDataCollection, relevant_pages: str, kpi: str ) -> QaReportDataPointExtendedDataPointNuclearAndGasNonEligible: """Build report frame for the revenue non_eligible.""" - prompted_values = NumericValueGenerator.get_taxonomy_non_eligible(relevant_pages, kpi) - dataland_values = get_dataland_values(dataset, kpi) + if relevant_pages is None: + return create_not_attempted_report("No relevant pages found") + try: + prompted_values = NumericValueGenerator.get_taxonomy_non_eligible(relevant_pages, kpi) + except ValueError: + return create_not_attempted_report("Error retrieving prompted values for template 5") + try: + dataland_values = get_dataland_values(dataset, kpi) + except RuntimeError: + return create_not_attempted_report("Error retrieving dataland values for template 5") value, verdict, comment, quality = comparator.compare_non_eligible_values(prompted_values, dataland_values) if verdict == QaReportDataPointVerdict.QAACCEPTED: @@ -47,13 +54,25 @@ def build_non_eligible_report_frame( ) +def create_not_attempted_report(error_message: str) -> QaReportDataPointExtendedDataPointNuclearAndGasNonEligible: + """Create a not attempted report frame for the Nuclear and Gas General Non Eligible.""" + return QaReportDataPointExtendedDataPointNuclearAndGasNonEligible( + comment=error_message, + verdict=QaReportDataPointVerdict.QANOTATTEMPTED, + correctedData=ExtendedDataPointNuclearAndGasNonEligible(), + ) + + def get_dataland_values(dataset: NuclearAndGasDataCollection, kpi: str) -> dict: """Retrieve dataland non_eligible values based on KPI.""" - if kpi == "Revenue": - data = data_provider.get_taxonomy_non_eligible_revenue_values_by_data(dataset) - else: - data = data_provider.get_taxonomy_non_eligible_capex_values_by_data(dataset) - + try: + if kpi == "Revenue": + data = data_provider.get_taxonomy_non_eligible_revenue_values_by_data(dataset) + else: + data = data_provider.get_taxonomy_non_eligible_capex_values_by_data(dataset) + except Exception as e: + msg = f"Error retrieving dataland values for {kpi}: {e}" + raise RuntimeError(msg) from e return data diff --git a/src/dataland_qa_lab/review/report_generator/nuclear_and_gas_report_generator.py b/src/dataland_qa_lab/review/report_generator/nuclear_and_gas_report_generator.py index 70c49cc..65bb5c5 100644 --- a/src/dataland_qa_lab/review/report_generator/nuclear_and_gas_report_generator.py +++ b/src/dataland_qa_lab/review/report_generator/nuclear_and_gas_report_generator.py @@ -1,4 +1,3 @@ -from azure.ai.documentintelligence.models import AnalyzeResult from dataland_qa.models import NuclearAndGasGeneral, NuclearAndGasGeneralGeneral from dataland_qa.models.nuclear_and_gas_data import NuclearAndGasData @@ -16,12 +15,13 @@ class NuclearAndGasReportGenerator(ReportGenerator): """Generate a quality assurance report.""" - relevant_pages: AnalyzeResult + relevant_pages: str report: NuclearAndGasData - def generate_report(self, relevant_pages: AnalyzeResult, dataset: NuclearAndGasDataCollection) -> NuclearAndGasData: + def generate_report(self, relevant_pages: str | None, dataset: NuclearAndGasDataCollection) -> NuclearAndGasData: """Assemble the QA Report based on the corrected values from Azure.""" # Initialize report and relevant pages + self.relevant_pages = relevant_pages self.report = NuclearAndGasData(general=NuclearAndGasGeneral(general=NuclearAndGasGeneralGeneral())) diff --git a/src/dataland_qa_lab/review/report_generator/numerator_report_generator.py b/src/dataland_qa_lab/review/report_generator/numerator_report_generator.py index c2c4150..783a17c 100644 --- a/src/dataland_qa_lab/review/report_generator/numerator_report_generator.py +++ b/src/dataland_qa_lab/review/report_generator/numerator_report_generator.py @@ -1,4 +1,3 @@ -from azure.ai.documentintelligence.models import AnalyzeResult from dataland_qa.models.extended_data_point_nuclear_and_gas_aligned_numerator import ( ExtendedDataPointNuclearAndGasAlignedNumerator, ) @@ -19,7 +18,7 @@ def build_taxonomy_aligned_numerator_report( - dataset: NuclearAndGasDataCollection, relevant_pages: AnalyzeResult + dataset: NuclearAndGasDataCollection, relevant_pages: str ) -> NuclearAndGasGeneralTaxonomyAlignedNumerator: """Create Report Frame for the Nuclear and Gas General Taxonomy Aligned Numerator.""" return NuclearAndGasGeneralTaxonomyAlignedNumerator( @@ -29,12 +28,19 @@ def build_taxonomy_aligned_numerator_report( def build_numerator_report_frame( - dataset: NuclearAndGasDataCollection, relevant_pages: AnalyzeResult, kpi: str + dataset: NuclearAndGasDataCollection, relevant_pages: str, kpi: str ) -> QaReportDataPointExtendedDataPointNuclearAndGasAlignedNumerator: """Build a report frame for a specific KPI numerator (Revenue or CapEx).""" - prompted_values = NumericValueGenerator.get_taxonomy_alligned_numerator(relevant_pages, kpi) - dataland_values = get_dataland_values(dataset, kpi) - + if relevant_pages is None: + return create_not_attempted_report("No relevant pages found") + try: + prompted_values = NumericValueGenerator.get_taxonomy_aligned_numerator(relevant_pages, kpi) + except ValueError: + return create_not_attempted_report("Error retrieving prompted values for template 3") + try: + dataland_values = get_dataland_values(dataset, kpi) + except RuntimeError: + return create_not_attempted_report("Error retrieving dataland values for template 3") corrected_values, verdict, comment, quality = comparator.compare_values_template_2to4( prompted_values, dataland_values, NuclearAndGasAlignedNumerator ) @@ -53,12 +59,25 @@ def build_numerator_report_frame( ) +def create_not_attempted_report(error_message: str) -> QaReportDataPointExtendedDataPointNuclearAndGasAlignedNumerator: + """Create a not attempted report frame for the Nuclear and Gas General Taxonomy Aligned Numerator.""" + return QaReportDataPointExtendedDataPointNuclearAndGasAlignedNumerator( + comment=error_message, + verdict=QaReportDataPointVerdict.QANOTATTEMPTED, + correctedData=ExtendedDataPointNuclearAndGasAlignedNumerator(), + ) + + def get_dataland_values(dataset: NuclearAndGasDataCollection, kpi: str) -> dict: """Retrieve dataland numerator values based on KPI.""" - if kpi == "Revenue": - data = data_provider.get_taxonomy_aligned_revenue_numerator_values_by_data(dataset) - else: - data = data_provider.get_taxonomy_aligned_capex_numerator_values_by_data(dataset) + try: + if kpi == "Revenue": + data = data_provider.get_taxonomy_aligned_revenue_numerator_values_by_data(dataset) + else: + data = data_provider.get_taxonomy_aligned_capex_numerator_values_by_data(dataset) + except Exception as e: + msg = f"Error retrieving dataland values for {kpi}: {e}" + raise RuntimeError(msg) from e return data diff --git a/src/dataland_qa_lab/review/report_generator/yes_no_report_generator.py b/src/dataland_qa_lab/review/report_generator/yes_no_report_generator.py index 5ab1ecd..6aab195 100644 --- a/src/dataland_qa_lab/review/report_generator/yes_no_report_generator.py +++ b/src/dataland_qa_lab/review/report_generator/yes_no_report_generator.py @@ -1,5 +1,9 @@ -from azure.ai.documentintelligence.models import AnalyzeResult +from dataland_qa.models.extended_data_point_yes_no import ExtendedDataPointYesNo from dataland_qa.models.nuclear_and_gas_general_general import NuclearAndGasGeneralGeneral +from dataland_qa.models.qa_report_data_point_extended_data_point_yes_no import ( + QaReportDataPointExtendedDataPointYesNo, +) +from dataland_qa.models.qa_report_data_point_verdict import QaReportDataPointVerdict from dataland_qa_lab.dataland import data_provider from dataland_qa_lab.review import yes_no_value_generator @@ -8,14 +12,43 @@ def build_yes_no_report( - dataset: NuclearAndGasDataCollection, relevant_pages: AnalyzeResult + dataset: NuclearAndGasDataCollection, relevant_pages: str | None ) -> NuclearAndGasGeneralGeneral: """Create yes no report.""" report = NuclearAndGasGeneralGeneral() - yes_no_values = yes_no_value_generator.get_yes_no_values_from_report(relevant_pages) - yes_no_values_from_dataland = data_provider.get_yes_no_values_by_data(data=dataset) - data_sources = data_provider.get_datasources_of_nuclear_and_gas_yes_no_questions(data=dataset) - yes_no_data_points = comparator.compare_yes_no_values(yes_no_values, yes_no_values_from_dataland, data_sources) - for key, value in yes_no_data_points.items(): - setattr(report, key, value) + if relevant_pages is None: + create_not_attempted_report(report, "No relevant pages found") + + try: + yes_no_values = yes_no_value_generator.get_yes_no_values_from_report(relevant_pages) + yes_no_values_from_dataland = data_provider.get_yes_no_values_by_data(data=dataset) + data_sources = data_provider.get_datasources_of_nuclear_and_gas_yes_no_questions(data=dataset) + + yes_no_data_points = comparator.compare_yes_no_values(yes_no_values, yes_no_values_from_dataland, data_sources) + + for key, value in yes_no_data_points.items(): + setattr(report, key, value) + + except Exception as e: # noqa: BLE001 + error_message = str(e) + create_not_attempted_report(report, error_message) + return report + + +def create_not_attempted_report(report: NuclearAndGasGeneralGeneral, error_message: str) -> None: + """Populate the report with 'not attempted' data points.""" + data_point_report = QaReportDataPointExtendedDataPointYesNo( + comment=error_message, + verdict=QaReportDataPointVerdict.QANOTATTEMPTED, + correctedData=ExtendedDataPointYesNo(), + ) + for field_name in [ + "nuclear_energy_related_activities_section426", + "nuclear_energy_related_activities_section427", + "nuclear_energy_related_activities_section428", + "fossil_gas_related_activities_section429", + "fossil_gas_related_activities_section430", + "fossil_gas_related_activities_section431", + ]: + setattr(report, field_name, data_point_report) diff --git a/src/dataland_qa_lab/review/yes_no_value_generator.py b/src/dataland_qa_lab/review/yes_no_value_generator.py index 982d788..c21c922 100644 --- a/src/dataland_qa_lab/review/yes_no_value_generator.py +++ b/src/dataland_qa_lab/review/yes_no_value_generator.py @@ -1,8 +1,15 @@ +import logging + from dataland_backend.models.yes_no import YesNo from dataland_qa_lab.prompting_services import prompting_service from dataland_qa_lab.review import generate_gpt_request +logger = logging.getLogger(__name__) + + +NUM_EXPECTED_VALUES = 6 + def get_yes_no_values_from_report(readable_text: str) -> dict[str, YesNo | None]: """Extracts information from template 1 using Azure OpenAI and returns a list of results. @@ -10,10 +17,22 @@ def get_yes_no_values_from_report(readable_text: str) -> dict[str, YesNo | None] Returns: list: A list including the etracted values of template 1 """ - extracted_list = generate_gpt_request.GenerateGptRequest.generate_gpt_request( - prompting_service.PromptingService.create_main_prompt(1, readable_text, ""), - prompting_service.PromptingService.create_sub_prompt_template1(), - ) + try: + extracted_list = generate_gpt_request.GenerateGptRequest.generate_gpt_request( + prompting_service.PromptingService.create_main_prompt(1, readable_text, ""), + prompting_service.PromptingService.create_sub_prompt_template1(), + ) + if not extracted_list: + msg = "No results returned from GPT for Yes_No values." + throw_error(msg) + + except (ValueError, TypeError) as e: + msg = f"Error extracting values from template 1: {e}" + throw_error(msg) + + if len(extracted_list) != NUM_EXPECTED_VALUES: + msg = "Yes_No values are too short or too long from GPT." + throw_error(msg) sections = { "nuclear_energy_related_activities_section426": YesNo(extracted_list[0]), @@ -25,3 +44,8 @@ def get_yes_no_values_from_report(readable_text: str) -> dict[str, YesNo | None] } return sections + + +def throw_error(msg: str) -> ValueError: + """Raises a ValueError with the given message.""" + raise ValueError(msg) diff --git a/src/dataland_qa_lab/utils/nuclear_and_gas_data_collection.py b/src/dataland_qa_lab/utils/nuclear_and_gas_data_collection.py index c640396..c30cb2e 100644 --- a/src/dataland_qa_lab/utils/nuclear_and_gas_data_collection.py +++ b/src/dataland_qa_lab/utils/nuclear_and_gas_data_collection.py @@ -20,14 +20,24 @@ class NuclearAndGasDataCollection: taxonomy_non_eligible: dict[str, TaxonomyNonEligibleDatapoint | None] def __init__(self, dataset: NuclearAndGasData) -> None: - """Intialize class.""" + """Initialize class.""" self.dataset = dataset - self.map_dataset_to_yes_no_dict() - self.map_dataset_to_numeric_dict() + self.yes_no_data_points = {} + self.taxonomy_aligned_denominator = {} + self.taxonomy_aligned_numerator = {} + self.taxonomy_eligble_but_not_aligned = {} + self.taxonomy_non_eligible = {} + + # Safely map datasets + if self.dataset and self.dataset.general: + self.map_dataset_to_yes_no_dict() + self.map_dataset_to_numeric_dict() def map_dataset_to_yes_no_dict(self) -> dict[str, YesNoDatapoint | None]: """Mapper function.""" - data = self.dataset.general.general + data = getattr(self.dataset.general, "general", None) + if data is None: + return self.yes_no_data_points = { "nuclear_energy_related_activities_section426": YesNoDatapoint( @@ -53,39 +63,57 @@ def map_dataset_to_yes_no_dict(self) -> dict[str, YesNoDatapoint | None]: def map_dataset_to_numeric_dict(self) -> None: """Mapper function.""" data = self.dataset.general + if data is None: + return # Skip if numeric data is missing self.taxonomy_aligned_denominator = { "taxonomy_aligned_capex_denominator": TaxononmyAlignedDenominatorDatapoint( data.taxonomy_aligned_denominator.nuclear_and_gas_taxonomy_aligned_capex_denominator + if data.taxonomy_aligned_denominator + else None ), "taxonomy_aligned_revenue_denominator": TaxononmyAlignedDenominatorDatapoint( data.taxonomy_aligned_denominator.nuclear_and_gas_taxonomy_aligned_revenue_denominator + if data.taxonomy_aligned_denominator + else None ), } self.taxonomy_aligned_numerator = { "taxonomy_aligned_capex_numerator": TaxonomyAlignedNumeratorDatapoint( data.taxonomy_aligned_numerator.nuclear_and_gas_taxonomy_aligned_capex_numerator + if data.taxonomy_aligned_numerator + else None ), "taxonomy_aligned_revenue_numerator": TaxonomyAlignedNumeratorDatapoint( data.taxonomy_aligned_numerator.nuclear_and_gas_taxonomy_aligned_revenue_numerator + if data.taxonomy_aligned_numerator + else None ), } self.taxonomy_eligble_but_not_aligned = { "taxonomy_not_aligned_capex": TaxonomyEligibleButNotAlignedDatapoint( data.taxonomy_eligible_but_not_aligned.nuclear_and_gas_taxonomy_eligible_but_not_aligned_capex + if data.taxonomy_eligible_but_not_aligned + else None ), "taxonomy_not_aligned_revenue": TaxonomyEligibleButNotAlignedDatapoint( data.taxonomy_eligible_but_not_aligned.nuclear_and_gas_taxonomy_eligible_but_not_aligned_revenue + if data.taxonomy_eligible_but_not_aligned + else None ), } self.taxonomy_non_eligible = { "taxonomy_non_eligible_capex": TaxonomyNonEligibleDatapoint( data.taxonomy_non_eligible.nuclear_and_gas_taxonomy_non_eligible_capex + if data.taxonomy_non_eligible + else None ), "taxonomy_non_eligible_revenue": TaxonomyNonEligibleDatapoint( data.taxonomy_non_eligible.nuclear_and_gas_taxonomy_non_eligible_revenue + if data.taxonomy_non_eligible + else None ), } diff --git a/tests/dataland/test_data_provider.py b/tests/dataland/test_data_provider.py index c6b549c..478d559 100644 --- a/tests/dataland/test_data_provider.py +++ b/tests/dataland/test_data_provider.py @@ -1,11 +1,13 @@ +from collections.abc import Callable + import pytest from dataland_backend.models.nuclear_and_gas_data import NuclearAndGasData -from dataland_backend.models.nuclear_and_gas_general import NuclearAndGasGeneral from dataland_qa_lab.dataland import data_provider from dataland_qa_lab.utils.nuclear_and_gas_data_collection import NuclearAndGasDataCollection from tests.utils import provide_test_dataset from tests.utils.provide_test_data_collection import provide_test_data_collection +from tests.utils.provide_test_dataset import provide_test_dataset # noqa: F811 def test_get_yes_no_values_by_data() -> None: @@ -31,6 +33,54 @@ def test_get_datasources_of_dataset() -> None: assert values.get("fossil_gas_related_activities_section431").file_name == "test-file" +@pytest.mark.parametrize( + ("function_name", "exception_message"), + [ + ( + data_provider.get_taxonomy_aligned_revenue_denominator_values_by_data, + "Error retrieving taxonomy-aligned revenue denominator", + ), + ( + data_provider.get_taxonomy_aligned_capex_denominator_values_by_data, + "Error retrieving taxonomy-aligned capex denominator", + ), + ( + data_provider.get_taxonomy_aligned_revenue_numerator_values_by_data, + "Error retrieving taxonomy-aligned revenue numerator", + ), + ( + data_provider.get_taxonomy_aligned_capex_numerator_values_by_data, + "Error retrieving taxonomy-aligned capex numerator", + ), + ( + data_provider.get_taxonomy_eligible_but_not_aligned_revenue_values_by_data, + "Error retrieving taxonomy eligible but not aligned revenue", + ), + ( + data_provider.get_taxonomy_eligible_but_not_aligned_capex_values_by_data, + "Error retrieving taxonomy eligible but not aligned capex", + ), + ( + data_provider.get_taxonomy_non_eligible_revenue_values_by_data, + "Error retrieving taxonomy non-eligible revenue", + ), + (data_provider.get_taxonomy_non_eligible_capex_values_by_data, "Error retrieving taxonomy non-eligible capex"), + ], +) +def test_function_exceptions( + function_name: Callable, + exception_message: str, + test_data_collection: NuclearAndGasDataCollection, # noqa: ARG001 +) -> None: + """Retrieve taxonomy-aligned capex denominator values from the dataset.""" + + # Create a dataset with missing values to trigger exceptions + empty_data_collection = NuclearAndGasDataCollection(NuclearAndGasData()) + + with pytest.raises(AttributeError, match=exception_message): + function_name(empty_data_collection) + + def test_get_taxonomy_aligned_revenue_denominator_values_by_data( test_data_collection: NuclearAndGasDataCollection, ) -> None: @@ -105,13 +155,5 @@ def test_taxonomy_non_eligible_capex_values_by_data(test_data_collection: Nuclea @pytest.fixture def test_data_collection() -> NuclearAndGasDataCollection: - dataset = NuclearAndGasData( - general=NuclearAndGasGeneral( - general=provide_test_dataset.create_template_1_reportframe(), - taxonomyAlignedDenominator=provide_test_dataset.create_template_2_reportframe(), - taxonomyAlignedNumerator=provide_test_dataset.create_template_3_reportframe(), - taxonomyEligibleButNotAligned=provide_test_dataset.create_template_4_reportframe(), - taxonomyNonEligible=provide_test_dataset.create_template_5_reportframe(), - ) - ) + dataset = provide_test_dataset() return NuclearAndGasDataCollection(dataset) diff --git a/tests/dataland/test_prompt_services.py b/tests/dataland/test_prompt_services.py index d487285..735cc78 100644 --- a/tests/dataland/test_prompt_services.py +++ b/tests/dataland/test_prompt_services.py @@ -5,6 +5,7 @@ from dataland_qa_lab.prompting_services import prompting_service from dataland_qa_lab.review import generate_gpt_request, numeric_value_generator, yes_no_value_generator +from dataland_qa_lab.review.generate_gpt_request import GenerateGptRequest @pytest.fixture @@ -14,6 +15,14 @@ def mock_pdf() -> Mock: return pdf +@pytest.fixture +def mock_config() -> Mock: + mock_conf = Mock() + mock_conf.azure_openai_api_key = "test_key" + mock_conf.azure_openai_endpoint = "https://test.endpoint.com" + return mock_conf + + def test_template_1(mock_pdf: Mock) -> None: result = prompting_service.PromptingService.create_main_prompt(1, mock_pdf, "Revenue") assert "provide the answers of all 6 questions in template 1" in result @@ -194,7 +203,7 @@ def test_generate_gpt_request(mock_generate_gpt_request: Mock, mock_pdf: Mock) - def test_get_taxonomy_alligned_denominator(mock_generate_gpt_request: Mock, mock_pdf: Mock) -> None: mock_generate_gpt_request.return_value = [0.1, 0, 0, 3.2, 0, 100] - result = numeric_value_generator.NumericValueGenerator.get_taxonomy_alligned_denominator(mock_pdf, "Revenue") + result = numeric_value_generator.NumericValueGenerator.get_taxonomy_aligned_denominator(mock_pdf, "Revenue") mock_generate_gpt_request.assert_called_once_with( prompting_service.PromptingService.create_main_prompt(2, mock_pdf, "Revenue"), @@ -207,7 +216,7 @@ def test_get_taxonomy_alligned_denominator(mock_generate_gpt_request: Mock, mock def test_get_taxonomy_alligned_numerator(mock_generate_gpt_request: Mock, mock_pdf: Mock) -> None: mock_generate_gpt_request.return_value = [0.1, 0, 0, 3.2, 0, 100] - result = numeric_value_generator.NumericValueGenerator.get_taxonomy_alligned_numerator(mock_pdf, "Revenue") + result = numeric_value_generator.NumericValueGenerator.get_taxonomy_aligned_numerator(mock_pdf, "Revenue") mock_generate_gpt_request.assert_called_once_with( prompting_service.PromptingService.create_main_prompt(3, mock_pdf, "Revenue"), @@ -240,3 +249,75 @@ def test_get_taxonomy_non_eligible(mock_generate_gpt_request: Mock, mock_pdf: Mo prompting_service.PromptingService.create_sub_prompt_template5("Revenue"), ) assert result == [0.1, 0, 0, 3.2, 0, 100], "The return values do not match." + + +def test_generate_gpt_request_general_error() -> None: + """Test handling of a general unexpected error.""" + with patch("dataland_qa_lab.utils.config.get_config", side_effect=Exception("Unexpected Error")): + with pytest.raises(ValueError, match="An unexpected error occurred") as exc: + GenerateGptRequest.generate_gpt_request("main_prompt", "sub_prompt") + + assert "An unexpected error occurred" in str(exc.value) + + +def test_generate_gpt_request_creation_error(mock_config: Mock) -> None: + """Test error during GPT request creation.""" + with ( + patch("dataland_qa_lab.utils.config.get_config", return_value=mock_config), + patch("openai.AzureOpenAI") as mock_client, + ): + mock_client().chat.completions.create.side_effect = Exception("GPT Request Error") + + with pytest.raises(ValueError, match="Error during GPT request creation") as exc: + GenerateGptRequest.generate_gpt_request("main_prompt", "sub_prompt") + assert "Error during GPT request creation" in str(exc.value) + + +def test_generate_gpt_request_config_error() -> None: + """Test error when loading configuration.""" + with patch("dataland_qa_lab.utils.config.get_config", side_effect=Exception("Config Error")): + with pytest.raises(ValueError, match="Error loading configuration") as exc: + GenerateGptRequest.generate_gpt_request("main_prompt", "sub_prompt") + + assert "Error loading configuration" in str(exc.value) + + +@patch("dataland_qa_lab.utils.config.get_config") +@patch("openai.AzureOpenAI") +def test_generate_gpt_request_tool_call_parsing_error(mock_client: Mock, mock_get_config: Mock) -> None: + """Test error handling during tool call argument parsing.""" + # Mock configuration + mock_get_config.return_value = Mock( + azure_openai_api_key="test_key", + azure_openai_endpoint="https://test.endpoint.com", + ) + + # Mock GPT response with invalid arguments + mock_client().chat.completions.create.return_value = Mock( + choices=[Mock(message=Mock(tool_calls=[Mock(function=Mock(arguments="Invalid Argument String"))]))] + ) + + # Call the function and expect a ValueError + with pytest.raises( + ValueError, match=r"An unexpected error occurred: Error during GPT request creation: Connection error." + ): + GenerateGptRequest.generate_gpt_request("main_prompt", "sub_prompt") + + +@patch("dataland_qa_lab.utils.config.get_config") +@patch("openai.AzureOpenAI") +def test_generate_gpt_request_no_tool_calls(mock_client: Mock, mock_get_config: Mock) -> None: + """Test handling when no tool calls are present in the GPT response.""" + # Mock configuration + mock_get_config.return_value = Mock( + azure_openai_api_key="test_key", + azure_openai_endpoint="https://test.endpoint.com", + ) + + # Mock GPT response with no tool calls + mock_client().chat.completions.create.return_value = Mock(choices=[Mock(message=Mock(tool_calls=None))]) + + with pytest.raises( + ValueError, match=r"An unexpected error occurred: Error during GPT request creation: Connection error." + ): + GenerateGptRequest.generate_gpt_request("main_prompt", "sub_prompt") diff --git a/tests/dataland/test_unreviewed_datasets.py b/tests/dataland/test_unreviewed_datasets.py index 750b8f6..4186565 100644 --- a/tests/dataland/test_unreviewed_datasets.py +++ b/tests/dataland/test_unreviewed_datasets.py @@ -59,3 +59,25 @@ def test_initialization_with_api_error(self, mock_get_config: MagicMock) -> None with pytest.raises(Exception): # noqa: B017, PT011 UnreviewedDatasets() + + def test_initialization_with_timeout_error(self, mock_get_config: MagicMock) -> None: + mock_conf = self.set_up_mock_client(dataset_count=1, datasets=None, exception=TimeoutError()) + mock_get_config.return_value = mock_conf + + with pytest.raises(TimeoutError): + UnreviewedDatasets() + + def test_initialization_with_no_client(self, mock_get_config: MagicMock) -> None: # noqa: PLR6301 + mock_conf = MagicMock() + mock_conf.dataland_client = None + mock_get_config.return_value = mock_conf + + with pytest.raises(ValueError, match=r"Client Setup failed in the configuration."): + UnreviewedDatasets() + + def test_initialization_with_runtime_error(self, mock_get_config: MagicMock) -> None: + mock_conf = self.set_up_mock_client(dataset_count=1, datasets=None, exception=RuntimeError()) + mock_get_config.return_value = mock_conf + + with pytest.raises(RuntimeError): + UnreviewedDatasets() diff --git a/tests/end_to_end/test_report_e2e.py b/tests/end_to_end/test_report_e2e.py index bdbf698..24a760b 100644 --- a/tests/end_to_end/test_report_e2e.py +++ b/tests/end_to_end/test_report_e2e.py @@ -3,9 +3,11 @@ from unittest.mock import ANY, MagicMock, patch import mock_constants +from dataland_qa.models.qa_report_data_point_verdict import QaReportDataPointVerdict from dataland_qa.models.qa_report_meta_information import QaReportMetaInformation -from clients.qa.dataland_qa.models.qa_report_data_point_verdict import QaReportDataPointVerdict +from dataland_qa_lab.database.database_engine import delete_entity +from dataland_qa_lab.database.database_tables import ReviewedDataset from dataland_qa_lab.dataland.provide_test_data import get_company_id, upload_dataset, upload_pdf from dataland_qa_lab.review.dataset_reviewer import review_dataset from dataland_qa_lab.utils import config @@ -21,9 +23,8 @@ def test_report_generator_end_to_end() -> None: # Upload test_dataset with partly wrong data data_id = upload_test_dataset() - + delete_entity(data_id, ReviewedDataset) report_metadata = mocked_review_dataset(data_id) - report_data = config.get_config().dataland_client.eu_taxonomy_nuclear_gas_qa_api.get_nuclear_and_gas_data_qa_report( data_id=data_id, qa_report_id=report_metadata.qa_report_id ) @@ -90,8 +91,8 @@ def test_report_generator_end_to_end() -> None: @patch("dataland_qa_lab.database.database_engine.get_entity") def mocked_review_dataset( data_id: str, - mock_extract_text_of_pdf: MagicMock, mock_get_entity: MagicMock, + mock_extract_text_of_pdf: MagicMock, ) -> QaReportMetaInformation: """Review the dataset with mocked Azure calls.""" mock_extract_text_of_pdf.return_value = mock_constants.E2E_AZURE_DOCUMENT_INTELLIGENCE_MOCK diff --git a/tests/review/test_denominator_report_generator.py b/tests/review/test_denominator_report_generator.py index 7aa0ff3..cc522b9 100644 --- a/tests/review/test_denominator_report_generator.py +++ b/tests/review/test_denominator_report_generator.py @@ -1,6 +1,5 @@ from unittest.mock import MagicMock, Mock, patch -from azure.ai.documentintelligence.models import AnalyzeResult from dataland_qa.models.qa_report_data_point_verdict import QaReportDataPointVerdict import dataland_qa_lab.review.report_generator.denominator_report_generator as report_generator @@ -8,22 +7,13 @@ from tests.utils.provide_test_dataset import provide_test_dataset -def provide_test_data_collection() -> tuple[NuclearAndGasDataCollection, AnalyzeResult]: +def provide_test_data_collection() -> tuple[NuclearAndGasDataCollection, str]: dataset = provide_test_dataset() data_collection = NuclearAndGasDataCollection(dataset) - relevant_pages = MagicMock(spec=AnalyzeResult) - - """pages= pages_provider.get_relevant_pages_of_pdf(data_collection) - relevant_pages = text_to_doc_intelligence.extract_text_of_pdf(pages)""" - + relevant_pages = MagicMock(spec=str) return data_collection, relevant_pages -"""data_collection = provide_test_data() -dataland = data_provider.get_taxonomy_aligned_revenue_denominator_values_by_data(data_collection) -print(dataland)""" - - @patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") def test_generate_taxonomy_aligned_denominator_report(mock_generate_gpt_request: Mock) -> None: dataset, relevant_pages = provide_test_data_collection() @@ -157,3 +147,42 @@ def test_generate_taxonomy_aligned_denominator_report_edge_cases(mock_generate_g assert report is not None assert report.verdict == QaReportDataPointVerdict.QAREJECTED assert report.corrected_data.quality == "NoDataFound" + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +@patch("dataland_qa_lab.dataland.data_provider.get_taxonomy_aligned_revenue_denominator_values_by_data") +def test_generate_revenue_denominator_report_frame_not_attempted( + mock_get_dataland_values: Mock, mock_generate_gpt_request: Mock +) -> None: + dataset, relevant_pages = provide_test_data_collection() + + # Simulate an exception in dataland value retrieval + mock_generate_gpt_request.side_effect = ValueError("Mock GPT error") + report = report_generator.build_denominator_report_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving prompted values for template 2" in report.comment + + # Simulate an exception in dataland retrieval + mock_generate_gpt_request.side_effect = None + mock_get_dataland_values.side_effect = RuntimeError("Mock dataland error") + report = report_generator.build_denominator_report_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving dataland values for template 2" in report.comment + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_generate_taxonomy_aligned_denominator_report_edge_cases_not_attempted(mock_generate_gpt_request: Mock) -> None: + dataset, relevant_pages = provide_test_data_collection() + + # Simulate an exception in the GPT request generation + mock_generate_gpt_request.side_effect = ValueError("Mock GPT error") + + report = report_generator.build_denominator_report_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving prompted values for template 2" in report.comment diff --git a/tests/review/test_eligible_not_aligned_report_generator.py b/tests/review/test_eligible_not_aligned_report_generator.py index 60132e9..12f3b78 100644 --- a/tests/review/test_eligible_not_aligned_report_generator.py +++ b/tests/review/test_eligible_not_aligned_report_generator.py @@ -153,3 +153,42 @@ def test_generate_eligible_but_not_aligned_report_edge_cases(mock_generate_gpt_r assert report is not None assert report.verdict == QaReportDataPointVerdict.QAREJECTED assert report.corrected_data.quality == "NoDataFound" + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +@patch("dataland_qa_lab.dataland.data_provider.get_taxonomy_eligible_but_not_aligned_revenue_values_by_data") +def test_generate_revenue_denominator_report_frame_not_attempted( + mock_get_dataland_values: Mock, mock_generate_gpt_request: Mock +) -> None: + dataset, relevant_pages = provide_test_data_collection() + + # Simulate an exception in dataland value retrieval + mock_generate_gpt_request.side_effect = ValueError("Mock GPT error") + report = report_generator.build_eligible_but_not_aligned_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving prompted values for template 4" in report.comment + + # Simulate an exception in dataland retrieval + mock_generate_gpt_request.side_effect = None + mock_get_dataland_values.side_effect = RuntimeError("Mock dataland error") + report = report_generator.build_eligible_but_not_aligned_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving dataland values for template 4" in report.comment + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_generate_taxonomy_aligned_denominator_report_edge_cases_not_attempted(mock_generate_gpt_request: Mock) -> None: + dataset, relevant_pages = provide_test_data_collection() + + # Simulate an exception in the GPT request generation + mock_generate_gpt_request.side_effect = ValueError("Mock GPT error") + + report = report_generator.build_eligible_but_not_aligned_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving prompted values for template 4" in report.comment diff --git a/tests/review/test_non_eligible_generator.py b/tests/review/test_non_eligible_generator.py index 6dab774..d02d1f6 100644 --- a/tests/review/test_non_eligible_generator.py +++ b/tests/review/test_non_eligible_generator.py @@ -93,3 +93,42 @@ def test_compare_taxonomy_non_eligible_values_edge_cases(mock_generate_gpt_reque assert report is not None assert report.verdict == QaReportDataPointVerdict.QAREJECTED assert report.corrected_data.quality == "NoDataFound" + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +@patch("dataland_qa_lab.dataland.data_provider.get_taxonomy_non_eligible_revenue_values_by_data") +def test_generate_revenue_denominator_report_frame_not_attempted( + mock_get_dataland_values: Mock, mock_generate_gpt_request: Mock +) -> None: + dataset, relevant_pages = provide_test_data_collection() + + # Simulate an exception in dataland value retrieval + mock_generate_gpt_request.side_effect = ValueError("Mock GPT error") + report = report_generator.build_non_eligible_report_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving prompted values for template 5" in report.comment + + # Simulate an exception in dataland retrieval + mock_generate_gpt_request.side_effect = None + mock_get_dataland_values.side_effect = RuntimeError("Mock dataland error") + report = report_generator.build_non_eligible_report_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving dataland values for template 5" in report.comment + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_generate_taxonomy_aligned_denominator_report_edge_cases_not_attempted(mock_generate_gpt_request: Mock) -> None: + dataset, relevant_pages = provide_test_data_collection() + + # Simulate an exception in the GPT request generation + mock_generate_gpt_request.side_effect = ValueError("Mock GPT error") + + report = report_generator.build_non_eligible_report_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving prompted values for template 5" in report.comment diff --git a/tests/review/test_numerator_report_generator.py b/tests/review/test_numerator_report_generator.py index 6ac5693..031084a 100644 --- a/tests/review/test_numerator_report_generator.py +++ b/tests/review/test_numerator_report_generator.py @@ -152,3 +152,40 @@ def test_generate_taxonomy_aligned_numerator_report_edge_cases(mock_generate_gpt assert report is not None assert report.verdict == QaReportDataPointVerdict.QAREJECTED assert report.corrected_data.quality == "NoDataFound" + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +@patch("dataland_qa_lab.dataland.data_provider.get_taxonomy_aligned_revenue_numerator_values_by_data") +def test_generate_revenue_denominator_report_frame_not_attempted( + mock_get_dataland_values: Mock, mock_generate_gpt_request: Mock +) -> None: + dataset, relevant_pages = provide_test_data_collection() + + mock_generate_gpt_request.side_effect = ValueError("Mock GPT error") + report = report_generator.build_numerator_report_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving prompted values for template 3" in report.comment + + mock_generate_gpt_request.side_effect = None + mock_get_dataland_values.side_effect = RuntimeError("Mock dataland error") + report = report_generator.build_numerator_report_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving dataland values for template 3" in report.comment + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_generate_taxonomy_aligned_denominator_report_edge_cases_not_attempted(mock_generate_gpt_request: Mock) -> None: + dataset, relevant_pages = provide_test_data_collection() + + # Simulate an exception in the GPT request generation + mock_generate_gpt_request.side_effect = ValueError("Mock GPT error") + + report = report_generator.build_numerator_report_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving prompted values for template 3" in report.comment diff --git a/tests/review/test_numeric_value_generator.py b/tests/review/test_numeric_value_generator.py new file mode 100644 index 0000000..244aa43 --- /dev/null +++ b/tests/review/test_numeric_value_generator.py @@ -0,0 +1,128 @@ +from unittest.mock import Mock, patch + +import pytest + +from dataland_qa_lab.prompting_services import prompting_service +from dataland_qa_lab.review.generate_gpt_request import GenerateGptRequest # noqa: F401 +from dataland_qa_lab.review.numeric_value_generator import NumericValueGenerator + + +# Mock AnalyzeResult +@pytest.fixture +def mock_analyze_result() -> Mock: + mock_result = Mock() + mock_result.content = "Test readable text content." + return mock_result + + +# Mock Config and Logger +@pytest.fixture +def mock_logger() -> Mock: + logger = Mock() + return logger + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_get_taxonomy_aligned_denominator_success(mock_generate_gpt_request: Mock, mock_analyze_result: Mock) -> None: + """Test successful extraction of taxonomy aligned denominator values.""" + mock_generate_gpt_request.return_value = ["0.1", "2.5", "3.0"] + + result = NumericValueGenerator.get_taxonomy_aligned_denominator(mock_analyze_result, "Revenue") + + mock_generate_gpt_request.assert_called_once_with( + prompting_service.PromptingService.create_main_prompt(2, mock_analyze_result, "Revenue"), + prompting_service.PromptingService.create_sub_prompt_template2to4("Revenue"), + ) + + assert result == [0.1, 2.5, 3.0] + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_get_taxonomy_aligned_denominator_empty_response(mock_generate_gpt_request: Mock) -> None: + """Test empty GPT response for taxonomy aligned denominator values.""" + mock_generate_gpt_request.return_value = [] + + with pytest.raises(ValueError, match=r"No results returned from GPT for template 2 values.") as exc: + NumericValueGenerator.get_taxonomy_aligned_denominator("Some readable text", "Revenue") + + assert "No results returned from GPT for template 2 values." in str(exc.value) + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_get_taxonomy_aligned_denominator_conversion_error( + mock_generate_gpt_request: Mock, mock_analyze_result: Mock +) -> None: + """Test float conversion error in taxonomy aligned denominator values.""" + mock_generate_gpt_request.return_value = ["0.1", "invalid", "3.0"] + + with pytest.raises(ValueError) as exc: # noqa: PT011 + NumericValueGenerator.get_taxonomy_aligned_denominator(mock_analyze_result, "Revenue") + + assert "Unexpected error during float conversion" in str(exc.value) + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_get_taxonomy_aligned_numerator_success(mock_generate_gpt_request: Mock, mock_analyze_result: Mock) -> None: + """Test successful extraction of taxonomy aligned numerator values.""" + mock_generate_gpt_request.return_value = ["1.0", "2.0", "3.0"] + + result = NumericValueGenerator.get_taxonomy_aligned_numerator(mock_analyze_result, "Revenue") + + mock_generate_gpt_request.assert_called_once_with( + prompting_service.PromptingService.create_main_prompt(3, mock_analyze_result, "Revenue"), + prompting_service.PromptingService.create_sub_prompt_template2to4("Revenue"), + ) + + assert result == [1.0, 2.0, 3.0] + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_get_taxonomy_eligible_not_alligned_success(mock_generate_gpt_request: Mock, mock_analyze_result: Mock) -> None: + """Test successful extraction of taxonomy eligible not aligned values.""" + mock_generate_gpt_request.return_value = ["4.0", "5.0", "6.0"] + + result = NumericValueGenerator.get_taxonomy_eligible_not_alligned(mock_analyze_result, "Revenue") + + mock_generate_gpt_request.assert_called_once_with( + prompting_service.PromptingService.create_main_prompt(4, mock_analyze_result, "Revenue"), + prompting_service.PromptingService.create_sub_prompt_template2to4("Revenue"), + ) + + assert result == [4.0, 5.0, 6.0] + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_get_taxonomy_non_eligible_success(mock_generate_gpt_request: Mock, mock_analyze_result: Mock) -> None: + """Test successful extraction of taxonomy non-eligible values.""" + mock_generate_gpt_request.return_value = ["7.0", "8.0", "9.0"] + + result = NumericValueGenerator.get_taxonomy_non_eligible(mock_analyze_result, "Revenue") + + mock_generate_gpt_request.assert_called_once_with( + prompting_service.PromptingService.create_main_prompt(5, mock_analyze_result, "Revenue"), + prompting_service.PromptingService.create_sub_prompt_template5("Revenue"), + ) + + assert result == [7.0, 8.0, 9.0] + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_get_taxonomy_non_eligible_empty_response(mock_generate_gpt_request: Mock, mock_analyze_result: Mock) -> None: + """Test empty GPT response for taxonomy non-eligible values.""" + mock_generate_gpt_request.return_value = [] + + with pytest.raises(ValueError) as exc: # noqa: PT011 + NumericValueGenerator.get_taxonomy_non_eligible(mock_analyze_result, "Revenue") + + assert "No results returned from GPT for template 5 values." in str(exc.value) + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_get_taxonomy_non_eligible_conversion_error(mock_generate_gpt_request: Mock, mock_analyze_result: Mock) -> None: + """Test float conversion error in taxonomy non-eligible values.""" + mock_generate_gpt_request.return_value = ["7.0", "invalid", "9.0"] + + with pytest.raises(ValueError) as exc: # noqa: PT011 + NumericValueGenerator.get_taxonomy_non_eligible(mock_analyze_result, "Revenue") + + assert "Unexpected error during float conversion" in str(exc.value) diff --git a/tests/review/test_report_generator.py b/tests/review/test_report_generator.py index 19b6353..450fba9 100644 --- a/tests/review/test_report_generator.py +++ b/tests/review/test_report_generator.py @@ -1,11 +1,9 @@ from unittest.mock import Mock, patch -import pytest from azure.ai.documentintelligence.models import AnalyzeResult from openai.types.chat.chat_completion import ChatCompletion, ChatCompletionMessage, Choice from dataland_qa_lab.review.report_generator import yes_no_report_generator -from dataland_qa_lab.review.report_generator.nuclear_and_gas_report_generator import NuclearAndGasReportGenerator from tests.utils.provide_test_data_collection import provide_test_data_collection @@ -49,17 +47,3 @@ def test_compare_yes_no_values(mock_generate_gpt_request: Mock) -> None: assert report.nuclear_energy_related_activities_section426.corrected_data.value is None assert report.nuclear_energy_related_activities_section426.comment == "Geprüft durch AzureOpenAI" assert report.fossil_gas_related_activities_section430.corrected_data.value == "Yes" - - -@patch("openai.resources.chat.Completions.create", return_value=build_simple_openai_chat_completion()) -def test_generate_report(_mock_create: Mock) -> None: # noqa: PT019 - test_data_collection = provide_test_data_collection() - - report = None # Initialize the variable to avoid UnboundLocalError - with pytest.raises(Exception, match=r"No tool calls found in the GPT response."): - report = NuclearAndGasReportGenerator().generate_report( - relevant_pages=AnalyzeResult(), dataset=test_data_collection - ) - # Handle report if no exception is raised - if report: - assert report.general.general.fossil_gas_related_activities_section430.corrected_data.value == "Yes" diff --git a/tests/review/test_yes_no_report_generator.py b/tests/review/test_yes_no_report_generator.py new file mode 100644 index 0000000..835b46a --- /dev/null +++ b/tests/review/test_yes_no_report_generator.py @@ -0,0 +1,104 @@ +from unittest.mock import Mock, patch + +from azure.ai.documentintelligence.models import AnalyzeResult +from dataland_qa.models.qa_report_data_point_verdict import QaReportDataPointVerdict +from openai.types.chat.chat_completion import ChatCompletion, ChatCompletionMessage, Choice + +from dataland_qa_lab.review.report_generator import yes_no_report_generator +from dataland_qa_lab.utils.nuclear_and_gas_data_collection import NuclearAndGasDataCollection +from tests.utils.provide_test_data_collection import provide_test_data_collection +from tests.utils.provide_test_dataset import provide_test_dataset + + +def create_document_intelligence_mock() -> AnalyzeResult: + return AnalyzeResult(content="") + + +def build_simple_openai_chat_completion() -> ChatCompletion: + msg = "['Yes', 'No', 'Yes', 'Yes', 'Yes', 'No']" + return ChatCompletion( + id="test", + choices=[ + Choice( + finish_reason="stop", + index=0, + message=ChatCompletionMessage( + content=msg, + role="assistant", + ), + ) + ], + created=0, + model="test", + object="chat.completion", + ) + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_compare_yes_no_values(mock_generate_gpt_request: Mock) -> None: + test_data_collection = provide_test_data_collection() + mock_generate_gpt_request.return_value = [ + "Yes", + "No", + "Yes", + "No", + "Yes", + "No", + ] + report = yes_no_report_generator.build_yes_no_report(dataset=test_data_collection, relevant_pages=AnalyzeResult()) + + assert report.nuclear_energy_related_activities_section426.corrected_data.value is None + assert report.nuclear_energy_related_activities_section426.comment == "Geprüft durch AzureOpenAI" + assert report.fossil_gas_related_activities_section430.corrected_data.value == "Yes" + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_build_yes_no_report_success(mock_generate_gpt_request: Mock) -> None: + mock_generate_gpt_request.return_value = [ + "No", + "No", + "Yes", + "No", + "No", + "No", + ] + test_data_collection = NuclearAndGasDataCollection(provide_test_dataset()) + report = yes_no_report_generator.build_yes_no_report(dataset=test_data_collection, relevant_pages=AnalyzeResult()) + + # Assertions + assert report.fossil_gas_related_activities_section430.comment == ( + "Discrepancy in 'fossil_gas_related_activities_section430': YesNo.YES != YesNo.NO." + ) + assert report.fossil_gas_related_activities_section430.verdict == QaReportDataPointVerdict.QAREJECTED + + +@patch("dataland_qa_lab.review.yes_no_value_generator.get_yes_no_values_from_report") +def test_build_yes_no_report_generator_error(mock_get_yes_no_values: Mock) -> None: + # Simulate an error in get_yes_no_values_from_report + mock_get_yes_no_values.side_effect = ValueError("Error in get_yes_no_values_from_report") + + test_data_collection = provide_test_data_collection() + report = yes_no_report_generator.build_yes_no_report(dataset=test_data_collection, relevant_pages="123") + + # Assertions for error handling + assert report.nuclear_energy_related_activities_section426.comment == "Error in get_yes_no_values_from_report" + assert report.nuclear_energy_related_activities_section426.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert report.nuclear_energy_related_activities_section426.corrected_data.value is None + + +@patch("dataland_qa_lab.dataland.data_provider.get_yes_no_values_by_data") +def test_build_yes_no_report_data_provider_error(mock_get_yes_no_values_by_data: Mock) -> None: + # Simulate an error in get_yes_no_values_by_data + mock_get_yes_no_values_by_data.side_effect = ValueError("Error in get_yes_no_values_by_data") + expected_comments = [ + "Error in get_yes_no_values_by_data", + "Error extracting values from template 1: An unexpected error occurred: " + "Error during GPT request creation: Connection error.", + ] + test_data_collection = provide_test_data_collection() + report = yes_no_report_generator.build_yes_no_report(dataset=test_data_collection, relevant_pages="123") + + # Assertions for error handling + assert report.nuclear_energy_related_activities_section426.comment in expected_comments + assert report.nuclear_energy_related_activities_section426.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert report.nuclear_energy_related_activities_section426.corrected_data.comment is None