diff --git a/notebooks/test_existing_company_reports.ipynb b/notebooks/test_existing_company_reports.ipynb index c9b0418..33b35da 100644 --- a/notebooks/test_existing_company_reports.ipynb +++ b/notebooks/test_existing_company_reports.ipynb @@ -4,7 +4,19 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Python-dotenv could not parse statement starting at line 15\n", + "Python-dotenv could not parse statement starting at line 18\n", + "Python-dotenv could not parse statement starting at line 20\n", + "Python-dotenv could not parse statement starting at line 23\n", + "Python-dotenv could not parse statement starting at line 25\n" + ] + } + ], "source": [ "from dataland_backend.models.data_type_enum import DataTypeEnum\n", "\n", @@ -70,7 +82,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "BPCE\n" + "Aktiebolaget Electrolux\n" ] } ], @@ -79,7 +91,7 @@ "extracted_yes_no_values = {}\n", "\n", "# check yes no values\n", - "for data_id, company_info in zip(data_ids[8:9], company_infos[8:9], strict=False):\n", + "for data_id, company_info in zip(data_ids[0:1], company_infos[0:1], strict=False):\n", " print(company_info.company_name)\n", " data = dataland_client.eu_taxonomy_nuclear_and_gas_api.get_company_associated_nuclear_and_gas_data(data_id=data_id)\n", " data_collection = NuclearAndGasDataCollection(dataset=data.data)\n", @@ -107,11 +119,11 @@ "output_type": "stream", "text": [ "\n", - "Company: BPCE\n", + "Company: Aktiebolaget Electrolux\n", "nuclear_energy_related_activities_section426: Dataland=YesNo.NO, Extracted=YesNo.NO\n", - "nuclear_energy_related_activities_section427: Dataland=YesNo.YES, Extracted=YesNo.YES\n", - "nuclear_energy_related_activities_section428: Dataland=YesNo.YES, Extracted=YesNo.YES\n", - "fossil_gas_related_activities_section429: Dataland=YesNo.YES, Extracted=YesNo.YES\n", + "nuclear_energy_related_activities_section427: Dataland=YesNo.NO, Extracted=YesNo.NO\n", + "nuclear_energy_related_activities_section428: Dataland=YesNo.NO, Extracted=YesNo.NO\n", + "fossil_gas_related_activities_section429: Dataland=YesNo.NO, Extracted=YesNo.NO\n", "fossil_gas_related_activities_section430: Dataland=YesNo.NO, Extracted=YesNo.NO\n", "fossil_gas_related_activities_section431: Dataland=YesNo.NO, Extracted=YesNo.NO\n", "1.0\n" @@ -143,46 +155,56 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Skipping company Aktiebolaget Electrolux due to missing data from Dataland: Error retrieving taxonomy-aligned revenue denominator: 'NoneType' object has no attribute 'value'\n" + ] + } + ], "source": [ "numeric_values_dataland = {}\n", "extracted_numeric_values = {}\n", "\n", "# check numeric values\n", - "for data_id, company_info in zip(data_ids[6:7], company_infos[6:7], strict=False):\n", + "for data_id, company_info in zip(data_ids[0:1], company_infos[0:1], strict=False):\n", " data = dataland_client.eu_taxonomy_nuclear_and_gas_api.get_company_associated_nuclear_and_gas_data(data_id=data_id)\n", " data_collection = NuclearAndGasDataCollection(dataset=data.data)\n", + " try:\n", + " # get values on Dataland\n", + " if company_info.company_name not in numeric_values_dataland:\n", + " numeric_values_dataland[company_info.company_name] = {}\n", "\n", - " # get values on Dataland\n", - " if company_info.company_name not in numeric_values_dataland:\n", - " numeric_values_dataland[company_info.company_name] = {}\n", - "\n", - " numeric_values_dataland[company_info.company_name][\"aligned_revenue_denominator\"] = (\n", - " get_taxonomy_aligned_revenue_denominator_values_by_data(data=data_collection)\n", - " )\n", - " numeric_values_dataland[company_info.company_name][\"aligned_capex_denominator\"] = (\n", - " get_taxonomy_aligned_capex_denominator_values_by_data(data=data_collection)\n", - " )\n", - " numeric_values_dataland[company_info.company_name][\"aligned_revenue_numerator\"] = (\n", - " get_taxonomy_aligned_revenue_numerator_values_by_data(data=data_collection)\n", - " )\n", - " numeric_values_dataland[company_info.company_name][\"aligned_capex_numerator\"] = (\n", - " get_taxonomy_aligned_capex_numerator_values_by_data(data=data_collection)\n", - " )\n", - " numeric_values_dataland[company_info.company_name][\"not_aligned_revenue\"] = (\n", - " get_taxonomy_eligible_but_not_aligned_revenue_values_by_data(data=data_collection)\n", - " )\n", - " numeric_values_dataland[company_info.company_name][\"not_aligned_capex\"] = (\n", - " get_taxonomy_eligible_but_not_aligned_capex_values_by_data(data=data_collection)\n", - " )\n", - " numeric_values_dataland[company_info.company_name][\"non_eligible_revenue\"] = (\n", - " get_taxonomy_non_eligible_revenue_values_by_data(data=data_collection)\n", - " )\n", - " numeric_values_dataland[company_info.company_name][\"non_eligible_capex\"] = (\n", - " get_taxonomy_non_eligible_capex_values_by_data(data=data_collection)\n", - " )\n", + " numeric_values_dataland[company_info.company_name][\"aligned_revenue_denominator\"] = (\n", + " get_taxonomy_aligned_revenue_denominator_values_by_data(data=data_collection)\n", + " )\n", + " numeric_values_dataland[company_info.company_name][\"aligned_capex_denominator\"] = (\n", + " get_taxonomy_aligned_capex_denominator_values_by_data(data=data_collection)\n", + " )\n", + " numeric_values_dataland[company_info.company_name][\"aligned_revenue_numerator\"] = (\n", + " get_taxonomy_aligned_revenue_numerator_values_by_data(data=data_collection)\n", + " )\n", + " numeric_values_dataland[company_info.company_name][\"aligned_capex_numerator\"] = (\n", + " get_taxonomy_aligned_capex_numerator_values_by_data(data=data_collection)\n", + " )\n", + " numeric_values_dataland[company_info.company_name][\"not_aligned_revenue\"] = (\n", + " get_taxonomy_eligible_but_not_aligned_revenue_values_by_data(data=data_collection)\n", + " )\n", + " numeric_values_dataland[company_info.company_name][\"not_aligned_capex\"] = (\n", + " get_taxonomy_eligible_but_not_aligned_capex_values_by_data(data=data_collection)\n", + " )\n", + " numeric_values_dataland[company_info.company_name][\"non_eligible_revenue\"] = (\n", + " get_taxonomy_non_eligible_revenue_values_by_data(data=data_collection)\n", + " )\n", + " numeric_values_dataland[company_info.company_name][\"non_eligible_capex\"] = (\n", + " get_taxonomy_non_eligible_capex_values_by_data(data=data_collection)\n", + " )\n", + " except AttributeError as e:\n", + " print(f\"Skipping company {company_info.company_name} due to missing data from Dataland: {e}\")\n", "\n", " # get values from AI\n", " try:\n", @@ -209,7 +231,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -217,168 +239,8 @@ "output_type": "stream", "text": [ "\n", - "Company: Berliner Volksbank eG\n", - "Section 0: Dataland=0, Extracted=0.0\n", - "Section 1: Dataland=0, Extracted=0.0\n", - "Section 2: Dataland=0, Extracted=0.0\n", - "Section 3: Dataland=0, Extracted=0.0\n", - "Section 4: Dataland=0, Extracted=0.0\n", - "Section 5: Dataland=0, Extracted=0.0\n", - "Section 6: Dataland=0, Extracted=0.0\n", - "Section 7: Dataland=0, Extracted=0.0\n", - "Section 8: Dataland=0, Extracted=0.0\n", - "Section 9: Dataland=0, Extracted=0.0\n", - "Section 10: Dataland=0, Extracted=0.0\n", - "Section 11: Dataland=0, Extracted=0.0\n", - "Section 12: Dataland=0, Extracted=0.0\n", - "Section 13: Dataland=0, Extracted=0.0\n", - "Section 14: Dataland=0, Extracted=0.0\n", - "Section 15: Dataland=0, Extracted=0.0\n", - "Section 16: Dataland=0, Extracted=0.0\n", - "Section 17: Dataland=0, Extracted=0.0\n", - "Section 18: Dataland=0.1, Extracted=0.1\n", - "Section 19: Dataland=0.1, Extracted=0.1\n", - "Section 20: Dataland=0, Extracted=0.0\n", - "Section 21: Dataland=0.1, Extracted=0.1\n", - "Section 22: Dataland=0.1, Extracted=0.1\n", - "Section 23: Dataland=0, Extracted=0.0\n", - "Section 24: Dataland=0, Extracted=0.0\n", - "Section 25: Dataland=0, Extracted=0.0\n", - "Section 26: Dataland=0, Extracted=0.0\n", - "Section 27: Dataland=0, Extracted=0.0\n", - "Section 28: Dataland=0, Extracted=0.0\n", - "Section 29: Dataland=0, Extracted=0.0\n", - "Section 30: Dataland=0, Extracted=0.0\n", - "Section 31: Dataland=0, Extracted=0.0\n", - "Section 32: Dataland=0, Extracted=0.0\n", - "Section 33: Dataland=0, Extracted=0.0\n", - "Section 34: Dataland=0, Extracted=0.0\n", - "Section 35: Dataland=0, Extracted=0.0\n", - "Section 36: Dataland=0, Extracted=0.0\n", - "Section 37: Dataland=0, Extracted=0.0\n", - "Section 38: Dataland=0, Extracted=0.0\n", - "Section 39: Dataland=0, Extracted=0.0\n", - "Section 40: Dataland=0, Extracted=0.0\n", - "Section 41: Dataland=0, Extracted=0.0\n", - "Section 42: Dataland=0.1, Extracted=0.1\n", - "Section 43: Dataland=0.1, Extracted=0.1\n", - "Section 44: Dataland=0, Extracted=0.0\n", - "Section 45: Dataland=0.1, Extracted=0.1\n", - "Section 46: Dataland=0.1, Extracted=0.1\n", - "Section 47: Dataland=0, Extracted=0.0\n", - "Section 48: Dataland=0, Extracted=0.0\n", - "Section 49: Dataland=0, Extracted=0.0\n", - "Section 50: Dataland=0, Extracted=0.0\n", - "Section 51: Dataland=0, Extracted=0.0\n", - "Section 52: Dataland=0, Extracted=0.0\n", - "Section 53: Dataland=0, Extracted=0.0\n", - "Section 54: Dataland=0, Extracted=0.0\n", - "Section 55: Dataland=0, Extracted=0.0\n", - "Section 56: Dataland=0, Extracted=0.0\n", - "Section 57: Dataland=0, Extracted=0.0\n", - "Section 58: Dataland=0, Extracted=0.0\n", - "Section 59: Dataland=0, Extracted=0.0\n", - "Section 60: Dataland=0, Extracted=0.0\n", - "Section 61: Dataland=0, Extracted=0.0\n", - "Section 62: Dataland=0, Extracted=0.0\n", - "Section 63: Dataland=0, Extracted=0.0\n", - "Section 64: Dataland=0, Extracted=0.0\n", - "Section 65: Dataland=0, Extracted=0.0\n", - "Section 66: Dataland=100, Extracted=100.0\n", - "Section 67: Dataland=100, Extracted=100.0\n", - "Section 68: Dataland=0, Extracted=0.0\n", - "Section 69: Dataland=100, Extracted=100.0\n", - "Section 70: Dataland=100, Extracted=100.0\n", - "Section 71: Dataland=0, Extracted=0.0\n", - "Section 72: Dataland=0, Extracted=0.0\n", - "Section 73: Dataland=0, Extracted=0.0\n", - "Section 74: Dataland=0, Extracted=0.0\n", - "Section 75: Dataland=0, Extracted=0.0\n", - "Section 76: Dataland=0, Extracted=0.0\n", - "Section 77: Dataland=0, Extracted=0.0\n", - "Section 78: Dataland=0, Extracted=0.0\n", - "Section 79: Dataland=0, Extracted=0.0\n", - "Section 80: Dataland=0, Extracted=0.0\n", - "Section 81: Dataland=0, Extracted=0.0\n", - "Section 82: Dataland=0, Extracted=0.0\n", - "Section 83: Dataland=0, Extracted=0.0\n", - "Section 84: Dataland=0, Extracted=0.0\n", - "Section 85: Dataland=0, Extracted=0.0\n", - "Section 86: Dataland=0, Extracted=0.0\n", - "Section 87: Dataland=0, Extracted=0.0\n", - "Section 88: Dataland=0, Extracted=0.0\n", - "Section 89: Dataland=0, Extracted=0.0\n", - "Section 90: Dataland=100, Extracted=100.0\n", - "Section 91: Dataland=100, Extracted=100.0\n", - "Section 92: Dataland=0, Extracted=0.0\n", - "Section 93: Dataland=100, Extracted=100.0\n", - "Section 94: Dataland=100, Extracted=100.0\n", - "Section 95: Dataland=0, Extracted=0.0\n", - "Section 96: Dataland=0, Extracted=0.0\n", - "Section 97: Dataland=0, Extracted=0.0\n", - "Section 98: Dataland=0, Extracted=0.0\n", - "Section 99: Dataland=0, Extracted=0.0\n", - "Section 100: Dataland=0, Extracted=0.0\n", - "Section 101: Dataland=0, Extracted=0.0\n", - "Section 102: Dataland=0, Extracted=0.0\n", - "Section 103: Dataland=0, Extracted=0.0\n", - "Section 104: Dataland=0, Extracted=0.0\n", - "Section 105: Dataland=0, Extracted=0.0\n", - "Section 106: Dataland=0, Extracted=0.0\n", - "Section 107: Dataland=0, Extracted=0.0\n", - "Section 108: Dataland=0, Extracted=0.0\n", - "Section 109: Dataland=0, Extracted=0.0\n", - "Section 110: Dataland=0, Extracted=0.0\n", - "Section 111: Dataland=0, Extracted=0.0\n", - "Section 112: Dataland=0, Extracted=0.0\n", - "Section 113: Dataland=0, Extracted=0.0\n", - "Section 114: Dataland=7.82, Extracted=7.82\n", - "Section 115: Dataland=7.82, Extracted=7.82\n", - "Section 116: Dataland=0, Extracted=0.0\n", - "Section 117: Dataland=7.82, Extracted=7.82\n", - "Section 118: Dataland=7.82, Extracted=7.82\n", - "Section 119: Dataland=0, Extracted=0.0\n", - "Section 120: Dataland=0, Extracted=0.0\n", - "Section 121: Dataland=0, Extracted=0.0\n", - "Section 122: Dataland=0, Extracted=0.0\n", - "Section 123: Dataland=0, Extracted=0.0\n", - "Section 124: Dataland=0, Extracted=0.0\n", - "Section 125: Dataland=0, Extracted=0.0\n", - "Section 126: Dataland=0, Extracted=0.0\n", - "Section 127: Dataland=0, Extracted=0.0\n", - "Section 128: Dataland=0, Extracted=0.0\n", - "Section 129: Dataland=0, Extracted=0.0\n", - "Section 130: Dataland=0, Extracted=0.0\n", - "Section 131: Dataland=0, Extracted=0.0\n", - "Section 132: Dataland=0, Extracted=0.0\n", - "Section 133: Dataland=0, Extracted=0.0\n", - "Section 134: Dataland=0, Extracted=0.0\n", - "Section 135: Dataland=0, Extracted=0.0\n", - "Section 136: Dataland=0, Extracted=0.0\n", - "Section 137: Dataland=0, Extracted=0.0\n", - "Section 138: Dataland=7.82, Extracted=7.82\n", - "Section 139: Dataland=7.82, Extracted=7.82\n", - "Section 140: Dataland=0, Extracted=0.0\n", - "Section 141: Dataland=7.82, Extracted=7.82\n", - "Section 142: Dataland=7.82, Extracted=7.82\n", - "Section 143: Dataland=0, Extracted=0.0\n", - "Section 144: Dataland=0, Extracted=0.0\n", - "Section 145: Dataland=0, Extracted=0.0\n", - "Section 146: Dataland=0, Extracted=0.0\n", - "Section 147: Dataland=0, Extracted=0.0\n", - "Section 148: Dataland=0, Extracted=0.0\n", - "Section 149: Dataland=0, Extracted=0.0\n", - "Section 150: Dataland=4.17, Extracted=4.17\n", - "Section 151: Dataland=4.17, Extracted=4.17\n", - "Section 152: Dataland=0, Extracted=0.0\n", - "Section 153: Dataland=0, Extracted=0.0\n", - "Section 154: Dataland=0, Extracted=0.0\n", - "Section 155: Dataland=0, Extracted=0.0\n", - "Section 156: Dataland=0, Extracted=0.0\n", - "Section 157: Dataland=0, Extracted=0.0\n", - "Section 158: Dataland=4.17, Extracted=4.17\n", - "Section 159: Dataland=4.17, Extracted=4.17\n", - "Matching ratio: 100.00%\n" + "Company: Aktiebolaget Electrolux\n", + "Matching ratio: 0.00%\n" ] } ], @@ -439,7 +301,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.0" + "version": "3.12.6" } }, "nbformat": 4, diff --git a/src/dataland_qa_lab/dataland/data_provider.py b/src/dataland_qa_lab/dataland/data_provider.py index 3072317..e07916d 100644 --- a/src/dataland_qa_lab/dataland/data_provider.py +++ b/src/dataland_qa_lab/dataland/data_provider.py @@ -14,87 +14,126 @@ def get_yes_no_values_by_data(data: NuclearAndGasDataCollection) -> dict[str, YesNo | None]: """Get Yes/No values of the given dataset as a dictionary with section names as keys.""" - sections = data.yes_no_data_points + try: + sections = data.yes_no_data_points + + section_values = { + key: (data.datapoint.value if data and data.datapoint and data.datapoint.value is not None else None) + for key, data in sections.items() + } + except (AttributeError, KeyError, TypeError) as e: + msg = f"Error retrieving yes/no values: {e}" + raise AttributeError(msg) from e - section_values = { - key: (data.datapoint.value if data and data.datapoint and data.datapoint.value is not None else None) - for key, data in sections.items() - } return section_values def get_taxonomy_aligned_revenue_denominator_values_by_data(data: NuclearAndGasDataCollection) -> dict: """Retrieve taxonomy-aligned revenue denominator values from the dataset.""" denominator_values_dict = {} - denominator_values = data.taxonomy_aligned_denominator.get("taxonomy_aligned_revenue_denominator").datapoint.value - for field_name in NuclearAndGasAlignedDenominator.model_fields: - denominator_values_dict[field_name] = extract_field_data(denominator_values, field_name) + try: + denominator_values = data.taxonomy_aligned_denominator.get( + "taxonomy_aligned_revenue_denominator" + ).datapoint.value + for field_name in NuclearAndGasAlignedDenominator.model_fields: + denominator_values_dict[field_name] = extract_field_data(denominator_values, field_name) + except (AttributeError, KeyError, TypeError) as e: + msg = f"Error retrieving taxonomy-aligned revenue denominator: {e}" + raise AttributeError(msg) from e + return denominator_values_dict def get_taxonomy_aligned_capex_denominator_values_by_data(data: NuclearAndGasDataCollection) -> dict: """Retrieve taxonomy-aligned capex denominator values from the dataset.""" denominator_values_dict = {} - denominator_values = data.taxonomy_aligned_denominator.get("taxonomy_aligned_capex_denominator").datapoint.value - for field_name in NuclearAndGasAlignedDenominator.model_fields: - denominator_values_dict[field_name] = extract_field_data(denominator_values, field_name) + try: + denominator_values = data.taxonomy_aligned_denominator.get("taxonomy_aligned_capex_denominator").datapoint.value + for field_name in NuclearAndGasAlignedDenominator.model_fields: + denominator_values_dict[field_name] = extract_field_data(denominator_values, field_name) + except (AttributeError, KeyError, TypeError) as e: + msg = f"Error retrieving taxonomy-aligned capex denominator: {e}" + raise AttributeError(msg) from e return denominator_values_dict def get_taxonomy_aligned_revenue_numerator_values_by_data(data: NuclearAndGasDataCollection) -> dict: """Retrieve taxonomy-aligned revenue numerator values from the dataset.""" numerator_values_dict = {} - numerator_values = data.taxonomy_aligned_numerator.get("taxonomy_aligned_revenue_numerator").datapoint.value - for field_name in NuclearAndGasAlignedNumerator.model_fields: - numerator_values_dict[field_name] = extract_field_data(numerator_values, field_name) + try: + numerator_values = data.taxonomy_aligned_numerator.get("taxonomy_aligned_revenue_numerator").datapoint.value + for field_name in NuclearAndGasAlignedNumerator.model_fields: + numerator_values_dict[field_name] = extract_field_data(numerator_values, field_name) + except (AttributeError, KeyError, TypeError) as e: + msg = f"Error retrieving taxonomy-aligned revenue numerator: {e}" + raise AttributeError(msg) from e return numerator_values_dict def get_taxonomy_aligned_capex_numerator_values_by_data(data: NuclearAndGasDataCollection) -> dict: """Retrieve taxonomy-aligned capex numerator values from the dataset.""" numerator_values_dict = {} - numerator_values = data.taxonomy_aligned_numerator.get("taxonomy_aligned_capex_numerator").datapoint.value - for field_name in NuclearAndGasAlignedNumerator.model_fields: - numerator_values_dict[field_name] = extract_field_data(numerator_values, field_name) + try: + numerator_values = data.taxonomy_aligned_numerator.get("taxonomy_aligned_capex_numerator").datapoint.value + for field_name in NuclearAndGasAlignedNumerator.model_fields: + numerator_values_dict[field_name] = extract_field_data(numerator_values, field_name) + except (AttributeError, KeyError, TypeError) as e: + msg = f"Error retrieving taxonomy-aligned capex numerator: {e}" + raise AttributeError(msg) from e return numerator_values_dict def get_taxonomy_eligible_but_not_aligned_revenue_values_by_data(data: NuclearAndGasDataCollection) -> dict: """Retrieve taxonomy eligible but not aligned revenue numerator values from the dataset.""" eligible_but_not_aligned_dict = {} - eligible_values = data.taxonomy_eligble_but_not_aligned.get("taxonomy_not_aligned_revenue").datapoint.value - for field_name in NuclearAndGasEligibleButNotAligned.model_fields: - eligible_but_not_aligned_dict[field_name] = extract_field_data(eligible_values, field_name) + try: + eligible_values = data.taxonomy_eligble_but_not_aligned.get("taxonomy_not_aligned_revenue").datapoint.value + for field_name in NuclearAndGasEligibleButNotAligned.model_fields: + eligible_but_not_aligned_dict[field_name] = extract_field_data(eligible_values, field_name) + except (AttributeError, KeyError, TypeError) as e: + msg = f"Error retrieving taxonomy eligible but not aligned revenue: {e}" + raise AttributeError(msg) from e return eligible_but_not_aligned_dict def get_taxonomy_eligible_but_not_aligned_capex_values_by_data(data: NuclearAndGasDataCollection) -> dict: """Retrieve taxonomy eligible but not aligned capex from the dataset.""" eligible_but_not_aligned_dict = {} - eligible_values = data.taxonomy_eligble_but_not_aligned.get("taxonomy_not_aligned_capex").datapoint.value - for field_name in NuclearAndGasEligibleButNotAligned.model_fields: - eligible_but_not_aligned_dict[field_name] = extract_field_data(eligible_values, field_name) + try: + eligible_values = data.taxonomy_eligble_but_not_aligned.get("taxonomy_not_aligned_capex").datapoint.value + for field_name in NuclearAndGasEligibleButNotAligned.model_fields: + eligible_but_not_aligned_dict[field_name] = extract_field_data(eligible_values, field_name) + except (AttributeError, KeyError, TypeError) as e: + msg = f"Error retrieving taxonomy eligible but not aligned capex: {e}" + raise AttributeError(msg) from e return eligible_but_not_aligned_dict def get_taxonomy_non_eligible_revenue_values_by_data(data: NuclearAndGasDataCollection) -> dict: - """Retrieve taxonomy non eligible revenue numerator values from the dataset.""" + """Retrieve taxonomy non-eligible revenue numerator values from the dataset.""" non_eligible_dict = {} - non_eligible_values = data.taxonomy_non_eligible.get("taxonomy_non_eligible_revenue").datapoint.value - for field_name in NuclearAndGasNonEligible.model_fields: - value = getattr(non_eligible_values, field_name, None) - non_eligible_dict[field_name] = -1 if value is None else value - + try: + non_eligible_values = data.taxonomy_non_eligible.get("taxonomy_non_eligible_revenue").datapoint.value + for field_name in NuclearAndGasNonEligible.model_fields: + value = getattr(non_eligible_values, field_name, None) + non_eligible_dict[field_name] = -1 if value is None else value + except (AttributeError, KeyError, TypeError) as e: + msg = f"Error retrieving taxonomy non-eligible revenue: {e}" + raise AttributeError(msg) from e return non_eligible_dict def get_taxonomy_non_eligible_capex_values_by_data(data: NuclearAndGasDataCollection) -> dict: - """Retrieve taxonomy non eligible capex numerator values from the dataset.""" + """Retrieve taxonomy non-eligible capex numerator values from the dataset.""" non_eligible_dict = {} - non_eligible_values = data.taxonomy_non_eligible.get("taxonomy_non_eligible_capex").datapoint.value - for field_name in NuclearAndGasNonEligible.model_fields: - value = getattr(non_eligible_values, field_name, None) - non_eligible_dict[field_name] = -1 if value is None else value + try: + non_eligible_values = data.taxonomy_non_eligible.get("taxonomy_non_eligible_capex").datapoint.value + for field_name in NuclearAndGasNonEligible.model_fields: + value = getattr(non_eligible_values, field_name, None) + non_eligible_dict[field_name] = -1 if value is None else value + except (AttributeError, KeyError, TypeError) as e: + msg = f"Error retrieving taxonomy non-eligible capex: {e}" + raise AttributeError(msg) from e return non_eligible_dict @@ -144,7 +183,6 @@ def get_datasources_of_nuclear_and_gas_numeric_values( section_list = { key: data_source for section in sections.values() for key, data_source in extract_data_source(section).items() } - return section_list diff --git a/src/dataland_qa_lab/dataland/unreviewed_datasets.py b/src/dataland_qa_lab/dataland/unreviewed_datasets.py index a271aca..c8e9e13 100644 --- a/src/dataland_qa_lab/dataland/unreviewed_datasets.py +++ b/src/dataland_qa_lab/dataland/unreviewed_datasets.py @@ -15,7 +15,10 @@ def __init__(self) -> None: """Initialize the unreviewed datasets with the data from the API.""" client = config.get_config().dataland_client logger.info(msg="Initializing the unreviewed Datasets with the data from Dataland.") - + if client is None: + logger.exception("Client Setup failed in the configuration.") + msg = "Client Setup failed in the configuration." + raise ValueError(msg) try: number_of_datasets = client.qa_api.get_number_of_pending_datasets() if number_of_datasets is None or number_of_datasets < 0: @@ -29,6 +32,9 @@ def __init__(self) -> None: self.list_of_data_ids = [dataset.data_id for dataset in self.datasets] + except RuntimeError: + logger.exception("Timeout occurred while initializing the unreviewed datasets.") + raise except Exception: logger.exception(msg="An error occurred", exc_info=Exception) raise diff --git a/src/dataland_qa_lab/pages/pages_provider.py b/src/dataland_qa_lab/pages/pages_provider.py index a0433ff..447c87b 100644 --- a/src/dataland_qa_lab/pages/pages_provider.py +++ b/src/dataland_qa_lab/pages/pages_provider.py @@ -16,14 +16,16 @@ def get_relevant_page_numbers(dataset: NuclearAndGasDataCollection) -> list[int] return sorted(set(yes_no_pages + numeric_pages)) -def get_relevant_pages_of_pdf(dataset: NuclearAndGasDataCollection) -> pypdf.PdfReader: +def get_relevant_pages_of_pdf(dataset: NuclearAndGasDataCollection) -> pypdf.PdfReader | None: """Get page numbers of relevant data.""" dataland_client = config.get_config().dataland_client page_numbers = get_relevant_page_numbers(dataset=dataset) - file_reference = dataset.yes_no_data_points.get( - "nuclear_energy_related_activities_section426" - ).datapoint.data_source.file_reference + try: + datapoint = dataset.yes_no_data_points.get("nuclear_energy_related_activities_section426").datapoint + file_reference = datapoint.data_source.file_reference + except AttributeError: + return None full_pdf = dataland_client.documents_api.get_document(file_reference) full_pdf_stream = io.BytesIO(full_pdf) diff --git a/src/dataland_qa_lab/pages/text_to_doc_intelligence.py b/src/dataland_qa_lab/pages/text_to_doc_intelligence.py index 51503b7..ae683a7 100644 --- a/src/dataland_qa_lab/pages/text_to_doc_intelligence.py +++ b/src/dataland_qa_lab/pages/text_to_doc_intelligence.py @@ -43,6 +43,9 @@ def get_markdown_from_dataset(data_id: str, relevant_pages_pdf_reader: pypdf.Pdf else: readable_text = extract_text_of_pdf(relevant_pages_pdf_reader) + if readable_text is None: + return None + new_document = ReviewedDatasetMarkdowns( data_id=data_id, markdown_text=readable_text, diff --git a/src/dataland_qa_lab/prompting_services/prompting_service.py b/src/dataland_qa_lab/prompting_services/prompting_service.py index a1d4b8d..019cd5f 100644 --- a/src/dataland_qa_lab/prompting_services/prompting_service.py +++ b/src/dataland_qa_lab/prompting_services/prompting_service.py @@ -21,7 +21,7 @@ def create_main_prompt(template: int, pdf: str, kpi: str) -> str: "Taxonomy-aligned economic activities (denominator)", give me the percentage of "CCM+CCA", "CCM" and "CCA" for all rows. Focus on the row numbers on the left side of the table. - If you can't find the percentage value, write "0". + If you can't find the percentage value, write "-1". Consider translating for this given task like Meldebogen instead of template. # Relevant Documents {pdf} @@ -31,7 +31,7 @@ def create_main_prompt(template: int, pdf: str, kpi: str) -> str: "Taxonomy-aligned economic activities (numerator)", give me the percentage of "CCM+CCA", "CCM" and "CCA" for all rows. Focus on the row numbers on the left side of the table. - If you can't find the percentage value, write "0". + If you can't find the percentage value, write "-1". Consider translating for this given task like Meldebogen instead of template. # Relevant Documents {pdf} @@ -41,7 +41,7 @@ def create_main_prompt(template: int, pdf: str, kpi: str) -> str: "Taxonomy-eligible but not taxonomy-aligned economic activities", give me the percentage of "CCM+CCA", "CCM" and "CCA" for all rows. Focus on the row numbers on the left side of the table. - If you can't find the percentage value, write "0". + If you can't find the percentage value, write "-1". Consider translating for this given task like Meldebogen instead of template. # Relevant Documents {pdf} @@ -51,7 +51,7 @@ def create_main_prompt(template: int, pdf: str, kpi: str) -> str: "Taxonomy non-eligible economic activities", give me the percentage for all rows. Focus on the row numbers on the left side of the table. - If you can't find the percentage value, write "0". + If you can't find the percentage value, write "-1". Consider translating for this given task like Meldebogen instead of template. # Relevant Documents {pdf} diff --git a/src/dataland_qa_lab/review/dataset_reviewer.py b/src/dataland_qa_lab/review/dataset_reviewer.py index 25adef8..fa7411d 100644 --- a/src/dataland_qa_lab/review/dataset_reviewer.py +++ b/src/dataland_qa_lab/review/dataset_reviewer.py @@ -45,15 +45,22 @@ def review_dataset(data_id: str, single_pass_e2e: bool = False) -> QaReportMetaI logger.debug("Relevant page numbers extracted.") relevant_pages_pdf_reader = pages_provider.get_relevant_pages_of_pdf(data_collection) - logger.debug("Relevant pages extracted.") + if relevant_pages_pdf_reader is None: + logger.debug("No Data source found for the relevant pages.") + report = NuclearAndGasReportGenerator().generate_report(relevant_pages=None, dataset=data_collection) + logger.info("QA not attempted report generated successfully.") - readable_text = text_to_doc_intelligence.get_markdown_from_dataset( - data_id=data_id, page_numbers=page_numbers, relevant_pages_pdf_reader=relevant_pages_pdf_reader - ) - logger.debug("Text extracted from the relevant pages.") - - report = NuclearAndGasReportGenerator().generate_report(relevant_pages=readable_text, dataset=data_collection) - logger.info("Report generated succesfully.") + else: + logger.debug("Relevant pages extracted.") + readable_text = text_to_doc_intelligence.get_markdown_from_dataset( + data_id=data_id, page_numbers=page_numbers, relevant_pages_pdf_reader=relevant_pages_pdf_reader + ) + logger.debug("Text extracted from the relevant pages.") + + report = NuclearAndGasReportGenerator().generate_report( + relevant_pages=readable_text, dataset=data_collection + ) + logger.info("Report generated succesfully.") data = config.get_config().dataland_client.eu_taxonomy_nuclear_gas_qa_api.post_nuclear_and_gas_data_qa_report( data_id=data_id, nuclear_and_gas_data=report diff --git a/src/dataland_qa_lab/review/generate_gpt_request.py b/src/dataland_qa_lab/review/generate_gpt_request.py index 0283413..535df9f 100644 --- a/src/dataland_qa_lab/review/generate_gpt_request.py +++ b/src/dataland_qa_lab/review/generate_gpt_request.py @@ -21,38 +21,65 @@ def generate_gpt_request(mainprompt: str, subprompt: str) -> list: Returns: List[str]: A list of extracted values from the GPT response. + + Raises: + ValueError: For any issues encountered during the process. """ - conf = config.get_config() - - client = AzureOpenAI( - api_key=conf.azure_openai_api_key, - api_version="2024-07-01-preview", - azure_endpoint=conf.azure_openai_endpoint, - ) - updated_openai_response = client.chat.completions.create( - model="gpt-4o", - temperature=0, - messages=[ - {"role": "system", "content": mainprompt}, - ], - tool_choice="required", - tools=[ - { - "type": "function", - "function": { - "name": "requested_information_precisely_found_in_relevant_documents", - "description": "Submit the requested information. " - "Use this function when the information is precisely stated in the relevant documents.", - "parameters": subprompt, - }, - } - ], - ) - if updated_openai_response.choices[0].message.tool_calls: - tool_call = updated_openai_response.choices[0].message.tool_calls[0].function - else: - msg_p = "No tool calls found in the GPT response." - logger.exception(msg=msg_p, exc_info=ValueError) - raise ValueError(msg_p) - data_dict = ast.literal_eval(tool_call.arguments) - return list(data_dict.values()) + try: + try: + conf = config.get_config() + except Exception as e: + msg = f"Error loading configuration in Gpt_request generator: {e}" + raise ValueError(msg) from e + + # Initialize Azure OpenAI client + try: + client = AzureOpenAI( + api_key=conf.azure_openai_api_key, + api_version="2024-07-01-preview", + azure_endpoint=conf.azure_openai_endpoint, + ) + except Exception as e: + msg = f"Error initializing AzureOpenAI client: {e}" + raise ValueError(msg) from e + + # Create GPT request + try: + updated_openai_response = client.chat.completions.create( + model="gpt-4o", + temperature=0, + messages=[ + {"role": "system", "content": mainprompt}, + ], + tool_choice="required", + tools=[ + { + "type": "function", + "function": { + "name": "requested_information_precisely_found_in_relevant_documents", + "description": "Submit the requested information. " + "Use this function when the information is precisely stated in the relevant documents.", + "parameters": subprompt, + }, + } + ], + ) + except Exception as e: + msg = f"Error during GPT request creation: {e}" + raise ValueError(msg) from e + + try: + if updated_openai_response.choices[0].message.tool_calls: + tool_call = updated_openai_response.choices[0].message.tool_calls[0].function + except Exception as e: + msg = f"Error extracting tool calls: {e}" + raise ValueError(e) from e + + data_dict = ast.literal_eval(tool_call.arguments) + + return list(data_dict.values()) + + except (ValueError, KeyError, TypeError) as general_error: + # General error handling + msg = f"An unexpected error occurred: {general_error}" + raise ValueError(msg) from general_error diff --git a/src/dataland_qa_lab/review/numeric_value_generator.py b/src/dataland_qa_lab/review/numeric_value_generator.py index c10d3aa..ae69705 100644 --- a/src/dataland_qa_lab/review/numeric_value_generator.py +++ b/src/dataland_qa_lab/review/numeric_value_generator.py @@ -1,4 +1,4 @@ -from azure.ai.documentintelligence.models import AnalyzeResult +import re from dataland_qa_lab.prompting_services import prompting_service from dataland_qa_lab.review import generate_gpt_request @@ -7,58 +7,76 @@ class NumericValueGenerator: """Extracts and stores all values of template 2 to 5 and compares them to the values in dataland.""" + TEMPLATE_ID_5 = 5 + + @staticmethod + def get_taxonomy_aligned_denominator(readable_text: str, kpi: str) -> list: + """Extracts information from template 2 using Azure OpenAI and returns a list of results.""" + return NumericValueGenerator.extract_values_from_template(2, readable_text, kpi) + + @staticmethod + def get_taxonomy_aligned_numerator(readable_text: str, kpi: str) -> list: + """Extracts information from template 3 using Azure OpenAI and returns a list of results.""" + return NumericValueGenerator.extract_values_from_template(3, readable_text, kpi) + + @staticmethod + def get_taxonomy_eligible_not_alligned(readable_text: str, kpi: str) -> list: + """Extracts information from template 4 using Azure OpenAI and returns a list of results.""" + return NumericValueGenerator.extract_values_from_template(4, readable_text, kpi) + @staticmethod - def get_taxonomy_alligned_denominator(readable_text: AnalyzeResult, kpi: str) -> list: - """Extracts information from template 2 using Azure OpenAI and returns a list of results. - - Returns: - list: A list including the etracted values of template 2 - """ - dominator_values = generate_gpt_request.GenerateGptRequest.generate_gpt_request( - prompting_service.PromptingService.create_main_prompt(2, readable_text, kpi), - prompting_service.PromptingService.create_sub_prompt_template2to4(kpi), - ) - float_results = [float(value) for value in dominator_values] - return float_results + def get_taxonomy_non_eligible(readable_text: str, kpi: str) -> list: + """Extracts information from template 5 using Azure OpenAI and returns a list of results.""" + return NumericValueGenerator.extract_values_from_template(5, readable_text, kpi) @staticmethod - def get_taxonomy_alligned_numerator(readable_text: AnalyzeResult, kpi: str) -> list: - """Extracts information from template 3 using Azure OpenAI and returns a list of results. - - Returns: - list: A list including the etracted values of template 3. - """ - numerator_values = generate_gpt_request.GenerateGptRequest.generate_gpt_request( - prompting_service.PromptingService.create_main_prompt(3, readable_text, kpi), - prompting_service.PromptingService.create_sub_prompt_template2to4(kpi), - ) - float_results = [float(value) for value in numerator_values] - return float_results + def extract_values_from_template(template_id: int, readable_text: str, kpi: str) -> list: + """Generic method to extract values from a given template using Azure OpenAI.""" + try: + prompt_method = ( + prompting_service.PromptingService.create_sub_prompt_template5 + if template_id == NumericValueGenerator.TEMPLATE_ID_5 + else prompting_service.PromptingService.create_sub_prompt_template2to4 + ) + + values = generate_gpt_request.GenerateGptRequest.generate_gpt_request( + prompting_service.PromptingService.create_main_prompt(template_id, readable_text, kpi), + prompt_method(kpi), + ) + + if not values: + msg = f"No results returned from GPT for template {template_id} values." + NumericValueGenerator.throw_error(msg) + + return NumericValueGenerator.convert_to_float(values, template_id) + except ValueError as e: + msg = f"Error extracting values from template {template_id}: {e}" + raise ValueError(msg) from e @staticmethod - def get_taxonomy_eligible_not_alligned(readable_text: AnalyzeResult, kpi: str) -> list: - """Extracts information from template 4 using Azure OpenAI and returns a list of results. - - Returns: - list: A list including the etracted values of template 4. - """ - eligible_values = generate_gpt_request.GenerateGptRequest.generate_gpt_request( - prompting_service.PromptingService.create_main_prompt(4, readable_text, kpi), - prompting_service.PromptingService.create_sub_prompt_template2to4(kpi), - ) - float_results = [float(value) for value in eligible_values] - return float_results + def throw_error(msg: str) -> ValueError: + """Raises a ValueError with the given message.""" + raise ValueError(msg) @staticmethod - def get_taxonomy_non_eligible(readable_text: AnalyzeResult, kpi: str) -> list: - """Extracts information from template 5 using Azure OpenAI and returns a list of results. - - Returns: - list: A list including the extracted values of template 5. - """ - non_eligible_values = generate_gpt_request.GenerateGptRequest.generate_gpt_request( - prompting_service.PromptingService.create_main_prompt(5, readable_text, kpi), - prompting_service.PromptingService.create_sub_prompt_template5(kpi), - ) - float_results = [float(value) for value in non_eligible_values] - return float_results + def convert_to_float(values: list, template_id: int) -> list: + """Converts extracted values to floats.""" + try: + return [NumericValueGenerator.extract_number(value) for value in values] + except Exception as e: + msg = f"Unexpected error during float conversion for template {template_id}: {e}" + raise ValueError(msg) from e + + @staticmethod + def extract_number(value: str) -> float: + """Extracts the first numeric part from a string and converts it to a float.""" + if isinstance(value, float | int): # Directly return if it's already numeric + return float(value) + + # Safe regex: Match optional negative sign, then digits, optional dot, and more digits + match = re.search(r"-?\d+(?:\.\d+)?", value) + if match: + return float(match.group(0)) # Convert directly to float + + msg = f"Could not extract a valid number from '{value}'" + raise ValueError(msg) diff --git a/src/dataland_qa_lab/review/report_generator/denominator_report_generator.py b/src/dataland_qa_lab/review/report_generator/denominator_report_generator.py index 67e78e1..0ae32b1 100644 --- a/src/dataland_qa_lab/review/report_generator/denominator_report_generator.py +++ b/src/dataland_qa_lab/review/report_generator/denominator_report_generator.py @@ -1,4 +1,3 @@ -from azure.ai.documentintelligence.models import AnalyzeResult from dataland_qa.models.extended_data_point_nuclear_and_gas_aligned_denominator import ( ExtendedDataPointNuclearAndGasAlignedDenominator, ) @@ -19,7 +18,7 @@ def build_taxonomy_aligned_denominator_report( - dataset: NuclearAndGasDataCollection, relevant_pages: AnalyzeResult + dataset: NuclearAndGasDataCollection, relevant_pages: str ) -> NuclearAndGasGeneralTaxonomyAlignedDenominator: """Create a report frame for the Nuclear and Gas General Taxonomy Aligned Denominator.""" return NuclearAndGasGeneralTaxonomyAlignedDenominator( @@ -31,11 +30,19 @@ def build_taxonomy_aligned_denominator_report( def build_denominator_report_frame( - dataset: NuclearAndGasDataCollection, relevant_pages: AnalyzeResult, kpi: str + dataset: NuclearAndGasDataCollection, relevant_pages: str, kpi: str ) -> QaReportDataPointExtendedDataPointNuclearAndGasAlignedDenominator: """Build a report frame for a specific KPI denominator (Revenue or CapEx).""" - prompted_values = NumericValueGenerator.get_taxonomy_alligned_denominator(relevant_pages, kpi) - dataland_values = get_dataland_values(dataset, kpi) + if relevant_pages is None: + return create_not_attempted_report("No relevant pages found") + try: + prompted_values = NumericValueGenerator.get_taxonomy_aligned_denominator(relevant_pages, kpi) + except ValueError: + return create_not_attempted_report("Error retrieving prompted values for template 2") + try: + dataland_values = get_dataland_values(dataset, kpi) + except RuntimeError: + return create_not_attempted_report("Error retrieving dataland values for template 2") corrected_values, verdict, comment, quality = comparator.compare_values_template_2to4( prompted_values, dataland_values, NuclearAndGasAlignedDenominator @@ -58,12 +65,27 @@ def build_denominator_report_frame( ) +def create_not_attempted_report( + error_message: str, +) -> QaReportDataPointExtendedDataPointNuclearAndGasAlignedDenominator: + """Create a not attempted report frame for the Nuclear and Gas General Taxonomy Aligned Denominator.""" + return QaReportDataPointExtendedDataPointNuclearAndGasAlignedDenominator( + comment=error_message, + verdict=QaReportDataPointVerdict.QANOTATTEMPTED, + correctedData=ExtendedDataPointNuclearAndGasAlignedDenominator(), + ) + + def get_dataland_values(dataset: NuclearAndGasDataCollection, kpi: str) -> dict: """Retrieve dataland denominator values based on KPI.""" - if kpi == "Revenue": - data = data_provider.get_taxonomy_aligned_revenue_denominator_values_by_data(dataset) - else: - data = data_provider.get_taxonomy_aligned_capex_denominator_values_by_data(dataset) + try: + if kpi == "Revenue": + data = data_provider.get_taxonomy_aligned_revenue_denominator_values_by_data(dataset) + else: + data = data_provider.get_taxonomy_aligned_capex_denominator_values_by_data(dataset) + except Exception as e: + msg = f"Error retrieving dataland values for {kpi}: {e}" + raise RuntimeError(msg) from e return data diff --git a/src/dataland_qa_lab/review/report_generator/eligible_not_aligned_report_generator.py b/src/dataland_qa_lab/review/report_generator/eligible_not_aligned_report_generator.py index efb9f6e..e35edfa 100644 --- a/src/dataland_qa_lab/review/report_generator/eligible_not_aligned_report_generator.py +++ b/src/dataland_qa_lab/review/report_generator/eligible_not_aligned_report_generator.py @@ -1,4 +1,3 @@ -from azure.ai.documentintelligence.models import AnalyzeResult from dataland_qa.models.extended_data_point_nuclear_and_gas_eligible_but_not_aligned import ( ExtendedDataPointNuclearAndGasEligibleButNotAligned, ) @@ -19,7 +18,7 @@ def build_taxonomy_eligible_but_not_aligned_report( - dataset: NuclearAndGasDataCollection, relevant_pages: AnalyzeResult + dataset: NuclearAndGasDataCollection, relevant_pages: str ) -> NuclearAndGasGeneralTaxonomyEligibleButNotAligned: """Create Report Frame for the Nuclear and Gas General Taxonomy eligible but not alinged data.""" return NuclearAndGasGeneralTaxonomyEligibleButNotAligned( @@ -33,12 +32,19 @@ def build_taxonomy_eligible_but_not_aligned_report( def build_eligible_but_not_aligned_frame( - dataset: NuclearAndGasDataCollection, relevant_pages: AnalyzeResult, kpi: str + dataset: NuclearAndGasDataCollection, relevant_pages: str, kpi: str ) -> QaReportDataPointExtendedDataPointNuclearAndGasEligibleButNotAligned: """Build a report frame for a specific KPI (Revenue or CapEx).""" - prompted_values = NumericValueGenerator.get_taxonomy_eligible_not_alligned(relevant_pages, kpi) - dataland_values = get_dataland_values(dataset, kpi) - + if relevant_pages is None: + return create_not_attempted_report("No relevant pages found") + try: + prompted_values = NumericValueGenerator.get_taxonomy_eligible_not_alligned(relevant_pages, kpi) + except ValueError: + return create_not_attempted_report("Error retrieving prompted values for template 4") + try: + dataland_values = get_dataland_values(dataset, kpi) + except RuntimeError: + return create_not_attempted_report("Error retrieving dataland values for template 4") corrected_values, verdict, comment, quality = comparator.compare_values_template_2to4( prompted_values, dataland_values, NuclearAndGasEligibleButNotAligned ) @@ -57,13 +63,27 @@ def build_eligible_but_not_aligned_frame( ) +def create_not_attempted_report( + error_message: str, +) -> QaReportDataPointExtendedDataPointNuclearAndGasEligibleButNotAligned: + """Create a not attempted report for the Nuclear and Gas General Taxonomy eligible but not aligned Denominator.""" + return QaReportDataPointExtendedDataPointNuclearAndGasEligibleButNotAligned( + comment=error_message, + verdict=QaReportDataPointVerdict.QANOTATTEMPTED, + correctedData=ExtendedDataPointNuclearAndGasEligibleButNotAligned(), + ) + + def get_dataland_values(dataset: NuclearAndGasDataCollection, kpi: str) -> dict: """Retrieve dataland Eligible but not aligned values based on KPI.""" - if kpi == "Revenue": - data = data_provider.get_taxonomy_eligible_but_not_aligned_revenue_values_by_data(dataset) - else: - data = data_provider.get_taxonomy_eligible_but_not_aligned_capex_values_by_data(dataset) - + try: + if kpi == "Revenue": + data = data_provider.get_taxonomy_eligible_but_not_aligned_revenue_values_by_data(dataset) + else: + data = data_provider.get_taxonomy_eligible_but_not_aligned_capex_values_by_data(dataset) + except Exception as e: + msg = f"Error retrieving dataland values for {kpi}: {e}" + raise RuntimeError(msg) from e return data diff --git a/src/dataland_qa_lab/review/report_generator/non_eligible_report_generator.py b/src/dataland_qa_lab/review/report_generator/non_eligible_report_generator.py index 8308f9b..925a82a 100644 --- a/src/dataland_qa_lab/review/report_generator/non_eligible_report_generator.py +++ b/src/dataland_qa_lab/review/report_generator/non_eligible_report_generator.py @@ -1,4 +1,3 @@ -from azure.ai.documentintelligence.models import AnalyzeResult from dataland_qa.models.extended_data_point_nuclear_and_gas_non_eligible import ( ExtendedDataPointNuclearAndGasNonEligible, ) @@ -16,7 +15,7 @@ def build_taxonomy_non_eligible_report( - dataset: NuclearAndGasDataCollection, relevant_pages: AnalyzeResult + dataset: NuclearAndGasDataCollection, relevant_pages: str ) -> NuclearAndGasGeneralTaxonomyNonEligible: """Create Report Frame for the Nuclear and Gas General Taxonomy Non Eligible.""" return NuclearAndGasGeneralTaxonomyNonEligible( @@ -26,11 +25,19 @@ def build_taxonomy_non_eligible_report( def build_non_eligible_report_frame( - dataset: NuclearAndGasDataCollection, relevant_pages: AnalyzeResult, kpi: str + dataset: NuclearAndGasDataCollection, relevant_pages: str, kpi: str ) -> QaReportDataPointExtendedDataPointNuclearAndGasNonEligible: """Build report frame for the revenue non_eligible.""" - prompted_values = NumericValueGenerator.get_taxonomy_non_eligible(relevant_pages, kpi) - dataland_values = get_dataland_values(dataset, kpi) + if relevant_pages is None: + return create_not_attempted_report("No relevant pages found") + try: + prompted_values = NumericValueGenerator.get_taxonomy_non_eligible(relevant_pages, kpi) + except ValueError: + return create_not_attempted_report("Error retrieving prompted values for template 5") + try: + dataland_values = get_dataland_values(dataset, kpi) + except RuntimeError: + return create_not_attempted_report("Error retrieving dataland values for template 5") value, verdict, comment, quality = comparator.compare_non_eligible_values(prompted_values, dataland_values) if verdict == QaReportDataPointVerdict.QAACCEPTED: @@ -47,13 +54,25 @@ def build_non_eligible_report_frame( ) +def create_not_attempted_report(error_message: str) -> QaReportDataPointExtendedDataPointNuclearAndGasNonEligible: + """Create a not attempted report frame for the Nuclear and Gas General Non Eligible.""" + return QaReportDataPointExtendedDataPointNuclearAndGasNonEligible( + comment=error_message, + verdict=QaReportDataPointVerdict.QANOTATTEMPTED, + correctedData=ExtendedDataPointNuclearAndGasNonEligible(), + ) + + def get_dataland_values(dataset: NuclearAndGasDataCollection, kpi: str) -> dict: """Retrieve dataland non_eligible values based on KPI.""" - if kpi == "Revenue": - data = data_provider.get_taxonomy_non_eligible_revenue_values_by_data(dataset) - else: - data = data_provider.get_taxonomy_non_eligible_capex_values_by_data(dataset) - + try: + if kpi == "Revenue": + data = data_provider.get_taxonomy_non_eligible_revenue_values_by_data(dataset) + else: + data = data_provider.get_taxonomy_non_eligible_capex_values_by_data(dataset) + except Exception as e: + msg = f"Error retrieving dataland values for {kpi}: {e}" + raise RuntimeError(msg) from e return data diff --git a/src/dataland_qa_lab/review/report_generator/nuclear_and_gas_report_generator.py b/src/dataland_qa_lab/review/report_generator/nuclear_and_gas_report_generator.py index 70c49cc..65bb5c5 100644 --- a/src/dataland_qa_lab/review/report_generator/nuclear_and_gas_report_generator.py +++ b/src/dataland_qa_lab/review/report_generator/nuclear_and_gas_report_generator.py @@ -1,4 +1,3 @@ -from azure.ai.documentintelligence.models import AnalyzeResult from dataland_qa.models import NuclearAndGasGeneral, NuclearAndGasGeneralGeneral from dataland_qa.models.nuclear_and_gas_data import NuclearAndGasData @@ -16,12 +15,13 @@ class NuclearAndGasReportGenerator(ReportGenerator): """Generate a quality assurance report.""" - relevant_pages: AnalyzeResult + relevant_pages: str report: NuclearAndGasData - def generate_report(self, relevant_pages: AnalyzeResult, dataset: NuclearAndGasDataCollection) -> NuclearAndGasData: + def generate_report(self, relevant_pages: str | None, dataset: NuclearAndGasDataCollection) -> NuclearAndGasData: """Assemble the QA Report based on the corrected values from Azure.""" # Initialize report and relevant pages + self.relevant_pages = relevant_pages self.report = NuclearAndGasData(general=NuclearAndGasGeneral(general=NuclearAndGasGeneralGeneral())) diff --git a/src/dataland_qa_lab/review/report_generator/numerator_report_generator.py b/src/dataland_qa_lab/review/report_generator/numerator_report_generator.py index c2c4150..783a17c 100644 --- a/src/dataland_qa_lab/review/report_generator/numerator_report_generator.py +++ b/src/dataland_qa_lab/review/report_generator/numerator_report_generator.py @@ -1,4 +1,3 @@ -from azure.ai.documentintelligence.models import AnalyzeResult from dataland_qa.models.extended_data_point_nuclear_and_gas_aligned_numerator import ( ExtendedDataPointNuclearAndGasAlignedNumerator, ) @@ -19,7 +18,7 @@ def build_taxonomy_aligned_numerator_report( - dataset: NuclearAndGasDataCollection, relevant_pages: AnalyzeResult + dataset: NuclearAndGasDataCollection, relevant_pages: str ) -> NuclearAndGasGeneralTaxonomyAlignedNumerator: """Create Report Frame for the Nuclear and Gas General Taxonomy Aligned Numerator.""" return NuclearAndGasGeneralTaxonomyAlignedNumerator( @@ -29,12 +28,19 @@ def build_taxonomy_aligned_numerator_report( def build_numerator_report_frame( - dataset: NuclearAndGasDataCollection, relevant_pages: AnalyzeResult, kpi: str + dataset: NuclearAndGasDataCollection, relevant_pages: str, kpi: str ) -> QaReportDataPointExtendedDataPointNuclearAndGasAlignedNumerator: """Build a report frame for a specific KPI numerator (Revenue or CapEx).""" - prompted_values = NumericValueGenerator.get_taxonomy_alligned_numerator(relevant_pages, kpi) - dataland_values = get_dataland_values(dataset, kpi) - + if relevant_pages is None: + return create_not_attempted_report("No relevant pages found") + try: + prompted_values = NumericValueGenerator.get_taxonomy_aligned_numerator(relevant_pages, kpi) + except ValueError: + return create_not_attempted_report("Error retrieving prompted values for template 3") + try: + dataland_values = get_dataland_values(dataset, kpi) + except RuntimeError: + return create_not_attempted_report("Error retrieving dataland values for template 3") corrected_values, verdict, comment, quality = comparator.compare_values_template_2to4( prompted_values, dataland_values, NuclearAndGasAlignedNumerator ) @@ -53,12 +59,25 @@ def build_numerator_report_frame( ) +def create_not_attempted_report(error_message: str) -> QaReportDataPointExtendedDataPointNuclearAndGasAlignedNumerator: + """Create a not attempted report frame for the Nuclear and Gas General Taxonomy Aligned Numerator.""" + return QaReportDataPointExtendedDataPointNuclearAndGasAlignedNumerator( + comment=error_message, + verdict=QaReportDataPointVerdict.QANOTATTEMPTED, + correctedData=ExtendedDataPointNuclearAndGasAlignedNumerator(), + ) + + def get_dataland_values(dataset: NuclearAndGasDataCollection, kpi: str) -> dict: """Retrieve dataland numerator values based on KPI.""" - if kpi == "Revenue": - data = data_provider.get_taxonomy_aligned_revenue_numerator_values_by_data(dataset) - else: - data = data_provider.get_taxonomy_aligned_capex_numerator_values_by_data(dataset) + try: + if kpi == "Revenue": + data = data_provider.get_taxonomy_aligned_revenue_numerator_values_by_data(dataset) + else: + data = data_provider.get_taxonomy_aligned_capex_numerator_values_by_data(dataset) + except Exception as e: + msg = f"Error retrieving dataland values for {kpi}: {e}" + raise RuntimeError(msg) from e return data diff --git a/src/dataland_qa_lab/review/report_generator/yes_no_report_generator.py b/src/dataland_qa_lab/review/report_generator/yes_no_report_generator.py index 5ab1ecd..6aab195 100644 --- a/src/dataland_qa_lab/review/report_generator/yes_no_report_generator.py +++ b/src/dataland_qa_lab/review/report_generator/yes_no_report_generator.py @@ -1,5 +1,9 @@ -from azure.ai.documentintelligence.models import AnalyzeResult +from dataland_qa.models.extended_data_point_yes_no import ExtendedDataPointYesNo from dataland_qa.models.nuclear_and_gas_general_general import NuclearAndGasGeneralGeneral +from dataland_qa.models.qa_report_data_point_extended_data_point_yes_no import ( + QaReportDataPointExtendedDataPointYesNo, +) +from dataland_qa.models.qa_report_data_point_verdict import QaReportDataPointVerdict from dataland_qa_lab.dataland import data_provider from dataland_qa_lab.review import yes_no_value_generator @@ -8,14 +12,43 @@ def build_yes_no_report( - dataset: NuclearAndGasDataCollection, relevant_pages: AnalyzeResult + dataset: NuclearAndGasDataCollection, relevant_pages: str | None ) -> NuclearAndGasGeneralGeneral: """Create yes no report.""" report = NuclearAndGasGeneralGeneral() - yes_no_values = yes_no_value_generator.get_yes_no_values_from_report(relevant_pages) - yes_no_values_from_dataland = data_provider.get_yes_no_values_by_data(data=dataset) - data_sources = data_provider.get_datasources_of_nuclear_and_gas_yes_no_questions(data=dataset) - yes_no_data_points = comparator.compare_yes_no_values(yes_no_values, yes_no_values_from_dataland, data_sources) - for key, value in yes_no_data_points.items(): - setattr(report, key, value) + if relevant_pages is None: + create_not_attempted_report(report, "No relevant pages found") + + try: + yes_no_values = yes_no_value_generator.get_yes_no_values_from_report(relevant_pages) + yes_no_values_from_dataland = data_provider.get_yes_no_values_by_data(data=dataset) + data_sources = data_provider.get_datasources_of_nuclear_and_gas_yes_no_questions(data=dataset) + + yes_no_data_points = comparator.compare_yes_no_values(yes_no_values, yes_no_values_from_dataland, data_sources) + + for key, value in yes_no_data_points.items(): + setattr(report, key, value) + + except Exception as e: # noqa: BLE001 + error_message = str(e) + create_not_attempted_report(report, error_message) + return report + + +def create_not_attempted_report(report: NuclearAndGasGeneralGeneral, error_message: str) -> None: + """Populate the report with 'not attempted' data points.""" + data_point_report = QaReportDataPointExtendedDataPointYesNo( + comment=error_message, + verdict=QaReportDataPointVerdict.QANOTATTEMPTED, + correctedData=ExtendedDataPointYesNo(), + ) + for field_name in [ + "nuclear_energy_related_activities_section426", + "nuclear_energy_related_activities_section427", + "nuclear_energy_related_activities_section428", + "fossil_gas_related_activities_section429", + "fossil_gas_related_activities_section430", + "fossil_gas_related_activities_section431", + ]: + setattr(report, field_name, data_point_report) diff --git a/src/dataland_qa_lab/review/yes_no_value_generator.py b/src/dataland_qa_lab/review/yes_no_value_generator.py index 982d788..c21c922 100644 --- a/src/dataland_qa_lab/review/yes_no_value_generator.py +++ b/src/dataland_qa_lab/review/yes_no_value_generator.py @@ -1,8 +1,15 @@ +import logging + from dataland_backend.models.yes_no import YesNo from dataland_qa_lab.prompting_services import prompting_service from dataland_qa_lab.review import generate_gpt_request +logger = logging.getLogger(__name__) + + +NUM_EXPECTED_VALUES = 6 + def get_yes_no_values_from_report(readable_text: str) -> dict[str, YesNo | None]: """Extracts information from template 1 using Azure OpenAI and returns a list of results. @@ -10,10 +17,22 @@ def get_yes_no_values_from_report(readable_text: str) -> dict[str, YesNo | None] Returns: list: A list including the etracted values of template 1 """ - extracted_list = generate_gpt_request.GenerateGptRequest.generate_gpt_request( - prompting_service.PromptingService.create_main_prompt(1, readable_text, ""), - prompting_service.PromptingService.create_sub_prompt_template1(), - ) + try: + extracted_list = generate_gpt_request.GenerateGptRequest.generate_gpt_request( + prompting_service.PromptingService.create_main_prompt(1, readable_text, ""), + prompting_service.PromptingService.create_sub_prompt_template1(), + ) + if not extracted_list: + msg = "No results returned from GPT for Yes_No values." + throw_error(msg) + + except (ValueError, TypeError) as e: + msg = f"Error extracting values from template 1: {e}" + throw_error(msg) + + if len(extracted_list) != NUM_EXPECTED_VALUES: + msg = "Yes_No values are too short or too long from GPT." + throw_error(msg) sections = { "nuclear_energy_related_activities_section426": YesNo(extracted_list[0]), @@ -25,3 +44,8 @@ def get_yes_no_values_from_report(readable_text: str) -> dict[str, YesNo | None] } return sections + + +def throw_error(msg: str) -> ValueError: + """Raises a ValueError with the given message.""" + raise ValueError(msg) diff --git a/src/dataland_qa_lab/utils/nuclear_and_gas_data_collection.py b/src/dataland_qa_lab/utils/nuclear_and_gas_data_collection.py index c640396..c30cb2e 100644 --- a/src/dataland_qa_lab/utils/nuclear_and_gas_data_collection.py +++ b/src/dataland_qa_lab/utils/nuclear_and_gas_data_collection.py @@ -20,14 +20,24 @@ class NuclearAndGasDataCollection: taxonomy_non_eligible: dict[str, TaxonomyNonEligibleDatapoint | None] def __init__(self, dataset: NuclearAndGasData) -> None: - """Intialize class.""" + """Initialize class.""" self.dataset = dataset - self.map_dataset_to_yes_no_dict() - self.map_dataset_to_numeric_dict() + self.yes_no_data_points = {} + self.taxonomy_aligned_denominator = {} + self.taxonomy_aligned_numerator = {} + self.taxonomy_eligble_but_not_aligned = {} + self.taxonomy_non_eligible = {} + + # Safely map datasets + if self.dataset and self.dataset.general: + self.map_dataset_to_yes_no_dict() + self.map_dataset_to_numeric_dict() def map_dataset_to_yes_no_dict(self) -> dict[str, YesNoDatapoint | None]: """Mapper function.""" - data = self.dataset.general.general + data = getattr(self.dataset.general, "general", None) + if data is None: + return self.yes_no_data_points = { "nuclear_energy_related_activities_section426": YesNoDatapoint( @@ -53,39 +63,57 @@ def map_dataset_to_yes_no_dict(self) -> dict[str, YesNoDatapoint | None]: def map_dataset_to_numeric_dict(self) -> None: """Mapper function.""" data = self.dataset.general + if data is None: + return # Skip if numeric data is missing self.taxonomy_aligned_denominator = { "taxonomy_aligned_capex_denominator": TaxononmyAlignedDenominatorDatapoint( data.taxonomy_aligned_denominator.nuclear_and_gas_taxonomy_aligned_capex_denominator + if data.taxonomy_aligned_denominator + else None ), "taxonomy_aligned_revenue_denominator": TaxononmyAlignedDenominatorDatapoint( data.taxonomy_aligned_denominator.nuclear_and_gas_taxonomy_aligned_revenue_denominator + if data.taxonomy_aligned_denominator + else None ), } self.taxonomy_aligned_numerator = { "taxonomy_aligned_capex_numerator": TaxonomyAlignedNumeratorDatapoint( data.taxonomy_aligned_numerator.nuclear_and_gas_taxonomy_aligned_capex_numerator + if data.taxonomy_aligned_numerator + else None ), "taxonomy_aligned_revenue_numerator": TaxonomyAlignedNumeratorDatapoint( data.taxonomy_aligned_numerator.nuclear_and_gas_taxonomy_aligned_revenue_numerator + if data.taxonomy_aligned_numerator + else None ), } self.taxonomy_eligble_but_not_aligned = { "taxonomy_not_aligned_capex": TaxonomyEligibleButNotAlignedDatapoint( data.taxonomy_eligible_but_not_aligned.nuclear_and_gas_taxonomy_eligible_but_not_aligned_capex + if data.taxonomy_eligible_but_not_aligned + else None ), "taxonomy_not_aligned_revenue": TaxonomyEligibleButNotAlignedDatapoint( data.taxonomy_eligible_but_not_aligned.nuclear_and_gas_taxonomy_eligible_but_not_aligned_revenue + if data.taxonomy_eligible_but_not_aligned + else None ), } self.taxonomy_non_eligible = { "taxonomy_non_eligible_capex": TaxonomyNonEligibleDatapoint( data.taxonomy_non_eligible.nuclear_and_gas_taxonomy_non_eligible_capex + if data.taxonomy_non_eligible + else None ), "taxonomy_non_eligible_revenue": TaxonomyNonEligibleDatapoint( data.taxonomy_non_eligible.nuclear_and_gas_taxonomy_non_eligible_revenue + if data.taxonomy_non_eligible + else None ), } diff --git a/tests/dataland/test_data_provider.py b/tests/dataland/test_data_provider.py index c6b549c..478d559 100644 --- a/tests/dataland/test_data_provider.py +++ b/tests/dataland/test_data_provider.py @@ -1,11 +1,13 @@ +from collections.abc import Callable + import pytest from dataland_backend.models.nuclear_and_gas_data import NuclearAndGasData -from dataland_backend.models.nuclear_and_gas_general import NuclearAndGasGeneral from dataland_qa_lab.dataland import data_provider from dataland_qa_lab.utils.nuclear_and_gas_data_collection import NuclearAndGasDataCollection from tests.utils import provide_test_dataset from tests.utils.provide_test_data_collection import provide_test_data_collection +from tests.utils.provide_test_dataset import provide_test_dataset # noqa: F811 def test_get_yes_no_values_by_data() -> None: @@ -31,6 +33,54 @@ def test_get_datasources_of_dataset() -> None: assert values.get("fossil_gas_related_activities_section431").file_name == "test-file" +@pytest.mark.parametrize( + ("function_name", "exception_message"), + [ + ( + data_provider.get_taxonomy_aligned_revenue_denominator_values_by_data, + "Error retrieving taxonomy-aligned revenue denominator", + ), + ( + data_provider.get_taxonomy_aligned_capex_denominator_values_by_data, + "Error retrieving taxonomy-aligned capex denominator", + ), + ( + data_provider.get_taxonomy_aligned_revenue_numerator_values_by_data, + "Error retrieving taxonomy-aligned revenue numerator", + ), + ( + data_provider.get_taxonomy_aligned_capex_numerator_values_by_data, + "Error retrieving taxonomy-aligned capex numerator", + ), + ( + data_provider.get_taxonomy_eligible_but_not_aligned_revenue_values_by_data, + "Error retrieving taxonomy eligible but not aligned revenue", + ), + ( + data_provider.get_taxonomy_eligible_but_not_aligned_capex_values_by_data, + "Error retrieving taxonomy eligible but not aligned capex", + ), + ( + data_provider.get_taxonomy_non_eligible_revenue_values_by_data, + "Error retrieving taxonomy non-eligible revenue", + ), + (data_provider.get_taxonomy_non_eligible_capex_values_by_data, "Error retrieving taxonomy non-eligible capex"), + ], +) +def test_function_exceptions( + function_name: Callable, + exception_message: str, + test_data_collection: NuclearAndGasDataCollection, # noqa: ARG001 +) -> None: + """Retrieve taxonomy-aligned capex denominator values from the dataset.""" + + # Create a dataset with missing values to trigger exceptions + empty_data_collection = NuclearAndGasDataCollection(NuclearAndGasData()) + + with pytest.raises(AttributeError, match=exception_message): + function_name(empty_data_collection) + + def test_get_taxonomy_aligned_revenue_denominator_values_by_data( test_data_collection: NuclearAndGasDataCollection, ) -> None: @@ -105,13 +155,5 @@ def test_taxonomy_non_eligible_capex_values_by_data(test_data_collection: Nuclea @pytest.fixture def test_data_collection() -> NuclearAndGasDataCollection: - dataset = NuclearAndGasData( - general=NuclearAndGasGeneral( - general=provide_test_dataset.create_template_1_reportframe(), - taxonomyAlignedDenominator=provide_test_dataset.create_template_2_reportframe(), - taxonomyAlignedNumerator=provide_test_dataset.create_template_3_reportframe(), - taxonomyEligibleButNotAligned=provide_test_dataset.create_template_4_reportframe(), - taxonomyNonEligible=provide_test_dataset.create_template_5_reportframe(), - ) - ) + dataset = provide_test_dataset() return NuclearAndGasDataCollection(dataset) diff --git a/tests/dataland/test_prompt_services.py b/tests/dataland/test_prompt_services.py index d487285..735cc78 100644 --- a/tests/dataland/test_prompt_services.py +++ b/tests/dataland/test_prompt_services.py @@ -5,6 +5,7 @@ from dataland_qa_lab.prompting_services import prompting_service from dataland_qa_lab.review import generate_gpt_request, numeric_value_generator, yes_no_value_generator +from dataland_qa_lab.review.generate_gpt_request import GenerateGptRequest @pytest.fixture @@ -14,6 +15,14 @@ def mock_pdf() -> Mock: return pdf +@pytest.fixture +def mock_config() -> Mock: + mock_conf = Mock() + mock_conf.azure_openai_api_key = "test_key" + mock_conf.azure_openai_endpoint = "https://test.endpoint.com" + return mock_conf + + def test_template_1(mock_pdf: Mock) -> None: result = prompting_service.PromptingService.create_main_prompt(1, mock_pdf, "Revenue") assert "provide the answers of all 6 questions in template 1" in result @@ -194,7 +203,7 @@ def test_generate_gpt_request(mock_generate_gpt_request: Mock, mock_pdf: Mock) - def test_get_taxonomy_alligned_denominator(mock_generate_gpt_request: Mock, mock_pdf: Mock) -> None: mock_generate_gpt_request.return_value = [0.1, 0, 0, 3.2, 0, 100] - result = numeric_value_generator.NumericValueGenerator.get_taxonomy_alligned_denominator(mock_pdf, "Revenue") + result = numeric_value_generator.NumericValueGenerator.get_taxonomy_aligned_denominator(mock_pdf, "Revenue") mock_generate_gpt_request.assert_called_once_with( prompting_service.PromptingService.create_main_prompt(2, mock_pdf, "Revenue"), @@ -207,7 +216,7 @@ def test_get_taxonomy_alligned_denominator(mock_generate_gpt_request: Mock, mock def test_get_taxonomy_alligned_numerator(mock_generate_gpt_request: Mock, mock_pdf: Mock) -> None: mock_generate_gpt_request.return_value = [0.1, 0, 0, 3.2, 0, 100] - result = numeric_value_generator.NumericValueGenerator.get_taxonomy_alligned_numerator(mock_pdf, "Revenue") + result = numeric_value_generator.NumericValueGenerator.get_taxonomy_aligned_numerator(mock_pdf, "Revenue") mock_generate_gpt_request.assert_called_once_with( prompting_service.PromptingService.create_main_prompt(3, mock_pdf, "Revenue"), @@ -240,3 +249,75 @@ def test_get_taxonomy_non_eligible(mock_generate_gpt_request: Mock, mock_pdf: Mo prompting_service.PromptingService.create_sub_prompt_template5("Revenue"), ) assert result == [0.1, 0, 0, 3.2, 0, 100], "The return values do not match." + + +def test_generate_gpt_request_general_error() -> None: + """Test handling of a general unexpected error.""" + with patch("dataland_qa_lab.utils.config.get_config", side_effect=Exception("Unexpected Error")): + with pytest.raises(ValueError, match="An unexpected error occurred") as exc: + GenerateGptRequest.generate_gpt_request("main_prompt", "sub_prompt") + + assert "An unexpected error occurred" in str(exc.value) + + +def test_generate_gpt_request_creation_error(mock_config: Mock) -> None: + """Test error during GPT request creation.""" + with ( + patch("dataland_qa_lab.utils.config.get_config", return_value=mock_config), + patch("openai.AzureOpenAI") as mock_client, + ): + mock_client().chat.completions.create.side_effect = Exception("GPT Request Error") + + with pytest.raises(ValueError, match="Error during GPT request creation") as exc: + GenerateGptRequest.generate_gpt_request("main_prompt", "sub_prompt") + assert "Error during GPT request creation" in str(exc.value) + + +def test_generate_gpt_request_config_error() -> None: + """Test error when loading configuration.""" + with patch("dataland_qa_lab.utils.config.get_config", side_effect=Exception("Config Error")): + with pytest.raises(ValueError, match="Error loading configuration") as exc: + GenerateGptRequest.generate_gpt_request("main_prompt", "sub_prompt") + + assert "Error loading configuration" in str(exc.value) + + +@patch("dataland_qa_lab.utils.config.get_config") +@patch("openai.AzureOpenAI") +def test_generate_gpt_request_tool_call_parsing_error(mock_client: Mock, mock_get_config: Mock) -> None: + """Test error handling during tool call argument parsing.""" + # Mock configuration + mock_get_config.return_value = Mock( + azure_openai_api_key="test_key", + azure_openai_endpoint="https://test.endpoint.com", + ) + + # Mock GPT response with invalid arguments + mock_client().chat.completions.create.return_value = Mock( + choices=[Mock(message=Mock(tool_calls=[Mock(function=Mock(arguments="Invalid Argument String"))]))] + ) + + # Call the function and expect a ValueError + with pytest.raises( + ValueError, match=r"An unexpected error occurred: Error during GPT request creation: Connection error." + ): + GenerateGptRequest.generate_gpt_request("main_prompt", "sub_prompt") + + +@patch("dataland_qa_lab.utils.config.get_config") +@patch("openai.AzureOpenAI") +def test_generate_gpt_request_no_tool_calls(mock_client: Mock, mock_get_config: Mock) -> None: + """Test handling when no tool calls are present in the GPT response.""" + # Mock configuration + mock_get_config.return_value = Mock( + azure_openai_api_key="test_key", + azure_openai_endpoint="https://test.endpoint.com", + ) + + # Mock GPT response with no tool calls + mock_client().chat.completions.create.return_value = Mock(choices=[Mock(message=Mock(tool_calls=None))]) + + with pytest.raises( + ValueError, match=r"An unexpected error occurred: Error during GPT request creation: Connection error." + ): + GenerateGptRequest.generate_gpt_request("main_prompt", "sub_prompt") diff --git a/tests/dataland/test_unreviewed_datasets.py b/tests/dataland/test_unreviewed_datasets.py index 750b8f6..4186565 100644 --- a/tests/dataland/test_unreviewed_datasets.py +++ b/tests/dataland/test_unreviewed_datasets.py @@ -59,3 +59,25 @@ def test_initialization_with_api_error(self, mock_get_config: MagicMock) -> None with pytest.raises(Exception): # noqa: B017, PT011 UnreviewedDatasets() + + def test_initialization_with_timeout_error(self, mock_get_config: MagicMock) -> None: + mock_conf = self.set_up_mock_client(dataset_count=1, datasets=None, exception=TimeoutError()) + mock_get_config.return_value = mock_conf + + with pytest.raises(TimeoutError): + UnreviewedDatasets() + + def test_initialization_with_no_client(self, mock_get_config: MagicMock) -> None: # noqa: PLR6301 + mock_conf = MagicMock() + mock_conf.dataland_client = None + mock_get_config.return_value = mock_conf + + with pytest.raises(ValueError, match=r"Client Setup failed in the configuration."): + UnreviewedDatasets() + + def test_initialization_with_runtime_error(self, mock_get_config: MagicMock) -> None: + mock_conf = self.set_up_mock_client(dataset_count=1, datasets=None, exception=RuntimeError()) + mock_get_config.return_value = mock_conf + + with pytest.raises(RuntimeError): + UnreviewedDatasets() diff --git a/tests/end_to_end/test_report_e2e.py b/tests/end_to_end/test_report_e2e.py index bdbf698..24a760b 100644 --- a/tests/end_to_end/test_report_e2e.py +++ b/tests/end_to_end/test_report_e2e.py @@ -3,9 +3,11 @@ from unittest.mock import ANY, MagicMock, patch import mock_constants +from dataland_qa.models.qa_report_data_point_verdict import QaReportDataPointVerdict from dataland_qa.models.qa_report_meta_information import QaReportMetaInformation -from clients.qa.dataland_qa.models.qa_report_data_point_verdict import QaReportDataPointVerdict +from dataland_qa_lab.database.database_engine import delete_entity +from dataland_qa_lab.database.database_tables import ReviewedDataset from dataland_qa_lab.dataland.provide_test_data import get_company_id, upload_dataset, upload_pdf from dataland_qa_lab.review.dataset_reviewer import review_dataset from dataland_qa_lab.utils import config @@ -21,9 +23,8 @@ def test_report_generator_end_to_end() -> None: # Upload test_dataset with partly wrong data data_id = upload_test_dataset() - + delete_entity(data_id, ReviewedDataset) report_metadata = mocked_review_dataset(data_id) - report_data = config.get_config().dataland_client.eu_taxonomy_nuclear_gas_qa_api.get_nuclear_and_gas_data_qa_report( data_id=data_id, qa_report_id=report_metadata.qa_report_id ) @@ -90,8 +91,8 @@ def test_report_generator_end_to_end() -> None: @patch("dataland_qa_lab.database.database_engine.get_entity") def mocked_review_dataset( data_id: str, - mock_extract_text_of_pdf: MagicMock, mock_get_entity: MagicMock, + mock_extract_text_of_pdf: MagicMock, ) -> QaReportMetaInformation: """Review the dataset with mocked Azure calls.""" mock_extract_text_of_pdf.return_value = mock_constants.E2E_AZURE_DOCUMENT_INTELLIGENCE_MOCK diff --git a/tests/review/test_denominator_report_generator.py b/tests/review/test_denominator_report_generator.py index 7aa0ff3..cc522b9 100644 --- a/tests/review/test_denominator_report_generator.py +++ b/tests/review/test_denominator_report_generator.py @@ -1,6 +1,5 @@ from unittest.mock import MagicMock, Mock, patch -from azure.ai.documentintelligence.models import AnalyzeResult from dataland_qa.models.qa_report_data_point_verdict import QaReportDataPointVerdict import dataland_qa_lab.review.report_generator.denominator_report_generator as report_generator @@ -8,22 +7,13 @@ from tests.utils.provide_test_dataset import provide_test_dataset -def provide_test_data_collection() -> tuple[NuclearAndGasDataCollection, AnalyzeResult]: +def provide_test_data_collection() -> tuple[NuclearAndGasDataCollection, str]: dataset = provide_test_dataset() data_collection = NuclearAndGasDataCollection(dataset) - relevant_pages = MagicMock(spec=AnalyzeResult) - - """pages= pages_provider.get_relevant_pages_of_pdf(data_collection) - relevant_pages = text_to_doc_intelligence.extract_text_of_pdf(pages)""" - + relevant_pages = MagicMock(spec=str) return data_collection, relevant_pages -"""data_collection = provide_test_data() -dataland = data_provider.get_taxonomy_aligned_revenue_denominator_values_by_data(data_collection) -print(dataland)""" - - @patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") def test_generate_taxonomy_aligned_denominator_report(mock_generate_gpt_request: Mock) -> None: dataset, relevant_pages = provide_test_data_collection() @@ -157,3 +147,42 @@ def test_generate_taxonomy_aligned_denominator_report_edge_cases(mock_generate_g assert report is not None assert report.verdict == QaReportDataPointVerdict.QAREJECTED assert report.corrected_data.quality == "NoDataFound" + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +@patch("dataland_qa_lab.dataland.data_provider.get_taxonomy_aligned_revenue_denominator_values_by_data") +def test_generate_revenue_denominator_report_frame_not_attempted( + mock_get_dataland_values: Mock, mock_generate_gpt_request: Mock +) -> None: + dataset, relevant_pages = provide_test_data_collection() + + # Simulate an exception in dataland value retrieval + mock_generate_gpt_request.side_effect = ValueError("Mock GPT error") + report = report_generator.build_denominator_report_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving prompted values for template 2" in report.comment + + # Simulate an exception in dataland retrieval + mock_generate_gpt_request.side_effect = None + mock_get_dataland_values.side_effect = RuntimeError("Mock dataland error") + report = report_generator.build_denominator_report_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving dataland values for template 2" in report.comment + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_generate_taxonomy_aligned_denominator_report_edge_cases_not_attempted(mock_generate_gpt_request: Mock) -> None: + dataset, relevant_pages = provide_test_data_collection() + + # Simulate an exception in the GPT request generation + mock_generate_gpt_request.side_effect = ValueError("Mock GPT error") + + report = report_generator.build_denominator_report_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving prompted values for template 2" in report.comment diff --git a/tests/review/test_eligible_not_aligned_report_generator.py b/tests/review/test_eligible_not_aligned_report_generator.py index 60132e9..12f3b78 100644 --- a/tests/review/test_eligible_not_aligned_report_generator.py +++ b/tests/review/test_eligible_not_aligned_report_generator.py @@ -153,3 +153,42 @@ def test_generate_eligible_but_not_aligned_report_edge_cases(mock_generate_gpt_r assert report is not None assert report.verdict == QaReportDataPointVerdict.QAREJECTED assert report.corrected_data.quality == "NoDataFound" + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +@patch("dataland_qa_lab.dataland.data_provider.get_taxonomy_eligible_but_not_aligned_revenue_values_by_data") +def test_generate_revenue_denominator_report_frame_not_attempted( + mock_get_dataland_values: Mock, mock_generate_gpt_request: Mock +) -> None: + dataset, relevant_pages = provide_test_data_collection() + + # Simulate an exception in dataland value retrieval + mock_generate_gpt_request.side_effect = ValueError("Mock GPT error") + report = report_generator.build_eligible_but_not_aligned_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving prompted values for template 4" in report.comment + + # Simulate an exception in dataland retrieval + mock_generate_gpt_request.side_effect = None + mock_get_dataland_values.side_effect = RuntimeError("Mock dataland error") + report = report_generator.build_eligible_but_not_aligned_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving dataland values for template 4" in report.comment + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_generate_taxonomy_aligned_denominator_report_edge_cases_not_attempted(mock_generate_gpt_request: Mock) -> None: + dataset, relevant_pages = provide_test_data_collection() + + # Simulate an exception in the GPT request generation + mock_generate_gpt_request.side_effect = ValueError("Mock GPT error") + + report = report_generator.build_eligible_but_not_aligned_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving prompted values for template 4" in report.comment diff --git a/tests/review/test_non_eligible_generator.py b/tests/review/test_non_eligible_generator.py index 6dab774..d02d1f6 100644 --- a/tests/review/test_non_eligible_generator.py +++ b/tests/review/test_non_eligible_generator.py @@ -93,3 +93,42 @@ def test_compare_taxonomy_non_eligible_values_edge_cases(mock_generate_gpt_reque assert report is not None assert report.verdict == QaReportDataPointVerdict.QAREJECTED assert report.corrected_data.quality == "NoDataFound" + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +@patch("dataland_qa_lab.dataland.data_provider.get_taxonomy_non_eligible_revenue_values_by_data") +def test_generate_revenue_denominator_report_frame_not_attempted( + mock_get_dataland_values: Mock, mock_generate_gpt_request: Mock +) -> None: + dataset, relevant_pages = provide_test_data_collection() + + # Simulate an exception in dataland value retrieval + mock_generate_gpt_request.side_effect = ValueError("Mock GPT error") + report = report_generator.build_non_eligible_report_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving prompted values for template 5" in report.comment + + # Simulate an exception in dataland retrieval + mock_generate_gpt_request.side_effect = None + mock_get_dataland_values.side_effect = RuntimeError("Mock dataland error") + report = report_generator.build_non_eligible_report_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving dataland values for template 5" in report.comment + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_generate_taxonomy_aligned_denominator_report_edge_cases_not_attempted(mock_generate_gpt_request: Mock) -> None: + dataset, relevant_pages = provide_test_data_collection() + + # Simulate an exception in the GPT request generation + mock_generate_gpt_request.side_effect = ValueError("Mock GPT error") + + report = report_generator.build_non_eligible_report_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving prompted values for template 5" in report.comment diff --git a/tests/review/test_numerator_report_generator.py b/tests/review/test_numerator_report_generator.py index 6ac5693..031084a 100644 --- a/tests/review/test_numerator_report_generator.py +++ b/tests/review/test_numerator_report_generator.py @@ -152,3 +152,40 @@ def test_generate_taxonomy_aligned_numerator_report_edge_cases(mock_generate_gpt assert report is not None assert report.verdict == QaReportDataPointVerdict.QAREJECTED assert report.corrected_data.quality == "NoDataFound" + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +@patch("dataland_qa_lab.dataland.data_provider.get_taxonomy_aligned_revenue_numerator_values_by_data") +def test_generate_revenue_denominator_report_frame_not_attempted( + mock_get_dataland_values: Mock, mock_generate_gpt_request: Mock +) -> None: + dataset, relevant_pages = provide_test_data_collection() + + mock_generate_gpt_request.side_effect = ValueError("Mock GPT error") + report = report_generator.build_numerator_report_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving prompted values for template 3" in report.comment + + mock_generate_gpt_request.side_effect = None + mock_get_dataland_values.side_effect = RuntimeError("Mock dataland error") + report = report_generator.build_numerator_report_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving dataland values for template 3" in report.comment + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_generate_taxonomy_aligned_denominator_report_edge_cases_not_attempted(mock_generate_gpt_request: Mock) -> None: + dataset, relevant_pages = provide_test_data_collection() + + # Simulate an exception in the GPT request generation + mock_generate_gpt_request.side_effect = ValueError("Mock GPT error") + + report = report_generator.build_numerator_report_frame(dataset, relevant_pages, "Revenue") + + assert report is not None + assert report.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert "Error retrieving prompted values for template 3" in report.comment diff --git a/tests/review/test_numeric_value_generator.py b/tests/review/test_numeric_value_generator.py new file mode 100644 index 0000000..244aa43 --- /dev/null +++ b/tests/review/test_numeric_value_generator.py @@ -0,0 +1,128 @@ +from unittest.mock import Mock, patch + +import pytest + +from dataland_qa_lab.prompting_services import prompting_service +from dataland_qa_lab.review.generate_gpt_request import GenerateGptRequest # noqa: F401 +from dataland_qa_lab.review.numeric_value_generator import NumericValueGenerator + + +# Mock AnalyzeResult +@pytest.fixture +def mock_analyze_result() -> Mock: + mock_result = Mock() + mock_result.content = "Test readable text content." + return mock_result + + +# Mock Config and Logger +@pytest.fixture +def mock_logger() -> Mock: + logger = Mock() + return logger + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_get_taxonomy_aligned_denominator_success(mock_generate_gpt_request: Mock, mock_analyze_result: Mock) -> None: + """Test successful extraction of taxonomy aligned denominator values.""" + mock_generate_gpt_request.return_value = ["0.1", "2.5", "3.0"] + + result = NumericValueGenerator.get_taxonomy_aligned_denominator(mock_analyze_result, "Revenue") + + mock_generate_gpt_request.assert_called_once_with( + prompting_service.PromptingService.create_main_prompt(2, mock_analyze_result, "Revenue"), + prompting_service.PromptingService.create_sub_prompt_template2to4("Revenue"), + ) + + assert result == [0.1, 2.5, 3.0] + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_get_taxonomy_aligned_denominator_empty_response(mock_generate_gpt_request: Mock) -> None: + """Test empty GPT response for taxonomy aligned denominator values.""" + mock_generate_gpt_request.return_value = [] + + with pytest.raises(ValueError, match=r"No results returned from GPT for template 2 values.") as exc: + NumericValueGenerator.get_taxonomy_aligned_denominator("Some readable text", "Revenue") + + assert "No results returned from GPT for template 2 values." in str(exc.value) + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_get_taxonomy_aligned_denominator_conversion_error( + mock_generate_gpt_request: Mock, mock_analyze_result: Mock +) -> None: + """Test float conversion error in taxonomy aligned denominator values.""" + mock_generate_gpt_request.return_value = ["0.1", "invalid", "3.0"] + + with pytest.raises(ValueError) as exc: # noqa: PT011 + NumericValueGenerator.get_taxonomy_aligned_denominator(mock_analyze_result, "Revenue") + + assert "Unexpected error during float conversion" in str(exc.value) + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_get_taxonomy_aligned_numerator_success(mock_generate_gpt_request: Mock, mock_analyze_result: Mock) -> None: + """Test successful extraction of taxonomy aligned numerator values.""" + mock_generate_gpt_request.return_value = ["1.0", "2.0", "3.0"] + + result = NumericValueGenerator.get_taxonomy_aligned_numerator(mock_analyze_result, "Revenue") + + mock_generate_gpt_request.assert_called_once_with( + prompting_service.PromptingService.create_main_prompt(3, mock_analyze_result, "Revenue"), + prompting_service.PromptingService.create_sub_prompt_template2to4("Revenue"), + ) + + assert result == [1.0, 2.0, 3.0] + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_get_taxonomy_eligible_not_alligned_success(mock_generate_gpt_request: Mock, mock_analyze_result: Mock) -> None: + """Test successful extraction of taxonomy eligible not aligned values.""" + mock_generate_gpt_request.return_value = ["4.0", "5.0", "6.0"] + + result = NumericValueGenerator.get_taxonomy_eligible_not_alligned(mock_analyze_result, "Revenue") + + mock_generate_gpt_request.assert_called_once_with( + prompting_service.PromptingService.create_main_prompt(4, mock_analyze_result, "Revenue"), + prompting_service.PromptingService.create_sub_prompt_template2to4("Revenue"), + ) + + assert result == [4.0, 5.0, 6.0] + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_get_taxonomy_non_eligible_success(mock_generate_gpt_request: Mock, mock_analyze_result: Mock) -> None: + """Test successful extraction of taxonomy non-eligible values.""" + mock_generate_gpt_request.return_value = ["7.0", "8.0", "9.0"] + + result = NumericValueGenerator.get_taxonomy_non_eligible(mock_analyze_result, "Revenue") + + mock_generate_gpt_request.assert_called_once_with( + prompting_service.PromptingService.create_main_prompt(5, mock_analyze_result, "Revenue"), + prompting_service.PromptingService.create_sub_prompt_template5("Revenue"), + ) + + assert result == [7.0, 8.0, 9.0] + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_get_taxonomy_non_eligible_empty_response(mock_generate_gpt_request: Mock, mock_analyze_result: Mock) -> None: + """Test empty GPT response for taxonomy non-eligible values.""" + mock_generate_gpt_request.return_value = [] + + with pytest.raises(ValueError) as exc: # noqa: PT011 + NumericValueGenerator.get_taxonomy_non_eligible(mock_analyze_result, "Revenue") + + assert "No results returned from GPT for template 5 values." in str(exc.value) + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_get_taxonomy_non_eligible_conversion_error(mock_generate_gpt_request: Mock, mock_analyze_result: Mock) -> None: + """Test float conversion error in taxonomy non-eligible values.""" + mock_generate_gpt_request.return_value = ["7.0", "invalid", "9.0"] + + with pytest.raises(ValueError) as exc: # noqa: PT011 + NumericValueGenerator.get_taxonomy_non_eligible(mock_analyze_result, "Revenue") + + assert "Unexpected error during float conversion" in str(exc.value) diff --git a/tests/review/test_report_generator.py b/tests/review/test_report_generator.py index 19b6353..450fba9 100644 --- a/tests/review/test_report_generator.py +++ b/tests/review/test_report_generator.py @@ -1,11 +1,9 @@ from unittest.mock import Mock, patch -import pytest from azure.ai.documentintelligence.models import AnalyzeResult from openai.types.chat.chat_completion import ChatCompletion, ChatCompletionMessage, Choice from dataland_qa_lab.review.report_generator import yes_no_report_generator -from dataland_qa_lab.review.report_generator.nuclear_and_gas_report_generator import NuclearAndGasReportGenerator from tests.utils.provide_test_data_collection import provide_test_data_collection @@ -49,17 +47,3 @@ def test_compare_yes_no_values(mock_generate_gpt_request: Mock) -> None: assert report.nuclear_energy_related_activities_section426.corrected_data.value is None assert report.nuclear_energy_related_activities_section426.comment == "Geprüft durch AzureOpenAI" assert report.fossil_gas_related_activities_section430.corrected_data.value == "Yes" - - -@patch("openai.resources.chat.Completions.create", return_value=build_simple_openai_chat_completion()) -def test_generate_report(_mock_create: Mock) -> None: # noqa: PT019 - test_data_collection = provide_test_data_collection() - - report = None # Initialize the variable to avoid UnboundLocalError - with pytest.raises(Exception, match=r"No tool calls found in the GPT response."): - report = NuclearAndGasReportGenerator().generate_report( - relevant_pages=AnalyzeResult(), dataset=test_data_collection - ) - # Handle report if no exception is raised - if report: - assert report.general.general.fossil_gas_related_activities_section430.corrected_data.value == "Yes" diff --git a/tests/review/test_yes_no_report_generator.py b/tests/review/test_yes_no_report_generator.py new file mode 100644 index 0000000..835b46a --- /dev/null +++ b/tests/review/test_yes_no_report_generator.py @@ -0,0 +1,104 @@ +from unittest.mock import Mock, patch + +from azure.ai.documentintelligence.models import AnalyzeResult +from dataland_qa.models.qa_report_data_point_verdict import QaReportDataPointVerdict +from openai.types.chat.chat_completion import ChatCompletion, ChatCompletionMessage, Choice + +from dataland_qa_lab.review.report_generator import yes_no_report_generator +from dataland_qa_lab.utils.nuclear_and_gas_data_collection import NuclearAndGasDataCollection +from tests.utils.provide_test_data_collection import provide_test_data_collection +from tests.utils.provide_test_dataset import provide_test_dataset + + +def create_document_intelligence_mock() -> AnalyzeResult: + return AnalyzeResult(content="") + + +def build_simple_openai_chat_completion() -> ChatCompletion: + msg = "['Yes', 'No', 'Yes', 'Yes', 'Yes', 'No']" + return ChatCompletion( + id="test", + choices=[ + Choice( + finish_reason="stop", + index=0, + message=ChatCompletionMessage( + content=msg, + role="assistant", + ), + ) + ], + created=0, + model="test", + object="chat.completion", + ) + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_compare_yes_no_values(mock_generate_gpt_request: Mock) -> None: + test_data_collection = provide_test_data_collection() + mock_generate_gpt_request.return_value = [ + "Yes", + "No", + "Yes", + "No", + "Yes", + "No", + ] + report = yes_no_report_generator.build_yes_no_report(dataset=test_data_collection, relevant_pages=AnalyzeResult()) + + assert report.nuclear_energy_related_activities_section426.corrected_data.value is None + assert report.nuclear_energy_related_activities_section426.comment == "Geprüft durch AzureOpenAI" + assert report.fossil_gas_related_activities_section430.corrected_data.value == "Yes" + + +@patch("dataland_qa_lab.review.generate_gpt_request.GenerateGptRequest.generate_gpt_request") +def test_build_yes_no_report_success(mock_generate_gpt_request: Mock) -> None: + mock_generate_gpt_request.return_value = [ + "No", + "No", + "Yes", + "No", + "No", + "No", + ] + test_data_collection = NuclearAndGasDataCollection(provide_test_dataset()) + report = yes_no_report_generator.build_yes_no_report(dataset=test_data_collection, relevant_pages=AnalyzeResult()) + + # Assertions + assert report.fossil_gas_related_activities_section430.comment == ( + "Discrepancy in 'fossil_gas_related_activities_section430': YesNo.YES != YesNo.NO." + ) + assert report.fossil_gas_related_activities_section430.verdict == QaReportDataPointVerdict.QAREJECTED + + +@patch("dataland_qa_lab.review.yes_no_value_generator.get_yes_no_values_from_report") +def test_build_yes_no_report_generator_error(mock_get_yes_no_values: Mock) -> None: + # Simulate an error in get_yes_no_values_from_report + mock_get_yes_no_values.side_effect = ValueError("Error in get_yes_no_values_from_report") + + test_data_collection = provide_test_data_collection() + report = yes_no_report_generator.build_yes_no_report(dataset=test_data_collection, relevant_pages="123") + + # Assertions for error handling + assert report.nuclear_energy_related_activities_section426.comment == "Error in get_yes_no_values_from_report" + assert report.nuclear_energy_related_activities_section426.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert report.nuclear_energy_related_activities_section426.corrected_data.value is None + + +@patch("dataland_qa_lab.dataland.data_provider.get_yes_no_values_by_data") +def test_build_yes_no_report_data_provider_error(mock_get_yes_no_values_by_data: Mock) -> None: + # Simulate an error in get_yes_no_values_by_data + mock_get_yes_no_values_by_data.side_effect = ValueError("Error in get_yes_no_values_by_data") + expected_comments = [ + "Error in get_yes_no_values_by_data", + "Error extracting values from template 1: An unexpected error occurred: " + "Error during GPT request creation: Connection error.", + ] + test_data_collection = provide_test_data_collection() + report = yes_no_report_generator.build_yes_no_report(dataset=test_data_collection, relevant_pages="123") + + # Assertions for error handling + assert report.nuclear_energy_related_activities_section426.comment in expected_comments + assert report.nuclear_energy_related_activities_section426.verdict == QaReportDataPointVerdict.QANOTATTEMPTED + assert report.nuclear_energy_related_activities_section426.corrected_data.comment is None