Skip to content

Commit

Permalink
Df 131 (#39)
Browse files Browse the repository at this point in the history
* Try catch

* bug fixes

* 0 to -1

* Old df-131 removed and changes taken over

* Ruff error fix

* ruff linting fix

* denominator verdict extended

* test coverage increase for unreviewed_dataset, data_provider and scheduled_processing

* test coverage increase for numeric_generator & prompt_servic

* Template 2-5 verdict extended

* Template 1 verdict extended

* test cases added

* df-131 finalized

* Error handling "get_relevant_pages_of_pdf"

* Yes_No List error fixed

* Pages Provider angepasst

* Tests error fixes

* Lint fix

* Test_updated

* text_to_doc output change

* Float convertion replaced with regex matching

* Float convertion fixed

* Sonar error fixed?

* No Data source fixed

* Lint fix

* Comments resolved

* Print tests

* fix: provide a not empty value to relevant pages to ensure test does not break

* fix: ensure error is thrown

* fix: ensure no null values are saved as markdown

* fix: arrangement of parameters adapted to patches

---------

Co-authored-by: aardunne <aaron.dunne@gmx.de>
Co-authored-by: fschnizer <falk.schnizer@stud.tu-darmstadt.de>
Co-authored-by: TilmanNiem <tilman.niem@gmail.com>
  • Loading branch information
4 people authored Feb 4, 2025
1 parent 15b9156 commit 62018e4
Show file tree
Hide file tree
Showing 28 changed files with 1,074 additions and 440 deletions.
266 changes: 64 additions & 202 deletions notebooks/test_existing_company_reports.ipynb

Large diffs are not rendered by default.

108 changes: 73 additions & 35 deletions src/dataland_qa_lab/dataland/data_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,87 +14,126 @@

def get_yes_no_values_by_data(data: NuclearAndGasDataCollection) -> dict[str, YesNo | None]:
"""Get Yes/No values of the given dataset as a dictionary with section names as keys."""
sections = data.yes_no_data_points
try:
sections = data.yes_no_data_points

section_values = {
key: (data.datapoint.value if data and data.datapoint and data.datapoint.value is not None else None)
for key, data in sections.items()
}
except (AttributeError, KeyError, TypeError) as e:
msg = f"Error retrieving yes/no values: {e}"
raise AttributeError(msg) from e

section_values = {
key: (data.datapoint.value if data and data.datapoint and data.datapoint.value is not None else None)
for key, data in sections.items()
}
return section_values


def get_taxonomy_aligned_revenue_denominator_values_by_data(data: NuclearAndGasDataCollection) -> dict:
"""Retrieve taxonomy-aligned revenue denominator values from the dataset."""
denominator_values_dict = {}
denominator_values = data.taxonomy_aligned_denominator.get("taxonomy_aligned_revenue_denominator").datapoint.value
for field_name in NuclearAndGasAlignedDenominator.model_fields:
denominator_values_dict[field_name] = extract_field_data(denominator_values, field_name)
try:
denominator_values = data.taxonomy_aligned_denominator.get(
"taxonomy_aligned_revenue_denominator"
).datapoint.value
for field_name in NuclearAndGasAlignedDenominator.model_fields:
denominator_values_dict[field_name] = extract_field_data(denominator_values, field_name)
except (AttributeError, KeyError, TypeError) as e:
msg = f"Error retrieving taxonomy-aligned revenue denominator: {e}"
raise AttributeError(msg) from e

return denominator_values_dict


def get_taxonomy_aligned_capex_denominator_values_by_data(data: NuclearAndGasDataCollection) -> dict:
"""Retrieve taxonomy-aligned capex denominator values from the dataset."""
denominator_values_dict = {}
denominator_values = data.taxonomy_aligned_denominator.get("taxonomy_aligned_capex_denominator").datapoint.value
for field_name in NuclearAndGasAlignedDenominator.model_fields:
denominator_values_dict[field_name] = extract_field_data(denominator_values, field_name)
try:
denominator_values = data.taxonomy_aligned_denominator.get("taxonomy_aligned_capex_denominator").datapoint.value
for field_name in NuclearAndGasAlignedDenominator.model_fields:
denominator_values_dict[field_name] = extract_field_data(denominator_values, field_name)
except (AttributeError, KeyError, TypeError) as e:
msg = f"Error retrieving taxonomy-aligned capex denominator: {e}"
raise AttributeError(msg) from e
return denominator_values_dict


def get_taxonomy_aligned_revenue_numerator_values_by_data(data: NuclearAndGasDataCollection) -> dict:
"""Retrieve taxonomy-aligned revenue numerator values from the dataset."""
numerator_values_dict = {}
numerator_values = data.taxonomy_aligned_numerator.get("taxonomy_aligned_revenue_numerator").datapoint.value
for field_name in NuclearAndGasAlignedNumerator.model_fields:
numerator_values_dict[field_name] = extract_field_data(numerator_values, field_name)
try:
numerator_values = data.taxonomy_aligned_numerator.get("taxonomy_aligned_revenue_numerator").datapoint.value
for field_name in NuclearAndGasAlignedNumerator.model_fields:
numerator_values_dict[field_name] = extract_field_data(numerator_values, field_name)
except (AttributeError, KeyError, TypeError) as e:
msg = f"Error retrieving taxonomy-aligned revenue numerator: {e}"
raise AttributeError(msg) from e
return numerator_values_dict


def get_taxonomy_aligned_capex_numerator_values_by_data(data: NuclearAndGasDataCollection) -> dict:
"""Retrieve taxonomy-aligned capex numerator values from the dataset."""
numerator_values_dict = {}
numerator_values = data.taxonomy_aligned_numerator.get("taxonomy_aligned_capex_numerator").datapoint.value
for field_name in NuclearAndGasAlignedNumerator.model_fields:
numerator_values_dict[field_name] = extract_field_data(numerator_values, field_name)
try:
numerator_values = data.taxonomy_aligned_numerator.get("taxonomy_aligned_capex_numerator").datapoint.value
for field_name in NuclearAndGasAlignedNumerator.model_fields:
numerator_values_dict[field_name] = extract_field_data(numerator_values, field_name)
except (AttributeError, KeyError, TypeError) as e:
msg = f"Error retrieving taxonomy-aligned capex numerator: {e}"
raise AttributeError(msg) from e
return numerator_values_dict


def get_taxonomy_eligible_but_not_aligned_revenue_values_by_data(data: NuclearAndGasDataCollection) -> dict:
"""Retrieve taxonomy eligible but not aligned revenue numerator values from the dataset."""
eligible_but_not_aligned_dict = {}
eligible_values = data.taxonomy_eligble_but_not_aligned.get("taxonomy_not_aligned_revenue").datapoint.value
for field_name in NuclearAndGasEligibleButNotAligned.model_fields:
eligible_but_not_aligned_dict[field_name] = extract_field_data(eligible_values, field_name)
try:
eligible_values = data.taxonomy_eligble_but_not_aligned.get("taxonomy_not_aligned_revenue").datapoint.value
for field_name in NuclearAndGasEligibleButNotAligned.model_fields:
eligible_but_not_aligned_dict[field_name] = extract_field_data(eligible_values, field_name)
except (AttributeError, KeyError, TypeError) as e:
msg = f"Error retrieving taxonomy eligible but not aligned revenue: {e}"
raise AttributeError(msg) from e
return eligible_but_not_aligned_dict


def get_taxonomy_eligible_but_not_aligned_capex_values_by_data(data: NuclearAndGasDataCollection) -> dict:
"""Retrieve taxonomy eligible but not aligned capex from the dataset."""
eligible_but_not_aligned_dict = {}
eligible_values = data.taxonomy_eligble_but_not_aligned.get("taxonomy_not_aligned_capex").datapoint.value
for field_name in NuclearAndGasEligibleButNotAligned.model_fields:
eligible_but_not_aligned_dict[field_name] = extract_field_data(eligible_values, field_name)
try:
eligible_values = data.taxonomy_eligble_but_not_aligned.get("taxonomy_not_aligned_capex").datapoint.value
for field_name in NuclearAndGasEligibleButNotAligned.model_fields:
eligible_but_not_aligned_dict[field_name] = extract_field_data(eligible_values, field_name)
except (AttributeError, KeyError, TypeError) as e:
msg = f"Error retrieving taxonomy eligible but not aligned capex: {e}"
raise AttributeError(msg) from e
return eligible_but_not_aligned_dict


def get_taxonomy_non_eligible_revenue_values_by_data(data: NuclearAndGasDataCollection) -> dict:
"""Retrieve taxonomy non eligible revenue numerator values from the dataset."""
"""Retrieve taxonomy non-eligible revenue numerator values from the dataset."""
non_eligible_dict = {}
non_eligible_values = data.taxonomy_non_eligible.get("taxonomy_non_eligible_revenue").datapoint.value
for field_name in NuclearAndGasNonEligible.model_fields:
value = getattr(non_eligible_values, field_name, None)
non_eligible_dict[field_name] = -1 if value is None else value

try:
non_eligible_values = data.taxonomy_non_eligible.get("taxonomy_non_eligible_revenue").datapoint.value
for field_name in NuclearAndGasNonEligible.model_fields:
value = getattr(non_eligible_values, field_name, None)
non_eligible_dict[field_name] = -1 if value is None else value
except (AttributeError, KeyError, TypeError) as e:
msg = f"Error retrieving taxonomy non-eligible revenue: {e}"
raise AttributeError(msg) from e
return non_eligible_dict


def get_taxonomy_non_eligible_capex_values_by_data(data: NuclearAndGasDataCollection) -> dict:
"""Retrieve taxonomy non eligible capex numerator values from the dataset."""
"""Retrieve taxonomy non-eligible capex numerator values from the dataset."""
non_eligible_dict = {}
non_eligible_values = data.taxonomy_non_eligible.get("taxonomy_non_eligible_capex").datapoint.value
for field_name in NuclearAndGasNonEligible.model_fields:
value = getattr(non_eligible_values, field_name, None)
non_eligible_dict[field_name] = -1 if value is None else value
try:
non_eligible_values = data.taxonomy_non_eligible.get("taxonomy_non_eligible_capex").datapoint.value
for field_name in NuclearAndGasNonEligible.model_fields:
value = getattr(non_eligible_values, field_name, None)
non_eligible_dict[field_name] = -1 if value is None else value
except (AttributeError, KeyError, TypeError) as e:
msg = f"Error retrieving taxonomy non-eligible capex: {e}"
raise AttributeError(msg) from e
return non_eligible_dict


Expand Down Expand Up @@ -144,7 +183,6 @@ def get_datasources_of_nuclear_and_gas_numeric_values(
section_list = {
key: data_source for section in sections.values() for key, data_source in extract_data_source(section).items()
}

return section_list


Expand Down
8 changes: 7 additions & 1 deletion src/dataland_qa_lab/dataland/unreviewed_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ def __init__(self) -> None:
"""Initialize the unreviewed datasets with the data from the API."""
client = config.get_config().dataland_client
logger.info(msg="Initializing the unreviewed Datasets with the data from Dataland.")

if client is None:
logger.exception("Client Setup failed in the configuration.")
msg = "Client Setup failed in the configuration."
raise ValueError(msg)
try:
number_of_datasets = client.qa_api.get_number_of_pending_datasets()
if number_of_datasets is None or number_of_datasets < 0:
Expand All @@ -29,6 +32,9 @@ def __init__(self) -> None:

self.list_of_data_ids = [dataset.data_id for dataset in self.datasets]

except RuntimeError:
logger.exception("Timeout occurred while initializing the unreviewed datasets.")
raise
except Exception:
logger.exception(msg="An error occurred", exc_info=Exception)
raise
10 changes: 6 additions & 4 deletions src/dataland_qa_lab/pages/pages_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,16 @@ def get_relevant_page_numbers(dataset: NuclearAndGasDataCollection) -> list[int]
return sorted(set(yes_no_pages + numeric_pages))


def get_relevant_pages_of_pdf(dataset: NuclearAndGasDataCollection) -> pypdf.PdfReader:
def get_relevant_pages_of_pdf(dataset: NuclearAndGasDataCollection) -> pypdf.PdfReader | None:
"""Get page numbers of relevant data."""
dataland_client = config.get_config().dataland_client

page_numbers = get_relevant_page_numbers(dataset=dataset)
file_reference = dataset.yes_no_data_points.get(
"nuclear_energy_related_activities_section426"
).datapoint.data_source.file_reference
try:
datapoint = dataset.yes_no_data_points.get("nuclear_energy_related_activities_section426").datapoint
file_reference = datapoint.data_source.file_reference
except AttributeError:
return None

full_pdf = dataland_client.documents_api.get_document(file_reference)
full_pdf_stream = io.BytesIO(full_pdf)
Expand Down
3 changes: 3 additions & 0 deletions src/dataland_qa_lab/pages/text_to_doc_intelligence.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ def get_markdown_from_dataset(data_id: str, relevant_pages_pdf_reader: pypdf.Pdf
else:
readable_text = extract_text_of_pdf(relevant_pages_pdf_reader)

if readable_text is None:
return None

new_document = ReviewedDatasetMarkdowns(
data_id=data_id,
markdown_text=readable_text,
Expand Down
8 changes: 4 additions & 4 deletions src/dataland_qa_lab/prompting_services/prompting_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def create_main_prompt(template: int, pdf: str, kpi: str) -> str:
"Taxonomy-aligned economic activities (denominator)",
give me the percentage of "CCM+CCA", "CCM" and "CCA" for all rows.
Focus on the row numbers on the left side of the table.
If you can't find the percentage value, write "0".
If you can't find the percentage value, write "-1".
Consider translating for this given task like Meldebogen instead of template.
# Relevant Documents
{pdf}
Expand All @@ -31,7 +31,7 @@ def create_main_prompt(template: int, pdf: str, kpi: str) -> str:
"Taxonomy-aligned economic activities (numerator)",
give me the percentage of "CCM+CCA", "CCM" and "CCA" for all rows.
Focus on the row numbers on the left side of the table.
If you can't find the percentage value, write "0".
If you can't find the percentage value, write "-1".
Consider translating for this given task like Meldebogen instead of template.
# Relevant Documents
{pdf}
Expand All @@ -41,7 +41,7 @@ def create_main_prompt(template: int, pdf: str, kpi: str) -> str:
"Taxonomy-eligible but not taxonomy-aligned economic activities",
give me the percentage of "CCM+CCA", "CCM" and "CCA" for all rows.
Focus on the row numbers on the left side of the table.
If you can't find the percentage value, write "0".
If you can't find the percentage value, write "-1".
Consider translating for this given task like Meldebogen instead of template.
# Relevant Documents
{pdf}
Expand All @@ -51,7 +51,7 @@ def create_main_prompt(template: int, pdf: str, kpi: str) -> str:
"Taxonomy non-eligible economic activities",
give me the percentage for all rows.
Focus on the row numbers on the left side of the table.
If you can't find the percentage value, write "0".
If you can't find the percentage value, write "-1".
Consider translating for this given task like Meldebogen instead of template.
# Relevant Documents
{pdf}
Expand Down
23 changes: 15 additions & 8 deletions src/dataland_qa_lab/review/dataset_reviewer.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,22 @@ def review_dataset(data_id: str, single_pass_e2e: bool = False) -> QaReportMetaI
logger.debug("Relevant page numbers extracted.")

relevant_pages_pdf_reader = pages_provider.get_relevant_pages_of_pdf(data_collection)
logger.debug("Relevant pages extracted.")
if relevant_pages_pdf_reader is None:
logger.debug("No Data source found for the relevant pages.")
report = NuclearAndGasReportGenerator().generate_report(relevant_pages=None, dataset=data_collection)
logger.info("QA not attempted report generated successfully.")

readable_text = text_to_doc_intelligence.get_markdown_from_dataset(
data_id=data_id, page_numbers=page_numbers, relevant_pages_pdf_reader=relevant_pages_pdf_reader
)
logger.debug("Text extracted from the relevant pages.")

report = NuclearAndGasReportGenerator().generate_report(relevant_pages=readable_text, dataset=data_collection)
logger.info("Report generated succesfully.")
else:
logger.debug("Relevant pages extracted.")
readable_text = text_to_doc_intelligence.get_markdown_from_dataset(
data_id=data_id, page_numbers=page_numbers, relevant_pages_pdf_reader=relevant_pages_pdf_reader
)
logger.debug("Text extracted from the relevant pages.")

report = NuclearAndGasReportGenerator().generate_report(
relevant_pages=readable_text, dataset=data_collection
)
logger.info("Report generated succesfully.")

data = config.get_config().dataland_client.eu_taxonomy_nuclear_gas_qa_api.post_nuclear_and_gas_data_qa_report(
data_id=data_id, nuclear_and_gas_data=report
Expand Down
95 changes: 61 additions & 34 deletions src/dataland_qa_lab/review/generate_gpt_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,38 +21,65 @@ def generate_gpt_request(mainprompt: str, subprompt: str) -> list:
Returns:
List[str]: A list of extracted values from the GPT response.
Raises:
ValueError: For any issues encountered during the process.
"""
conf = config.get_config()

client = AzureOpenAI(
api_key=conf.azure_openai_api_key,
api_version="2024-07-01-preview",
azure_endpoint=conf.azure_openai_endpoint,
)
updated_openai_response = client.chat.completions.create(
model="gpt-4o",
temperature=0,
messages=[
{"role": "system", "content": mainprompt},
],
tool_choice="required",
tools=[
{
"type": "function",
"function": {
"name": "requested_information_precisely_found_in_relevant_documents",
"description": "Submit the requested information. "
"Use this function when the information is precisely stated in the relevant documents.",
"parameters": subprompt,
},
}
],
)
if updated_openai_response.choices[0].message.tool_calls:
tool_call = updated_openai_response.choices[0].message.tool_calls[0].function
else:
msg_p = "No tool calls found in the GPT response."
logger.exception(msg=msg_p, exc_info=ValueError)
raise ValueError(msg_p)
data_dict = ast.literal_eval(tool_call.arguments)
return list(data_dict.values())
try:
try:
conf = config.get_config()
except Exception as e:
msg = f"Error loading configuration in Gpt_request generator: {e}"
raise ValueError(msg) from e

# Initialize Azure OpenAI client
try:
client = AzureOpenAI(
api_key=conf.azure_openai_api_key,
api_version="2024-07-01-preview",
azure_endpoint=conf.azure_openai_endpoint,
)
except Exception as e:
msg = f"Error initializing AzureOpenAI client: {e}"
raise ValueError(msg) from e

# Create GPT request
try:
updated_openai_response = client.chat.completions.create(
model="gpt-4o",
temperature=0,
messages=[
{"role": "system", "content": mainprompt},
],
tool_choice="required",
tools=[
{
"type": "function",
"function": {
"name": "requested_information_precisely_found_in_relevant_documents",
"description": "Submit the requested information. "
"Use this function when the information is precisely stated in the relevant documents.",
"parameters": subprompt,
},
}
],
)
except Exception as e:
msg = f"Error during GPT request creation: {e}"
raise ValueError(msg) from e

try:
if updated_openai_response.choices[0].message.tool_calls:
tool_call = updated_openai_response.choices[0].message.tool_calls[0].function
except Exception as e:
msg = f"Error extracting tool calls: {e}"
raise ValueError(e) from e

data_dict = ast.literal_eval(tool_call.arguments)

return list(data_dict.values())

except (ValueError, KeyError, TypeError) as general_error:
# General error handling
msg = f"An unexpected error occurred: {general_error}"
raise ValueError(msg) from general_error
Loading

0 comments on commit 62018e4

Please sign in to comment.