Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Df 131 #39

Merged
merged 36 commits into from
Feb 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
5338de3
Try catch
aardunne Jan 22, 2025
acf556c
bug fixes
aardunne Jan 22, 2025
9292a19
0 to -1
aardunne Jan 22, 2025
8b596a6
Old df-131 removed and changes taken over
Si2-Aung Jan 23, 2025
673eeb5
Ruff error fix
Si2-Aung Jan 23, 2025
0a25dc1
ruff linting fix
Si2-Aung Jan 23, 2025
d582013
denominator verdict extended
Si2-Aung Jan 24, 2025
3de085f
test coverage increase for unreviewed_dataset, data_provider and sche…
Si2-Aung Jan 24, 2025
b48bd9a
test coverage increase for numeric_generator & prompt_servic
Si2-Aung Jan 24, 2025
ee85ecc
Template 2-5 verdict extended
Si2-Aung Jan 24, 2025
6a1bb9e
Template 1 verdict extended
Si2-Aung Jan 24, 2025
3e8015c
test cases added
Si2-Aung Jan 26, 2025
aa8c4a6
df-131 finalized
Si2-Aung Jan 26, 2025
81a913c
Error handling "get_relevant_pages_of_pdf"
fschnizer Jan 28, 2025
e339c16
Merge remote-tracking branch 'origin/main' into df-131
Si2-Aung Jan 28, 2025
c9efcd6
Yes_No List error fixed
Si2-Aung Jan 29, 2025
2f74818
Merge branch 'main' into df-131
Si2-Aung Jan 29, 2025
6626c7e
Pages Provider angepasst
Si2-Aung Jan 30, 2025
eebf87c
Merge branch 'main' into df-131 (loggin)
Si2-Aung Jan 30, 2025
f6d23a7
Tests error fixes
Si2-Aung Jan 30, 2025
2cad86a
Lint fix
Si2-Aung Jan 30, 2025
099d1c9
Merge branch 'main' into df-131
Si2-Aung Jan 30, 2025
fa6b75f
Test_updated
Si2-Aung Jan 30, 2025
e96d202
text_to_doc output change
Si2-Aung Jan 30, 2025
f63dfd7
Float convertion replaced with regex matching
Si2-Aung Jan 31, 2025
965a864
Float convertion fixed
Si2-Aung Jan 31, 2025
e8996d1
Sonar error fixed?
Si2-Aung Jan 31, 2025
569e44c
No Data source fixed
Si2-Aung Feb 1, 2025
ad9b3d4
Lint fix
Si2-Aung Feb 1, 2025
96f811e
Comments resolved
Si2-Aung Feb 3, 2025
fa87701
Print tests
Si2-Aung Feb 4, 2025
5b40981
fix: provide a not empty value to relevant pages to ensure test does …
TilmanNiem Feb 4, 2025
3d3ae22
fix: ensure error is thrown
TilmanNiem Feb 4, 2025
7068dba
fix: ensure no null values are saved as markdown
TilmanNiem Feb 4, 2025
2b9c55e
fix: arrangement of parameters adapted to patches
TilmanNiem Feb 4, 2025
5921bb3
Merge branch 'main' into df-131
Si2-Aung Feb 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
266 changes: 64 additions & 202 deletions notebooks/test_existing_company_reports.ipynb

Large diffs are not rendered by default.

108 changes: 73 additions & 35 deletions src/dataland_qa_lab/dataland/data_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,87 +14,126 @@

def get_yes_no_values_by_data(data: NuclearAndGasDataCollection) -> dict[str, YesNo | None]:
"""Get Yes/No values of the given dataset as a dictionary with section names as keys."""
sections = data.yes_no_data_points
try:
sections = data.yes_no_data_points

section_values = {
key: (data.datapoint.value if data and data.datapoint and data.datapoint.value is not None else None)
for key, data in sections.items()
}
except (AttributeError, KeyError, TypeError) as e:
msg = f"Error retrieving yes/no values: {e}"
raise AttributeError(msg) from e

section_values = {
key: (data.datapoint.value if data and data.datapoint and data.datapoint.value is not None else None)
for key, data in sections.items()
}
return section_values


def get_taxonomy_aligned_revenue_denominator_values_by_data(data: NuclearAndGasDataCollection) -> dict:
"""Retrieve taxonomy-aligned revenue denominator values from the dataset."""
denominator_values_dict = {}
denominator_values = data.taxonomy_aligned_denominator.get("taxonomy_aligned_revenue_denominator").datapoint.value
for field_name in NuclearAndGasAlignedDenominator.model_fields:
denominator_values_dict[field_name] = extract_field_data(denominator_values, field_name)
try:
denominator_values = data.taxonomy_aligned_denominator.get(
"taxonomy_aligned_revenue_denominator"
).datapoint.value
for field_name in NuclearAndGasAlignedDenominator.model_fields:
denominator_values_dict[field_name] = extract_field_data(denominator_values, field_name)
except (AttributeError, KeyError, TypeError) as e:
msg = f"Error retrieving taxonomy-aligned revenue denominator: {e}"
raise AttributeError(msg) from e

return denominator_values_dict


def get_taxonomy_aligned_capex_denominator_values_by_data(data: NuclearAndGasDataCollection) -> dict:
"""Retrieve taxonomy-aligned capex denominator values from the dataset."""
denominator_values_dict = {}
denominator_values = data.taxonomy_aligned_denominator.get("taxonomy_aligned_capex_denominator").datapoint.value
for field_name in NuclearAndGasAlignedDenominator.model_fields:
denominator_values_dict[field_name] = extract_field_data(denominator_values, field_name)
try:
denominator_values = data.taxonomy_aligned_denominator.get("taxonomy_aligned_capex_denominator").datapoint.value
for field_name in NuclearAndGasAlignedDenominator.model_fields:
denominator_values_dict[field_name] = extract_field_data(denominator_values, field_name)
except (AttributeError, KeyError, TypeError) as e:
msg = f"Error retrieving taxonomy-aligned capex denominator: {e}"
raise AttributeError(msg) from e
return denominator_values_dict


def get_taxonomy_aligned_revenue_numerator_values_by_data(data: NuclearAndGasDataCollection) -> dict:
"""Retrieve taxonomy-aligned revenue numerator values from the dataset."""
numerator_values_dict = {}
numerator_values = data.taxonomy_aligned_numerator.get("taxonomy_aligned_revenue_numerator").datapoint.value
for field_name in NuclearAndGasAlignedNumerator.model_fields:
numerator_values_dict[field_name] = extract_field_data(numerator_values, field_name)
try:
numerator_values = data.taxonomy_aligned_numerator.get("taxonomy_aligned_revenue_numerator").datapoint.value
for field_name in NuclearAndGasAlignedNumerator.model_fields:
numerator_values_dict[field_name] = extract_field_data(numerator_values, field_name)
except (AttributeError, KeyError, TypeError) as e:
msg = f"Error retrieving taxonomy-aligned revenue numerator: {e}"
raise AttributeError(msg) from e
return numerator_values_dict


def get_taxonomy_aligned_capex_numerator_values_by_data(data: NuclearAndGasDataCollection) -> dict:
"""Retrieve taxonomy-aligned capex numerator values from the dataset."""
numerator_values_dict = {}
numerator_values = data.taxonomy_aligned_numerator.get("taxonomy_aligned_capex_numerator").datapoint.value
for field_name in NuclearAndGasAlignedNumerator.model_fields:
numerator_values_dict[field_name] = extract_field_data(numerator_values, field_name)
try:
numerator_values = data.taxonomy_aligned_numerator.get("taxonomy_aligned_capex_numerator").datapoint.value
for field_name in NuclearAndGasAlignedNumerator.model_fields:
numerator_values_dict[field_name] = extract_field_data(numerator_values, field_name)
except (AttributeError, KeyError, TypeError) as e:
msg = f"Error retrieving taxonomy-aligned capex numerator: {e}"
raise AttributeError(msg) from e
return numerator_values_dict


def get_taxonomy_eligible_but_not_aligned_revenue_values_by_data(data: NuclearAndGasDataCollection) -> dict:
"""Retrieve taxonomy eligible but not aligned revenue numerator values from the dataset."""
eligible_but_not_aligned_dict = {}
eligible_values = data.taxonomy_eligble_but_not_aligned.get("taxonomy_not_aligned_revenue").datapoint.value
for field_name in NuclearAndGasEligibleButNotAligned.model_fields:
eligible_but_not_aligned_dict[field_name] = extract_field_data(eligible_values, field_name)
try:
eligible_values = data.taxonomy_eligble_but_not_aligned.get("taxonomy_not_aligned_revenue").datapoint.value
for field_name in NuclearAndGasEligibleButNotAligned.model_fields:
eligible_but_not_aligned_dict[field_name] = extract_field_data(eligible_values, field_name)
except (AttributeError, KeyError, TypeError) as e:
msg = f"Error retrieving taxonomy eligible but not aligned revenue: {e}"
raise AttributeError(msg) from e
return eligible_but_not_aligned_dict


def get_taxonomy_eligible_but_not_aligned_capex_values_by_data(data: NuclearAndGasDataCollection) -> dict:
"""Retrieve taxonomy eligible but not aligned capex from the dataset."""
eligible_but_not_aligned_dict = {}
eligible_values = data.taxonomy_eligble_but_not_aligned.get("taxonomy_not_aligned_capex").datapoint.value
for field_name in NuclearAndGasEligibleButNotAligned.model_fields:
eligible_but_not_aligned_dict[field_name] = extract_field_data(eligible_values, field_name)
try:
eligible_values = data.taxonomy_eligble_but_not_aligned.get("taxonomy_not_aligned_capex").datapoint.value
for field_name in NuclearAndGasEligibleButNotAligned.model_fields:
eligible_but_not_aligned_dict[field_name] = extract_field_data(eligible_values, field_name)
except (AttributeError, KeyError, TypeError) as e:
msg = f"Error retrieving taxonomy eligible but not aligned capex: {e}"
raise AttributeError(msg) from e
return eligible_but_not_aligned_dict


def get_taxonomy_non_eligible_revenue_values_by_data(data: NuclearAndGasDataCollection) -> dict:
"""Retrieve taxonomy non eligible revenue numerator values from the dataset."""
"""Retrieve taxonomy non-eligible revenue numerator values from the dataset."""
non_eligible_dict = {}
non_eligible_values = data.taxonomy_non_eligible.get("taxonomy_non_eligible_revenue").datapoint.value
for field_name in NuclearAndGasNonEligible.model_fields:
value = getattr(non_eligible_values, field_name, None)
non_eligible_dict[field_name] = -1 if value is None else value

try:
non_eligible_values = data.taxonomy_non_eligible.get("taxonomy_non_eligible_revenue").datapoint.value
for field_name in NuclearAndGasNonEligible.model_fields:
value = getattr(non_eligible_values, field_name, None)
non_eligible_dict[field_name] = -1 if value is None else value
except (AttributeError, KeyError, TypeError) as e:
msg = f"Error retrieving taxonomy non-eligible revenue: {e}"
raise AttributeError(msg) from e
return non_eligible_dict


def get_taxonomy_non_eligible_capex_values_by_data(data: NuclearAndGasDataCollection) -> dict:
"""Retrieve taxonomy non eligible capex numerator values from the dataset."""
"""Retrieve taxonomy non-eligible capex numerator values from the dataset."""
non_eligible_dict = {}
non_eligible_values = data.taxonomy_non_eligible.get("taxonomy_non_eligible_capex").datapoint.value
for field_name in NuclearAndGasNonEligible.model_fields:
value = getattr(non_eligible_values, field_name, None)
non_eligible_dict[field_name] = -1 if value is None else value
try:
non_eligible_values = data.taxonomy_non_eligible.get("taxonomy_non_eligible_capex").datapoint.value
for field_name in NuclearAndGasNonEligible.model_fields:
value = getattr(non_eligible_values, field_name, None)
non_eligible_dict[field_name] = -1 if value is None else value
except (AttributeError, KeyError, TypeError) as e:
msg = f"Error retrieving taxonomy non-eligible capex: {e}"
raise AttributeError(msg) from e
return non_eligible_dict


Expand Down Expand Up @@ -144,7 +183,6 @@ def get_datasources_of_nuclear_and_gas_numeric_values(
section_list = {
key: data_source for section in sections.values() for key, data_source in extract_data_source(section).items()
}

return section_list


Expand Down
8 changes: 7 additions & 1 deletion src/dataland_qa_lab/dataland/unreviewed_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ def __init__(self) -> None:
"""Initialize the unreviewed datasets with the data from the API."""
client = config.get_config().dataland_client
logger.info(msg="Initializing the unreviewed Datasets with the data from Dataland.")

if client is None:
logger.exception("Client Setup failed in the configuration.")
msg = "Client Setup failed in the configuration."
raise ValueError(msg)
try:
number_of_datasets = client.qa_api.get_number_of_pending_datasets()
if number_of_datasets is None or number_of_datasets < 0:
Expand All @@ -29,6 +32,9 @@ def __init__(self) -> None:

self.list_of_data_ids = [dataset.data_id for dataset in self.datasets]

except RuntimeError:
logger.exception("Timeout occurred while initializing the unreviewed datasets.")
raise
except Exception:
logger.exception(msg="An error occurred", exc_info=Exception)
raise
10 changes: 6 additions & 4 deletions src/dataland_qa_lab/pages/pages_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,16 @@ def get_relevant_page_numbers(dataset: NuclearAndGasDataCollection) -> list[int]
return sorted(set(yes_no_pages + numeric_pages))


def get_relevant_pages_of_pdf(dataset: NuclearAndGasDataCollection) -> pypdf.PdfReader:
def get_relevant_pages_of_pdf(dataset: NuclearAndGasDataCollection) -> pypdf.PdfReader | None:
"""Get page numbers of relevant data."""
dataland_client = config.get_config().dataland_client

page_numbers = get_relevant_page_numbers(dataset=dataset)
file_reference = dataset.yes_no_data_points.get(
"nuclear_energy_related_activities_section426"
).datapoint.data_source.file_reference
try:
datapoint = dataset.yes_no_data_points.get("nuclear_energy_related_activities_section426").datapoint
file_reference = datapoint.data_source.file_reference
except AttributeError:
return None

full_pdf = dataland_client.documents_api.get_document(file_reference)
full_pdf_stream = io.BytesIO(full_pdf)
Expand Down
3 changes: 3 additions & 0 deletions src/dataland_qa_lab/pages/text_to_doc_intelligence.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ def get_markdown_from_dataset(data_id: str, relevant_pages_pdf_reader: pypdf.Pdf
else:
readable_text = extract_text_of_pdf(relevant_pages_pdf_reader)

if readable_text is None:
return None

new_document = ReviewedDatasetMarkdowns(
data_id=data_id,
markdown_text=readable_text,
Expand Down
8 changes: 4 additions & 4 deletions src/dataland_qa_lab/prompting_services/prompting_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def create_main_prompt(template: int, pdf: str, kpi: str) -> str:
"Taxonomy-aligned economic activities (denominator)",
give me the percentage of "CCM+CCA", "CCM" and "CCA" for all rows.
Focus on the row numbers on the left side of the table.
If you can't find the percentage value, write "0".
If you can't find the percentage value, write "-1".
Consider translating for this given task like Meldebogen instead of template.
# Relevant Documents
{pdf}
Expand All @@ -31,7 +31,7 @@ def create_main_prompt(template: int, pdf: str, kpi: str) -> str:
"Taxonomy-aligned economic activities (numerator)",
give me the percentage of "CCM+CCA", "CCM" and "CCA" for all rows.
Focus on the row numbers on the left side of the table.
If you can't find the percentage value, write "0".
If you can't find the percentage value, write "-1".
Consider translating for this given task like Meldebogen instead of template.
# Relevant Documents
{pdf}
Expand All @@ -41,7 +41,7 @@ def create_main_prompt(template: int, pdf: str, kpi: str) -> str:
"Taxonomy-eligible but not taxonomy-aligned economic activities",
give me the percentage of "CCM+CCA", "CCM" and "CCA" for all rows.
Focus on the row numbers on the left side of the table.
If you can't find the percentage value, write "0".
If you can't find the percentage value, write "-1".
Consider translating for this given task like Meldebogen instead of template.
# Relevant Documents
{pdf}
Expand All @@ -51,7 +51,7 @@ def create_main_prompt(template: int, pdf: str, kpi: str) -> str:
"Taxonomy non-eligible economic activities",
give me the percentage for all rows.
Focus on the row numbers on the left side of the table.
If you can't find the percentage value, write "0".
If you can't find the percentage value, write "-1".
Consider translating for this given task like Meldebogen instead of template.
# Relevant Documents
{pdf}
Expand Down
23 changes: 15 additions & 8 deletions src/dataland_qa_lab/review/dataset_reviewer.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,22 @@ def review_dataset(data_id: str, single_pass_e2e: bool = False) -> QaReportMetaI
logger.debug("Relevant page numbers extracted.")

relevant_pages_pdf_reader = pages_provider.get_relevant_pages_of_pdf(data_collection)
logger.debug("Relevant pages extracted.")
if relevant_pages_pdf_reader is None:
logger.debug("No Data source found for the relevant pages.")
report = NuclearAndGasReportGenerator().generate_report(relevant_pages=None, dataset=data_collection)
logger.info("QA not attempted report generated successfully.")

readable_text = text_to_doc_intelligence.get_markdown_from_dataset(
data_id=data_id, page_numbers=page_numbers, relevant_pages_pdf_reader=relevant_pages_pdf_reader
)
logger.debug("Text extracted from the relevant pages.")

report = NuclearAndGasReportGenerator().generate_report(relevant_pages=readable_text, dataset=data_collection)
logger.info("Report generated succesfully.")
else:
logger.debug("Relevant pages extracted.")
readable_text = text_to_doc_intelligence.get_markdown_from_dataset(
data_id=data_id, page_numbers=page_numbers, relevant_pages_pdf_reader=relevant_pages_pdf_reader
)
logger.debug("Text extracted from the relevant pages.")

report = NuclearAndGasReportGenerator().generate_report(
relevant_pages=readable_text, dataset=data_collection
)
logger.info("Report generated succesfully.")

data = config.get_config().dataland_client.eu_taxonomy_nuclear_gas_qa_api.post_nuclear_and_gas_data_qa_report(
data_id=data_id, nuclear_and_gas_data=report
Expand Down
95 changes: 61 additions & 34 deletions src/dataland_qa_lab/review/generate_gpt_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,38 +21,65 @@ def generate_gpt_request(mainprompt: str, subprompt: str) -> list:

Returns:
List[str]: A list of extracted values from the GPT response.

Raises:
ValueError: For any issues encountered during the process.
"""
conf = config.get_config()

client = AzureOpenAI(
api_key=conf.azure_openai_api_key,
api_version="2024-07-01-preview",
azure_endpoint=conf.azure_openai_endpoint,
)
updated_openai_response = client.chat.completions.create(
model="gpt-4o",
temperature=0,
messages=[
{"role": "system", "content": mainprompt},
],
tool_choice="required",
tools=[
{
"type": "function",
"function": {
"name": "requested_information_precisely_found_in_relevant_documents",
"description": "Submit the requested information. "
"Use this function when the information is precisely stated in the relevant documents.",
"parameters": subprompt,
},
}
],
)
if updated_openai_response.choices[0].message.tool_calls:
tool_call = updated_openai_response.choices[0].message.tool_calls[0].function
else:
msg_p = "No tool calls found in the GPT response."
logger.exception(msg=msg_p, exc_info=ValueError)
raise ValueError(msg_p)
data_dict = ast.literal_eval(tool_call.arguments)
return list(data_dict.values())
try:
try:
conf = config.get_config()
except Exception as e:
msg = f"Error loading configuration in Gpt_request generator: {e}"
raise ValueError(msg) from e

# Initialize Azure OpenAI client
try:
client = AzureOpenAI(
api_key=conf.azure_openai_api_key,
api_version="2024-07-01-preview",
azure_endpoint=conf.azure_openai_endpoint,
)
except Exception as e:
msg = f"Error initializing AzureOpenAI client: {e}"
raise ValueError(msg) from e

# Create GPT request
try:
updated_openai_response = client.chat.completions.create(
model="gpt-4o",
temperature=0,
messages=[
{"role": "system", "content": mainprompt},
],
tool_choice="required",
tools=[
{
"type": "function",
"function": {
"name": "requested_information_precisely_found_in_relevant_documents",
"description": "Submit the requested information. "
"Use this function when the information is precisely stated in the relevant documents.",
"parameters": subprompt,
},
}
],
)
except Exception as e:
msg = f"Error during GPT request creation: {e}"
raise ValueError(msg) from e

try:
if updated_openai_response.choices[0].message.tool_calls:
tool_call = updated_openai_response.choices[0].message.tool_calls[0].function
except Exception as e:
msg = f"Error extracting tool calls: {e}"
raise ValueError(e) from e

data_dict = ast.literal_eval(tool_call.arguments)

return list(data_dict.values())

except (ValueError, KeyError, TypeError) as general_error:
# General error handling
msg = f"An unexpected error occurred: {general_error}"
raise ValueError(msg) from general_error
Loading
Loading