Skip to content

Commit

Permalink
DF-21 final changes
Browse files Browse the repository at this point in the history
  • Loading branch information
chrisBer67 committed Jan 29, 2025
1 parent 7d580f2 commit 7aa5013
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 36 deletions.
4 changes: 2 additions & 2 deletions src/dataland_qa_lab/dataland/scheduled_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def run_scheduled_processing(iterations: int) -> None:
try:
unreviewed_datasets = UnreviewedDatasets()
list_of_data_ids = unreviewed_datasets.list_of_data_ids
logger.info("Processing unreviewed datasets with the list of Data ids: %s", list_of_data_ids)
logger.info("Processing unreviewed datasets with the list of Data-IDs: %s", list_of_data_ids)

if not list_of_data_ids:
time.sleep(600)
Expand All @@ -28,7 +28,7 @@ def run_scheduled_processing(iterations: int) -> None:
list_of_data_ids.remove(data_id)

except Exception:
logger.exception("Error processing dataset %s", data_id)
logger.exception("Error processing dataset with the Data-ID: %s", data_id)

except Exception as e:
logger.critical("Critical error: %s", e)
Expand Down
96 changes: 63 additions & 33 deletions src/dataland_qa_lab/review/dataset_reviewer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
from datetime import UTC, datetime, timedelta, timezone

from dataland_qa.models.qa_report_meta_information import QaReportMetaInformation
Expand All @@ -10,51 +11,80 @@
from dataland_qa_lab.utils import config
from dataland_qa_lab.utils.nuclear_and_gas_data_collection import NuclearAndGasDataCollection

logger = logging.getLogger(__name__)


def review_dataset(data_id: str) -> QaReportMetaInformation | None:
"""Review a dataset."""
dataset = dataset_provider.get_dataset_by_id(data_id)
try:
logger.info("Starting the review of the Dataset: %s", data_id)

create_tables()
dataset = dataset_provider.get_dataset_by_id(data_id)
logger.debug("Dataset retrieved form the given Data-ID.")

existing_entity = get_entity(data_id, ReviewedDataset)
logger.info("Creating database.")
create_tables()

now_utc = datetime.now(UTC)
ger_timezone = timedelta(hours=2) if now_utc.astimezone(timezone(timedelta(hours=1))).dst() else timedelta(hours=1)
formatted_german_time1 = (now_utc + ger_timezone).strftime("%Y-%m-%d %H:%M:%S")
existing_entity = get_entity(data_id, ReviewedDataset)

if existing_entity is None:
review_dataset = ReviewedDataset(data_id=data_id, review_start_time=formatted_german_time1)
now_utc = datetime.now(UTC)
ger_timezone = timedelta(hours=2) if now_utc.astimezone(
timezone(timedelta(hours=1))).dst() else timedelta(hours=1)
formatted_german_time1 = (now_utc + ger_timezone).strftime("%Y-%m-%d %H:%M:%S")

add_entity(review_dataset)
logger.debug("Checking if the dataset is already existing in the database")
if existing_entity is None:
logger.info("Dataset with the Data-ID does not exist in the database. Starting review.")
review_dataset = ReviewedDataset(data_id=data_id, review_start_time=formatted_german_time1)

data_collection = NuclearAndGasDataCollection(dataset.data)
logger.debug("Adding the dataset in the database with the Data-ID and review start time.")
add_entity(review_dataset)

page_numbers = pages_provider.get_relevant_page_numbers(data_collection)
data_collection = NuclearAndGasDataCollection(dataset.data)
logger.debug("Data collection created.")

relevant_pages_pdf_reader = pages_provider.get_relevant_pages_of_pdf(data_collection)
page_numbers = pages_provider.get_relevant_page_numbers(data_collection)
logger.debug("Relevant page numbers extracted.")

readable_text = text_to_doc_intelligence.get_markdown_from_dataset(
data_id=data_id, page_numbers=page_numbers, relevant_pages_pdf_reader=relevant_pages_pdf_reader
)
relevant_pages_pdf_reader = pages_provider.get_relevant_pages_of_pdf(data_collection)
logger.debug("Relevant pages extracted.")

report = NuclearAndGasReportGenerator().generate_report(relevant_pages=readable_text, dataset=data_collection)
readable_text = text_to_doc_intelligence.get_markdown_from_dataset(
data_id=data_id, page_numbers=page_numbers, relevant_pages_pdf_reader=relevant_pages_pdf_reader
)
logger.debug("Text extracted from the relevant pages.")

data = config.get_config().dataland_client.eu_taxonomy_nuclear_gas_qa_api.post_nuclear_and_gas_data_qa_report(
data_id=data_id, nuclear_and_gas_data=report
)
report = NuclearAndGasReportGenerator().generate_report(
relevant_pages=readable_text, dataset=data_collection)
logger.info("Report generated succesfully.")

now_utc = datetime.now(UTC)
if now_utc.astimezone(timezone(timedelta(hours=1))).dst():
ger_timezone = timedelta(hours=2)
else:
ger_timezone = timedelta(hours=1)

formatted_german_time2 = (now_utc + ger_timezone).strftime("%Y-%m-%d %H:%M:%S")
review_dataset.review_end_time = formatted_german_time2
review_dataset.review_completed = True
review_dataset.report_id = data.qa_report_id

update_entity(review_dataset)
return data
return None
data = config.get_config(
).dataland_client.eu_taxonomy_nuclear_gas_qa_api.post_nuclear_and_gas_data_qa_report(
data_id=data_id, nuclear_and_gas_data=report
)

now_utc = datetime.now(UTC)
if now_utc.astimezone(timezone(timedelta(hours=1))).dst():
ger_timezone = timedelta(hours=2)
else:
ger_timezone = timedelta(hours=1)

formatted_german_time2 = (now_utc + ger_timezone).strftime("%Y-%m-%d %H:%M:%S")

logger.debug("Adding review end time in the database.")
review_dataset.review_end_time = formatted_german_time2

logger.debug("Adding review completed to the database.")
review_dataset.review_completed = True

logger.debug("Adding the Report-ID to the database.")
review_dataset.report_id = data.qa_report_id

update_entity(review_dataset)

logger.info("Report posted successfully for dataset with ID: %s", data_id)
return data
logger.info("Dataset with the Data-ID already exist in the database.")
except Exception as e:
logger.exception(msg="An error occured: ", exc_info=e)
return None
1 change: 0 additions & 1 deletion src/dataland_qa_lab/review/generate_gpt_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
class GenerateGptRequest:
"""Generates the actual GPT request."""


@staticmethod
def generate_gpt_request(mainprompt: str, subprompt: str) -> list:
"""Generates the actual GPT request.
Expand Down

0 comments on commit 7aa5013

Please sign in to comment.