-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
6e5a5c8
commit e4c5058
Showing
1 changed file
with
195 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,195 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"[2025-01-28 12:41:34] INFO [dataland_qa_lab.dataland.unreviewed_datasets.__init__:17] Initializing the unreviewed Datasets with the data from Dataland.\n", | ||
"[2025-01-28 12:41:34] INFO [dataland_qa_lab.dataland.scheduled_processor.run_scheduled_processing:19] Processing unreviewed datasets with the list of Data ids: ['00a96594-ef13-40bc-bfb7-5a7b71e57ad8', 'd1fcbf68-99d9-48c8-8734-8d9937e37abd', '677ec4dc-993e-4fe7-865a-b5a357eb693b', 'ddb9dadd-9d3f-42bb-bc7b-5395136ae4a3', '9a3ec569-3a29-4450-8ba3-95bc352f5db3', 'c8b4fc6b-631f-4876-bd86-e326c50c64ac', '75287a14-099e-40e0-a8cc-eed8dd4e3a0a']\n", | ||
"[2025-01-28 12:41:34] INFO [dataland_qa_lab.review.dataset_reviewer.review_dataset:16] Starting the review of the Dataset: 75287a14-099e-40e0-a8cc-eed8dd4e3a0a\n", | ||
"[2025-01-28 12:41:35] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:19] Dataset retrieved: 75287a14-099e-40e0-a8cc-eed8dd4e3a0a\n", | ||
"[2025-01-28 12:41:35] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:22] Data collection created.\n", | ||
"[2025-01-28 12:41:35] ERROR [dataland_qa_lab.dataland.scheduled_processor.run_scheduled_processing:32] Error processing dataset 75287a14-099e-40e0-a8cc-eed8dd4e3a0a\n", | ||
"Traceback (most recent call last):\n", | ||
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\dataland\\scheduled_processor.py\", line 28, in run_scheduled_processing\n", | ||
" review_dataset(data_id)\n", | ||
" ~~~~~~~~~~~~~~^^^^^^^^^\n", | ||
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\review\\dataset_reviewer.py\", line 24, in review_dataset\n", | ||
" relevant_pages_pdf_reader = pages_provider.get_relevant_pages_of_pdf(data_collection)\n", | ||
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\pages\\pages_provider.py\", line 25, in get_relevant_pages_of_pdf\n", | ||
" ).datapoint.data_source.file_reference\n", | ||
" ^^^^^^^^^^^^^^\n", | ||
"AttributeError: 'NoneType' object has no attribute 'file_reference'\n", | ||
"[2025-01-28 12:41:35] INFO [dataland_qa_lab.review.dataset_reviewer.review_dataset:16] Starting the review of the Dataset: c8b4fc6b-631f-4876-bd86-e326c50c64ac\n", | ||
"[2025-01-28 12:41:35] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:19] Dataset retrieved: c8b4fc6b-631f-4876-bd86-e326c50c64ac\n", | ||
"[2025-01-28 12:41:35] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:22] Data collection created.\n", | ||
"[2025-01-28 12:41:35] ERROR [dataland_qa_lab.dataland.scheduled_processor.run_scheduled_processing:32] Error processing dataset c8b4fc6b-631f-4876-bd86-e326c50c64ac\n", | ||
"Traceback (most recent call last):\n", | ||
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\dataland\\scheduled_processor.py\", line 28, in run_scheduled_processing\n", | ||
" review_dataset(data_id)\n", | ||
" ~~~~~~~~~~~~~~^^^^^^^^^\n", | ||
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\review\\dataset_reviewer.py\", line 24, in review_dataset\n", | ||
" relevant_pages_pdf_reader = pages_provider.get_relevant_pages_of_pdf(data_collection)\n", | ||
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\pages\\pages_provider.py\", line 25, in get_relevant_pages_of_pdf\n", | ||
" ).datapoint.data_source.file_reference\n", | ||
" ^^^^^^^^^^^^^^\n", | ||
"AttributeError: 'NoneType' object has no attribute 'file_reference'\n", | ||
"[2025-01-28 12:41:35] INFO [dataland_qa_lab.review.dataset_reviewer.review_dataset:16] Starting the review of the Dataset: 9a3ec569-3a29-4450-8ba3-95bc352f5db3\n", | ||
"[2025-01-28 12:41:35] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:19] Dataset retrieved: 9a3ec569-3a29-4450-8ba3-95bc352f5db3\n", | ||
"[2025-01-28 12:41:35] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:22] Data collection created.\n", | ||
"[2025-01-28 12:41:35] ERROR [dataland_qa_lab.dataland.scheduled_processor.run_scheduled_processing:32] Error processing dataset 9a3ec569-3a29-4450-8ba3-95bc352f5db3\n", | ||
"Traceback (most recent call last):\n", | ||
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\dataland\\scheduled_processor.py\", line 28, in run_scheduled_processing\n", | ||
" review_dataset(data_id)\n", | ||
" ~~~~~~~~~~~~~~^^^^^^^^^\n", | ||
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\review\\dataset_reviewer.py\", line 24, in review_dataset\n", | ||
" relevant_pages_pdf_reader = pages_provider.get_relevant_pages_of_pdf(data_collection)\n", | ||
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\pages\\pages_provider.py\", line 25, in get_relevant_pages_of_pdf\n", | ||
" ).datapoint.data_source.file_reference\n", | ||
" ^^^^^^^^^^^^^^\n", | ||
"AttributeError: 'NoneType' object has no attribute 'file_reference'\n", | ||
"[2025-01-28 12:41:35] INFO [dataland_qa_lab.review.dataset_reviewer.review_dataset:16] Starting the review of the Dataset: ddb9dadd-9d3f-42bb-bc7b-5395136ae4a3\n", | ||
"[2025-01-28 12:41:35] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:19] Dataset retrieved: ddb9dadd-9d3f-42bb-bc7b-5395136ae4a3\n", | ||
"[2025-01-28 12:41:35] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:22] Data collection created.\n", | ||
"[2025-01-28 12:41:35] ERROR [dataland_qa_lab.dataland.scheduled_processor.run_scheduled_processing:32] Error processing dataset ddb9dadd-9d3f-42bb-bc7b-5395136ae4a3\n", | ||
"Traceback (most recent call last):\n", | ||
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\dataland\\scheduled_processor.py\", line 28, in run_scheduled_processing\n", | ||
" review_dataset(data_id)\n", | ||
" ~~~~~~~~~~~~~~^^^^^^^^^\n", | ||
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\review\\dataset_reviewer.py\", line 24, in review_dataset\n", | ||
" relevant_pages_pdf_reader = pages_provider.get_relevant_pages_of_pdf(data_collection)\n", | ||
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\pages\\pages_provider.py\", line 25, in get_relevant_pages_of_pdf\n", | ||
" ).datapoint.data_source.file_reference\n", | ||
" ^^^^^^^^^^^^^^\n", | ||
"AttributeError: 'NoneType' object has no attribute 'file_reference'\n", | ||
"[2025-01-28 12:41:35] INFO [dataland_qa_lab.review.dataset_reviewer.review_dataset:16] Starting the review of the Dataset: 677ec4dc-993e-4fe7-865a-b5a357eb693b\n", | ||
"[2025-01-28 12:41:36] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:19] Dataset retrieved: 677ec4dc-993e-4fe7-865a-b5a357eb693b\n", | ||
"[2025-01-28 12:41:36] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:22] Data collection created.\n", | ||
"[2025-01-28 12:41:36] ERROR [dataland_qa_lab.dataland.scheduled_processor.run_scheduled_processing:32] Error processing dataset 677ec4dc-993e-4fe7-865a-b5a357eb693b\n", | ||
"Traceback (most recent call last):\n", | ||
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\dataland\\scheduled_processor.py\", line 28, in run_scheduled_processing\n", | ||
" review_dataset(data_id)\n", | ||
" ~~~~~~~~~~~~~~^^^^^^^^^\n", | ||
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\review\\dataset_reviewer.py\", line 24, in review_dataset\n", | ||
" relevant_pages_pdf_reader = pages_provider.get_relevant_pages_of_pdf(data_collection)\n", | ||
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\pages\\pages_provider.py\", line 25, in get_relevant_pages_of_pdf\n", | ||
" ).datapoint.data_source.file_reference\n", | ||
" ^^^^^^^^^^^^^^\n", | ||
"AttributeError: 'NoneType' object has no attribute 'file_reference'\n", | ||
"[2025-01-28 12:41:36] INFO [dataland_qa_lab.review.dataset_reviewer.review_dataset:16] Starting the review of the Dataset: d1fcbf68-99d9-48c8-8734-8d9937e37abd\n", | ||
"[2025-01-28 12:41:36] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:19] Dataset retrieved: d1fcbf68-99d9-48c8-8734-8d9937e37abd\n", | ||
"[2025-01-28 12:41:36] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:22] Data collection created.\n", | ||
"[2025-01-28 12:41:36] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:25] Relevant pages extracted.\n", | ||
"[2025-01-28 12:41:37] ERROR [dataland_qa_lab.dataland.scheduled_processor.run_scheduled_processing:32] Error processing dataset d1fcbf68-99d9-48c8-8734-8d9937e37abd\n", | ||
"Traceback (most recent call last):\n", | ||
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\dataland\\scheduled_processor.py\", line 28, in run_scheduled_processing\n", | ||
" review_dataset(data_id)\n", | ||
" ~~~~~~~~~~~~~~^^^^^^^^^\n", | ||
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\review\\dataset_reviewer.py\", line 27, in review_dataset\n", | ||
" readable_text = text_to_doc_intelligence.extract_text_of_pdf(relevant_pages_pdf_reader)\n", | ||
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\pages\\text_to_doc_intelligence.py\", line 21, in extract_text_of_pdf\n", | ||
" poller = document_intelligence_client.begin_analyze_document(\n", | ||
" \"prebuilt-layout\",\n", | ||
" ...<2 lines>...\n", | ||
" output_content_format=DocumentContentFormat.MARKDOWN,\n", | ||
" )\n", | ||
" File \"c:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\.venv\\Lib\\site-packages\\azure\\core\\tracing\\decorator.py\", line 105, in wrapper_use_tracer\n", | ||
" return func(*args, **kwargs)\n", | ||
" File \"c:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\.venv\\Lib\\site-packages\\azure\\ai\\documentintelligence\\_operations\\_patch.py\", line 596, in begin_analyze_document\n", | ||
" raw_result = self._analyze_document_initial(\n", | ||
" model_id=model_id,\n", | ||
" ...<12 lines>...\n", | ||
" **kwargs,\n", | ||
" )\n", | ||
" File \"c:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\.venv\\Lib\\site-packages\\azure\\ai\\documentintelligence\\_operations\\_operations.py\", line 819, in _analyze_document_initial\n", | ||
" raise HttpResponseError(response=response, model=error)\n", | ||
"azure.core.exceptions.HttpResponseError: (InvalidArgument) Invalid argument.\n", | ||
"Code: InvalidArgument\n", | ||
"Message: Invalid argument.\n", | ||
"Inner error: {\n", | ||
" \"code\": \"InvalidParameter\",\n", | ||
" \"message\": \"The parameter pages is invalid: The page range exceeds the number of pages in the document.\"\n", | ||
"}\n", | ||
"[2025-01-28 12:41:37] INFO [dataland_qa_lab.review.dataset_reviewer.review_dataset:16] Starting the review of the Dataset: 00a96594-ef13-40bc-bfb7-5a7b71e57ad8\n", | ||
"[2025-01-28 12:41:37] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:19] Dataset retrieved: 00a96594-ef13-40bc-bfb7-5a7b71e57ad8\n", | ||
"[2025-01-28 12:41:37] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:22] Data collection created.\n", | ||
"[2025-01-28 12:41:37] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:25] Relevant pages extracted.\n", | ||
"[2025-01-28 12:41:38] ERROR [dataland_qa_lab.dataland.scheduled_processor.run_scheduled_processing:32] Error processing dataset 00a96594-ef13-40bc-bfb7-5a7b71e57ad8\n", | ||
"Traceback (most recent call last):\n", | ||
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\dataland\\scheduled_processor.py\", line 28, in run_scheduled_processing\n", | ||
" review_dataset(data_id)\n", | ||
" ~~~~~~~~~~~~~~^^^^^^^^^\n", | ||
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\review\\dataset_reviewer.py\", line 27, in review_dataset\n", | ||
" readable_text = text_to_doc_intelligence.extract_text_of_pdf(relevant_pages_pdf_reader)\n", | ||
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\pages\\text_to_doc_intelligence.py\", line 21, in extract_text_of_pdf\n", | ||
" poller = document_intelligence_client.begin_analyze_document(\n", | ||
" \"prebuilt-layout\",\n", | ||
" ...<2 lines>...\n", | ||
" output_content_format=DocumentContentFormat.MARKDOWN,\n", | ||
" )\n", | ||
" File \"c:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\.venv\\Lib\\site-packages\\azure\\core\\tracing\\decorator.py\", line 105, in wrapper_use_tracer\n", | ||
" return func(*args, **kwargs)\n", | ||
" File \"c:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\.venv\\Lib\\site-packages\\azure\\ai\\documentintelligence\\_operations\\_patch.py\", line 596, in begin_analyze_document\n", | ||
" raw_result = self._analyze_document_initial(\n", | ||
" model_id=model_id,\n", | ||
" ...<12 lines>...\n", | ||
" **kwargs,\n", | ||
" )\n", | ||
" File \"c:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\.venv\\Lib\\site-packages\\azure\\ai\\documentintelligence\\_operations\\_operations.py\", line 819, in _analyze_document_initial\n", | ||
" raise HttpResponseError(response=response, model=error)\n", | ||
"azure.core.exceptions.HttpResponseError: (InvalidArgument) Invalid argument.\n", | ||
"Code: InvalidArgument\n", | ||
"Message: Invalid argument.\n", | ||
"Inner error: {\n", | ||
" \"code\": \"InvalidParameter\",\n", | ||
" \"message\": \"The parameter pages is invalid: The page range exceeds the number of pages in the document.\"\n", | ||
"}\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from dataland_qa_lab.database.database_engine import create_tables, delete_entity\n", | ||
"from dataland_qa_lab.database.database_tables import ReviewedDataset\n", | ||
"from dataland_qa_lab.review.dataset_reviewer import review_dataset\n", | ||
"import logging\n", | ||
"from dataland_qa_lab.dataland.scheduled_processor import run_scheduled_processing\n", | ||
"\n", | ||
"# delete_entity(\"b8cdc985-a81d-4c55-b49a-f61a145e9ea5\", ReviewedDataset)\n", | ||
"# delete_entity(\"ef015dcd-674c-4ddb-b3c7-0a1bc25f0ce0\", ReviewedDataset)\n", | ||
"# delete_entity(\"7b7c7ea2-7d74-4161-afc8-4aa6bcde66c7\", ReviewedDataset)\n", | ||
"run_scheduled_processing(1)\n", | ||
"# print(datetime.now().strftime(\"%H:%M:%S\"))\n", | ||
"\n", | ||
"# logger = logging.getLogger(__name__)\n", | ||
"\n", | ||
"# logger.info(\"Hier beginnt das logging.\")" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": ".venv", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.13.0" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |