Skip to content

Commit

Permalink
Test notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
chrisBer67 committed Jan 29, 2025
1 parent 6e5a5c8 commit e4c5058
Showing 1 changed file with 195 additions and 0 deletions.
195 changes: 195 additions & 0 deletions notebooks/test.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2025-01-28 12:41:34] INFO [dataland_qa_lab.dataland.unreviewed_datasets.__init__:17] Initializing the unreviewed Datasets with the data from Dataland.\n",
"[2025-01-28 12:41:34] INFO [dataland_qa_lab.dataland.scheduled_processor.run_scheduled_processing:19] Processing unreviewed datasets with the list of Data ids: ['00a96594-ef13-40bc-bfb7-5a7b71e57ad8', 'd1fcbf68-99d9-48c8-8734-8d9937e37abd', '677ec4dc-993e-4fe7-865a-b5a357eb693b', 'ddb9dadd-9d3f-42bb-bc7b-5395136ae4a3', '9a3ec569-3a29-4450-8ba3-95bc352f5db3', 'c8b4fc6b-631f-4876-bd86-e326c50c64ac', '75287a14-099e-40e0-a8cc-eed8dd4e3a0a']\n",
"[2025-01-28 12:41:34] INFO [dataland_qa_lab.review.dataset_reviewer.review_dataset:16] Starting the review of the Dataset: 75287a14-099e-40e0-a8cc-eed8dd4e3a0a\n",
"[2025-01-28 12:41:35] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:19] Dataset retrieved: 75287a14-099e-40e0-a8cc-eed8dd4e3a0a\n",
"[2025-01-28 12:41:35] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:22] Data collection created.\n",
"[2025-01-28 12:41:35] ERROR [dataland_qa_lab.dataland.scheduled_processor.run_scheduled_processing:32] Error processing dataset 75287a14-099e-40e0-a8cc-eed8dd4e3a0a\n",
"Traceback (most recent call last):\n",
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\dataland\\scheduled_processor.py\", line 28, in run_scheduled_processing\n",
" review_dataset(data_id)\n",
" ~~~~~~~~~~~~~~^^^^^^^^^\n",
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\review\\dataset_reviewer.py\", line 24, in review_dataset\n",
" relevant_pages_pdf_reader = pages_provider.get_relevant_pages_of_pdf(data_collection)\n",
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\pages\\pages_provider.py\", line 25, in get_relevant_pages_of_pdf\n",
" ).datapoint.data_source.file_reference\n",
" ^^^^^^^^^^^^^^\n",
"AttributeError: 'NoneType' object has no attribute 'file_reference'\n",
"[2025-01-28 12:41:35] INFO [dataland_qa_lab.review.dataset_reviewer.review_dataset:16] Starting the review of the Dataset: c8b4fc6b-631f-4876-bd86-e326c50c64ac\n",
"[2025-01-28 12:41:35] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:19] Dataset retrieved: c8b4fc6b-631f-4876-bd86-e326c50c64ac\n",
"[2025-01-28 12:41:35] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:22] Data collection created.\n",
"[2025-01-28 12:41:35] ERROR [dataland_qa_lab.dataland.scheduled_processor.run_scheduled_processing:32] Error processing dataset c8b4fc6b-631f-4876-bd86-e326c50c64ac\n",
"Traceback (most recent call last):\n",
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\dataland\\scheduled_processor.py\", line 28, in run_scheduled_processing\n",
" review_dataset(data_id)\n",
" ~~~~~~~~~~~~~~^^^^^^^^^\n",
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\review\\dataset_reviewer.py\", line 24, in review_dataset\n",
" relevant_pages_pdf_reader = pages_provider.get_relevant_pages_of_pdf(data_collection)\n",
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\pages\\pages_provider.py\", line 25, in get_relevant_pages_of_pdf\n",
" ).datapoint.data_source.file_reference\n",
" ^^^^^^^^^^^^^^\n",
"AttributeError: 'NoneType' object has no attribute 'file_reference'\n",
"[2025-01-28 12:41:35] INFO [dataland_qa_lab.review.dataset_reviewer.review_dataset:16] Starting the review of the Dataset: 9a3ec569-3a29-4450-8ba3-95bc352f5db3\n",
"[2025-01-28 12:41:35] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:19] Dataset retrieved: 9a3ec569-3a29-4450-8ba3-95bc352f5db3\n",
"[2025-01-28 12:41:35] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:22] Data collection created.\n",
"[2025-01-28 12:41:35] ERROR [dataland_qa_lab.dataland.scheduled_processor.run_scheduled_processing:32] Error processing dataset 9a3ec569-3a29-4450-8ba3-95bc352f5db3\n",
"Traceback (most recent call last):\n",
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\dataland\\scheduled_processor.py\", line 28, in run_scheduled_processing\n",
" review_dataset(data_id)\n",
" ~~~~~~~~~~~~~~^^^^^^^^^\n",
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\review\\dataset_reviewer.py\", line 24, in review_dataset\n",
" relevant_pages_pdf_reader = pages_provider.get_relevant_pages_of_pdf(data_collection)\n",
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\pages\\pages_provider.py\", line 25, in get_relevant_pages_of_pdf\n",
" ).datapoint.data_source.file_reference\n",
" ^^^^^^^^^^^^^^\n",
"AttributeError: 'NoneType' object has no attribute 'file_reference'\n",
"[2025-01-28 12:41:35] INFO [dataland_qa_lab.review.dataset_reviewer.review_dataset:16] Starting the review of the Dataset: ddb9dadd-9d3f-42bb-bc7b-5395136ae4a3\n",
"[2025-01-28 12:41:35] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:19] Dataset retrieved: ddb9dadd-9d3f-42bb-bc7b-5395136ae4a3\n",
"[2025-01-28 12:41:35] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:22] Data collection created.\n",
"[2025-01-28 12:41:35] ERROR [dataland_qa_lab.dataland.scheduled_processor.run_scheduled_processing:32] Error processing dataset ddb9dadd-9d3f-42bb-bc7b-5395136ae4a3\n",
"Traceback (most recent call last):\n",
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\dataland\\scheduled_processor.py\", line 28, in run_scheduled_processing\n",
" review_dataset(data_id)\n",
" ~~~~~~~~~~~~~~^^^^^^^^^\n",
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\review\\dataset_reviewer.py\", line 24, in review_dataset\n",
" relevant_pages_pdf_reader = pages_provider.get_relevant_pages_of_pdf(data_collection)\n",
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\pages\\pages_provider.py\", line 25, in get_relevant_pages_of_pdf\n",
" ).datapoint.data_source.file_reference\n",
" ^^^^^^^^^^^^^^\n",
"AttributeError: 'NoneType' object has no attribute 'file_reference'\n",
"[2025-01-28 12:41:35] INFO [dataland_qa_lab.review.dataset_reviewer.review_dataset:16] Starting the review of the Dataset: 677ec4dc-993e-4fe7-865a-b5a357eb693b\n",
"[2025-01-28 12:41:36] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:19] Dataset retrieved: 677ec4dc-993e-4fe7-865a-b5a357eb693b\n",
"[2025-01-28 12:41:36] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:22] Data collection created.\n",
"[2025-01-28 12:41:36] ERROR [dataland_qa_lab.dataland.scheduled_processor.run_scheduled_processing:32] Error processing dataset 677ec4dc-993e-4fe7-865a-b5a357eb693b\n",
"Traceback (most recent call last):\n",
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\dataland\\scheduled_processor.py\", line 28, in run_scheduled_processing\n",
" review_dataset(data_id)\n",
" ~~~~~~~~~~~~~~^^^^^^^^^\n",
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\review\\dataset_reviewer.py\", line 24, in review_dataset\n",
" relevant_pages_pdf_reader = pages_provider.get_relevant_pages_of_pdf(data_collection)\n",
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\pages\\pages_provider.py\", line 25, in get_relevant_pages_of_pdf\n",
" ).datapoint.data_source.file_reference\n",
" ^^^^^^^^^^^^^^\n",
"AttributeError: 'NoneType' object has no attribute 'file_reference'\n",
"[2025-01-28 12:41:36] INFO [dataland_qa_lab.review.dataset_reviewer.review_dataset:16] Starting the review of the Dataset: d1fcbf68-99d9-48c8-8734-8d9937e37abd\n",
"[2025-01-28 12:41:36] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:19] Dataset retrieved: d1fcbf68-99d9-48c8-8734-8d9937e37abd\n",
"[2025-01-28 12:41:36] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:22] Data collection created.\n",
"[2025-01-28 12:41:36] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:25] Relevant pages extracted.\n",
"[2025-01-28 12:41:37] ERROR [dataland_qa_lab.dataland.scheduled_processor.run_scheduled_processing:32] Error processing dataset d1fcbf68-99d9-48c8-8734-8d9937e37abd\n",
"Traceback (most recent call last):\n",
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\dataland\\scheduled_processor.py\", line 28, in run_scheduled_processing\n",
" review_dataset(data_id)\n",
" ~~~~~~~~~~~~~~^^^^^^^^^\n",
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\review\\dataset_reviewer.py\", line 27, in review_dataset\n",
" readable_text = text_to_doc_intelligence.extract_text_of_pdf(relevant_pages_pdf_reader)\n",
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\pages\\text_to_doc_intelligence.py\", line 21, in extract_text_of_pdf\n",
" poller = document_intelligence_client.begin_analyze_document(\n",
" \"prebuilt-layout\",\n",
" ...<2 lines>...\n",
" output_content_format=DocumentContentFormat.MARKDOWN,\n",
" )\n",
" File \"c:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\.venv\\Lib\\site-packages\\azure\\core\\tracing\\decorator.py\", line 105, in wrapper_use_tracer\n",
" return func(*args, **kwargs)\n",
" File \"c:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\.venv\\Lib\\site-packages\\azure\\ai\\documentintelligence\\_operations\\_patch.py\", line 596, in begin_analyze_document\n",
" raw_result = self._analyze_document_initial(\n",
" model_id=model_id,\n",
" ...<12 lines>...\n",
" **kwargs,\n",
" )\n",
" File \"c:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\.venv\\Lib\\site-packages\\azure\\ai\\documentintelligence\\_operations\\_operations.py\", line 819, in _analyze_document_initial\n",
" raise HttpResponseError(response=response, model=error)\n",
"azure.core.exceptions.HttpResponseError: (InvalidArgument) Invalid argument.\n",
"Code: InvalidArgument\n",
"Message: Invalid argument.\n",
"Inner error: {\n",
" \"code\": \"InvalidParameter\",\n",
" \"message\": \"The parameter pages is invalid: The page range exceeds the number of pages in the document.\"\n",
"}\n",
"[2025-01-28 12:41:37] INFO [dataland_qa_lab.review.dataset_reviewer.review_dataset:16] Starting the review of the Dataset: 00a96594-ef13-40bc-bfb7-5a7b71e57ad8\n",
"[2025-01-28 12:41:37] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:19] Dataset retrieved: 00a96594-ef13-40bc-bfb7-5a7b71e57ad8\n",
"[2025-01-28 12:41:37] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:22] Data collection created.\n",
"[2025-01-28 12:41:37] DEBUG [dataland_qa_lab.review.dataset_reviewer.review_dataset:25] Relevant pages extracted.\n",
"[2025-01-28 12:41:38] ERROR [dataland_qa_lab.dataland.scheduled_processor.run_scheduled_processing:32] Error processing dataset 00a96594-ef13-40bc-bfb7-5a7b71e57ad8\n",
"Traceback (most recent call last):\n",
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\dataland\\scheduled_processor.py\", line 28, in run_scheduled_processing\n",
" review_dataset(data_id)\n",
" ~~~~~~~~~~~~~~^^^^^^^^^\n",
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\review\\dataset_reviewer.py\", line 27, in review_dataset\n",
" readable_text = text_to_doc_intelligence.extract_text_of_pdf(relevant_pages_pdf_reader)\n",
" File \"C:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\src\\dataland_qa_lab\\pages\\text_to_doc_intelligence.py\", line 21, in extract_text_of_pdf\n",
" poller = document_intelligence_client.begin_analyze_document(\n",
" \"prebuilt-layout\",\n",
" ...<2 lines>...\n",
" output_content_format=DocumentContentFormat.MARKDOWN,\n",
" )\n",
" File \"c:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\.venv\\Lib\\site-packages\\azure\\core\\tracing\\decorator.py\", line 105, in wrapper_use_tracer\n",
" return func(*args, **kwargs)\n",
" File \"c:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\.venv\\Lib\\site-packages\\azure\\ai\\documentintelligence\\_operations\\_patch.py\", line 596, in begin_analyze_document\n",
" raw_result = self._analyze_document_initial(\n",
" model_id=model_id,\n",
" ...<12 lines>...\n",
" **kwargs,\n",
" )\n",
" File \"c:\\Users\\chrbe\\Desktop\\Studium\\dfine Projekt\\DatalandQALab\\.venv\\Lib\\site-packages\\azure\\ai\\documentintelligence\\_operations\\_operations.py\", line 819, in _analyze_document_initial\n",
" raise HttpResponseError(response=response, model=error)\n",
"azure.core.exceptions.HttpResponseError: (InvalidArgument) Invalid argument.\n",
"Code: InvalidArgument\n",
"Message: Invalid argument.\n",
"Inner error: {\n",
" \"code\": \"InvalidParameter\",\n",
" \"message\": \"The parameter pages is invalid: The page range exceeds the number of pages in the document.\"\n",
"}\n"
]
}
],
"source": [
"from dataland_qa_lab.database.database_engine import create_tables, delete_entity\n",
"from dataland_qa_lab.database.database_tables import ReviewedDataset\n",
"from dataland_qa_lab.review.dataset_reviewer import review_dataset\n",
"import logging\n",
"from dataland_qa_lab.dataland.scheduled_processor import run_scheduled_processing\n",
"\n",
"# delete_entity(\"b8cdc985-a81d-4c55-b49a-f61a145e9ea5\", ReviewedDataset)\n",
"# delete_entity(\"ef015dcd-674c-4ddb-b3c7-0a1bc25f0ce0\", ReviewedDataset)\n",
"# delete_entity(\"7b7c7ea2-7d74-4161-afc8-4aa6bcde66c7\", ReviewedDataset)\n",
"run_scheduled_processing(1)\n",
"# print(datetime.now().strftime(\"%H:%M:%S\"))\n",
"\n",
"# logger = logging.getLogger(__name__)\n",
"\n",
"# logger.info(\"Hier beginnt das logging.\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit e4c5058

Please sign in to comment.