From a28107a40689f750467f85175ef96a5a15acafd1 Mon Sep 17 00:00:00 2001 From: Quang Date: Tue, 19 Nov 2024 15:47:41 +0700 Subject: [PATCH] Add handle error --- dsst_etl/upload_pdfs.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/dsst_etl/upload_pdfs.py b/dsst_etl/upload_pdfs.py index 540365f..d73eacd 100644 --- a/dsst_etl/upload_pdfs.py +++ b/dsst_etl/upload_pdfs.py @@ -4,6 +4,7 @@ from typing import List, Optional, Tuple import boto3 from botocore.exceptions import ClientError +import sqlalchemy from dsst_etl._utils import get_compute_context_id, get_bucket_name from dsst_etl.models import Documents, Provenance, Works from dsst_etl.db import get_db_session @@ -100,7 +101,7 @@ def create_document_records( self.db_session.add(document) self.db_session.commit() documents.append(document) - except psycopg2.errors.UniqueViolation: + except (psycopg2.errors.UniqueViolation, sqlalchemy.exc.IntegrityError) as e: self.db_session.rollback() logger.warning(f"Document with hash {hash_data} already exists. Skipping.") @@ -187,15 +188,13 @@ def upload_directory( Args: pdf_directory_path (str): Path to directory containing PDFs - bucket_name (str): S3 bucket name comment (Optional[str]): Comment for provenance record """ - # Get list of PDF files - pdf_files = [ - os.path.join(pdf_directory_path, f) - for f in os.listdir(pdf_directory_path) - if f.lower().endswith('.pdf') - ] + # Convert string path to Path object + pdf_directory = Path(pdf_directory_path) + + # Get list of PDF files using glob + pdf_files = [str(pdf_file) for pdf_file in pdf_directory.glob("*.pdf")] if not pdf_files: logger.warning(f"No PDF files found in {pdf_directory_path}") @@ -214,6 +213,10 @@ def upload_directory( # Create document records documents = uploader.create_document_records(successful_uploads) + if not documents: + logger.warning("No documents created") + return + # Create provenance record provenance = uploader.create_provenance_record(documents, comment)