Skip to content

Commit

Permalink
Add handle error
Browse files Browse the repository at this point in the history
  • Loading branch information
quang-ng committed Nov 19, 2024
1 parent ea2e1ae commit a28107a
Showing 1 changed file with 11 additions and 8 deletions.
19 changes: 11 additions & 8 deletions dsst_etl/upload_pdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import List, Optional, Tuple
import boto3
from botocore.exceptions import ClientError
import sqlalchemy
from dsst_etl._utils import get_compute_context_id, get_bucket_name
from dsst_etl.models import Documents, Provenance, Works
from dsst_etl.db import get_db_session
Expand Down Expand Up @@ -100,7 +101,7 @@ def create_document_records(
self.db_session.add(document)
self.db_session.commit()
documents.append(document)
except psycopg2.errors.UniqueViolation:
except (psycopg2.errors.UniqueViolation, sqlalchemy.exc.IntegrityError) as e:
self.db_session.rollback()
logger.warning(f"Document with hash {hash_data} already exists. Skipping.")

Expand Down Expand Up @@ -187,15 +188,13 @@ def upload_directory(
Args:
pdf_directory_path (str): Path to directory containing PDFs
bucket_name (str): S3 bucket name
comment (Optional[str]): Comment for provenance record
"""
# Get list of PDF files
pdf_files = [
os.path.join(pdf_directory_path, f)
for f in os.listdir(pdf_directory_path)
if f.lower().endswith('.pdf')
]
# Convert string path to Path object
pdf_directory = Path(pdf_directory_path)

# Get list of PDF files using glob
pdf_files = [str(pdf_file) for pdf_file in pdf_directory.glob("*.pdf")]

if not pdf_files:
logger.warning(f"No PDF files found in {pdf_directory_path}")
Expand All @@ -214,6 +213,10 @@ def upload_directory(
# Create document records
documents = uploader.create_document_records(successful_uploads)

if not documents:
logger.warning("No documents created")
return

# Create provenance record
provenance = uploader.create_provenance_record(documents, comment)

Expand Down

0 comments on commit a28107a

Please sign in to comment.