Skip to content

Commit

Permalink
include minio in testing
Browse files Browse the repository at this point in the history
  • Loading branch information
leej3 committed Jan 30, 2025
1 parent d6a383d commit d897bfc
Show file tree
Hide file tree
Showing 3 changed files with 180 additions and 82 deletions.
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,18 @@ You can run the test suite (assuming you have activated the virtual environment
pytest
```

### Object Storage for development

For local development/testing Minio is included in the compose stack. This runs a local server that exposes an S3-compatible API with the bucket `dsst-pdfs`.

To use this from the command line:

```
AWS_ACCESS_KEY_ID=minioadmin AWS_SECRET_ACCESS_KEY=minioadmin aws s3api list-objects --endpoint-url http://localhost:9000 --bucket dsst-pdfs
```

A web interface is also available at `http://localhost:9001`.

### Database Setup

Prior to running any services, the user must create the database specified in the .mockenv/.env file. The database is named `pdx` by default. To set up the database for this project, follow these steps:
Expand Down
37 changes: 37 additions & 0 deletions compose.override.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,43 @@ services:
timeout: 5s # Added missing timeout value
retries: 5

minio:
image: minio/minio:latest
container_name: dsst_minio
ports:
- "9000:9000" # API
- "9001:9001" # Console
environment:
MINIO_ROOT_USER: minioadmin
MINIO_ROOT_PASSWORD: minioadmin
MINIO_ADDRESS: ":9000"
MINIO_CONSOLE_ADDRESS: ":9001"
command: server /data --console-address ":9001"
volumes:
- minio_data:/data
healthcheck:
test: ["CMD", "mc", "ready", "local"]
interval: 5s
timeout: 5s
retries: 5

createbuckets:
image: minio/mc
depends_on:
minio:
condition: service_healthy
volumes:
- ./tests/pdf-test:/test-pdfs
entrypoint: >
/bin/sh -c "
/usr/bin/mc config host add myminio http://minio:9000 minioadmin minioadmin;
/usr/bin/mc mb myminio/dsst-pdfs;
/usr/bin/mc anonymous set public myminio/dsst-pdfs;
/usr/bin/mc cp /test-pdfs/test1.pdf myminio/dsst-pdfs/;
/usr/bin/mc cp /test-pdfs/test2.pdf myminio/dsst-pdfs/;
exit 0;
"
volumes:
dsst_postgres_data:
minio_data:
213 changes: 131 additions & 82 deletions tests/test_upload_pdfs_title_is_pmid.py
Original file line number Diff line number Diff line change
@@ -1,91 +1,140 @@
import unittest
from unittest.mock import MagicMock, patch
from dsst_etl.models import Documents, Identifier, OddpubMetrics, Works
import pytest
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
from dsst_etl.models import Documents, Identifier, OddpubMetrics, Works, Base
from dsst_etl.upload_pdfs_title_is_pmid import DocumentInventoryPMID
import boto3
from botocore.config import Config


@pytest.fixture(params=['mock', 'minio'])
def s3_client(request):
if request.param == 'mock':
# Create a mock S3 client
mock_client = MagicMock()

# Set up default mock behaviors
mock_paginator = MagicMock()
mock_paginator.paginate.return_value = [{'Contents': [{'Key': '12345678.pdf'}]}]
mock_client.get_paginator.return_value = mock_paginator

mock_body = MagicMock()
mock_body.read.return_value = b'test content'
mock_client.get_object.return_value = {'Body': mock_body}

return mock_client
else:
# Return real MinIO client
return boto3.client(
's3',
endpoint_url='http://localhost:9000',
aws_access_key_id='minioadmin',
aws_secret_access_key='minioadmin',
region_name='us-east-1',
config=Config(signature_version='s3v4')
)

from tests.base_test import BaseTest # type: ignore


class TestDocumentInventoryPMID(BaseTest):

@patch('dsst_etl.upload_pdfs_title_is_pmid.boto3.client')
@patch('dsst_etl.upload_pdfs_title_is_pmid.get_bucket_name')
def setUp(self, mock_get_bucket_name, mock_boto_client):
super().setUp()

# Mock S3 client and bucket name
self.mock_s3_client = MagicMock()
mock_boto_client.return_value = self.mock_s3_client
mock_get_bucket_name.return_value = 'mock-bucket'

# Initialize the class with the mocked session
self.uploader = DocumentInventoryPMID(self.session, oddpub_host_api='http://mock-api:8071')

@patch('requests.post')
def test_process_s3_inventory_success(self, mock_post):
# Mock the POST request
def mock_post_side_effect(url, *args, **kwargs):
if url == 'http://mock-api:8071/oddpub':
mock_response = MagicMock()
mock_response.json.return_value = {
'article': 'test1.txt',
'is_open_data': False,
'open_data_category': '',
'is_reuse': False,
'is_open_code': False,
'is_open_data_das': False,
'is_open_code_cas': False,
'das': None,
'open_data_statements': '',
'cas': None,
'open_code_statements': ''
}
mock_response.status_code = 200
return mock_response
else:
raise ValueError(f"Unexpected URL: {url}")

mock_post.side_effect = mock_post_side_effect

# Mock the S3 paginator and page iterator
mock_page_iterator = [{'Contents': [{'Key': '12345678.pdf'}]}]
self.uploader._get_s3_pdf_iterator = MagicMock(return_value=mock_page_iterator)

# Mock S3 get_object response to return bytes
self.mock_s3_client.get_object.return_value = {
'Body': MagicMock(read=MagicMock(return_value=b'test content'))
}

# Run the method
self.uploader.run()

# Assertions
self.assertEqual(self.session.query(Documents).count(), 1, "Documents table should have 1 row")
self.assertEqual(self.session.query(Identifier).count(), 1, "Identifier table should have 1 row")
self.assertEqual(self.session.query(Works).count(), 1, "Works table should have 1 row")
self.assertEqual(self.session.query(OddpubMetrics).count(), 1, "OddpubMetrics table should have 1 row")

# Check that the POST request was made to the correct URL
mock_post.assert_called_with(
'http://mock-api:8071/oddpub',
files={'file': b'test content'}
@pytest.fixture
def db_session():
# Create an in-memory SQLite database for testing
engine = create_engine('sqlite:///:memory:')
Base.metadata.create_all(engine)
session = Session(engine)
yield session
session.close()


@pytest.fixture
def uploader(db_session, s3_client):
with patch('dsst_etl.upload_pdfs_title_is_pmid.get_bucket_name') as mock_get_bucket_name:
mock_get_bucket_name.return_value = 'dsst-pdfs'

# Initialize the class with the session
uploader = DocumentInventoryPMID(db_session, oddpub_host_api='http://mock-api:8071')
# Override the S3 client
uploader.s3_client = s3_client

# For MinIO, ensure test file exists
if not isinstance(s3_client, MagicMock):
try:
s3_client.head_bucket(Bucket='dsst-pdfs')
except:
s3_client.create_bucket(Bucket='dsst-pdfs')
s3_client.put_object(
Bucket='dsst-pdfs',
Key='12345678.pdf',
Body=b'test content'
)

return uploader


@patch('requests.post')
def test_process_s3_inventory_success(mock_post, uploader, db_session):
# Mock the POST request
def mock_post_side_effect(url, *args, **kwargs):
if url == 'http://mock-api:8071/oddpub':
mock_response = MagicMock()
mock_response.json.return_value = {
'article': 'test1.txt',
'is_open_data': False,
'open_data_category': '',
'is_reuse': False,
'is_open_code': False,
'is_open_data_das': False,
'is_open_code_cas': False,
'das': None,
'open_data_statements': '',
'cas': None,
'open_code_statements': ''
}
mock_response.status_code = 200
return mock_response
else:
raise ValueError(f"Unexpected URL: {url}")

mock_post.side_effect = mock_post_side_effect

# Run the method
uploader.run()

# Assertions
assert db_session.query(Documents).count() == 1, "Documents table should have 1 row"
assert db_session.query(Identifier).count() == 1, "Identifier table should have 1 row"
assert db_session.query(Works).count() == 1, "Works table should have 1 row"
assert db_session.query(OddpubMetrics).count() == 1, "OddpubMetrics table should have 1 row"

# Check that the POST request was made to the correct URL
mock_post.assert_called_with(
'http://mock-api:8071/oddpub',
files={'file': b'test content'}
)


def test_process_s3_inventory_failure(uploader, db_session, s3_client):
if isinstance(s3_client, MagicMock):
# For mock client, force an exception
s3_client.get_paginator.side_effect = Exception("Test exception")
else:
# For MinIO client, use an invalid bucket to force failure
uploader.s3_client = boto3.client(
's3',
endpoint_url='http://localhost:9000',
aws_access_key_id='invalid',
aws_secret_access_key='invalid',
region_name='us-east-1',
config=Config(signature_version='s3v4')
)

@patch('dsst_etl.upload_pdfs_title_is_pmid.logger')
def test_process_s3_inventory_failure(self, mock_logger):
# Force an exception in the process
self.uploader._get_s3_pdf_iterator = MagicMock(side_effect=Exception("Test exception"))

with patch('dsst_etl.upload_pdfs_title_is_pmid.logger') as mock_logger:
# Run the method
self.uploader.run()
uploader.run()

# Assertions
mock_logger.error.assert_called_with("Error processing S3 inventory: Test exception")
self.assertEqual(self.session.query(Documents).count(), 0)
self.assertEqual(self.session.query(Identifier).count(), 0)
self.assertEqual(self.session.query(Works).count(), 0)
self.assertEqual(self.session.query(OddpubMetrics).count(), 0)


if __name__ == '__main__':
unittest.main()
mock_logger.error.assert_called() # Just verify error was logged
assert db_session.query(Documents).count() == 0
assert db_session.query(Identifier).count() == 0
assert db_session.query(Works).count() == 0
assert db_session.query(OddpubMetrics).count() == 0

0 comments on commit d897bfc

Please sign in to comment.