Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add AWS Bedrock models native support as LLM judge #1426

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions deepeval/key_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ class KeyValues(Enum):
LOCAL_EMBEDDING_BASE_URL = "LOCAL_EMBEDDING_BASE_URL"
LOCAL_EMBEDDING_API_KEY = "LOCAL_EMBEDDING_API_KEY"
USE_LOCAL_EMBEDDINGS = "USE_LOCAL_EMBEDDINGS"
# AWS Cloud support
AWS_ACCESS_KEY_ID = "AWS_ACCESS_KEY_ID"
AWS_SECRET_ACCESS_KEY = "AWS_SECRET_ACCESS_KEY"
AWS_SESSION_TOKEN = "AWS_SESSION_TOKEN"
AWS_REGION = "AWS_REGION"


class KeyFileHandler:
Expand Down
2 changes: 1 addition & 1 deletion deepeval/metrics/answer_relevancy/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,4 +121,4 @@ def generate_reason(
{input}

JSON:
"""
"""
1 change: 1 addition & 0 deletions deepeval/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from deepeval.models.gpt_model import GPTModel, MultimodalGPTModel
from deepeval.models.gpt_model_schematic import SchematicGPTModel
from deepeval.models.openai_embedding_model import OpenAIEmbeddingModel
from deepeval.models.bedrock_model import BedrockModel, MultimodalBedrockModel

# TODO: uncomment out once fixed
# from deepeval.models.summac_model import SummaCModels
Expand Down
382 changes: 382 additions & 0 deletions deepeval/models/bedrock_model.py

Large diffs are not rendered by default.

92 changes: 92 additions & 0 deletions docs/aws_setup.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# AWS Bedrock Setup for DeepEval

To use AWS Bedrock models (like LLaMA, Claude, etc.) with DeepEval, you'll need to set up your AWS credentials. Here's how:

## Option 1: Environment Variables

Set the following environment variables:

```bash
export AWS_ACCESS_KEY_ID="your-access-key-id"
export AWS_SECRET_ACCESS_KEY="your-secret-access-key"
export AWS_SESSION_TOKEN="your-session-token" # Optional: If using temporary credentials
export AWS_REGION="your-region" # e.g., "us-west-2"
```

## Option 2: DeepEval Configuration

Use the DeepEval CLI to set your credentials:

```bash
deepeval config set AWS_ACCESS_KEY_ID "your-access-key-id"
deepeval config set AWS_SECRET_ACCESS_KEY "your-secret-access-key"
deepeval config set AWS_SESSION_TOKEN "your-session-token" # Optional
deepeval config set AWS_REGION "your-region"
```

## Option 3: Direct Initialization

You can also pass the credentials directly when initializing the models:

```bash
from deepeval.models import BedrockModel

# Initialize Bedrock model with explicit credentials
model = BedrockModel(
model_id="your-model-id",
access_key_id="your-access-key-id",
secret_access_key="your-secret-access-key",
session_token="your-session-token", # Optional
region="your-region"
)
```

## Authentication

Make sure you have:

1. An AWS account with Bedrock service enabled
2. AWS credentials configured (either via environment variables, AWS credentials file, or direct initialization as shown above)

## Available Models

- claude-3-7-sonnet-20250219-v1:0
- claude-3-5-haiku-20241022-v1:0
- claude-3-5-sonnet-20241022-v2:0
- claude-3-5-sonnet-20240620-v1:0
- claude-3-opus-20240229-v1:0
- claude-3-sonnet-20240229-v1:0
- claude-3-haiku-20240307-v1:0

## Default Models

- Text-Only: claude-3-7-sonnet-20250219-v1:0
- Multimodal: claude-3-7-sonnet-20250219-v1:0

## Example Usage

```python
from deepeval.models import BedrockModel
from deepeval.test_case import LLMTestCase
from deepeval.metrics import AnswerRelevancyMetric

# Initialize the model
model = BedrockModel(
model_id="us.anthropic.claude-3-7-sonnet-20250219-v1:0",
access_key_id="your-access-key-id",
secret_access_key="your-secret-access-key",
region="us-west-2"
)

# Create a test case
test_case = LLMTestCase(
input="What is the capital of France?",
actual_output=model.generate("What is the capital of France?")
)

# Evaluate using DeepEval metrics
metric = AnswerRelevancyMetric(threshold=0.7)
metric.measure(test_case)
print(f"Score: {metric.score}")
```

2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@
"nest-asyncio",
"datasets",
"ollama",
"boto3",
"pillow"
],
extras_require={
"dev": ["black"],
Expand Down
Binary file added tests/data/test.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
215 changes: 215 additions & 0 deletions tests/test_bedrock_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
"""Tests for Amazon Bedrock model implementations
"""

import pytest
from unittest.mock import patch, MagicMock
import base64
from botocore.response import StreamingBody

from deepeval.models import BedrockModel, MultimodalBedrockModel
from deepeval.test_case import MLLMImage
from deepeval.key_handler import KeyValues, KEY_FILE_HANDLER

# Mock credentials for testing
TEST_REGION = "us-east-1"
TEST_RESPONSE_JSON = '{"content": [{"type": "text", "text": "This is a test response"}]}'
TEST_RESPONSE = "This is a test response"
TEST_IMAGE_URL = "https://www.shutterstock.com/image-photo/funny-large-longhair-gray-kitten-600nw-1842198919.jpg"
TEST_LOCAL_IMAGE = "tests/data/test.jpg"

@pytest.fixture
def mock_boto3_client():
with patch('boto3.client') as mock:
client = MagicMock()
client.invoke_model.return_value = {"body": MagicMock(spec=StreamingBody, read=MagicMock(return_value=TEST_RESPONSE_JSON.encode("utf-8")))}
mock.return_value = client
yield mock

@pytest.fixture
def mock_key_handler():
with patch('deepeval.key_handler.KEY_FILE_HANDLER.fetch_data') as mock:
mock.side_effect = lambda x: {
KeyValues.AWS_REGION: TEST_REGION
}.get(x)
yield mock

class TestBedrockModel:
"""Test suite for Amazon Bedrock model"""

def test_initialization(self, mock_boto3_client, mock_key_handler):
"""Test model initialization with default parameters"""
model = BedrockModel()

assert model.model_id == "us.anthropic.claude-3-7-sonnet-20250219-v1:0"
assert model.region == TEST_REGION

mock_boto3_client.assert_called_once_with('bedrock-runtime', region_name='us-east-1', aws_access_key_id=None, aws_secret_access_key=None, aws_session_token=None)

def test_initialization_with_custom_params(self, mock_boto3_client):
"""Test model initialization with custom parameters"""
model = BedrockModel(
model_id="us.anthropic.claude-3-5-haiku-20241022-v1:0",
region="us-west-2"
)

assert model.model_id == "us.anthropic.claude-3-5-haiku-20241022-v1:0"
assert model.region == "us-west-2"

def test_invalid_model_name(self):
"""Test initialization with invalid model name"""
with pytest.raises(ValueError, match="Invalid model"):
BedrockModel(model_id="invalid-model")

def test_generate(self, mock_boto3_client, mock_key_handler):
"""Test text generation"""
model = BedrockModel()
test_prompt = "Test prompt"
response = model.generate(test_prompt)

assert response == TEST_RESPONSE

mock_instance = mock_boto3_client.return_value
mock_instance.invoke_model.assert_called_once()

@pytest.mark.asyncio
async def test_a_generate(self, mock_boto3_client, mock_key_handler):
"""Test async text generation"""
model = BedrockModel()
test_prompt = "Test prompt"
response = await model.a_generate(test_prompt)

assert response == TEST_RESPONSE

mock_instance = mock_boto3_client.return_value
mock_instance.invoke_model.assert_called_once()


class TestBedrockMultimodalModel:
"""Test suite for Bedrock multimodal model (Anthropic Claude 3.7 Sonnet)."""

def test_initialization(self, mock_boto3_client, mock_key_handler):
"""Test model initialization with default parameters."""
model = MultimodalBedrockModel()

assert model.model_id == "us.anthropic.claude-3-7-sonnet-20250219-v1:0"
assert model.region == TEST_REGION

mock_boto3_client.assert_called_once_with('bedrock-runtime', region_name='us-east-1', aws_access_key_id=None, aws_secret_access_key=None, aws_session_token=None)

def test_initialization_with_custom_params(self, mock_boto3_client):
"""Test model initialization with custom parameters."""
model = MultimodalBedrockModel(
model_id="us.anthropic.claude-3-7-sonnet-20250219-v1:0",
region="us-west-2"
)

assert model.model_id == "us.anthropic.claude-3-7-sonnet-20250219-v1:0"
assert model.region == "us-west-2"

def test_invalid_model_name(self):
"""Test initialization with invalid model name."""
with pytest.raises(ValueError, match="Invalid model"):
MultimodalBedrockModel(model_id="invalid-model")


def test_generate_prompt_local_image(self, mock_boto3_client, mock_key_handler):
"""Test multimodal prompt generation with a local image."""
model = MultimodalBedrockModel()

with open(TEST_LOCAL_IMAGE, "rb") as image_file:
base64_image = base64.b64encode(image_file.read()).decode("utf-8")

multimodal_input = [
"What's in these images?",
MLLMImage(url=TEST_LOCAL_IMAGE, local=True)
]

prompt = model.generate_prompt(multimodal_input)

assert isinstance(prompt, list)
assert len(prompt) == 2

print(f"Generated Prompt: {prompt}")

assert isinstance(prompt[0], dict)
assert prompt[0]['content'][0]["type"] == "text"
assert prompt[0]['content'][0]["text"] == "What's in these images?"

assert isinstance(prompt[1], dict)
assert prompt[1]['content'][0]["type"] == "image"
assert "source" in prompt[1]['content'][0]
assert prompt[1]['content'][0]["source"]["type"] == "base64"
assert prompt[1]['content'][0]["source"]["media_type"] == "image/jpeg"
assert isinstance(prompt[1]['content'][0]["source"]["data"], str)
assert prompt[1]['content'][0]["source"]["data"].startswith("/")

def test_generate_prompt_remote_image(self, mock_boto3_client, mock_key_handler):
"""Test multimodal prompt generation with a remote image."""
model = MultimodalBedrockModel()

multimodal_input = [
"Describe this image:",
MLLMImage(url=TEST_IMAGE_URL, local=False)
]

prompt = model.generate_prompt(multimodal_input)

assert isinstance(prompt, list)
assert len(prompt) == 2

assert prompt[0]['content'][0]["type"] == "text"
assert prompt[0]['content'][0]["text"] == "Describe this image:"

assert isinstance(prompt[1], dict)
assert prompt[1]['content'][0]["type"] == "image"
assert "source" in prompt[1]['content'][0]
assert prompt[1]['content'][0]["source"]["type"] == "base64"
assert prompt[1]['content'][0]["source"]["media_type"] == "image/jpeg"
assert isinstance(prompt[1]['content'][0]["source"]["data"], str)
assert prompt[1]['content'][0]["source"]["data"].startswith("/")


def test_generate(self, mock_boto3_client, mock_key_handler):
"""Test multimodal generation with image and text."""
model = MultimodalBedrockModel()

multimodal_input = [
"Describe this image:",
MLLMImage(url=TEST_LOCAL_IMAGE, local=True)
]

response = model.generate(multimodal_input)

assert response == TEST_RESPONSE

mock_instance = mock_boto3_client.return_value
mock_instance.invoke_model.assert_called_once()

@pytest.mark.asyncio
async def test_a_generate(self, mock_boto3_client, mock_key_handler):
"""Test async multimodal generation."""
model = MultimodalBedrockModel()

multimodal_input = [
"Describe this image:",
MLLMImage(url=TEST_IMAGE_URL, local=False)
]

response = await model.a_generate(multimodal_input)

assert response == TEST_RESPONSE

mock_instance = mock_boto3_client.return_value
mock_instance.invoke_model.assert_called_once()

def test_invalid_input_type(self, mock_boto3_client, mock_key_handler):
"""Test handling of invalid input types."""
model = MultimodalBedrockModel()

multimodal_input = [
"Describe this image:",
{"url": TEST_IMAGE_URL}
]

with pytest.raises(ValueError, match="Invalid input type"):
model.generate_prompt(multimodal_input)
Loading
Loading