Skip to content
This repository has been archived by the owner on Jun 9, 2024. It is now read-only.

quality of life improvements & fixes #75

Merged
merged 8 commits into from
Jul 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
agbenchmark/mocks/workspace/
agbenchmark/workspace/

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
5 changes: 2 additions & 3 deletions agbenchmark/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,7 @@ import os
class TestWriteFile(BasicChallenge):
"""Testing if LLM can write to a file"""

@pytest.mark.depends(on=[], name="basic_write_file")
def test_method(self, workspace):
def test_method(self, config):
# implement scoring logic by looking at workspace
```

Expand Down Expand Up @@ -82,7 +81,7 @@ Add the below to create a file in the workspace prior to running a challenge. On

## Workspace

If `--mock` flag is used it is at `agbenchmark/mocks/workspace`. Otherwise for mini-agi it is at `C:/Users/<name>/miniagi` - it will be automitcally set on config
If `--mock` flag is used it is at `agbenchmark/workspace`. Otherwise for mini-agi it is at `C:/Users/<name>/miniagi` - it will be automitcally set on config

#### Dataset

Expand Down
21 changes: 4 additions & 17 deletions agbenchmark/agent_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,37 +3,27 @@
import subprocess
import sys
import time
from typing import Any, Dict, Optional
from typing import Any, Dict

from dotenv import load_dotenv

from agbenchmark.mocks.mock_manager import MockManager

load_dotenv()

MOCK_FLAG = os.getenv("MOCK_TEST")
mock_test_str = os.getenv("MOCK_TEST")
MOCK_FLAG = mock_test_str.lower() == "true" if mock_test_str else False


def run_agent(
task: str,
mock_func: Optional[str],
config: Dict[str, Any],
challenge_location: str,
) -> None:
"""Calling to get a response"""

if MOCK_FLAG == "True":
if MOCK_FLAG:
copy_artifacts_into_workspace(
config["workspace"], "artifacts_out", challenge_location
)
if mock_func is None:
print("No mock provided")
return
mock_manager = MockManager(
task, config
) # workspace doesn't need to be passed in, stays the same
print("Server unavailable, using mock", mock_func)
mock_manager.delegate(mock_func)
else:
timeout = config["cutoff"]
print(
Expand Down Expand Up @@ -99,6 +89,3 @@ def copy_artifacts_into_workspace(
full_file_name = os.path.join(source_dir, file_name)
if os.path.isfile(full_file_name):
shutil.copy(full_file_name, workspace)


ENVIRONMENT = os.getenv("ENVIRONMENT") or "production"
39 changes: 18 additions & 21 deletions agbenchmark/challenge.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@
import subprocess
import types
from abc import ABC, ABCMeta
from typing import Any, Dict, List, Optional, Tuple, Type, cast
from typing import Any, Dict, List, Tuple, Type, cast

import pytest
from dotenv import load_dotenv

from agbenchmark.challenges.define_task_types import ChallengeData, Ground
Expand All @@ -19,7 +18,6 @@

class ChallengeMeta(ABCMeta):
def __init__(self, name: str, bases: Tuple[Type, ...], dct: Dict[str, Any]) -> None:

super().__init__(name, bases, dct)
try:
frame = cast(types.FrameType, inspect.currentframe())
Expand All @@ -40,18 +38,13 @@ class Challenge(ABC, metaclass=ChallengeMeta):
@property
def data(self) -> ChallengeData:
file_path = f"{self.CHALLENGE_LOCATION}/data.json"
Challenge._data_cache[file_path] = ChallengeData.deserialize(file_path)
if file_path not in Challenge._data_cache:
Challenge._data_cache[file_path] = ChallengeData.deserialize(file_path)
return Challenge._data_cache[file_path]

@property
def mock(self) -> Optional[str]:
return self.data.mock.mock_func if self.data.mock else None

@property
def task(self) -> str:
return str(
self.data.mock.mock_task if self.data.mock and MOCK_TEST else self.data.task
)
return self.data.task

@property
def dependencies(self) -> list:
Expand All @@ -64,17 +57,8 @@ def setup_challenge(self, config: Dict[str, Any]) -> None:
config["workspace"], "artifacts_in", self.__class__.CHALLENGE_LOCATION
)

run_agent(self.task, self.mock, config, self.__class__.CHALLENGE_LOCATION)
run_agent(self.task, config, self.__class__.CHALLENGE_LOCATION)

@property
def name(self) -> str:
return self.data.name

@pytest.mark.parametrize(
"challenge_data",
[data],
indirect=True,
)
def test_method(self, config: Dict[str, Any]) -> None:
raise NotImplementedError

Expand Down Expand Up @@ -151,3 +135,16 @@ def scoring(self, content: str, ground: Ground) -> float:
)

return 1.0

def get_scores(self, config: Dict[str, Any]) -> List[float]:
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)

scores = []
for file_content in files_contents:
score = self.scoring(file_content, self.data.ground)
print("Your score is:", score)
scores.append(score)

return scores
7 changes: 1 addition & 6 deletions agbenchmark/challenges/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,16 @@ Example:

```python
{
"name": "basic_write_file",
"category": ["basic"],
"task": "Print the the capital of America to a .txt file",
"dependencies": [],
"dependencies": ["TestWriteFile"], # the class name of the test
"ground": {
"answer": "Washington",
"should_contain": ["Washington"],
"should_not_contain": ["New York", "Los Angeles", "San Francisco"],
"files": [".txt"],
"type": "file"
},
"mock": {
"mock_func": "basic_write_file_mock",
"mock_task": "What is the capital of America?"
},
"info": {
"difficulty": "basic",
"description": "Tests the writing to file",
Expand Down
8 changes: 0 additions & 8 deletions agbenchmark/challenges/code/code.py

This file was deleted.

7 changes: 1 addition & 6 deletions agbenchmark/challenges/code/d1/data.json
Original file line number Diff line number Diff line change
@@ -1,19 +1,14 @@
{
"name": "debug_simple_typo_with_guidance",
"category": ["code"],
"task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
"dependencies": [],
"dependencies": ["TestReadFile", "TestWriteFile"],
"ground": {
"answer": "[0, 1] [2, 5] [0, 3]",
"should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
"should_not_contain": [],
"files": ["test.py"],
"type": "execute_python_code"
},
"mock": {
"mock_func": null,
"mock_task": null
},
"info": {
"difficulty": "basic",
"description": "Tests ability for the agent to debug python code with a simple typo in it.",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,25 +1,13 @@
from typing import Any, Dict

import pytest
from agbenchmark.challenge import Challenge

from agbenchmark.challenges.code.code import CodeChallenge


class TestDebugSimpleTypoWithGuidance(CodeChallenge):
class TestDebugSimpleTypoWithGuidance(Challenge):
"""The first memory challenge"""

@pytest.mark.depends(name="test_debug_simple_typo_with_guidance")
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)

files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)

scores = []
for file_content in files_contents:
score = self.scoring(file_content, self.data.ground)
print("Your score is:", score)
scores.append(score)

scores = self.get_scores(config)
assert 1 in scores
20 changes: 3 additions & 17 deletions agbenchmark/challenges/code/d2/d2_test.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,14 @@
from typing import Any, Dict

import pytest
from agbenchmark.challenge import Challenge

from agbenchmark.challenges.code.code import CodeChallenge


class TestDebugSimpleTypoWithoutGuidance(CodeChallenge):
class TestDebugSimpleTypoWithoutGuidance(Challenge):
"""The first memory challenge"""

@pytest.mark.depends(
name="test_debug_simple_typo_without_guidance",
depends=["test_debug_simple_typo_with_guidance"],
)
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)

files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)

scores = []
for file_content in files_contents:
score = self.scoring(file_content, self.data.ground)
print("Your score is:", score)
scores.append(score)
scores = self.get_scores(config)

assert 1 in scores
9 changes: 2 additions & 7 deletions agbenchmark/challenges/code/d2/data.json
Original file line number Diff line number Diff line change
@@ -1,21 +1,16 @@
{
"name": "debug_simple_typo_without_guidance",
"category": ["code"],
"task": "Make test.py run without errors.",
"dependencies": [],
"dependencies": ["TestDebugSimpleTypoWithGuidance"],
"ground": {
"answer": "[0, 1] [2, 5] [0, 3]",
"should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
"should_not_contain": [],
"files": ["test.py"],
"type": "execute_python_code"
},
"mock": {
"mock_func": null,
"mock_task": null
},
"info": {
"difficulty": "basic",
"difficulty": "medium",
"description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
"side_effects": ["tests if there is in fact an LLM attached"]
}
Expand Down
7 changes: 0 additions & 7 deletions agbenchmark/challenges/define_task_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,6 @@
from pydantic import BaseModel


class Mock(BaseModel):
mock_func: Optional[str] = None
mock_task: Optional[str] = None


class Info(BaseModel):
difficulty: str
description: str
Expand All @@ -24,12 +19,10 @@ class Ground(BaseModel):


class ChallengeData(BaseModel):
name: str
category: List[str]
task: str
dependencies: List[str]
ground: Ground
mock: Optional[Mock] = None
info: Info

def serialize(self, path: str) -> None:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,17 +1,14 @@
{
"name": "basic_read_file",
"category": ["basic"],
"name": "ReadFile",
"category": ["interface"],
"task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
"dependencies": ["basic_write_file"],
"dependencies": ["TestWriteFile"],
"ground": {
"answer": "random string Hello World!",
"should_contain": ["random string", "Hello World!"],
"files": ["file_to_check.txt"],
"type": "file"
},
"mock": {
"mock_func": "basic_read_file_mock"
},
"info": {
"description": "This reads the file quickly",
"difficulty": "basic",
Expand Down
12 changes: 12 additions & 0 deletions agbenchmark/challenges/interface/read_file/read_file_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from typing import Any, Dict

from agbenchmark.challenge import Challenge


class TestReadFile(Challenge):
"""Testing if LLM can read a file"""

def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
scores = self.get_scores(config)
assert 1 in scores
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "basic_write_file",
"category": ["basic"],
"name": "WriteFile",
"category": ["interface"],
"task": "Print the the capital of America to a .txt file",
"dependencies": [],
"ground": {
Expand All @@ -10,10 +10,6 @@
"files": [".txt"],
"type": "file"
},
"mock": {
"mock_func": "basic_write_file_mock",
"mock_task": "What is the capital of America?"
},
"info": {
"difficulty": "basic",
"description": "Tests the writing to file",
Expand Down
13 changes: 13 additions & 0 deletions agbenchmark/challenges/interface/write_file/write_file_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from typing import Any, Dict

from agbenchmark.challenge import Challenge


class TestWriteFile(Challenge):
"""Testing if LLM can write to a file"""

def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)

scores = self.get_scores(config)
assert 1 in scores
3 changes: 1 addition & 2 deletions agbenchmark/challenges/memory/m1/data.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
{
"name": "basic_memory",
"category": ["memory"],
"task": "Follow the instructions in the instructions_1.txt file",
"dependencies": [],
"dependencies": ["TestReadFile", "TestWriteFile"],
"ground": {
"answer": "2314",
"should_contain": ["2314"],
Expand Down
Loading