diff --git a/agbenchmark/RegressionManager.py b/agbenchmark/RegressionManager.py index a1379ecaee8..e289a478780 100644 --- a/agbenchmark/RegressionManager.py +++ b/agbenchmark/RegressionManager.py @@ -11,9 +11,18 @@ def __init__(self, filename: str): def load(self) -> None: try: with open(self.filename, "r") as f: - self.tests = json.load(f) - except (FileNotFoundError, json.decoder.JSONDecodeError): + file_content = ( + f.read().strip() + ) # read the content and remove any leading/trailing whitespace + if file_content: # if file is not empty, load the json + self.tests = json.loads(file_content) + else: # if file is empty, assign an empty dictionary + self.tests = {} + except FileNotFoundError: self.tests = {} + except json.decoder.JSONDecodeError: # If JSON is invalid + self.tests = {} + self.save() def save(self) -> None: with open(self.filename, "w") as f: diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py index ddf69f42d3e..cf7ce104c57 100644 --- a/agbenchmark/challenge.py +++ b/agbenchmark/challenge.py @@ -1,10 +1,8 @@ import glob -import inspect import os import subprocess -import types -from abc import ABC, ABCMeta -from typing import Any, Dict, List, Tuple, Type, cast +from abc import ABC +from typing import Any, Dict, List from dotenv import load_dotenv @@ -16,24 +14,12 @@ MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False -class ChallengeMeta(ABCMeta): - def __init__(self, name: str, bases: Tuple[Type, ...], dct: Dict[str, Any]) -> None: - super().__init__(name, bases, dct) - try: - frame = cast(types.FrameType, inspect.currentframe()) - assert frame.f_back is not None - self.CHALLENGE_LOCATION = os.path.dirname(inspect.getfile(frame.f_back)) - except Exception as e: - print(f"Unable to get the file from 8 frames back due to: {str(e)}") - raise e - - -class Challenge(ABC, metaclass=ChallengeMeta): +class Challenge(ABC): """The parent class to all specific challenges classes. Defines helper methods for running a challenge""" _data_cache: Dict[str, ChallengeData] = {} - CHALLENGE_LOCATION: str + CHALLENGE_LOCATION: str = "" @property def data(self) -> ChallengeData: @@ -54,10 +40,10 @@ def setup_challenge(self, config: Dict[str, Any]) -> None: from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent copy_artifacts_into_workspace( - config["workspace"], "artifacts_in", self.__class__.CHALLENGE_LOCATION + config["workspace"], "artifacts_in", self.CHALLENGE_LOCATION ) - run_agent(self.task, config, self.__class__.CHALLENGE_LOCATION) + run_agent(self.task, config, self.CHALLENGE_LOCATION) def test_method(self, config: Dict[str, Any]) -> None: raise NotImplementedError diff --git a/agbenchmark/challenges/code/d1/data.json b/agbenchmark/challenges/code/d1/data.json index 6ac284b81f1..0c724600050 100644 --- a/agbenchmark/challenges/code/d1/data.json +++ b/agbenchmark/challenges/code/d1/data.json @@ -1,4 +1,5 @@ { + "name": "TestDebugSimpleTypoWithGuidance", "category": ["code"], "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", "dependencies": ["TestReadFile", "TestWriteFile"], diff --git a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py deleted file mode 100644 index d104b337450..00000000000 --- a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py +++ /dev/null @@ -1,13 +0,0 @@ -from typing import Any, Dict - -from agbenchmark.challenge import Challenge - - -class TestDebugSimpleTypoWithGuidance(Challenge): - """The first memory challenge""" - - def test_method(self, config: Dict[str, Any]) -> None: - self.setup_challenge(config) - - scores = self.get_scores(config) - assert 1 in scores diff --git a/agbenchmark/challenges/code/d2/d2_test.py b/agbenchmark/challenges/code/d2/d2_test.py deleted file mode 100644 index b02114a753d..00000000000 --- a/agbenchmark/challenges/code/d2/d2_test.py +++ /dev/null @@ -1,14 +0,0 @@ -from typing import Any, Dict - -from agbenchmark.challenge import Challenge - - -class TestDebugSimpleTypoWithoutGuidance(Challenge): - """The first memory challenge""" - - def test_method(self, config: Dict[str, Any]) -> None: - self.setup_challenge(config) - - scores = self.get_scores(config) - - assert 1 in scores diff --git a/agbenchmark/challenges/code/d2/data.json b/agbenchmark/challenges/code/d2/data.json index 3de5111f568..2923010949b 100644 --- a/agbenchmark/challenges/code/d2/data.json +++ b/agbenchmark/challenges/code/d2/data.json @@ -1,4 +1,5 @@ { + "name": "TestDebugSimpleTypoWithoutGuidance", "category": ["code"], "task": "Make test.py run without errors.", "dependencies": ["TestDebugSimpleTypoWithGuidance"], diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py index 308cb5ea6f1..94cba5b724c 100644 --- a/agbenchmark/challenges/define_task_types.py +++ b/agbenchmark/challenges/define_task_types.py @@ -19,6 +19,7 @@ class Ground(BaseModel): class ChallengeData(BaseModel): + name: str category: List[str] task: str dependencies: List[str] diff --git a/agbenchmark/challenges/interface/read_file/data.json b/agbenchmark/challenges/interface/read_file/data.json index dd399fabf5e..c827581b6bf 100644 --- a/agbenchmark/challenges/interface/read_file/data.json +++ b/agbenchmark/challenges/interface/read_file/data.json @@ -1,5 +1,5 @@ { - "name": "ReadFile", + "name": "TestReadFile", "category": ["interface"], "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", "dependencies": ["TestWriteFile"], diff --git a/agbenchmark/challenges/interface/read_file/read_file_test.py b/agbenchmark/challenges/interface/read_file/read_file_test.py deleted file mode 100644 index 591d0a744a7..00000000000 --- a/agbenchmark/challenges/interface/read_file/read_file_test.py +++ /dev/null @@ -1,12 +0,0 @@ -from typing import Any, Dict - -from agbenchmark.challenge import Challenge - - -class TestReadFile(Challenge): - """Testing if LLM can read a file""" - - def test_method(self, config: Dict[str, Any]) -> None: - self.setup_challenge(config) - scores = self.get_scores(config) - assert 1 in scores diff --git a/agbenchmark/challenges/interface/write_file/data.json b/agbenchmark/challenges/interface/write_file/data.json index b3e4b6f0270..2be2d0dfe8d 100644 --- a/agbenchmark/challenges/interface/write_file/data.json +++ b/agbenchmark/challenges/interface/write_file/data.json @@ -1,5 +1,5 @@ { - "name": "WriteFile", + "name": "TestWriteFile", "category": ["interface"], "task": "Print the the capital of America to a .txt file", "dependencies": [], diff --git a/agbenchmark/challenges/interface/write_file/write_file_test.py b/agbenchmark/challenges/interface/write_file/write_file_test.py deleted file mode 100644 index 4a52b097965..00000000000 --- a/agbenchmark/challenges/interface/write_file/write_file_test.py +++ /dev/null @@ -1,13 +0,0 @@ -from typing import Any, Dict - -from agbenchmark.challenge import Challenge - - -class TestWriteFile(Challenge): - """Testing if LLM can write to a file""" - - def test_method(self, config: Dict[str, Any]) -> None: - self.setup_challenge(config) - - scores = self.get_scores(config) - assert 1 in scores diff --git a/agbenchmark/challenges/memory/m1/data.json b/agbenchmark/challenges/memory/m1/data.json index f771a2669b4..506b246ad02 100644 --- a/agbenchmark/challenges/memory/m1/data.json +++ b/agbenchmark/challenges/memory/m1/data.json @@ -1,4 +1,5 @@ { + "name": "TestBasicMemory", "category": ["memory"], "task": "Follow the instructions in the instructions_1.txt file", "dependencies": ["TestReadFile", "TestWriteFile"], diff --git a/agbenchmark/challenges/memory/m1/m1_test.py b/agbenchmark/challenges/memory/m1/m1_test.py deleted file mode 100644 index 0fc537eeb43..00000000000 --- a/agbenchmark/challenges/memory/m1/m1_test.py +++ /dev/null @@ -1,13 +0,0 @@ -from typing import Any, Dict - -from agbenchmark.challenge import Challenge - - -class TestBasicMemory(Challenge): - """The first memory challenge""" - - def test_method(self, config: Dict[str, Any]) -> None: - self.setup_challenge(config) - - scores = self.get_scores(config) - assert 1 in scores diff --git a/agbenchmark/challenges/memory/m2/data.json b/agbenchmark/challenges/memory/m2/data.json index 998e894b16c..7ef2552d1cc 100644 --- a/agbenchmark/challenges/memory/m2/data.json +++ b/agbenchmark/challenges/memory/m2/data.json @@ -1,4 +1,5 @@ { + "name": "TestRememberMultipleIds", "category": ["memory"], "task": "Follow the instructions in the instructions_1.txt file", "dependencies": ["TestBasicMemory"], diff --git a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py deleted file mode 100644 index c88f288311a..00000000000 --- a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py +++ /dev/null @@ -1,13 +0,0 @@ -from typing import Any, Dict - -from agbenchmark.challenge import Challenge - - -class TestRememberMultipleIds(Challenge): - """The first memory challenge""" - - def test_method(self, config: Dict[str, Any]) -> None: - self.setup_challenge(config) - - scores = self.get_scores(config) - assert 1 in scores diff --git a/agbenchmark/challenges/memory/m3/data.json b/agbenchmark/challenges/memory/m3/data.json index d5d95b1de26..720cce93c0c 100644 --- a/agbenchmark/challenges/memory/m3/data.json +++ b/agbenchmark/challenges/memory/m3/data.json @@ -1,4 +1,5 @@ { + "name": "TestRememberMultipleIdsWithNoise", "category": ["memory"], "task": "Follow the instructions in the instructions_1.txt file", "dependencies": ["TestRememberMultipleIds"], diff --git a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py deleted file mode 100644 index 0e35dd2f47b..00000000000 --- a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py +++ /dev/null @@ -1,13 +0,0 @@ -from typing import Any, Dict - -from agbenchmark.challenge import Challenge - - -class TestRememberMultipleIdsWithNoise(Challenge): - """The first memory challenge""" - - def test_method(self, config: Dict[str, Any]) -> None: - self.setup_challenge(config) - - scores = self.get_scores(config) - assert 1 in scores diff --git a/agbenchmark/challenges/memory/m4/data.json b/agbenchmark/challenges/memory/m4/data.json index 49831537e5a..61965206bdb 100644 --- a/agbenchmark/challenges/memory/m4/data.json +++ b/agbenchmark/challenges/memory/m4/data.json @@ -1,4 +1,5 @@ { + "name": "TestRememberMultiplePhrasesWithNoise", "category": ["memory"], "task": "Follow the instructions in the instructions_1.txt file", "dependencies": ["TestRememberMultipleIdsWithNoise"], diff --git a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py deleted file mode 100644 index 4c4bdce55e6..00000000000 --- a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py +++ /dev/null @@ -1,13 +0,0 @@ -from typing import Any, Dict - -from agbenchmark.challenge import Challenge - - -class TestRememberMultiplePhrasesWithNoise(Challenge): - """The first memory challenge""" - - def test_method(self, config: Dict[str, Any]) -> None: - self.setup_challenge(config) - - scores = self.get_scores(config) - assert 1 in scores diff --git a/agbenchmark/challenges/retrieval/r1/data.json b/agbenchmark/challenges/retrieval/r1/data.json index 6e1344b8bdc..7812c21dae7 100644 --- a/agbenchmark/challenges/retrieval/r1/data.json +++ b/agbenchmark/challenges/retrieval/r1/data.json @@ -1,4 +1,5 @@ { + "name": "TestBasicRetrieval", "category": ["retrieval"], "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", "dependencies": ["TestWriteFile"], diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py deleted file mode 100644 index 9845a7b2a04..00000000000 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ /dev/null @@ -1,13 +0,0 @@ -from typing import Any, Dict - -from agbenchmark.challenge import Challenge - - -class TestRetrieval(Challenge): - """The first information-retrieval challenge""" - - def test_method(self, config: Dict[str, Any]) -> None: - self.setup_challenge(config) - - scores = self.get_scores(config) - assert 1 in scores diff --git a/agbenchmark/challenges/retrieval/r2/data.json b/agbenchmark/challenges/retrieval/r2/data.json index 05846b9f37d..5bc2e96b4a5 100644 --- a/agbenchmark/challenges/retrieval/r2/data.json +++ b/agbenchmark/challenges/retrieval/r2/data.json @@ -1,7 +1,8 @@ { + "name": "TestRetrieval2", "category": ["retrieval"], "task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", - "dependencies": ["TestRetrieval"], + "dependencies": ["TestBasicRetrieval"], "ground": { "answer": "81,462", "should_contain": ["81,462"], diff --git a/agbenchmark/challenges/retrieval/r2/r2_test.py b/agbenchmark/challenges/retrieval/r2/r2_test.py deleted file mode 100644 index f0f13ffbf42..00000000000 --- a/agbenchmark/challenges/retrieval/r2/r2_test.py +++ /dev/null @@ -1,13 +0,0 @@ -from typing import Any, Dict - -from agbenchmark.challenge import Challenge - - -class TestRetrieval2(Challenge): - """The first information-retrieval challenge""" - - def test_method(self, config: Dict[str, Any]) -> None: - self.setup_challenge(config) - - scores = self.get_scores(config) - assert 1 in scores diff --git a/agbenchmark/challenges/retrieval/r3/data.json b/agbenchmark/challenges/retrieval/r3/data.json index 763c963ec4a..b918d3d4e81 100644 --- a/agbenchmark/challenges/retrieval/r3/data.json +++ b/agbenchmark/challenges/retrieval/r3/data.json @@ -1,4 +1,5 @@ { + "name": "TestRetrieval3", "category": ["retrieval"], "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", "dependencies": ["TestRetrieval2"], diff --git a/agbenchmark/challenges/retrieval/r3/r3_test.py b/agbenchmark/challenges/retrieval/r3/r3_test.py deleted file mode 100644 index 5887c0b43a5..00000000000 --- a/agbenchmark/challenges/retrieval/r3/r3_test.py +++ /dev/null @@ -1,14 +0,0 @@ -from typing import Any, Dict - -from agbenchmark.challenge import Challenge - - -class TestRetrieval3(Challenge): - """The first information-retrieval challenge""" - - def test_method(self, config: Dict[str, Any]) -> None: - self.setup_challenge(config) - - scores = self.get_scores(config) - - assert 1 in scores diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py new file mode 100644 index 00000000000..4f9e5b7f828 --- /dev/null +++ b/agbenchmark/challenges/test_all.py @@ -0,0 +1,78 @@ +import glob +import importlib +import json +import os +import types +from pathlib import Path +from typing import Any, Dict + +import pytest +from dotenv import load_dotenv + +from agbenchmark.challenge import Challenge + +load_dotenv() + +IMPROVE = os.getenv("IMPROVE", "False") + + +json_files = glob.glob("agbenchmark/challenges/**/data.json", recursive=True) + + +def get_test_path(json_file: str) -> str: + abs_location = os.path.dirname(os.path.abspath(json_file)) + + path = Path(abs_location) + + # Find the index of "agbenchmark" in the path parts + try: + agbenchmark_index = path.parts.index("agbenchmark") + except ValueError: + raise ValueError("Invalid challenge location.") + + # Create the path from "agbenchmark" onwards + challenge_location = Path(*path.parts[agbenchmark_index:]) + + return str(challenge_location) + + +def generate_tests() -> None: + print("Generating tests...") + # Dynamic class creation + for json_file in json_files: + with open(json_file, "r") as f: + data = json.load(f) + + class_name = data.get("name", "") + + challenge_location = get_test_path(json_file) + + # Define test class dynamically + challenge_class = types.new_class(class_name, (Challenge,)) + + setattr(challenge_class, "CHALLENGE_LOCATION", challenge_location) + + # Define test method within the dynamically created class + def test_method(self, config: Dict[str, Any]) -> None: # type: ignore + self.setup_challenge(config) + + scores = self.get_scores(config) + assert 1 in scores + + # Parametrize the method here + test_method = pytest.mark.parametrize( + "challenge_data", + [data], + indirect=True, + )(test_method) + + setattr(challenge_class, "test_method", test_method) + + # Attach the new class to a module so it can be discovered by pytest + module = importlib.import_module(__name__) + setattr(module, class_name, challenge_class) + + print(f"Generated test for {class_name}.") + + +generate_tests() diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 7d3dd8ed310..e321f5a26c8 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -88,13 +88,16 @@ def check_regression(request: Any) -> None: test_name = request.node.parent.name data = get_regression_data() + # Get the true location of the test + challenge_location = getattr(request.node.parent.cls, "CHALLENGE_LOCATION", "") + + skip_string = f"Skipping {test_name} at {challenge_location}" + # Check if the test name exists in the regression tests if request.config.getoption("--improve") and data.get(test_name, None): - pytest.skip("Skipping test because it's a regression test and --improve is set") + pytest.skip(f"{skip_string} because it's a regression test") elif request.config.getoption("--maintain") and not data.get(test_name, None): - pytest.skip( - "Skipping test because it's not a regression test and --maintain is set" - ) + pytest.skip(f"{skip_string} because it's not a regression test") # this is to get the challenge_data from every test @@ -109,15 +112,19 @@ def challenge_data(request: Any) -> None: def pytest_runtest_makereport(item: Any, call: Any) -> None: if call.when == "call": challenge_data = item.funcargs.get("challenge_data", None) - difficulty = challenge_data.info.difficulty if challenge_data else "unknown" - dependencies = challenge_data.dependencies if challenge_data else [] - parts = item.nodeid.split("::")[0].split("/") - agbenchmark_index = parts.index("agbenchmark") - file_path = "/".join(parts[agbenchmark_index:]) + difficulty = ( + challenge_data["info"]["difficulty"] if challenge_data else "unknown" + ) + dependencies = dependencies = ( + challenge_data["dependencies"] if challenge_data else [] + ) + # Extract the challenge_location from the class + challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "") + test_details = { "difficulty": difficulty, "dependencies": dependencies, - "test": file_path, + "test": challenge_location, } print("pytest_runtest_makereport", test_details) @@ -132,19 +139,6 @@ def pytest_sessionfinish() -> None: regression_manager.save() -# this is so that all tests can inherit from the Challenge class -def pytest_generate_tests(metafunc: Any) -> None: - if "challenge_data" in metafunc.fixturenames: - # Get the instance of the test class - test_class = metafunc.cls() - - # Generate the parameters - params = test_class.data - - # Add the parameters to the test function - metafunc.parametrize("challenge_data", [params], indirect=True) - - # this is adding the dependency marker and category markers automatically from the json def pytest_collection_modifyitems(items: Any, config: Any) -> None: data = get_regression_data() diff --git a/regression_tests.json b/regression_tests.json index 44334801e5d..6132079179d 100644 --- a/regression_tests.json +++ b/regression_tests.json @@ -1,59 +1,64 @@ { - "TestBasicMemory": { + "TestWriteFile": { "difficulty": "basic", "dependencies": [], - "test": "agbenchmark/challenges/memory/m1/m1_test.py" + "test": "agbenchmark\\challenges\\interface\\write_file" }, - "TestRememberMultipleIds": { + "TestReadFile": { "difficulty": "basic", "dependencies": [ - "TestBasicMemory" + "TestWriteFile" ], - "test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py" + "test": "agbenchmark\\challenges\\interface\\read_file" }, - "TestRememberMultipleIdsWithNoise": { - "difficulty": "medium", + "TestBasicMemory": { + "difficulty": "basic", "dependencies": [ - "TestRememberMultipleIds" + "TestReadFile", + "TestWriteFile" ], - "test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py" + "test": "agbenchmark\\challenges\\memory\\m1" }, - "TestRememberMultiplePhrasesWithNoise": { - "difficulty": "medium", + "TestBasicRetrieval": { + "difficulty": "basic", "dependencies": [ - "TestRememberMultipleIdsWithNoise" + "TestWriteFile" ], - "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py" + "test": "agbenchmark\\challenges\\retrieval\\r1" }, - "TestRetrieval": { + "TestRememberMultipleIds": { "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/challenges/retrieval/r1/r1_test.py" + "dependencies": [ + "TestBasicMemory" + ], + "test": "agbenchmark\\challenges\\memory\\m2" }, "TestRetrieval2": { "difficulty": "basic", "dependencies": [ - "TestRetrieval" + "TestBasicRetrieval" + ], + "test": "agbenchmark\\challenges\\retrieval\\r2" + }, + "TestRememberMultipleIdsWithNoise": { + "difficulty": "medium", + "dependencies": [ + "TestRememberMultipleIds" ], - "test": "agbenchmark/challenges/retrieval/r2/r2_test.py" + "test": "agbenchmark\\challenges\\memory\\m3" }, "TestRetrieval3": { "difficulty": "basic", "dependencies": [ "TestRetrieval2" ], - "test": "agbenchmark/challenges/retrieval/r3/r3_test.py" + "test": "agbenchmark\\challenges\\retrieval\\r3" }, - "TestWriteFile": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/challenges/interface/write_file/write_file_test.py" - }, - "TestReadFile": { - "difficulty": "basic", + "TestRememberMultiplePhrasesWithNoise": { + "difficulty": "medium", "dependencies": [ - "TestWriteFile" + "TestRememberMultipleIdsWithNoise" ], - "test": "agbenchmark/challenges/interface/read_file/read_file_test.py" + "test": "agbenchmark\\challenges\\memory\\m4" } } \ No newline at end of file