Significant-Gravitas · waynehamadi · Jul 9, 2023 · Jul 8, 2023 · Jul 8, 2023 · Jul 8, 2023
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,4 @@
-agbenchmark/mocks/workspace/
+agbenchmark/workspace/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/agbenchmark/README.md b/agbenchmark/README.md
@@ -53,8 +53,7 @@ import os
 class TestWriteFile(BasicChallenge):
     """Testing if LLM can write to a file"""
 
-    @pytest.mark.depends(on=[], name="basic_write_file")
-    def test_method(self, workspace):
+    def test_method(self, config):
         # implement scoring logic by looking at workspace
 ```
 
@@ -82,7 +81,7 @@ Add the below to create a file in the workspace prior to running a challenge. On
 
 ## Workspace
 
-If `--mock` flag is used it is at `agbenchmark/mocks/workspace`. Otherwise for mini-agi it is at `C:/Users/<name>/miniagi` - it will be automitcally set on config
+If `--mock` flag is used it is at `agbenchmark/workspace`. Otherwise for mini-agi it is at `C:/Users/<name>/miniagi` - it will be automitcally set on config
 
 #### Dataset
 

diff --git a/...ark/tests/regression/RegressionManager.py → agbenchmark/RegressionManager.py b/...ark/tests/regression/RegressionManager.py → agbenchmark/RegressionManager.py
diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
@@ -3,37 +3,27 @@
 import subprocess
 import sys
 import time
-from typing import Any, Dict, Optional
+from typing import Any, Dict
 
 from dotenv import load_dotenv
 
-from agbenchmark.mocks.mock_manager import MockManager
-
 load_dotenv()
 
-MOCK_FLAG = os.getenv("MOCK_TEST")
+mock_test_str = os.getenv("MOCK_TEST")
+MOCK_FLAG = mock_test_str.lower() == "true" if mock_test_str else False
 
 
 def run_agent(
     task: str,
-    mock_func: Optional[str],
     config: Dict[str, Any],
     challenge_location: str,
 ) -> None:
     """Calling to get a response"""
 
-    if MOCK_FLAG == "True":
+    if MOCK_FLAG:
         copy_artifacts_into_workspace(
             config["workspace"], "artifacts_out", challenge_location
         )
-        if mock_func is None:
-            print("No mock provided")
-            return
-        mock_manager = MockManager(
-            task, config
-        )  # workspace doesn't need to be passed in, stays the same
-        print("Server unavailable, using mock", mock_func)
-        mock_manager.delegate(mock_func)
     else:
         timeout = config["cutoff"]
         print(
@@ -99,6 +89,3 @@ def copy_artifacts_into_workspace(
         full_file_name = os.path.join(source_dir, file_name)
         if os.path.isfile(full_file_name):
             shutil.copy(full_file_name, workspace)
-
-
-ENVIRONMENT = os.getenv("ENVIRONMENT") or "production"
diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py
@@ -4,9 +4,8 @@
 import subprocess
 import types
 from abc import ABC, ABCMeta
-from typing import Any, Dict, List, Optional, Tuple, Type, cast
+from typing import Any, Dict, List, Tuple, Type, cast
 
-import pytest
 from dotenv import load_dotenv
 
 from agbenchmark.challenges.define_task_types import ChallengeData, Ground
@@ -19,7 +18,6 @@
 
 class ChallengeMeta(ABCMeta):
     def __init__(self, name: str, bases: Tuple[Type, ...], dct: Dict[str, Any]) -> None:
-
         super().__init__(name, bases, dct)
         try:
             frame = cast(types.FrameType, inspect.currentframe())
@@ -40,18 +38,13 @@ class Challenge(ABC, metaclass=ChallengeMeta):
     @property
     def data(self) -> ChallengeData:
         file_path = f"{self.CHALLENGE_LOCATION}/data.json"
-        Challenge._data_cache[file_path] = ChallengeData.deserialize(file_path)
+        if file_path not in Challenge._data_cache:
+            Challenge._data_cache[file_path] = ChallengeData.deserialize(file_path)
         return Challenge._data_cache[file_path]
 
-    @property
-    def mock(self) -> Optional[str]:
-        return self.data.mock.mock_func if self.data.mock else None
-
     @property
     def task(self) -> str:
-        return str(
-            self.data.mock.mock_task if self.data.mock and MOCK_TEST else self.data.task
-        )
+        return self.data.task
 
     @property
     def dependencies(self) -> list:
@@ -64,17 +57,8 @@ def setup_challenge(self, config: Dict[str, Any]) -> None:
             config["workspace"], "artifacts_in", self.__class__.CHALLENGE_LOCATION
         )
 
-        run_agent(self.task, self.mock, config, self.__class__.CHALLENGE_LOCATION)
+        run_agent(self.task, config, self.__class__.CHALLENGE_LOCATION)
 
-    @property
-    def name(self) -> str:
-        return self.data.name
-
-    @pytest.mark.parametrize(
-        "challenge_data",
-        [data],
-        indirect=True,
-    )
     def test_method(self, config: Dict[str, Any]) -> None:
         raise NotImplementedError
 
@@ -151,3 +135,16 @@ def scoring(self, content: str, ground: Ground) -> float:
                     )
 
         return 1.0
+
+    def get_scores(self, config: Dict[str, Any]) -> List[float]:
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
+
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, self.data.ground)
+            print("Your score is:", score)
+            scores.append(score)
+
+        return scores
diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md
@@ -25,21 +25,16 @@ Example:
 
 ```python
 {
-  "name": "basic_write_file",
   "category": ["basic"],
   "task": "Print the the capital of America to a .txt file",
-  "dependencies": [],
+  "dependencies": ["TestWriteFile"], # the class name of the test
   "ground": {
     "answer": "Washington",
     "should_contain": ["Washington"],
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
     "files": [".txt"],
     "type": "file"
   },
-  "mock": {
-    "mock_func": "basic_write_file_mock",
-    "mock_task": "What is the capital of America?"
-  },
   "info": {
     "difficulty": "basic",
     "description": "Tests the writing to file",

diff --git a/agbenchmark/challenges/code/code.py b/agbenchmark/challenges/code/code.py
diff --git a/agbenchmark/challenges/code/d1/data.json b/agbenchmark/challenges/code/d1/data.json
@@ -1,19 +1,14 @@
 {
-  "name": "debug_simple_typo_with_guidance",
   "category": ["code"],
   "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
-  "dependencies": [],
+  "dependencies": ["TestReadFile", "TestWriteFile"],
   "ground": {
     "answer": "[0, 1] [2, 5] [0, 3]",
     "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
     "should_not_contain": [],
     "files": ["test.py"],
     "type": "execute_python_code"
   },
-  "mock": {
-    "mock_func": null,
-    "mock_task": null
-  },
   "info": {
     "difficulty": "basic",
     "description": "Tests ability for the agent to debug python code with a simple typo in it.",

diff --git a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
@@ -1,25 +1,13 @@
 from typing import Any, Dict
 
-import pytest
+from agbenchmark.challenge import Challenge
 
-from agbenchmark.challenges.code.code import CodeChallenge
 
-
-class TestDebugSimpleTypoWithGuidance(CodeChallenge):
+class TestDebugSimpleTypoWithGuidance(Challenge):
     """The first memory challenge"""
 
-    @pytest.mark.depends(name="test_debug_simple_typo_with_guidance")
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
+        scores = self.get_scores(config)
         assert 1 in scores
diff --git a/agbenchmark/challenges/code/d2/d2_test.py b/agbenchmark/challenges/code/d2/d2_test.py
@@ -1,28 +1,14 @@
 from typing import Any, Dict
 
-import pytest
+from agbenchmark.challenge import Challenge
 
-from agbenchmark.challenges.code.code import CodeChallenge
 
-
-class TestDebugSimpleTypoWithoutGuidance(CodeChallenge):
+class TestDebugSimpleTypoWithoutGuidance(Challenge):
     """The first memory challenge"""
 
-    @pytest.mark.depends(
-        name="test_debug_simple_typo_without_guidance",
-        depends=["test_debug_simple_typo_with_guidance"],
-    )
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
+        scores = self.get_scores(config)
 
         assert 1 in scores
diff --git a/agbenchmark/challenges/code/d2/data.json b/agbenchmark/challenges/code/d2/data.json
@@ -1,21 +1,16 @@
 {
-  "name": "debug_simple_typo_without_guidance",
   "category": ["code"],
   "task": "Make test.py run without errors.",
-  "dependencies": [],
+  "dependencies": ["TestDebugSimpleTypoWithGuidance"],
   "ground": {
     "answer": "[0, 1] [2, 5] [0, 3]",
     "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
     "should_not_contain": [],
     "files": ["test.py"],
     "type": "execute_python_code"
   },
-  "mock": {
-    "mock_func": null,
-    "mock_task": null
-  },
   "info": {
-    "difficulty": "basic",
+    "difficulty": "medium",
     "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
     "side_effects": ["tests if there is in fact an LLM attached"]
   }

diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
@@ -4,11 +4,6 @@
 from pydantic import BaseModel
 
 
-class Mock(BaseModel):
-    mock_func: Optional[str] = None
-    mock_task: Optional[str] = None
-
-
 class Info(BaseModel):
     difficulty: str
     description: str
@@ -24,12 +19,10 @@ class Ground(BaseModel):
 
 
 class ChallengeData(BaseModel):
-    name: str
     category: List[str]
     task: str
     dependencies: List[str]
     ground: Ground
-    mock: Optional[Mock] = None
     info: Info
 
     def serialize(self, path: str) -> None:

diff --git a/...mark/tests/basic_abilities/browse_test.py → ...hmark/challenges/interface/browse_test.py b/...mark/tests/basic_abilities/browse_test.py → ...hmark/challenges/interface/browse_test.py
diff --git a/.../read_file/artifacts_in/file_to_check.txt → .../read_file/artifacts_in/file_to_check.txt b/.../read_file/artifacts_in/file_to_check.txt → .../read_file/artifacts_in/file_to_check.txt
diff --git a/...read_file/artifacts_out/file_to_check.txt → ...read_file/artifacts_out/file_to_check.txt b/...read_file/artifacts_out/file_to_check.txt → ...read_file/artifacts_out/file_to_check.txt
diff --git a/...tests/basic_abilities/read_file/data.json → .../challenges/interface/read_file/data.json b/...tests/basic_abilities/read_file/data.json → .../challenges/interface/read_file/data.json
@@ -1,17 +1,14 @@
 {
-  "name": "basic_read_file",
-  "category": ["basic"],
+  "name": "ReadFile",
+  "category": ["interface"],
   "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
-  "dependencies": ["basic_write_file"],
+  "dependencies": ["TestWriteFile"],
   "ground": {
     "answer": "random string Hello World!",
     "should_contain": ["random string", "Hello World!"],
     "files": ["file_to_check.txt"],
     "type": "file"
   },
-  "mock": {
-    "mock_func": "basic_read_file_mock"
-  },
   "info": {
     "description": "This reads the file quickly",
     "difficulty": "basic",

diff --git a/agbenchmark/challenges/interface/read_file/read_file_test.py b/agbenchmark/challenges/interface/read_file/read_file_test.py
@@ -0,0 +1,12 @@
+from typing import Any, Dict
+
+from agbenchmark.challenge import Challenge
+
+
+class TestReadFile(Challenge):
+    """Testing if LLM can read a file"""
+
+    def test_method(self, config: Dict[str, Any]) -> None:
+        self.setup_challenge(config)
+        scores = self.get_scores(config)
+        assert 1 in scores
diff --git a/.../write_file/artifacts_out/random_file.txt → .../write_file/artifacts_out/random_file.txt b/.../write_file/artifacts_out/random_file.txt → .../write_file/artifacts_out/random_file.txt
diff --git a/...ests/basic_abilities/write_file/data.json → ...challenges/interface/write_file/data.json b/...ests/basic_abilities/write_file/data.json → ...challenges/interface/write_file/data.json
@@ -1,6 +1,6 @@
 {
-  "name": "basic_write_file",
-  "category": ["basic"],
+  "name": "WriteFile",
+  "category": ["interface"],
   "task": "Print the the capital of America to a .txt file",
   "dependencies": [],
   "ground": {
@@ -10,10 +10,6 @@
     "files": [".txt"],
     "type": "file"
   },
-  "mock": {
-    "mock_func": "basic_write_file_mock",
-    "mock_task": "What is the capital of America?"
-  },
   "info": {
     "difficulty": "basic",
     "description": "Tests the writing to file",

diff --git a/agbenchmark/challenges/interface/write_file/write_file_test.py b/agbenchmark/challenges/interface/write_file/write_file_test.py
@@ -0,0 +1,13 @@
+from typing import Any, Dict
+
+from agbenchmark.challenge import Challenge
+
+
+class TestWriteFile(Challenge):
+    """Testing if LLM can write to a file"""
+
+    def test_method(self, config: Dict[str, Any]) -> None:
+        self.setup_challenge(config)
+
+        scores = self.get_scores(config)
+        assert 1 in scores
diff --git a/agbenchmark/challenges/memory/m1/data.json b/agbenchmark/challenges/memory/m1/data.json
@@ -1,8 +1,7 @@
 {
-  "name": "basic_memory",
   "category": ["memory"],
   "task": "Follow the instructions in the instructions_1.txt file",
-  "dependencies": [],
+  "dependencies": ["TestReadFile", "TestWriteFile"],
   "ground": {
     "answer": "2314",
     "should_contain": ["2314"],