From ac5af736963dac95969f0cb3d0f99480a0a4f401 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Wed, 28 Jun 2023 21:28:46 -0400
Subject: [PATCH 1/4] trying to get kill process

---
 agbenchmark/config.json                       |  8 +-
 agbenchmark/conftest.py                       | 70 ++++++++++----
 .../tests/regression/regression_tests.json    |  7 ++
 poetry.lock                                   | 93 ++++++++++++++++++-
 pyproject.toml                                |  2 +
 5 files changed, 161 insertions(+), 19 deletions(-)

diff --git a/agbenchmark/config.json b/agbenchmark/config.json
index 3de1dd64386..d95b8e44399 100644
--- a/agbenchmark/config.json
+++ b/agbenchmark/config.json
@@ -1,3 +1,9 @@
 {
-  "hostname": "localhost"
+  "workspace": "C:\\Users\\silen\\miniagi",
+  "cutoff": {
+    "type": "time",
+    "user_prompt": "Press enter to continue or abort this action by typing feedback:",
+    "user_input": "\n",
+    "count": 5
+  }
 }
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 4edd4b5e0b6..2590ce78187 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -2,11 +2,10 @@
 import os
 import pytest
 import shutil
+import subprocess
+import sys
 from agbenchmark.tests.regression.RegressionManager import RegressionManager
-import requests
 from agbenchmark.mocks.MockManager import MockManager
-import subprocess
-from agbenchmark.Challenge import Challenge
 from dotenv import load_dotenv
 
 load_dotenv()
@@ -44,8 +43,16 @@ def pytest_addoption(parser):
     parser.addoption("--mock", action="store_true", default=False)
 
 
+def check_cycle_count(cycle_count: int, cutoff: int, proc):
+    """Increment, print, and check cycle count."""
+    cycle_count += 1
+    print(f"Cycle count: {cycle_count}")
+    if cycle_count >= cutoff:
+        proc.terminate(force=True)
+    return cycle_count
+
+
 AGENT_NAME = os.getenv("AGENT_NAME")
-AGENT_TIMEOUT = os.getenv("AGENT_TIMEOUT")
 
 
 @pytest.fixture(autouse=True)
@@ -70,19 +77,48 @@ def run_agent(request, config):
     else:
         path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}")
 
-        try:
-            timeout = int(AGENT_TIMEOUT) if AGENT_TIMEOUT is not None else 60
-
-            subprocess.run(
-                ["python", "miniagi.py", task],
-                check=True,
-                cwd=path,
-                timeout=timeout
-                # text=True,
-                # capture_output=True
-            )
-        except subprocess.TimeoutExpired:
-            print("The subprocess has exceeded the time limit and was terminated.")
+        timeout = sys.maxsize
+
+        if config["cutoff"]["type"] == "time":
+            timeout = config["cutoff"]["count"] or 60
+            
+        from pexpect.popen_spawn import PopenSpawn
+
+        print(f"Running {task} with timeout {timeout}")
+
+        # Starting the subprocess using pexpect
+        proc = PopenSpawn("python", ["miniagi.py", task], timeout=timeout, cwd=path)
+
+        print("proc", proc)
+
+        cycle_count = 0
+
+        while True:
+            try:
+                # If we get the prompt for user input, we send "\n"
+                if config["cutoff"]["type"] == "user_input":
+                    proc.expect([config["cutoff"]["user_prompt"]])
+                    proc.sendline(config["cutoff"]["user_input"])
+                    cycle_count = check_cycle_count(
+                        cycle_count, config["cutoff"]["count"], proc
+                    )
+                elif config["cutoff"]["type"] == "cycle_count":
+                    match = proc.expect([r"Cycle count: (\d+)"])
+                    if match is not None:
+                        cycle_count = int(match.group(1))  # type: ignore
+                        cycle_count = check_cycle_count(
+                            cycle_count, config["cutoff"]["count"], proc
+                        )
+
+                # for cutoff type "time", just let it run until timeout
+            except expect.TIMEOUT:
+                print("The subprocess has exceeded the time limit and was terminated.")
+                break
+            except expect.EOF:
+                print("The subprocess has finished running.")
+                break
+
+        proc.close()
 
 
 regression_json = "agbenchmark/tests/regression/regression_tests.json"
diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json
index 384f9e7c61a..8a6278fea13 100644
--- a/agbenchmark/tests/regression/regression_tests.json
+++ b/agbenchmark/tests/regression/regression_tests.json
@@ -3,5 +3,12 @@
         "difficulty": "basic",
         "dependencies": [],
         "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0-run_agent0]"
+    },
+    "TestReadFile": {
+        "difficulty": "basic",
+        "dependencies": [
+            "basic_write_file"
+        ],
+        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_method[challenge_data0-run_agent0]"
     }
 }
\ No newline at end of file
diff --git a/poetry.lock b/poetry.lock
index 7b2477bc6c9..a460f988da2 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -538,6 +538,20 @@ files = [
     {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"},
 ]
 
+[[package]]
+name = "pexpect"
+version = "4.8.0"
+description = "Pexpect allows easy control of interactive console applications."
+optional = false
+python-versions = "*"
+files = [
+    {file = "pexpect-4.8.0-py2.py3-none-any.whl", hash = "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937"},
+    {file = "pexpect-4.8.0.tar.gz", hash = "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c"},
+]
+
+[package.dependencies]
+ptyprocess = ">=0.5"
+
 [[package]]
 name = "pluggy"
 version = "1.0.0"
@@ -553,6 +567,43 @@ files = [
 dev = ["pre-commit", "tox"]
 testing = ["pytest", "pytest-benchmark"]
 
+[[package]]
+name = "psutil"
+version = "5.9.5"
+description = "Cross-platform lib for process and system monitoring in Python."
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "psutil-5.9.5-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:be8929ce4313f9f8146caad4272f6abb8bf99fc6cf59344a3167ecd74f4f203f"},
+    {file = "psutil-5.9.5-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:ab8ed1a1d77c95453db1ae00a3f9c50227ebd955437bcf2a574ba8adbf6a74d5"},
+    {file = "psutil-5.9.5-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:4aef137f3345082a3d3232187aeb4ac4ef959ba3d7c10c33dd73763fbc063da4"},
+    {file = "psutil-5.9.5-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:ea8518d152174e1249c4f2a1c89e3e6065941df2fa13a1ab45327716a23c2b48"},
+    {file = "psutil-5.9.5-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:acf2aef9391710afded549ff602b5887d7a2349831ae4c26be7c807c0a39fac4"},
+    {file = "psutil-5.9.5-cp27-none-win32.whl", hash = "sha256:5b9b8cb93f507e8dbaf22af6a2fd0ccbe8244bf30b1baad6b3954e935157ae3f"},
+    {file = "psutil-5.9.5-cp27-none-win_amd64.whl", hash = "sha256:8c5f7c5a052d1d567db4ddd231a9d27a74e8e4a9c3f44b1032762bd7b9fdcd42"},
+    {file = "psutil-5.9.5-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:3c6f686f4225553615612f6d9bc21f1c0e305f75d7d8454f9b46e901778e7217"},
+    {file = "psutil-5.9.5-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7a7dd9997128a0d928ed4fb2c2d57e5102bb6089027939f3b722f3a210f9a8da"},
+    {file = "psutil-5.9.5-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89518112647f1276b03ca97b65cc7f64ca587b1eb0278383017c2a0dcc26cbe4"},
+    {file = "psutil-5.9.5-cp36-abi3-win32.whl", hash = "sha256:104a5cc0e31baa2bcf67900be36acde157756b9c44017b86b2c049f11957887d"},
+    {file = "psutil-5.9.5-cp36-abi3-win_amd64.whl", hash = "sha256:b258c0c1c9d145a1d5ceffab1134441c4c5113b2417fafff7315a917a026c3c9"},
+    {file = "psutil-5.9.5-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:c607bb3b57dc779d55e1554846352b4e358c10fff3abf3514a7a6601beebdb30"},
+    {file = "psutil-5.9.5.tar.gz", hash = "sha256:5410638e4df39c54d957fc51ce03048acd8e6d60abc0f5107af51e5fb566eb3c"},
+]
+
+[package.extras]
+test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
+
+[[package]]
+name = "ptyprocess"
+version = "0.7.0"
+description = "Run a subprocess in a pseudo terminal"
+optional = false
+python-versions = "*"
+files = [
+    {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"},
+    {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"},
+]
+
 [[package]]
 name = "pydantic"
 version = "1.10.9"
@@ -658,6 +709,29 @@ files = [
 [package.extras]
 cli = ["click (>=5.0)"]
 
+[[package]]
+name = "pywin32"
+version = "306"
+description = "Python for Window Extensions"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pywin32-306-cp310-cp310-win32.whl", hash = "sha256:06d3420a5155ba65f0b72f2699b5bacf3109f36acbe8923765c22938a69dfc8d"},
+    {file = "pywin32-306-cp310-cp310-win_amd64.whl", hash = "sha256:84f4471dbca1887ea3803d8848a1616429ac94a4a8d05f4bc9c5dcfd42ca99c8"},
+    {file = "pywin32-306-cp311-cp311-win32.whl", hash = "sha256:e65028133d15b64d2ed8f06dd9fbc268352478d4f9289e69c190ecd6818b6407"},
+    {file = "pywin32-306-cp311-cp311-win_amd64.whl", hash = "sha256:a7639f51c184c0272e93f244eb24dafca9b1855707d94c192d4a0b4c01e1100e"},
+    {file = "pywin32-306-cp311-cp311-win_arm64.whl", hash = "sha256:70dba0c913d19f942a2db25217d9a1b726c278f483a919f1abfed79c9cf64d3a"},
+    {file = "pywin32-306-cp312-cp312-win32.whl", hash = "sha256:383229d515657f4e3ed1343da8be101000562bf514591ff383ae940cad65458b"},
+    {file = "pywin32-306-cp312-cp312-win_amd64.whl", hash = "sha256:37257794c1ad39ee9be652da0462dc2e394c8159dfd913a8a4e8eb6fd346da0e"},
+    {file = "pywin32-306-cp312-cp312-win_arm64.whl", hash = "sha256:5821ec52f6d321aa59e2db7e0a35b997de60c201943557d108af9d4ae1ec7040"},
+    {file = "pywin32-306-cp37-cp37m-win32.whl", hash = "sha256:1c73ea9a0d2283d889001998059f5eaaba3b6238f767c9cf2833b13e6a685f65"},
+    {file = "pywin32-306-cp37-cp37m-win_amd64.whl", hash = "sha256:72c5f621542d7bdd4fdb716227be0dd3f8565c11b280be6315b06ace35487d36"},
+    {file = "pywin32-306-cp38-cp38-win32.whl", hash = "sha256:e4c092e2589b5cf0d365849e73e02c391c1349958c5ac3e9d5ccb9a28e017b3a"},
+    {file = "pywin32-306-cp38-cp38-win_amd64.whl", hash = "sha256:e8ac1ae3601bee6ca9f7cb4b5363bf1c0badb935ef243c4733ff9a393b1690c0"},
+    {file = "pywin32-306-cp39-cp39-win32.whl", hash = "sha256:e25fd5b485b55ac9c057f67d94bc203f3f6595078d1fb3b458c9c28b7153a802"},
+    {file = "pywin32-306-cp39-cp39-win_amd64.whl", hash = "sha256:39b61c15272833b5c329a2989999dcae836b1eed650252ab1b7bfbe1d59f30f4"},
+]
+
 [[package]]
 name = "requests"
 version = "2.31.0"
@@ -738,6 +812,23 @@ secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.
 socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
 zstd = ["zstandard (>=0.18.0)"]
 
+[[package]]
+name = "wexpect"
+version = "4.0.0"
+description = "Windows alternative of pexpect"
+optional = false
+python-versions = "*"
+files = [
+    {file = "wexpect-4.0.0.tar.gz", hash = "sha256:de9e739e78ec4d74a39bf8499904dacb6c594007a674fb7e10752c9b131f6522"},
+]
+
+[package.dependencies]
+psutil = ">=5.0.0"
+pywin32 = ">=220"
+
+[package.extras]
+test = ["codecov", "coverage", "pyinstaller", "setuptools (>=38.0)", "tox", "twine"]
+
 [[package]]
 name = "yarl"
 version = "1.9.2"
@@ -828,4 +919,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "f8de5e973c92360108aaca1cecc2fdd505f10a9c2975b46c83ea9c24b4af3cfe"
+content-hash = "8ab722acade739b9fb841ecae3b8cabd4f1d8a355864573a93d9faa11dcffb90"
diff --git a/pyproject.toml b/pyproject.toml
index 043fe68a2a1..af9688d1432 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,6 +16,8 @@ openai = "^0.27.8"
 pydantic = "^1.10.9"
 pytest-depends = "^1.0.1"
 python-dotenv = "^1.0.0"
+pexpect = "^4.8.0"
+wexpect = "^4.0.0"
 
 
 [build-system]

From fce421fb335107cddd9fd60b32e91902be7b5eae Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Thu, 29 Jun 2023 20:51:23 -0400
Subject: [PATCH 2/4] moving logic to benchmark.py file

---
 agbenchmark/benchmark.py | 65 ++++++++++++++++++++++++++++++++++++++++
 agbenchmark/conftest.py  | 61 ++-----------------------------------
 2 files changed, 67 insertions(+), 59 deletions(-)
 create mode 100644 agbenchmark/benchmark.py

diff --git a/agbenchmark/benchmark.py b/agbenchmark/benchmark.py
new file mode 100644
index 00000000000..6dc3b231282
--- /dev/null
+++ b/agbenchmark/benchmark.py
@@ -0,0 +1,65 @@
+import os
+import sys
+import pexpect as expect
+from dotenv import load_dotenv
+
+load_dotenv()
+
+
+def check_cycle_count(cycle_count: int, cutoff: int, proc):
+    """Increment, print, and check cycle count."""
+    cycle_count += 1
+    print(f"Cycle count: {cycle_count}")
+    if cycle_count >= cutoff:
+        proc.terminate(force=True)
+    return cycle_count
+
+
+AGENT_NAME = os.getenv("AGENT_NAME")
+
+
+def run_agnostic(config, task):
+    path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}")
+
+    timeout = sys.maxsize
+
+    if config["cutoff"]["type"] == "time":
+        timeout = config["cutoff"]["count"] or 60
+
+    # from pexpect.popen_spawn import PopenSpawn
+
+    print(f"Running {task} with timeout {timeout}")
+
+    # Starting the subprocess using pexpect
+    proc = expect.spawn("python", ["miniagi.py", task], timeout=timeout, cwd=path)
+
+    print("proc", proc)
+
+    cycle_count = 0
+
+    while True:
+        try:
+            # If we get the prompt for user input, we send "\n"
+            if config["cutoff"]["type"] == "user_input":
+                proc.expect([config["cutoff"]["user_prompt"]])
+                proc.sendline(config["cutoff"]["user_input"])
+                cycle_count = check_cycle_count(
+                    cycle_count, config["cutoff"]["count"], proc
+                )
+            elif config["cutoff"]["type"] == "cycle_count":
+                match = proc.expect([r"Cycle count: (\d+)"])
+                if match is not None:
+                    cycle_count = int(match.group(1))  # type: ignore
+                    cycle_count = check_cycle_count(
+                        cycle_count, config["cutoff"]["count"], proc
+                    )
+
+            # for cutoff type "time", just let it run until timeout
+        except expect.TIMEOUT:
+            print("The subprocess has exceeded the time limit and was terminated.")
+            break
+        except expect.EOF:
+            print("The subprocess has finished running.")
+            break
+
+    proc.close()
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 2590ce78187..25510e42b67 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -6,9 +6,7 @@
 import sys
 from agbenchmark.tests.regression.RegressionManager import RegressionManager
 from agbenchmark.mocks.MockManager import MockManager
-from dotenv import load_dotenv
-
-load_dotenv()
+from agbenchmark.benchmark import run_agnostic
 
 
 @pytest.fixture(scope="module")
@@ -43,18 +41,6 @@ def pytest_addoption(parser):
     parser.addoption("--mock", action="store_true", default=False)
 
 
-def check_cycle_count(cycle_count: int, cutoff: int, proc):
-    """Increment, print, and check cycle count."""
-    cycle_count += 1
-    print(f"Cycle count: {cycle_count}")
-    if cycle_count >= cutoff:
-        proc.terminate(force=True)
-    return cycle_count
-
-
-AGENT_NAME = os.getenv("AGENT_NAME")
-
-
 @pytest.fixture(autouse=True)
 def run_agent(request, config):
     """Calling to get a response"""
@@ -75,50 +61,7 @@ def run_agent(request, config):
         else:
             print("No mock provided")
     else:
-        path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}")
-
-        timeout = sys.maxsize
-
-        if config["cutoff"]["type"] == "time":
-            timeout = config["cutoff"]["count"] or 60
-            
-        from pexpect.popen_spawn import PopenSpawn
-
-        print(f"Running {task} with timeout {timeout}")
-
-        # Starting the subprocess using pexpect
-        proc = PopenSpawn("python", ["miniagi.py", task], timeout=timeout, cwd=path)
-
-        print("proc", proc)
-
-        cycle_count = 0
-
-        while True:
-            try:
-                # If we get the prompt for user input, we send "\n"
-                if config["cutoff"]["type"] == "user_input":
-                    proc.expect([config["cutoff"]["user_prompt"]])
-                    proc.sendline(config["cutoff"]["user_input"])
-                    cycle_count = check_cycle_count(
-                        cycle_count, config["cutoff"]["count"], proc
-                    )
-                elif config["cutoff"]["type"] == "cycle_count":
-                    match = proc.expect([r"Cycle count: (\d+)"])
-                    if match is not None:
-                        cycle_count = int(match.group(1))  # type: ignore
-                        cycle_count = check_cycle_count(
-                            cycle_count, config["cutoff"]["count"], proc
-                        )
-
-                # for cutoff type "time", just let it run until timeout
-            except expect.TIMEOUT:
-                print("The subprocess has exceeded the time limit and was terminated.")
-                break
-            except expect.EOF:
-                print("The subprocess has finished running.")
-                break
-
-        proc.close()
+        run_agnostic(config, task)
 
 
 regression_json = "agbenchmark/tests/regression/regression_tests.json"

From 2987d71264c7ffb0b6184e28e17c503aef5b4681 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Fri, 30 Jun 2023 10:50:54 -0400
Subject: [PATCH 3/4] moving run agent to tests & agnostic run working

---
 .env.example                                  |   2 +-
 agbenchmark/Challenge.py                      |  16 +--
 agbenchmark/agent_interface.py                | 108 ++++++++++++++++++
 agbenchmark/benchmark.py                      |  65 -----------
 .../challenges/retrieval/r1/r1_test.py        |   7 +-
 agbenchmark/config.json                       |   9 +-
 agbenchmark/conftest.py                       |  37 ------
 agbenchmark/mocks/workspace/file_to_check.txt |   1 +
 .../read_file/read_file_test.py               |   7 +-
 .../write_file/write_file_test.py             |   6 +-
 .../tests/regression/regression_tests.json    |   9 +-
 agent/hook.py                                 |  10 ++
 pyproject.toml                                |   2 -
 13 files changed, 144 insertions(+), 135 deletions(-)
 create mode 100644 agbenchmark/agent_interface.py
 delete mode 100644 agbenchmark/benchmark.py
 create mode 100644 agbenchmark/mocks/workspace/file_to_check.txt
 create mode 100644 agent/hook.py

diff --git a/.env.example b/.env.example
index 7782d048e23..e50ed58a5f0 100644
--- a/.env.example
+++ b/.env.example
@@ -1,3 +1,3 @@
 AGENT_NAME=mini-agi
-AGENT_TIMEOUT=60
+ENVIRONMENT=local
 MOCK_TEST=False
\ No newline at end of file
diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py
index f644abc4a6d..7b1e4df0425 100644
--- a/agbenchmark/Challenge.py
+++ b/agbenchmark/Challenge.py
@@ -4,7 +4,7 @@
 from abc import ABC, abstractmethod
 from agbenchmark.challenges.define_task_types import Ground
 from agbenchmark.challenges.define_task_types import ChallengeData
-from dotenv import load_dotenv, set_key
+from dotenv import load_dotenv
 
 load_dotenv()
 
@@ -40,22 +40,24 @@ def dependencies(self) -> list:
         print("self.data.dependencies", self.data.dependencies)
         return self.data.dependencies
 
+    def setup_challenge(self, config):
+        from agbenchmark.agent_interface import run_agent
+
+        print("SETTING UP CHALLENGE...")
+
+        run_agent(self.task, self.mock, config)
+
     @property
     def name(self) -> str:
         print("self.data.name", self.data.name)
         return self.data.name
 
-    @pytest.mark.parametrize(
-        "run_agent",
-        [(task, mock)],
-        indirect=True,
-    )
     @pytest.mark.parametrize(
         "challenge_data",
         [data],
         indirect=True,
     )
-    def test_method(self, workspace):
+    def test_method(self, config):
         raise NotImplementedError
 
     @staticmethod
diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
new file mode 100644
index 00000000000..eba26fc189d
--- /dev/null
+++ b/agbenchmark/agent_interface.py
@@ -0,0 +1,108 @@
+import os
+import sys
+import subprocess
+import time
+from agbenchmark.mocks.MockManager import MockManager
+from multiprocessing import Process, Pipe
+
+from agent.hook import run_specific_agent
+
+from dotenv import load_dotenv
+
+load_dotenv()
+
+MOCK_FLAG = os.getenv("MOCK_TEST")
+
+
+def run_agent(task, mock_func, config):
+    """Calling to get a response"""
+
+    if mock_func == None and MOCK_FLAG == "True":
+        print("No mock provided")
+    elif MOCK_FLAG == "True":
+        mock_manager = MockManager(
+            task
+        )  # workspace doesn't need to be passed in, stays the same
+        print("Server unavailable, using mock", mock_func)
+        mock_manager.delegate(mock_func)
+    else:
+        if config["agent"]["type"] == "python":
+            run_agent_function(config, task)
+        elif config["agent"]["type"] == "script":
+            run_agent_command(config, task)
+
+
+ENVIRONMENT = os.getenv("ENVIRONMENT") or "production"
+
+
+def run_agent_command(config, task):
+    path = config["agent"]["path"]
+
+    if ENVIRONMENT == "local":
+        AGENT_NAME = os.getenv("AGENT_NAME")
+        path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}")
+
+    timeout = config["agent"]["cutoff"] or sys.maxsize
+    print(f"Running {task} with timeout {timeout}")
+
+    command_from_config = config["agent"]["script"]
+    command_list = command_from_config.split()
+
+    # replace '{}' with the task
+    command_list = [cmd if cmd != "{}" else task for cmd in command_list]
+    print("path, command_list", path, command_list)
+    start_time = time.time()
+    proc = subprocess.Popen(
+        command_list,
+        cwd=path,
+        shell=True,
+    )
+
+    while True:
+        if time.time() - start_time > timeout:
+            print("The subprocess has exceeded the time limit and was terminated.")
+            proc.terminate()
+            break
+
+        if proc.poll() is not None:
+            print("The subprocess has finished running.")
+            break
+
+
+def run_agent_function(config, task):
+    timeout = (
+        config["cutoff"]["count"] if config["cutoff"]["type"] == "time" else sys.maxsize
+    )
+    print(
+        f"Running Python function '{config['agent']['function']}' with timeout {timeout}"
+    )
+
+    parent_conn, child_conn = Pipe()
+    process = Process(target=run_specific_agent, args=(task, child_conn))
+    process.start()
+    start_time = time.time()
+
+    while True:
+        if parent_conn.poll():  # Check if there's a new message from the child process
+            response, cycle_count = parent_conn.recv()
+            print(f"Cycle {cycle_count}: {response}")
+
+            if cycle_count >= config["cutoff"]["count"]:
+                print(
+                    f"Cycle count has reached the limit of {config['cutoff']['count']}. Terminating."
+                )
+                child_conn.send("terminate")
+                break
+
+        if time.time() - start_time > timeout:
+            print("The Python function has exceeded the time limit and was terminated.")
+            child_conn.send(
+                "terminate"
+            )  # Send a termination signal to the child process
+            break
+
+        if not process.is_alive():
+            print("The Python function has finished running.")
+            break
+
+    process.join()
diff --git a/agbenchmark/benchmark.py b/agbenchmark/benchmark.py
deleted file mode 100644
index 6dc3b231282..00000000000
--- a/agbenchmark/benchmark.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import os
-import sys
-import pexpect as expect
-from dotenv import load_dotenv
-
-load_dotenv()
-
-
-def check_cycle_count(cycle_count: int, cutoff: int, proc):
-    """Increment, print, and check cycle count."""
-    cycle_count += 1
-    print(f"Cycle count: {cycle_count}")
-    if cycle_count >= cutoff:
-        proc.terminate(force=True)
-    return cycle_count
-
-
-AGENT_NAME = os.getenv("AGENT_NAME")
-
-
-def run_agnostic(config, task):
-    path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}")
-
-    timeout = sys.maxsize
-
-    if config["cutoff"]["type"] == "time":
-        timeout = config["cutoff"]["count"] or 60
-
-    # from pexpect.popen_spawn import PopenSpawn
-
-    print(f"Running {task} with timeout {timeout}")
-
-    # Starting the subprocess using pexpect
-    proc = expect.spawn("python", ["miniagi.py", task], timeout=timeout, cwd=path)
-
-    print("proc", proc)
-
-    cycle_count = 0
-
-    while True:
-        try:
-            # If we get the prompt for user input, we send "\n"
-            if config["cutoff"]["type"] == "user_input":
-                proc.expect([config["cutoff"]["user_prompt"]])
-                proc.sendline(config["cutoff"]["user_input"])
-                cycle_count = check_cycle_count(
-                    cycle_count, config["cutoff"]["count"], proc
-                )
-            elif config["cutoff"]["type"] == "cycle_count":
-                match = proc.expect([r"Cycle count: (\d+)"])
-                if match is not None:
-                    cycle_count = int(match.group(1))  # type: ignore
-                    cycle_count = check_cycle_count(
-                        cycle_count, config["cutoff"]["count"], proc
-                    )
-
-            # for cutoff type "time", just let it run until timeout
-        except expect.TIMEOUT:
-            print("The subprocess has exceeded the time limit and was terminated.")
-            break
-        except expect.EOF:
-            print("The subprocess has finished running.")
-            break
-
-    proc.close()
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index 0bd907d8a0b..b679a731dc9 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -1,6 +1,4 @@
-import pytest
 from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge
-from agbenchmark.challenges.define_task_types import ChallengeData, Ground
 import os
 
 
@@ -10,8 +8,9 @@ class TestRetrieval1(RetrievalChallenge):
     def get_file_path(self) -> str:  # all tests must implement this method
         return os.path.join(os.path.dirname(__file__), "r1_data.json")
 
-    def test_method(self, workspace):
-        files_contents = self.open_files(workspace, self.data.ground.files)
+    def test_method(self, config):
+        self.setup_challenge(config)
+        files_contents = self.open_files(config["workspace"], self.data.ground.files)
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/config.json b/agbenchmark/config.json
index d95b8e44399..7388085dc89 100644
--- a/agbenchmark/config.json
+++ b/agbenchmark/config.json
@@ -1,9 +1,10 @@
 {
   "workspace": "C:\\Users\\silen\\miniagi",
-  "cutoff": {
-    "type": "time",
-    "user_prompt": "Press enter to continue or abort this action by typing feedback:",
+  "agent": {
+    "type": "script",
+    "path": "",
+    "script": "python miniagi.py {}",
     "user_input": "\n",
-    "count": 5
+    "cutoff": 60
   }
 }
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 25510e42b67..0f1fc7bb2f3 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -2,11 +2,7 @@
 import os
 import pytest
 import shutil
-import subprocess
-import sys
 from agbenchmark.tests.regression.RegressionManager import RegressionManager
-from agbenchmark.mocks.MockManager import MockManager
-from agbenchmark.benchmark import run_agnostic
 
 
 @pytest.fixture(scope="module")
@@ -41,29 +37,6 @@ def pytest_addoption(parser):
     parser.addoption("--mock", action="store_true", default=False)
 
 
-@pytest.fixture(autouse=True)
-def run_agent(request, config):
-    """Calling to get a response"""
-    if isinstance(request.param, tuple):
-        task = request.param[0]  # The task is passed in indirectly
-        mock_function_name = request.param[1] or None
-    else:
-        task = request.param
-        mock_function_name = None
-
-    if mock_function_name != None and (request.config.getoption("--mock")):
-        if mock_function_name:
-            mock_manager = MockManager(
-                task
-            )  # workspace doesn't need to be passed in, stays the same
-            print("Server unavailable, using mock", mock_function_name)
-            mock_manager.delegate(mock_function_name)
-        else:
-            print("No mock provided")
-    else:
-        run_agnostic(config, task)
-
-
 regression_json = "agbenchmark/tests/regression/regression_tests.json"
 
 regression_manager = RegressionManager(regression_json)
@@ -120,13 +93,3 @@ def pytest_generate_tests(metafunc):
 
         # Add the parameters to the test function
         metafunc.parametrize("challenge_data", [params], indirect=True)
-
-    if "run_agent" in metafunc.fixturenames:
-        # Get the instance of the test class
-        test_class = metafunc.cls()
-
-        # Generate the parameters
-        params = [(test_class.task, test_class.mock)]
-
-        # Add the parameters to the test function
-        metafunc.parametrize("run_agent", params, indirect=True)
diff --git a/agbenchmark/mocks/workspace/file_to_check.txt b/agbenchmark/mocks/workspace/file_to_check.txt
new file mode 100644
index 00000000000..48dc8cff1a4
--- /dev/null
+++ b/agbenchmark/mocks/workspace/file_to_check.txt
@@ -0,0 +1 @@
+Washington DC is the capital of the United States of America
\ No newline at end of file
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index f99ae608c82..c0aaa7f933b 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -17,10 +17,9 @@ def get_file_path(self) -> str:  # all tests must implement this method
         return os.path.join(os.path.dirname(__file__), "r_file_data.json")
 
     @pytest.mark.depends(on=["basic_write_file"], name="basic_read_file")
-    def test_method(
-        self, workspace
-    ):  # run_test is a common name that all tests must implement
-        files_contents = self.open_files(workspace, self.data.ground.files)
+    def test_method(self, config):
+        self.setup_challenge(config)
+        files_contents = self.open_files(config["workspace"], self.data.ground.files)
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index 39c73b163dd..306375ddd3c 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -10,9 +10,9 @@ def get_file_path(self) -> str:  # all tests must implement this method
         return os.path.join(os.path.dirname(__file__), "w_file_data.json")
 
     @pytest.mark.depends(on=[], name="basic_write_file")
-    def test_method(self, workspace):
-        print("my workspace is ", workspace)
-        files_contents = self.open_files(workspace, self.data.ground.files)
+    def test_method(self, config):
+        self.setup_challenge(config)
+        files_contents = self.open_files(config["workspace"], self.data.ground.files)
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json
index 8a6278fea13..d13b763c7cc 100644
--- a/agbenchmark/tests/regression/regression_tests.json
+++ b/agbenchmark/tests/regression/regression_tests.json
@@ -2,13 +2,6 @@
     "TestWriteFile": {
         "difficulty": "basic",
         "dependencies": [],
-        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0-run_agent0]"
-    },
-    "TestReadFile": {
-        "difficulty": "basic",
-        "dependencies": [
-            "basic_write_file"
-        ],
-        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_method[challenge_data0-run_agent0]"
+        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0]"
     }
 }
\ No newline at end of file
diff --git a/agent/hook.py b/agent/hook.py
new file mode 100644
index 00000000000..6fa5341800b
--- /dev/null
+++ b/agent/hook.py
@@ -0,0 +1,10 @@
+async def run_specific_agent(task, conn):
+    while (
+        not conn.poll()
+    ):  # Check if there's a termination signal from the main process
+        response, cycle_count = await run_agent(
+            task
+        )  # run the agent and get the response and cycle count
+
+        # Send response and cycle count back to the main process
+        conn.send((response, cycle_count))
diff --git a/pyproject.toml b/pyproject.toml
index af9688d1432..043fe68a2a1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,8 +16,6 @@ openai = "^0.27.8"
 pydantic = "^1.10.9"
 pytest-depends = "^1.0.1"
 python-dotenv = "^1.0.0"
-pexpect = "^4.8.0"
-wexpect = "^4.0.0"
 
 
 [build-system]

From 7c352b745ec90486826289ed735800197e95cd80 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Fri, 30 Jun 2023 11:55:43 -0400
Subject: [PATCH 4/4] integrate config, agent_interface just func, hook

---
 agbenchmark/Challenge.py                      |   5 +-
 agbenchmark/agent_interface.py                | 118 ++++++------------
 agbenchmark/config.json                       |   9 +-
 agbenchmark/start_benchmark.py                |  12 +-
 .../tests/regression/regression_tests.json    |   8 +-
 agent/benchmarks.py                           |  15 +++
 agent/hook.py                                 |  10 --
 7 files changed, 70 insertions(+), 107 deletions(-)
 create mode 100644 agent/benchmarks.py
 delete mode 100644 agent/hook.py

diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py
index 7b1e4df0425..d7a2bdc9b38 100644
--- a/agbenchmark/Challenge.py
+++ b/agbenchmark/Challenge.py
@@ -23,6 +23,7 @@ def get_file_path(self) -> str:
 
     @property
     def data(self) -> ChallengeData:
+        # TODO: make it so that this is cached somewhere to just call self.deserialized_data
         return ChallengeData.deserialize(self.get_file_path())
 
     @property
@@ -37,19 +38,15 @@ def task(self):
 
     @property
     def dependencies(self) -> list:
-        print("self.data.dependencies", self.data.dependencies)
         return self.data.dependencies
 
     def setup_challenge(self, config):
         from agbenchmark.agent_interface import run_agent
 
-        print("SETTING UP CHALLENGE...")
-
         run_agent(self.task, self.mock, config)
 
     @property
     def name(self) -> str:
-        print("self.data.name", self.data.name)
         return self.data.name
 
     @pytest.mark.parametrize(
diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
index eba26fc189d..2ff2acf3011 100644
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -1,12 +1,9 @@
 import os
-import sys
-import subprocess
+import importlib
 import time
 from agbenchmark.mocks.MockManager import MockManager
 from multiprocessing import Process, Pipe
 
-from agent.hook import run_specific_agent
-
 from dotenv import load_dotenv
 
 load_dotenv()
@@ -26,83 +23,48 @@ def run_agent(task, mock_func, config):
         print("Server unavailable, using mock", mock_func)
         mock_manager.delegate(mock_func)
     else:
-        if config["agent"]["type"] == "python":
-            run_agent_function(config, task)
-        elif config["agent"]["type"] == "script":
-            run_agent_command(config, task)
-
-
-ENVIRONMENT = os.getenv("ENVIRONMENT") or "production"
-
-
-def run_agent_command(config, task):
-    path = config["agent"]["path"]
-
-    if ENVIRONMENT == "local":
-        AGENT_NAME = os.getenv("AGENT_NAME")
-        path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}")
-
-    timeout = config["agent"]["cutoff"] or sys.maxsize
-    print(f"Running {task} with timeout {timeout}")
-
-    command_from_config = config["agent"]["script"]
-    command_list = command_from_config.split()
-
-    # replace '{}' with the task
-    command_list = [cmd if cmd != "{}" else task for cmd in command_list]
-    print("path, command_list", path, command_list)
-    start_time = time.time()
-    proc = subprocess.Popen(
-        command_list,
-        cwd=path,
-        shell=True,
-    )
-
-    while True:
-        if time.time() - start_time > timeout:
-            print("The subprocess has exceeded the time limit and was terminated.")
-            proc.terminate()
-            break
-
-        if proc.poll() is not None:
-            print("The subprocess has finished running.")
-            break
-
-
-def run_agent_function(config, task):
-    timeout = (
-        config["cutoff"]["count"] if config["cutoff"]["type"] == "time" else sys.maxsize
-    )
-    print(
-        f"Running Python function '{config['agent']['function']}' with timeout {timeout}"
-    )
-
-    parent_conn, child_conn = Pipe()
-    process = Process(target=run_specific_agent, args=(task, child_conn))
-    process.start()
-    start_time = time.time()
-
-    while True:
-        if parent_conn.poll():  # Check if there's a new message from the child process
-            response, cycle_count = parent_conn.recv()
-            print(f"Cycle {cycle_count}: {response}")
-
-            if cycle_count >= config["cutoff"]["count"]:
+        timeout = config["cutoff"]
+        print(f"Running Python function '{config['func_path']}' with timeout {timeout}")
+
+        parent_conn, child_conn = Pipe()
+
+        # Import the specific agent dynamically
+        module_name = config["func_path"].replace("/", ".").rstrip(".py")
+        module = importlib.import_module(module_name)
+        run_specific_agent = getattr(module, "run_specific_agent")
+
+        process = Process(target=run_specific_agent, args=(task, child_conn))
+        process.start()
+        start_time = time.time()
+
+        while True:
+            if (
+                parent_conn.poll()
+            ):  # Check if there's a new message from the child process
+                response, cycle_count = parent_conn.recv()
+                print(f"Cycle {cycle_count}: {response}")
+
+                if cycle_count >= config["cutoff"]:
+                    print(
+                        f"Cycle count has reached the limit of {config['cutoff']}. Terminating."
+                    )
+                    child_conn.send("terminate")
+                    break
+
+            if time.time() - start_time > timeout:
                 print(
-                    f"Cycle count has reached the limit of {config['cutoff']['count']}. Terminating."
+                    "The Python function has exceeded the time limit and was terminated."
                 )
-                child_conn.send("terminate")
+                child_conn.send(
+                    "terminate"
+                )  # Send a termination signal to the child process
                 break
 
-        if time.time() - start_time > timeout:
-            print("The Python function has exceeded the time limit and was terminated.")
-            child_conn.send(
-                "terminate"
-            )  # Send a termination signal to the child process
-            break
+            if not process.is_alive():
+                print("The Python function has finished running.")
+                break
 
-        if not process.is_alive():
-            print("The Python function has finished running.")
-            break
+        process.join()
 
-    process.join()
+
+ENVIRONMENT = os.getenv("ENVIRONMENT") or "production"
diff --git a/agbenchmark/config.json b/agbenchmark/config.json
index 7388085dc89..d9b42ca4283 100644
--- a/agbenchmark/config.json
+++ b/agbenchmark/config.json
@@ -1,10 +1,5 @@
 {
   "workspace": "C:\\Users\\silen\\miniagi",
-  "agent": {
-    "type": "script",
-    "path": "",
-    "script": "python miniagi.py {}",
-    "user_input": "\n",
-    "cutoff": 60
-  }
+  "func_path": "agent/benchmarks.py",
+  "cutoff": 60
 }
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index c9f3643cc02..fe395cd2169 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -29,7 +29,17 @@ def start(category, noreg, mock):
 
         config["workspace"] = click.prompt(
             "Please enter a new workspace path",
-            default=os.path.join(Path.home(), "miniagi"),
+            default=os.path.join(Path.home(), "workspace"),
+        )
+
+        config["func_path"] = click.prompt(
+            "Please enter a the path to your run_specific_agent function implementation",
+            default="/benchmarks.py",
+        )
+
+        config["cutoff"] = click.prompt(
+            "Please enter a hard cutoff runtime for your agent",
+            default="60",
         )
 
         with open(config_dir, "w") as f:
diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json
index d13b763c7cc..9e26dfeeb6e 100644
--- a/agbenchmark/tests/regression/regression_tests.json
+++ b/agbenchmark/tests/regression/regression_tests.json
@@ -1,7 +1 @@
-{
-    "TestWriteFile": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0]"
-    }
-}
\ No newline at end of file
+{}
\ No newline at end of file
diff --git a/agent/benchmarks.py b/agent/benchmarks.py
new file mode 100644
index 00000000000..eb66412c143
--- /dev/null
+++ b/agent/benchmarks.py
@@ -0,0 +1,15 @@
+# import subprocess
+
+
+def run_specific_agent(task, conn):
+    cycle_count = 0
+    while (
+        not conn.poll()
+    ):  # Check if there's a termination signal from the main process
+        response = run_agent(task)  # run the agent and get the response and cycle count
+
+        if response:
+            cycle_count += 1
+
+        # Send response and cycle count back to the main process
+        conn.send((response, cycle_count))
diff --git a/agent/hook.py b/agent/hook.py
deleted file mode 100644
index 6fa5341800b..00000000000
--- a/agent/hook.py
+++ /dev/null
@@ -1,10 +0,0 @@
-async def run_specific_agent(task, conn):
-    while (
-        not conn.poll()
-    ):  # Check if there's a termination signal from the main process
-        response, cycle_count = await run_agent(
-            task
-        )  # run the agent and get the response and cycle count
-
-        # Send response and cycle count back to the main process
-        conn.send((response, cycle_count))