From ac5af736963dac95969f0cb3d0f99480a0a4f401 Mon Sep 17 00:00:00 2001 From: Silen Naihin <silen.naihin@gmail.com> Date: Wed, 28 Jun 2023 21:28:46 -0400 Subject: [PATCH 1/4] trying to get kill process --- agbenchmark/config.json | 8 +- agbenchmark/conftest.py | 70 ++++++++++---- .../tests/regression/regression_tests.json | 7 ++ poetry.lock | 93 ++++++++++++++++++- pyproject.toml | 2 + 5 files changed, 161 insertions(+), 19 deletions(-) diff --git a/agbenchmark/config.json b/agbenchmark/config.json index 3de1dd64386..d95b8e44399 100644 --- a/agbenchmark/config.json +++ b/agbenchmark/config.json @@ -1,3 +1,9 @@ { - "hostname": "localhost" + "workspace": "C:\\Users\\silen\\miniagi", + "cutoff": { + "type": "time", + "user_prompt": "Press enter to continue or abort this action by typing feedback:", + "user_input": "\n", + "count": 5 + } } diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 4edd4b5e0b6..2590ce78187 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -2,11 +2,10 @@ import os import pytest import shutil +import subprocess +import sys from agbenchmark.tests.regression.RegressionManager import RegressionManager -import requests from agbenchmark.mocks.MockManager import MockManager -import subprocess -from agbenchmark.Challenge import Challenge from dotenv import load_dotenv load_dotenv() @@ -44,8 +43,16 @@ def pytest_addoption(parser): parser.addoption("--mock", action="store_true", default=False) +def check_cycle_count(cycle_count: int, cutoff: int, proc): + """Increment, print, and check cycle count.""" + cycle_count += 1 + print(f"Cycle count: {cycle_count}") + if cycle_count >= cutoff: + proc.terminate(force=True) + return cycle_count + + AGENT_NAME = os.getenv("AGENT_NAME") -AGENT_TIMEOUT = os.getenv("AGENT_TIMEOUT") @pytest.fixture(autouse=True) @@ -70,19 +77,48 @@ def run_agent(request, config): else: path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}") - try: - timeout = int(AGENT_TIMEOUT) if AGENT_TIMEOUT is not None else 60 - - subprocess.run( - ["python", "miniagi.py", task], - check=True, - cwd=path, - timeout=timeout - # text=True, - # capture_output=True - ) - except subprocess.TimeoutExpired: - print("The subprocess has exceeded the time limit and was terminated.") + timeout = sys.maxsize + + if config["cutoff"]["type"] == "time": + timeout = config["cutoff"]["count"] or 60 + + from pexpect.popen_spawn import PopenSpawn + + print(f"Running {task} with timeout {timeout}") + + # Starting the subprocess using pexpect + proc = PopenSpawn("python", ["miniagi.py", task], timeout=timeout, cwd=path) + + print("proc", proc) + + cycle_count = 0 + + while True: + try: + # If we get the prompt for user input, we send "\n" + if config["cutoff"]["type"] == "user_input": + proc.expect([config["cutoff"]["user_prompt"]]) + proc.sendline(config["cutoff"]["user_input"]) + cycle_count = check_cycle_count( + cycle_count, config["cutoff"]["count"], proc + ) + elif config["cutoff"]["type"] == "cycle_count": + match = proc.expect([r"Cycle count: (\d+)"]) + if match is not None: + cycle_count = int(match.group(1)) # type: ignore + cycle_count = check_cycle_count( + cycle_count, config["cutoff"]["count"], proc + ) + + # for cutoff type "time", just let it run until timeout + except expect.TIMEOUT: + print("The subprocess has exceeded the time limit and was terminated.") + break + except expect.EOF: + print("The subprocess has finished running.") + break + + proc.close() regression_json = "agbenchmark/tests/regression/regression_tests.json" diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json index 384f9e7c61a..8a6278fea13 100644 --- a/agbenchmark/tests/regression/regression_tests.json +++ b/agbenchmark/tests/regression/regression_tests.json @@ -3,5 +3,12 @@ "difficulty": "basic", "dependencies": [], "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0-run_agent0]" + }, + "TestReadFile": { + "difficulty": "basic", + "dependencies": [ + "basic_write_file" + ], + "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_method[challenge_data0-run_agent0]" } } \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 7b2477bc6c9..a460f988da2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -538,6 +538,20 @@ files = [ {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, ] +[[package]] +name = "pexpect" +version = "4.8.0" +description = "Pexpect allows easy control of interactive console applications." +optional = false +python-versions = "*" +files = [ + {file = "pexpect-4.8.0-py2.py3-none-any.whl", hash = "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937"}, + {file = "pexpect-4.8.0.tar.gz", hash = "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c"}, +] + +[package.dependencies] +ptyprocess = ">=0.5" + [[package]] name = "pluggy" version = "1.0.0" @@ -553,6 +567,43 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "psutil" +version = "5.9.5" +description = "Cross-platform lib for process and system monitoring in Python." +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "psutil-5.9.5-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:be8929ce4313f9f8146caad4272f6abb8bf99fc6cf59344a3167ecd74f4f203f"}, + {file = "psutil-5.9.5-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:ab8ed1a1d77c95453db1ae00a3f9c50227ebd955437bcf2a574ba8adbf6a74d5"}, + {file = "psutil-5.9.5-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:4aef137f3345082a3d3232187aeb4ac4ef959ba3d7c10c33dd73763fbc063da4"}, + {file = "psutil-5.9.5-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:ea8518d152174e1249c4f2a1c89e3e6065941df2fa13a1ab45327716a23c2b48"}, + {file = "psutil-5.9.5-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:acf2aef9391710afded549ff602b5887d7a2349831ae4c26be7c807c0a39fac4"}, + {file = "psutil-5.9.5-cp27-none-win32.whl", hash = "sha256:5b9b8cb93f507e8dbaf22af6a2fd0ccbe8244bf30b1baad6b3954e935157ae3f"}, + {file = "psutil-5.9.5-cp27-none-win_amd64.whl", hash = "sha256:8c5f7c5a052d1d567db4ddd231a9d27a74e8e4a9c3f44b1032762bd7b9fdcd42"}, + {file = "psutil-5.9.5-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:3c6f686f4225553615612f6d9bc21f1c0e305f75d7d8454f9b46e901778e7217"}, + {file = "psutil-5.9.5-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7a7dd9997128a0d928ed4fb2c2d57e5102bb6089027939f3b722f3a210f9a8da"}, + {file = "psutil-5.9.5-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89518112647f1276b03ca97b65cc7f64ca587b1eb0278383017c2a0dcc26cbe4"}, + {file = "psutil-5.9.5-cp36-abi3-win32.whl", hash = "sha256:104a5cc0e31baa2bcf67900be36acde157756b9c44017b86b2c049f11957887d"}, + {file = "psutil-5.9.5-cp36-abi3-win_amd64.whl", hash = "sha256:b258c0c1c9d145a1d5ceffab1134441c4c5113b2417fafff7315a917a026c3c9"}, + {file = "psutil-5.9.5-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:c607bb3b57dc779d55e1554846352b4e358c10fff3abf3514a7a6601beebdb30"}, + {file = "psutil-5.9.5.tar.gz", hash = "sha256:5410638e4df39c54d957fc51ce03048acd8e6d60abc0f5107af51e5fb566eb3c"}, +] + +[package.extras] +test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] + +[[package]] +name = "ptyprocess" +version = "0.7.0" +description = "Run a subprocess in a pseudo terminal" +optional = false +python-versions = "*" +files = [ + {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, + {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"}, +] + [[package]] name = "pydantic" version = "1.10.9" @@ -658,6 +709,29 @@ files = [ [package.extras] cli = ["click (>=5.0)"] +[[package]] +name = "pywin32" +version = "306" +description = "Python for Window Extensions" +optional = false +python-versions = "*" +files = [ + {file = "pywin32-306-cp310-cp310-win32.whl", hash = "sha256:06d3420a5155ba65f0b72f2699b5bacf3109f36acbe8923765c22938a69dfc8d"}, + {file = "pywin32-306-cp310-cp310-win_amd64.whl", hash = "sha256:84f4471dbca1887ea3803d8848a1616429ac94a4a8d05f4bc9c5dcfd42ca99c8"}, + {file = "pywin32-306-cp311-cp311-win32.whl", hash = "sha256:e65028133d15b64d2ed8f06dd9fbc268352478d4f9289e69c190ecd6818b6407"}, + {file = "pywin32-306-cp311-cp311-win_amd64.whl", hash = "sha256:a7639f51c184c0272e93f244eb24dafca9b1855707d94c192d4a0b4c01e1100e"}, + {file = "pywin32-306-cp311-cp311-win_arm64.whl", hash = "sha256:70dba0c913d19f942a2db25217d9a1b726c278f483a919f1abfed79c9cf64d3a"}, + {file = "pywin32-306-cp312-cp312-win32.whl", hash = "sha256:383229d515657f4e3ed1343da8be101000562bf514591ff383ae940cad65458b"}, + {file = "pywin32-306-cp312-cp312-win_amd64.whl", hash = "sha256:37257794c1ad39ee9be652da0462dc2e394c8159dfd913a8a4e8eb6fd346da0e"}, + {file = "pywin32-306-cp312-cp312-win_arm64.whl", hash = "sha256:5821ec52f6d321aa59e2db7e0a35b997de60c201943557d108af9d4ae1ec7040"}, + {file = "pywin32-306-cp37-cp37m-win32.whl", hash = "sha256:1c73ea9a0d2283d889001998059f5eaaba3b6238f767c9cf2833b13e6a685f65"}, + {file = "pywin32-306-cp37-cp37m-win_amd64.whl", hash = "sha256:72c5f621542d7bdd4fdb716227be0dd3f8565c11b280be6315b06ace35487d36"}, + {file = "pywin32-306-cp38-cp38-win32.whl", hash = "sha256:e4c092e2589b5cf0d365849e73e02c391c1349958c5ac3e9d5ccb9a28e017b3a"}, + {file = "pywin32-306-cp38-cp38-win_amd64.whl", hash = "sha256:e8ac1ae3601bee6ca9f7cb4b5363bf1c0badb935ef243c4733ff9a393b1690c0"}, + {file = "pywin32-306-cp39-cp39-win32.whl", hash = "sha256:e25fd5b485b55ac9c057f67d94bc203f3f6595078d1fb3b458c9c28b7153a802"}, + {file = "pywin32-306-cp39-cp39-win_amd64.whl", hash = "sha256:39b61c15272833b5c329a2989999dcae836b1eed650252ab1b7bfbe1d59f30f4"}, +] + [[package]] name = "requests" version = "2.31.0" @@ -738,6 +812,23 @@ secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17. socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] +[[package]] +name = "wexpect" +version = "4.0.0" +description = "Windows alternative of pexpect" +optional = false +python-versions = "*" +files = [ + {file = "wexpect-4.0.0.tar.gz", hash = "sha256:de9e739e78ec4d74a39bf8499904dacb6c594007a674fb7e10752c9b131f6522"}, +] + +[package.dependencies] +psutil = ">=5.0.0" +pywin32 = ">=220" + +[package.extras] +test = ["codecov", "coverage", "pyinstaller", "setuptools (>=38.0)", "tox", "twine"] + [[package]] name = "yarl" version = "1.9.2" @@ -828,4 +919,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "f8de5e973c92360108aaca1cecc2fdd505f10a9c2975b46c83ea9c24b4af3cfe" +content-hash = "8ab722acade739b9fb841ecae3b8cabd4f1d8a355864573a93d9faa11dcffb90" diff --git a/pyproject.toml b/pyproject.toml index 043fe68a2a1..af9688d1432 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,8 @@ openai = "^0.27.8" pydantic = "^1.10.9" pytest-depends = "^1.0.1" python-dotenv = "^1.0.0" +pexpect = "^4.8.0" +wexpect = "^4.0.0" [build-system] From fce421fb335107cddd9fd60b32e91902be7b5eae Mon Sep 17 00:00:00 2001 From: Silen Naihin <silen.naihin@gmail.com> Date: Thu, 29 Jun 2023 20:51:23 -0400 Subject: [PATCH 2/4] moving logic to benchmark.py file --- agbenchmark/benchmark.py | 65 ++++++++++++++++++++++++++++++++++++++++ agbenchmark/conftest.py | 61 ++----------------------------------- 2 files changed, 67 insertions(+), 59 deletions(-) create mode 100644 agbenchmark/benchmark.py diff --git a/agbenchmark/benchmark.py b/agbenchmark/benchmark.py new file mode 100644 index 00000000000..6dc3b231282 --- /dev/null +++ b/agbenchmark/benchmark.py @@ -0,0 +1,65 @@ +import os +import sys +import pexpect as expect +from dotenv import load_dotenv + +load_dotenv() + + +def check_cycle_count(cycle_count: int, cutoff: int, proc): + """Increment, print, and check cycle count.""" + cycle_count += 1 + print(f"Cycle count: {cycle_count}") + if cycle_count >= cutoff: + proc.terminate(force=True) + return cycle_count + + +AGENT_NAME = os.getenv("AGENT_NAME") + + +def run_agnostic(config, task): + path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}") + + timeout = sys.maxsize + + if config["cutoff"]["type"] == "time": + timeout = config["cutoff"]["count"] or 60 + + # from pexpect.popen_spawn import PopenSpawn + + print(f"Running {task} with timeout {timeout}") + + # Starting the subprocess using pexpect + proc = expect.spawn("python", ["miniagi.py", task], timeout=timeout, cwd=path) + + print("proc", proc) + + cycle_count = 0 + + while True: + try: + # If we get the prompt for user input, we send "\n" + if config["cutoff"]["type"] == "user_input": + proc.expect([config["cutoff"]["user_prompt"]]) + proc.sendline(config["cutoff"]["user_input"]) + cycle_count = check_cycle_count( + cycle_count, config["cutoff"]["count"], proc + ) + elif config["cutoff"]["type"] == "cycle_count": + match = proc.expect([r"Cycle count: (\d+)"]) + if match is not None: + cycle_count = int(match.group(1)) # type: ignore + cycle_count = check_cycle_count( + cycle_count, config["cutoff"]["count"], proc + ) + + # for cutoff type "time", just let it run until timeout + except expect.TIMEOUT: + print("The subprocess has exceeded the time limit and was terminated.") + break + except expect.EOF: + print("The subprocess has finished running.") + break + + proc.close() diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 2590ce78187..25510e42b67 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -6,9 +6,7 @@ import sys from agbenchmark.tests.regression.RegressionManager import RegressionManager from agbenchmark.mocks.MockManager import MockManager -from dotenv import load_dotenv - -load_dotenv() +from agbenchmark.benchmark import run_agnostic @pytest.fixture(scope="module") @@ -43,18 +41,6 @@ def pytest_addoption(parser): parser.addoption("--mock", action="store_true", default=False) -def check_cycle_count(cycle_count: int, cutoff: int, proc): - """Increment, print, and check cycle count.""" - cycle_count += 1 - print(f"Cycle count: {cycle_count}") - if cycle_count >= cutoff: - proc.terminate(force=True) - return cycle_count - - -AGENT_NAME = os.getenv("AGENT_NAME") - - @pytest.fixture(autouse=True) def run_agent(request, config): """Calling to get a response""" @@ -75,50 +61,7 @@ def run_agent(request, config): else: print("No mock provided") else: - path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}") - - timeout = sys.maxsize - - if config["cutoff"]["type"] == "time": - timeout = config["cutoff"]["count"] or 60 - - from pexpect.popen_spawn import PopenSpawn - - print(f"Running {task} with timeout {timeout}") - - # Starting the subprocess using pexpect - proc = PopenSpawn("python", ["miniagi.py", task], timeout=timeout, cwd=path) - - print("proc", proc) - - cycle_count = 0 - - while True: - try: - # If we get the prompt for user input, we send "\n" - if config["cutoff"]["type"] == "user_input": - proc.expect([config["cutoff"]["user_prompt"]]) - proc.sendline(config["cutoff"]["user_input"]) - cycle_count = check_cycle_count( - cycle_count, config["cutoff"]["count"], proc - ) - elif config["cutoff"]["type"] == "cycle_count": - match = proc.expect([r"Cycle count: (\d+)"]) - if match is not None: - cycle_count = int(match.group(1)) # type: ignore - cycle_count = check_cycle_count( - cycle_count, config["cutoff"]["count"], proc - ) - - # for cutoff type "time", just let it run until timeout - except expect.TIMEOUT: - print("The subprocess has exceeded the time limit and was terminated.") - break - except expect.EOF: - print("The subprocess has finished running.") - break - - proc.close() + run_agnostic(config, task) regression_json = "agbenchmark/tests/regression/regression_tests.json" From 2987d71264c7ffb0b6184e28e17c503aef5b4681 Mon Sep 17 00:00:00 2001 From: Silen Naihin <silen.naihin@gmail.com> Date: Fri, 30 Jun 2023 10:50:54 -0400 Subject: [PATCH 3/4] moving run agent to tests & agnostic run working --- .env.example | 2 +- agbenchmark/Challenge.py | 16 +-- agbenchmark/agent_interface.py | 108 ++++++++++++++++++ agbenchmark/benchmark.py | 65 ----------- .../challenges/retrieval/r1/r1_test.py | 7 +- agbenchmark/config.json | 9 +- agbenchmark/conftest.py | 37 ------ agbenchmark/mocks/workspace/file_to_check.txt | 1 + .../read_file/read_file_test.py | 7 +- .../write_file/write_file_test.py | 6 +- .../tests/regression/regression_tests.json | 9 +- agent/hook.py | 10 ++ pyproject.toml | 2 - 13 files changed, 144 insertions(+), 135 deletions(-) create mode 100644 agbenchmark/agent_interface.py delete mode 100644 agbenchmark/benchmark.py create mode 100644 agbenchmark/mocks/workspace/file_to_check.txt create mode 100644 agent/hook.py diff --git a/.env.example b/.env.example index 7782d048e23..e50ed58a5f0 100644 --- a/.env.example +++ b/.env.example @@ -1,3 +1,3 @@ AGENT_NAME=mini-agi -AGENT_TIMEOUT=60 +ENVIRONMENT=local MOCK_TEST=False \ No newline at end of file diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py index f644abc4a6d..7b1e4df0425 100644 --- a/agbenchmark/Challenge.py +++ b/agbenchmark/Challenge.py @@ -4,7 +4,7 @@ from abc import ABC, abstractmethod from agbenchmark.challenges.define_task_types import Ground from agbenchmark.challenges.define_task_types import ChallengeData -from dotenv import load_dotenv, set_key +from dotenv import load_dotenv load_dotenv() @@ -40,22 +40,24 @@ def dependencies(self) -> list: print("self.data.dependencies", self.data.dependencies) return self.data.dependencies + def setup_challenge(self, config): + from agbenchmark.agent_interface import run_agent + + print("SETTING UP CHALLENGE...") + + run_agent(self.task, self.mock, config) + @property def name(self) -> str: print("self.data.name", self.data.name) return self.data.name - @pytest.mark.parametrize( - "run_agent", - [(task, mock)], - indirect=True, - ) @pytest.mark.parametrize( "challenge_data", [data], indirect=True, ) - def test_method(self, workspace): + def test_method(self, config): raise NotImplementedError @staticmethod diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py new file mode 100644 index 00000000000..eba26fc189d --- /dev/null +++ b/agbenchmark/agent_interface.py @@ -0,0 +1,108 @@ +import os +import sys +import subprocess +import time +from agbenchmark.mocks.MockManager import MockManager +from multiprocessing import Process, Pipe + +from agent.hook import run_specific_agent + +from dotenv import load_dotenv + +load_dotenv() + +MOCK_FLAG = os.getenv("MOCK_TEST") + + +def run_agent(task, mock_func, config): + """Calling to get a response""" + + if mock_func == None and MOCK_FLAG == "True": + print("No mock provided") + elif MOCK_FLAG == "True": + mock_manager = MockManager( + task + ) # workspace doesn't need to be passed in, stays the same + print("Server unavailable, using mock", mock_func) + mock_manager.delegate(mock_func) + else: + if config["agent"]["type"] == "python": + run_agent_function(config, task) + elif config["agent"]["type"] == "script": + run_agent_command(config, task) + + +ENVIRONMENT = os.getenv("ENVIRONMENT") or "production" + + +def run_agent_command(config, task): + path = config["agent"]["path"] + + if ENVIRONMENT == "local": + AGENT_NAME = os.getenv("AGENT_NAME") + path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}") + + timeout = config["agent"]["cutoff"] or sys.maxsize + print(f"Running {task} with timeout {timeout}") + + command_from_config = config["agent"]["script"] + command_list = command_from_config.split() + + # replace '{}' with the task + command_list = [cmd if cmd != "{}" else task for cmd in command_list] + print("path, command_list", path, command_list) + start_time = time.time() + proc = subprocess.Popen( + command_list, + cwd=path, + shell=True, + ) + + while True: + if time.time() - start_time > timeout: + print("The subprocess has exceeded the time limit and was terminated.") + proc.terminate() + break + + if proc.poll() is not None: + print("The subprocess has finished running.") + break + + +def run_agent_function(config, task): + timeout = ( + config["cutoff"]["count"] if config["cutoff"]["type"] == "time" else sys.maxsize + ) + print( + f"Running Python function '{config['agent']['function']}' with timeout {timeout}" + ) + + parent_conn, child_conn = Pipe() + process = Process(target=run_specific_agent, args=(task, child_conn)) + process.start() + start_time = time.time() + + while True: + if parent_conn.poll(): # Check if there's a new message from the child process + response, cycle_count = parent_conn.recv() + print(f"Cycle {cycle_count}: {response}") + + if cycle_count >= config["cutoff"]["count"]: + print( + f"Cycle count has reached the limit of {config['cutoff']['count']}. Terminating." + ) + child_conn.send("terminate") + break + + if time.time() - start_time > timeout: + print("The Python function has exceeded the time limit and was terminated.") + child_conn.send( + "terminate" + ) # Send a termination signal to the child process + break + + if not process.is_alive(): + print("The Python function has finished running.") + break + + process.join() diff --git a/agbenchmark/benchmark.py b/agbenchmark/benchmark.py deleted file mode 100644 index 6dc3b231282..00000000000 --- a/agbenchmark/benchmark.py +++ /dev/null @@ -1,65 +0,0 @@ -import os -import sys -import pexpect as expect -from dotenv import load_dotenv - -load_dotenv() - - -def check_cycle_count(cycle_count: int, cutoff: int, proc): - """Increment, print, and check cycle count.""" - cycle_count += 1 - print(f"Cycle count: {cycle_count}") - if cycle_count >= cutoff: - proc.terminate(force=True) - return cycle_count - - -AGENT_NAME = os.getenv("AGENT_NAME") - - -def run_agnostic(config, task): - path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}") - - timeout = sys.maxsize - - if config["cutoff"]["type"] == "time": - timeout = config["cutoff"]["count"] or 60 - - # from pexpect.popen_spawn import PopenSpawn - - print(f"Running {task} with timeout {timeout}") - - # Starting the subprocess using pexpect - proc = expect.spawn("python", ["miniagi.py", task], timeout=timeout, cwd=path) - - print("proc", proc) - - cycle_count = 0 - - while True: - try: - # If we get the prompt for user input, we send "\n" - if config["cutoff"]["type"] == "user_input": - proc.expect([config["cutoff"]["user_prompt"]]) - proc.sendline(config["cutoff"]["user_input"]) - cycle_count = check_cycle_count( - cycle_count, config["cutoff"]["count"], proc - ) - elif config["cutoff"]["type"] == "cycle_count": - match = proc.expect([r"Cycle count: (\d+)"]) - if match is not None: - cycle_count = int(match.group(1)) # type: ignore - cycle_count = check_cycle_count( - cycle_count, config["cutoff"]["count"], proc - ) - - # for cutoff type "time", just let it run until timeout - except expect.TIMEOUT: - print("The subprocess has exceeded the time limit and was terminated.") - break - except expect.EOF: - print("The subprocess has finished running.") - break - - proc.close() diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 0bd907d8a0b..b679a731dc9 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -1,6 +1,4 @@ -import pytest from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge -from agbenchmark.challenges.define_task_types import ChallengeData, Ground import os @@ -10,8 +8,9 @@ class TestRetrieval1(RetrievalChallenge): def get_file_path(self) -> str: # all tests must implement this method return os.path.join(os.path.dirname(__file__), "r1_data.json") - def test_method(self, workspace): - files_contents = self.open_files(workspace, self.data.ground.files) + def test_method(self, config): + self.setup_challenge(config) + files_contents = self.open_files(config["workspace"], self.data.ground.files) scores = [] for file_content in files_contents: diff --git a/agbenchmark/config.json b/agbenchmark/config.json index d95b8e44399..7388085dc89 100644 --- a/agbenchmark/config.json +++ b/agbenchmark/config.json @@ -1,9 +1,10 @@ { "workspace": "C:\\Users\\silen\\miniagi", - "cutoff": { - "type": "time", - "user_prompt": "Press enter to continue or abort this action by typing feedback:", + "agent": { + "type": "script", + "path": "", + "script": "python miniagi.py {}", "user_input": "\n", - "count": 5 + "cutoff": 60 } } diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 25510e42b67..0f1fc7bb2f3 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -2,11 +2,7 @@ import os import pytest import shutil -import subprocess -import sys from agbenchmark.tests.regression.RegressionManager import RegressionManager -from agbenchmark.mocks.MockManager import MockManager -from agbenchmark.benchmark import run_agnostic @pytest.fixture(scope="module") @@ -41,29 +37,6 @@ def pytest_addoption(parser): parser.addoption("--mock", action="store_true", default=False) -@pytest.fixture(autouse=True) -def run_agent(request, config): - """Calling to get a response""" - if isinstance(request.param, tuple): - task = request.param[0] # The task is passed in indirectly - mock_function_name = request.param[1] or None - else: - task = request.param - mock_function_name = None - - if mock_function_name != None and (request.config.getoption("--mock")): - if mock_function_name: - mock_manager = MockManager( - task - ) # workspace doesn't need to be passed in, stays the same - print("Server unavailable, using mock", mock_function_name) - mock_manager.delegate(mock_function_name) - else: - print("No mock provided") - else: - run_agnostic(config, task) - - regression_json = "agbenchmark/tests/regression/regression_tests.json" regression_manager = RegressionManager(regression_json) @@ -120,13 +93,3 @@ def pytest_generate_tests(metafunc): # Add the parameters to the test function metafunc.parametrize("challenge_data", [params], indirect=True) - - if "run_agent" in metafunc.fixturenames: - # Get the instance of the test class - test_class = metafunc.cls() - - # Generate the parameters - params = [(test_class.task, test_class.mock)] - - # Add the parameters to the test function - metafunc.parametrize("run_agent", params, indirect=True) diff --git a/agbenchmark/mocks/workspace/file_to_check.txt b/agbenchmark/mocks/workspace/file_to_check.txt new file mode 100644 index 00000000000..48dc8cff1a4 --- /dev/null +++ b/agbenchmark/mocks/workspace/file_to_check.txt @@ -0,0 +1 @@ +Washington DC is the capital of the United States of America \ No newline at end of file diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index f99ae608c82..c0aaa7f933b 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -17,10 +17,9 @@ def get_file_path(self) -> str: # all tests must implement this method return os.path.join(os.path.dirname(__file__), "r_file_data.json") @pytest.mark.depends(on=["basic_write_file"], name="basic_read_file") - def test_method( - self, workspace - ): # run_test is a common name that all tests must implement - files_contents = self.open_files(workspace, self.data.ground.files) + def test_method(self, config): + self.setup_challenge(config) + files_contents = self.open_files(config["workspace"], self.data.ground.files) scores = [] for file_content in files_contents: diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 39c73b163dd..306375ddd3c 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -10,9 +10,9 @@ def get_file_path(self) -> str: # all tests must implement this method return os.path.join(os.path.dirname(__file__), "w_file_data.json") @pytest.mark.depends(on=[], name="basic_write_file") - def test_method(self, workspace): - print("my workspace is ", workspace) - files_contents = self.open_files(workspace, self.data.ground.files) + def test_method(self, config): + self.setup_challenge(config) + files_contents = self.open_files(config["workspace"], self.data.ground.files) scores = [] for file_content in files_contents: diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json index 8a6278fea13..d13b763c7cc 100644 --- a/agbenchmark/tests/regression/regression_tests.json +++ b/agbenchmark/tests/regression/regression_tests.json @@ -2,13 +2,6 @@ "TestWriteFile": { "difficulty": "basic", "dependencies": [], - "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0-run_agent0]" - }, - "TestReadFile": { - "difficulty": "basic", - "dependencies": [ - "basic_write_file" - ], - "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_method[challenge_data0-run_agent0]" + "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0]" } } \ No newline at end of file diff --git a/agent/hook.py b/agent/hook.py new file mode 100644 index 00000000000..6fa5341800b --- /dev/null +++ b/agent/hook.py @@ -0,0 +1,10 @@ +async def run_specific_agent(task, conn): + while ( + not conn.poll() + ): # Check if there's a termination signal from the main process + response, cycle_count = await run_agent( + task + ) # run the agent and get the response and cycle count + + # Send response and cycle count back to the main process + conn.send((response, cycle_count)) diff --git a/pyproject.toml b/pyproject.toml index af9688d1432..043fe68a2a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,8 +16,6 @@ openai = "^0.27.8" pydantic = "^1.10.9" pytest-depends = "^1.0.1" python-dotenv = "^1.0.0" -pexpect = "^4.8.0" -wexpect = "^4.0.0" [build-system] From 7c352b745ec90486826289ed735800197e95cd80 Mon Sep 17 00:00:00 2001 From: Silen Naihin <silen.naihin@gmail.com> Date: Fri, 30 Jun 2023 11:55:43 -0400 Subject: [PATCH 4/4] integrate config, agent_interface just func, hook --- agbenchmark/Challenge.py | 5 +- agbenchmark/agent_interface.py | 118 ++++++------------ agbenchmark/config.json | 9 +- agbenchmark/start_benchmark.py | 12 +- .../tests/regression/regression_tests.json | 8 +- agent/benchmarks.py | 15 +++ agent/hook.py | 10 -- 7 files changed, 70 insertions(+), 107 deletions(-) create mode 100644 agent/benchmarks.py delete mode 100644 agent/hook.py diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py index 7b1e4df0425..d7a2bdc9b38 100644 --- a/agbenchmark/Challenge.py +++ b/agbenchmark/Challenge.py @@ -23,6 +23,7 @@ def get_file_path(self) -> str: @property def data(self) -> ChallengeData: + # TODO: make it so that this is cached somewhere to just call self.deserialized_data return ChallengeData.deserialize(self.get_file_path()) @property @@ -37,19 +38,15 @@ def task(self): @property def dependencies(self) -> list: - print("self.data.dependencies", self.data.dependencies) return self.data.dependencies def setup_challenge(self, config): from agbenchmark.agent_interface import run_agent - print("SETTING UP CHALLENGE...") - run_agent(self.task, self.mock, config) @property def name(self) -> str: - print("self.data.name", self.data.name) return self.data.name @pytest.mark.parametrize( diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index eba26fc189d..2ff2acf3011 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -1,12 +1,9 @@ import os -import sys -import subprocess +import importlib import time from agbenchmark.mocks.MockManager import MockManager from multiprocessing import Process, Pipe -from agent.hook import run_specific_agent - from dotenv import load_dotenv load_dotenv() @@ -26,83 +23,48 @@ def run_agent(task, mock_func, config): print("Server unavailable, using mock", mock_func) mock_manager.delegate(mock_func) else: - if config["agent"]["type"] == "python": - run_agent_function(config, task) - elif config["agent"]["type"] == "script": - run_agent_command(config, task) - - -ENVIRONMENT = os.getenv("ENVIRONMENT") or "production" - - -def run_agent_command(config, task): - path = config["agent"]["path"] - - if ENVIRONMENT == "local": - AGENT_NAME = os.getenv("AGENT_NAME") - path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}") - - timeout = config["agent"]["cutoff"] or sys.maxsize - print(f"Running {task} with timeout {timeout}") - - command_from_config = config["agent"]["script"] - command_list = command_from_config.split() - - # replace '{}' with the task - command_list = [cmd if cmd != "{}" else task for cmd in command_list] - print("path, command_list", path, command_list) - start_time = time.time() - proc = subprocess.Popen( - command_list, - cwd=path, - shell=True, - ) - - while True: - if time.time() - start_time > timeout: - print("The subprocess has exceeded the time limit and was terminated.") - proc.terminate() - break - - if proc.poll() is not None: - print("The subprocess has finished running.") - break - - -def run_agent_function(config, task): - timeout = ( - config["cutoff"]["count"] if config["cutoff"]["type"] == "time" else sys.maxsize - ) - print( - f"Running Python function '{config['agent']['function']}' with timeout {timeout}" - ) - - parent_conn, child_conn = Pipe() - process = Process(target=run_specific_agent, args=(task, child_conn)) - process.start() - start_time = time.time() - - while True: - if parent_conn.poll(): # Check if there's a new message from the child process - response, cycle_count = parent_conn.recv() - print(f"Cycle {cycle_count}: {response}") - - if cycle_count >= config["cutoff"]["count"]: + timeout = config["cutoff"] + print(f"Running Python function '{config['func_path']}' with timeout {timeout}") + + parent_conn, child_conn = Pipe() + + # Import the specific agent dynamically + module_name = config["func_path"].replace("/", ".").rstrip(".py") + module = importlib.import_module(module_name) + run_specific_agent = getattr(module, "run_specific_agent") + + process = Process(target=run_specific_agent, args=(task, child_conn)) + process.start() + start_time = time.time() + + while True: + if ( + parent_conn.poll() + ): # Check if there's a new message from the child process + response, cycle_count = parent_conn.recv() + print(f"Cycle {cycle_count}: {response}") + + if cycle_count >= config["cutoff"]: + print( + f"Cycle count has reached the limit of {config['cutoff']}. Terminating." + ) + child_conn.send("terminate") + break + + if time.time() - start_time > timeout: print( - f"Cycle count has reached the limit of {config['cutoff']['count']}. Terminating." + "The Python function has exceeded the time limit and was terminated." ) - child_conn.send("terminate") + child_conn.send( + "terminate" + ) # Send a termination signal to the child process break - if time.time() - start_time > timeout: - print("The Python function has exceeded the time limit and was terminated.") - child_conn.send( - "terminate" - ) # Send a termination signal to the child process - break + if not process.is_alive(): + print("The Python function has finished running.") + break - if not process.is_alive(): - print("The Python function has finished running.") - break + process.join() - process.join() + +ENVIRONMENT = os.getenv("ENVIRONMENT") or "production" diff --git a/agbenchmark/config.json b/agbenchmark/config.json index 7388085dc89..d9b42ca4283 100644 --- a/agbenchmark/config.json +++ b/agbenchmark/config.json @@ -1,10 +1,5 @@ { "workspace": "C:\\Users\\silen\\miniagi", - "agent": { - "type": "script", - "path": "", - "script": "python miniagi.py {}", - "user_input": "\n", - "cutoff": 60 - } + "func_path": "agent/benchmarks.py", + "cutoff": 60 } diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index c9f3643cc02..fe395cd2169 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -29,7 +29,17 @@ def start(category, noreg, mock): config["workspace"] = click.prompt( "Please enter a new workspace path", - default=os.path.join(Path.home(), "miniagi"), + default=os.path.join(Path.home(), "workspace"), + ) + + config["func_path"] = click.prompt( + "Please enter a the path to your run_specific_agent function implementation", + default="/benchmarks.py", + ) + + config["cutoff"] = click.prompt( + "Please enter a hard cutoff runtime for your agent", + default="60", ) with open(config_dir, "w") as f: diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json index d13b763c7cc..9e26dfeeb6e 100644 --- a/agbenchmark/tests/regression/regression_tests.json +++ b/agbenchmark/tests/regression/regression_tests.json @@ -1,7 +1 @@ -{ - "TestWriteFile": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0]" - } -} \ No newline at end of file +{} \ No newline at end of file diff --git a/agent/benchmarks.py b/agent/benchmarks.py new file mode 100644 index 00000000000..eb66412c143 --- /dev/null +++ b/agent/benchmarks.py @@ -0,0 +1,15 @@ +# import subprocess + + +def run_specific_agent(task, conn): + cycle_count = 0 + while ( + not conn.poll() + ): # Check if there's a termination signal from the main process + response = run_agent(task) # run the agent and get the response and cycle count + + if response: + cycle_count += 1 + + # Send response and cycle count back to the main process + conn.send((response, cycle_count)) diff --git a/agent/hook.py b/agent/hook.py deleted file mode 100644 index 6fa5341800b..00000000000 --- a/agent/hook.py +++ /dev/null @@ -1,10 +0,0 @@ -async def run_specific_agent(task, conn): - while ( - not conn.poll() - ): # Check if there's a termination signal from the main process - response, cycle_count = await run_agent( - task - ) # run the agent and get the response and cycle count - - # Send response and cycle count back to the main process - conn.send((response, cycle_count))