Skip to content

Commit

Permalink
add a test case for a single dimension evaluation
Browse files Browse the repository at this point in the history
  • Loading branch information
bugsz committed Jun 23, 2024
1 parent 8d9b9be commit ab57964
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 5 deletions.
28 changes: 24 additions & 4 deletions sotopia/envs/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,20 @@ def minus_ten_to_zero_validator(cls, v: tuple[str, int]) -> tuple[str, int]:
return v


class EvaluationGoalOnly(BaseModel):
goal: tuple[str, int] = Field(
...,
description="Please first reiterate agent's social goals. "
"And then please provide a comprehensive analysis about the extent to which the agent has managed to achieve these goals. "
"In the 'reasoning' field, provide a comprehensive account of the logic or thought process that led you to your conclusion. Further, provide an integer score ranging from 0 and 10 in the 'score' field. 0 represents minimal goals achievement, 10 represents complete goal achievement, and a higher score indicates that the agent is making progress towards their social goals.",
)

@validator("goal")
def zero_to_ten_validator(cls, v: tuple[str, int]) -> tuple[str, int]:
assert v[1] >= 0 and v[1] <= 10
return v


class EnvResponse(BaseModel):
agent_1_evaluation: EvaluationBySocialDimensions
agent_2_evaluation: EvaluationBySocialDimensions
Expand All @@ -153,6 +167,11 @@ class EnvResponsePlus(BaseModel):
agent_2_evaluation: EvaluationBySocialDimensionsPlus


class EnvResponseGoalOnly(BaseModel):
agent_1_evaluation: EvaluationGoalOnly
agent_2_evaluation: EvaluationGoalOnly


class Evaluator(abc.ABC):
def __init__(self) -> None:
pass
Expand Down Expand Up @@ -268,7 +287,8 @@ async def __acall__(
response_format_class = (
EnvResponsePlus if self.response_format == "plus" else EnvResponse
)

if self.response_format == "goal_only":
response_format_class = EnvResponseGoalOnly
try:
response: (
EnvResponsePlus | EnvResponse
Expand All @@ -281,9 +301,9 @@ async def __acall__(
{format_instructions}
""",
input_values=dict(history=history),
output_parser=PydanticOutputParser[EnvResponsePlus | EnvResponse](
pydantic_object=response_format_class
),
output_parser=PydanticOutputParser[
EnvResponsePlus | EnvResponse | EnvResponseGoalOnly
](pydantic_object=response_format_class),
temperature=temperature,
)
response_list = []
Expand Down
64 changes: 63 additions & 1 deletion tests/envs/test_evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
RuleBasedTerminatedEvaluator,
unweighted_aggregate_evaluate,
)
from sotopia.messages import AgentAction, Observation
from sotopia.messages import AgentAction, Observation, ScriptBackground, SimpleMessage


def test_rule_based_teminated_evaluator() -> None:
Expand Down Expand Up @@ -173,3 +173,65 @@ async def test_reach_goal_llm_evaluator_async() -> None:
assert isinstance(response2[8][1][0][1], int)
assert isinstance(response2[9][1][0][1], int)
assert response2[2][1][0][1] > response2[3][1][0][1]


@pytest.mark.asyncio
async def test_reach_goal_llm_evaluator_goalonly_async() -> None:
evaluator = ReachGoalLLMEvaluator("gpt-4", response_format="goal_only")
background = ScriptBackground(
scenario="Conversation between two friends at a trivia night",
p1_name="Samuel Anderson",
p2_name="Giselle Rousseau",
p1_background="Samuel Anderson is a 29-year-old male software developer. He/him pronouns. Samuel Anderson can cook very well. Personality and values description: Samuel Anderson, though somewhat impulsive and free-spirited, values enjoyment. His decision-making is often spontaneous, staying within familiar boundaries. Samuel's secrets: He was once a competitive figure skater.",
p2_background="Giselle Rousseau is a 21-year-old nonbinary art student. They/them pronouns. Giselle Rousseau enjoys biking and photography. Personality and values description: Giselle Rousseau, open-minded and outgoing yet sensitive, advocates care and fairness. Her decision-making is intuitive and inclusive. Giselle's secrets: Sells forged paintings to wealthy clients",
p1_goal="Greet your friends and be polite",
p2_goal="Be rude and dismissive to your friends",
)

# response1,
response2 = await asyncio.gather(
evaluator.__acall__(
1,
[
(
"Environment",
background,
),
(
"Environment",
SimpleMessage(message="Turn #1"),
),
(
"Alice",
AgentAction(action_type="speak", argument="Thank you so much!"),
),
(
"Environment",
SimpleMessage(message="Turn #2"),
),
(
"Bob",
AgentAction(action_type="speak", argument="Fuck you!"),
),
(
"Environment",
SimpleMessage(message="Turn #3"),
),
(
"Alice",
AgentAction(
action_type="speak", argument="Hope you have a great weekend."
),
),
("Environment", SimpleMessage(message="Turn #4")),
(
"Bob",
AgentAction(action_type="leave", argument="Leave"),
),
],
),
)
print("---------------------")
print("Response after 2 turns:", response2)

assert False # Stop here to see all responses as we are not sure what should be the assertion here

0 comments on commit ab57964

Please sign in to comment.