Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Moved to extract_answer from #148 and back to gpt-4o-mini #161

Merged
merged 3 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/aviary/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
EvalAnswerMode,
encode_image_to_base64,
eval_answer,
extract_answer,
is_coroutine_callable,
partial_format,
)
Expand Down Expand Up @@ -82,6 +83,7 @@
"encode_image_to_base64",
"eval_answer",
"eval_answer",
"extract_answer",
"fenv",
"is_coroutine_callable",
"join",
Expand Down
105 changes: 57 additions & 48 deletions src/aviary/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@
import inspect
import io
import random
import re
import string
from ast import literal_eval
from collections.abc import Awaitable, Callable, Sequence
from collections.abc import Sequence
from enum import StrEnum
from typing import TYPE_CHECKING, Any, ClassVar, Literal, Self, cast

Expand All @@ -21,8 +20,8 @@
import numpy as np


DEFAULT_EVAL_MODEL_NAME = "gpt-4o"
LLM_BOOL_EVAL_CONFIG = {
DEFAULT_EVAL_MODEL_NAME = "gpt-4o-mini"
LLM_BOOL_EVAL_CONFIG: dict[str, Any] = {
"prompt": (
"Here is a question, the correct answer to the question, and a proposed answer"
" to the question. Please tell me if the proposed answer is correct, given the"
Expand All @@ -35,6 +34,18 @@
"temperature": 0,
}

LLM_EXTRACT_CONFIG = LLM_BOOL_EVAL_CONFIG | {
"prompt": (
"You are evaluating answers for a test which has fixed options. "
"Repeat back which option the proposed answer matches. "
"GIVE ONLY THE VERBATIM TEXT OF A FIXED OPTION. "
"If the proposed answer is empty, invalid, or ambiguous, "
"return an empty string."
"\n\nOptions:\n{options}"
"\n\nProposed answer: {proposed_answer}"
)
}

LLM_SCORE_EVAL_CONFIG = LLM_BOOL_EVAL_CONFIG | {
"prompt": (
"Here is a question, the correct answer to the question, and a rubric for"
Expand Down Expand Up @@ -175,21 +186,36 @@ async def eval_answer(
raise RuntimeError(f"Invalid evaluation mode: {eval_mode}")


async def extract_answer(
proposed_answer: str, options: Sequence[str], llm_eval_config: dict | None = None
) -> str | None:
"""Extract the answer matching a proposal from a list of options using an LLM."""
for option in options:
if proposed_answer.strip().casefold() == option.strip().casefold():
return option

default_config = LLM_EXTRACT_CONFIG
config = llm_eval_config or default_config
response_msg = await run_prompt(
prompt=config.get("prompt", default_config["prompt"]).format(
options="\n".join(options),
proposed_answer=proposed_answer,
),
model=config.get("model", default_config["model"]),
temperature=config.get("temperature", default_config["temperature"]),
)
answer = response_msg.strip().casefold() # noqa: FURB184
for option in options:
if answer == option.strip().casefold():
return option
return None


_CAPITAL_A_INDEX = ord("A")


class MultipleChoiceQuestion(BaseModel):
QUESTION_PROMPT_TEMPLATE: ClassVar[str] = "Q: {question}\n\nOptions:\n{options}"
# TODO: combine with above eval_answer and its prompts
EVALUATION_PROMPT_TEMPLATE: ClassVar[str] = (
"Given the following question and a proposed answer to the question, return the"
" single-letter choice in the question that matches the proposed answer."
" If the proposed answer is blank or an empty string,"
" or multiple options are matched, respond with '0'."
"\n\nQuestion: {qa_prompt}"
"\n\nProposed Answer: {qa_answer}"
"\n\nSingle Letter Answer:"
)
DEFAULT_UNSURE_OPTION: ClassVar[str] = (
"Insufficient information to answer this question"
)
Expand Down Expand Up @@ -280,18 +306,14 @@ def split_options(options: str) -> list[str]:
return split_options

async def grade(
self, answer: str, prompt_runner: Callable[[str], Awaitable[str]] | None = None
) -> "tuple[MultipleChoiceEvaluation, str, str]":
if prompt_runner is None:
prompt_runner = run_prompt
eval_prompt = self.EVALUATION_PROMPT_TEMPLATE.format(
qa_prompt=self.question_prompt, qa_answer=answer
)
raw_evaluation = await prompt_runner(eval_prompt)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also thanks to this change we no longer need raw_evaluation

evaluation, parsed_answer = MultipleChoiceEvaluation.from_answer(
raw_evaluation, self
self, proposed_answer: str
) -> "tuple[MultipleChoiceEvaluation, str | None]":
extracted_answer = await extract_answer(
proposed_answer=proposed_answer, options=self.options
)
return evaluation, raw_evaluation, parsed_answer
return MultipleChoiceEvaluation.from_answer(
extracted_answer, self
), extracted_answer


class MultipleChoiceEvaluation(StrEnum):
Expand Down Expand Up @@ -323,32 +345,19 @@ def calculate_accuracy_precision(

@classmethod
def from_answer(
cls, answer: str, question: MultipleChoiceQuestion
) -> "tuple[MultipleChoiceEvaluation, str]":
cls, extracted_answer: str | None, question: MultipleChoiceQuestion
) -> "MultipleChoiceEvaluation":
"""Make an evaluation from the input answer and multiple choice question.

Returns:
Two-tuple of answer enum and the raw answer extracted from the input answer.
Evaluation corresponding to the parsed answer.
"""
# SEE: https://regex101.com/r/vcE9Hb/1
letter_search = re.search(r"([A-Z])\)?", answer, re.DOTALL)
# Get the letter answer, or fail over to the first non-whitespace char
answer_char = (
letter_search.group(1)
if letter_search is not None
else answer.split()[0][0].upper()
)
answer_letter_index = ord(answer_char[0]) - _CAPITAL_A_INDEX
if answer_letter_index < 0 or answer_letter_index > len(question.options):
# The result extracted was not in the options (e.g. '0')
return cls.INCORRECT, answer_char
if extracted_answer is None:
return MultipleChoiceEvaluation.INCORRECT
# From here, if we don't match either the ideal or the unsure multiple choice
# options then we declare the answer as incorrect.
if (
question.unsure_answer_index is not None
and answer_letter_index == question.unsure_answer_index
):
return cls.UNSURE, cast(str, question.unsure_answer)
if answer_letter_index == question.ideal_answer_index:
return cls.CORRECT, question.ideal_answer
return cls.INCORRECT, question.options[answer_letter_index]
if extracted_answer == question.ideal_answer:
return MultipleChoiceEvaluation.CORRECT
if question.unsure_answer and extracted_answer == question.unsure_answer:
return MultipleChoiceEvaluation.UNSURE
return MultipleChoiceEvaluation.INCORRECT
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
interactions:
- request:
body:
'{"messages": [{"content": "Given the following question and a proposed
answer to the question, return the single-letter choice in the question that
matches the proposed answer. If the proposed answer is blank or an empty string,
or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is
the meaning of life?\n\nOptions:\nA) -84\nB) Insufficient information to answer
this question\nC) cheesecake\nD) 11\nE) 42\n\nProposed Answer: 14\n\nSingle
Letter Answer:", "role": "user"}], "model": "gpt-4o"}'
'{"messages": [{"content": "You are evaluating answers for a test which
has fixed options. Repeat back which option the proposed answer matches. GIVE
ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
or ambiguous, return an empty string.\n\nOptions:\n-84\nInsufficient information
to answer this question\ncheesecake\n11\n42\n\nProposed answer: 14", "role":
"user"}], "model": "gpt-4o-mini", "temperature": 0}'
headers:
accept:
- application/json
Expand All @@ -16,7 +15,7 @@ interactions:
connection:
- keep-alive
content-length:
- "513"
- "442"
content-type:
- application/json
host:
Expand All @@ -36,7 +35,7 @@ interactions:
x-stainless-raw-response:
- "true"
x-stainless-retry-count:
- "1"
- "0"
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
Expand All @@ -46,28 +45,34 @@ interactions:
response:
body:
string: !!binary |
H4sIAAAAAAAAAwAAAP//jJJLa8MwEITv/hVC57goqamd3HpooPQBORVSilGktaNW1gpJoY+Q/15k
u7FDU+jFh/l2xrNr7xNCqJJ0QajY8iAaq9Prav24NO7rNmMKH55ulvd8VaxXHu/ejaOT6MDNK4jw
47oQ2FgNQaHpsHDAA8TUaX6ZZTnLi3kLGpSgo622Ic0wnbFZlrIiZVe9cYtKgKcL8pwQQsi+fcaK
RsIHXRA2+VEa8J7XQBfHIUKoQx0Vyr1XPnAT6GSAAk0A07ZmY91BtfM81jI7rXv9cHyRxto63Pie
H/VKGeW3pQPu0cRQH9DSlh4SQl7ahXYnHal12NhQBnwDEwOnrOjy6HDCEe1ZwMD12DSfnIkrJQSu
tB9dhAoutiAH63A+vpMKRyAZLf27zLnsbnFl6v/ED0AIsAFkaR1IJU4XHsYcxB/sr7HjkdvC1H/6
AE1ZKVODs05137iyJc/nspBcTCuaHJJvAAAA//8DAGY5XevsAgAA
H4sIAAAAAAAAAwAAAP//jFLLTsMwELznK6w9Nyh9P24V0AMHBBK9gFDk2pvU4NiWveVV9d+R00da
tUhcfJjZGc+svU4YAyVhwkAsOYnK6XRafF7fzuWPeLwZP4v7p5maTwd3GD6yB5pBKyrs4g0F7VVX
wlZOIylrtrTwyAmja3vY7fX7o357VBOVlaijrHSU9mxaKaPSTtbppdkwbY926qVVAgNM2EvCGGPr
+ow5jcQvmLCstUcqDIGXCJPDEGPgrY4I8BBUIG4IWg0prCE0dfRj2GOxCjxGMyutd/jmcI+2pfN2
EXb8AS+UUWGZe+TBmugZyDqo2U3C2GvdZ3USEZy3laOc7DuaaDjqb+2g2WJD7qoCWeL6gubELJdI
XOlwtA4QXCxRnhkyBnwllT0ikqPK51kueW9rK1P+x74hhEBHKHPnUSpxsW9tHr/YX2OHFdeBIXwH
wiovlCnRO6+2D1y4vDvmvUyMBzyDZJP8AgAA//8DADaBBszuAgAA
headers:
CF-Cache-Status:
- DYNAMIC
CF-RAY:
- 8f39fde1cf88cf1b-SJC
- 8f425bb2ac70f953-SJC
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Tue, 17 Dec 2024 21:26:29 GMT
- Wed, 18 Dec 2024 21:48:38 GMT
Server:
- cloudflare
Set-Cookie:
- __cf_bm=Z3Wkkk2LQA2GKAPZVirKPYLTJfmm9Luttv26RxPBKro-1734558518-1.0.1.1-4BZR47qupd.QCWRMrfyj_F2lS0fqBEuzxwPZTqYPUxSKwdzL4S_8YWk9ofOPXhFEnkMN6nwgWjBLjAR4nioxiQ;
path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly;
Secure; SameSite=None
- _cfuvid=B7CeJKL1WXveU2pmeUGy_AFjPsbf25SvdiSN_4fxTXE-1734558518441-0.0.1.1-604800000;
path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
Transfer-Encoding:
- chunked
X-Content-Type-Options:
Expand All @@ -79,25 +84,25 @@ interactions:
openai-organization:
- future-house-xr4tdh
openai-processing-ms:
- "363"
- "144"
openai-version:
- "2020-10-01"
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- "10000"
- "30000"
x-ratelimit-limit-tokens:
- "30000000"
- "150000000"
x-ratelimit-remaining-requests:
- "9999"
- "29999"
x-ratelimit-remaining-tokens:
- "29999874"
- "149999896"
x-ratelimit-reset-requests:
- 6ms
- 2ms
x-ratelimit-reset-tokens:
- 0s
x-request-id:
- req_aff8daa48aa43d3df077f97da6136e5a
- req_503cd8163bd0d3b634eb723d6874b1da
status:
code: 200
message: OK
Expand Down
Loading
Loading