Skip to content

Commit

Permalink
Exported #148 contents
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesbraza committed Dec 18, 2024
1 parent c705773 commit 9653adb
Show file tree
Hide file tree
Showing 7 changed files with 491 additions and 2 deletions.
2 changes: 2 additions & 0 deletions src/aviary/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
EvalAnswerMode,
encode_image_to_base64,
eval_answer,
extract_answer,
is_coroutine_callable,
partial_format,
)
Expand Down Expand Up @@ -82,6 +83,7 @@
"encode_image_to_base64",
"eval_answer",
"eval_answer",
"extract_answer",
"fenv",
"is_coroutine_callable",
"join",
Expand Down
39 changes: 38 additions & 1 deletion src/aviary/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@


DEFAULT_EVAL_MODEL_NAME = "gpt-4o"
LLM_BOOL_EVAL_CONFIG = {
LLM_BOOL_EVAL_CONFIG: dict[str, Any] = {
"prompt": (
"Here is a question, the correct answer to the question, and a proposed answer"
" to the question. Please tell me if the proposed answer is correct, given the"
Expand All @@ -35,6 +35,18 @@
"temperature": 0,
}

LLM_EXTRACT_CONFIG = LLM_BOOL_EVAL_CONFIG | {
"prompt": (
"You are evaluating answers for a test which has fixed options. "
"Repeat back which option the proposed answer matches. "
"GIVE ONLY THE VERBATIM TEXT OF A FIXED OPTION. "
"If the proposed answer is empty, invalid, or ambiguous, "
"return an empty string."
"\n\nOptions:\n{options}"
"\n\nProposed answer: {proposed_answer}"
)
}

LLM_SCORE_EVAL_CONFIG = LLM_BOOL_EVAL_CONFIG | {
"prompt": (
"Here is a question, the correct answer to the question, and a rubric for"
Expand Down Expand Up @@ -175,6 +187,31 @@ async def eval_answer(
raise RuntimeError(f"Invalid evaluation mode: {eval_mode}")


async def extract_answer(
proposed_answer: str, options: Sequence[str], llm_eval_config: dict | None = None
) -> str | None:
"""Extract the answer matching a proposal from a list of options using an LLM."""
for option in options:
if proposed_answer.strip().casefold() == option.strip().casefold():
return option

default_config = LLM_EXTRACT_CONFIG
config = llm_eval_config or default_config
response_msg = await run_prompt(
prompt=config.get("prompt", default_config["prompt"]).format(
options="\n".join(options),
proposed_answer=proposed_answer,
),
model=config.get("model", default_config["model"]),
temperature=config.get("temperature", default_config["temperature"]),
)
answer = response_msg.strip().casefold()
for option in options:
if answer == option.strip().casefold():
return option
return None


_CAPITAL_A_INDEX = ord("A")


Expand Down
109 changes: 109 additions & 0 deletions tests/cassettes/test_extract_answer[complex].yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
interactions:
- request:
body:
'{"messages": [{"content": "You are evaluating answers for a test which
has fixed options. Repeat back which option the proposed answer matches. GIVE
ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
or ambiguous, return an empty string.\n\nOptions:\nEconomic factors\nSocial
unrest\nPolitical corruption\n\nProposed answer: Based on the context given,
Serif et al. (2026) claim that the overwhelming cause of regime collapse arises
from economic factors. Yet, most other scholars (Gerald and Robinson for example)
believe the collapse was due to social unrest because of the prolonged epidemic
of 2025. I tend to agree with the majority - although I can see both sides.
Thus my response is that the social unrest was the significant factor in the
collapse of the regime.", "role": "user"}], "model": "gpt-4o", "temperature":
0}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate
connection:
- keep-alive
content-length:
- "861"
content-type:
- application/json
host:
- api.openai.com
user-agent:
- AsyncOpenAI/Python 1.57.4
x-stainless-arch:
- arm64
x-stainless-async:
- async:asyncio
x-stainless-lang:
- python
x-stainless-os:
- MacOS
x-stainless-package-version:
- 1.57.4
x-stainless-raw-response:
- "true"
x-stainless-retry-count:
- "1"
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.12.7
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
body:
string: !!binary |
H4sIAAAAAAAAAwAAAP//jJI/T8MwEMX3fArLc4Oa9E9KtrJ1REyAUOTal8TF8RnbgaKq3x05TZNW
FInFw/3uPb87+xARQqWgOaG8Zp43RsXr8nP/9VCtNKzbGuYvJSab3cdj+rySiw2dBAVud8D9WXXH
sTEKvER9wtwC8xBck2w2Xyyy5SztQIMCVJBVxsdzjNNpOo+nq3i67IU1Sg6O5uQ1IoSQQ3eGiFrA
nuZkOjlXGnCOVUDzoYkQalGFCmXOSeeZ9nQyQo7ag+5SPyGXTJFWW3BXPRbK1rEQUbdK9fXjcKnC
yljcup4P9VJq6erCAnOowwXOo6EdPUaEvHXDtVd5qbHYGF94fAcdDJPl4uRHx3WONO2ZR8/UpSib
3LArBHgmlbvYDuWM1yBG6bhK1gqJFyC6GPp3mFvep8Glrv5jPwLOwXgQhbEgJL8eeGyzED7bX23D
krvA1H07D01RSl2BNVae3rs0BcvuxUownpQ0OkY/AAAA//8DAOEzla34AgAA
headers:
CF-Cache-Status:
- DYNAMIC
CF-RAY:
- 8f42461018c5eb29-SJC
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Wed, 18 Dec 2024 21:33:52 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
access-control-expose-headers:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
openai-organization:
- future-house-xr4tdh
openai-processing-ms:
- "235"
openai-version:
- "2020-10-01"
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- "10000"
x-ratelimit-limit-tokens:
- "30000000"
x-ratelimit-remaining-requests:
- "9999"
x-ratelimit-remaining-tokens:
- "29999790"
x-ratelimit-reset-requests:
- 6ms
x-ratelimit-reset-tokens:
- 0s
x-request-id:
- req_366dfd5f505d08facd0f7d10e64a9f5e
status:
code: 200
message: OK
version: 1
102 changes: 102 additions & 0 deletions tests/cassettes/test_extract_answer[empty-proposal].yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
interactions:
- request:
body:
'{"messages": [{"content": "You are evaluating answers for a test which
has fixed options. Repeat back which option the proposed answer matches. GIVE
ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
or ambiguous, return an empty string.\n\nOptions:\nA\nB\nC\n\nProposed answer:
", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate
connection:
- keep-alive
content-length:
- "369"
content-type:
- application/json
host:
- api.openai.com
user-agent:
- AsyncOpenAI/Python 1.57.4
x-stainless-arch:
- arm64
x-stainless-async:
- async:asyncio
x-stainless-lang:
- python
x-stainless-os:
- MacOS
x-stainless-package-version:
- 1.57.4
x-stainless-raw-response:
- "true"
x-stainless-retry-count:
- "1"
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.12.7
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
body:
string: !!binary |
H4sIAAAAAAAAAwAAAP//jFLLTsMwELznK6w9Nyht0pb2xokLIEDiAkKRa29Sg+O1bBcFqv47cpq+
1CJx8WFmZzyz9jphDJSEOQOx5EE0Vqc31Vfb3mXF/fOoeDG3Ol+0D4WZPD79uOoVBlFBiw8UYae6
EtRYjUGR2dLCIQ8YXYfTvBiPp5M874iGJOooq21IC0pH2ahIs+s0m/TCJSmBHubsLWGMsXV3xohG
Ygtzlg12SIPe8xphvh9iDBzpiAD3XvnATYDBgRRkApou9THssFp5HlOZldY9vtnfo6m2jha+5/d4
pYzyy9Ih92Sipw9koWM3CWPvXZ/VSUSwjhobykCfaKLhdLi1g8MCD2RfFQIFri9oTsxKiYEr7Y/W
AYKLJcozQ8aAr6SiIyI5qnye5ZL3trYy9X/sD4QQaAPK0jqUSlzs25nH3/XX2H7FXWDw3z5gU1bK
1OisU9sHrmw5q/iCz6osv4Zkk/wCAAD//wMA7iORIukCAAA=
headers:
CF-Cache-Status:
- DYNAMIC
CF-RAY:
- 8f424615ca81eb32-SJC
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Wed, 18 Dec 2024 21:33:53 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
access-control-expose-headers:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
openai-organization:
- future-house-xr4tdh
openai-processing-ms:
- "171"
openai-version:
- "2020-10-01"
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- "10000"
x-ratelimit-limit-tokens:
- "30000000"
x-ratelimit-remaining-requests:
- "9999"
x-ratelimit-remaining-tokens:
- "29999912"
x-ratelimit-reset-requests:
- 6ms
x-ratelimit-reset-tokens:
- 0s
x-request-id:
- req_de2070d3e02afd584ac618042c22382d
status:
code: 200
message: OK
version: 1
102 changes: 102 additions & 0 deletions tests/cassettes/test_extract_answer[gave-two].yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
interactions:
- request:
body:
'{"messages": [{"content": "You are evaluating answers for a test which
has fixed options. Repeat back which option the proposed answer matches. GIVE
ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
or ambiguous, return an empty string.\n\nOptions:\nA\nB\nC\n\nProposed answer:
A or B", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate
connection:
- keep-alive
content-length:
- "375"
content-type:
- application/json
host:
- api.openai.com
user-agent:
- AsyncOpenAI/Python 1.57.4
x-stainless-arch:
- arm64
x-stainless-async:
- async:asyncio
x-stainless-lang:
- python
x-stainless-os:
- MacOS
x-stainless-package-version:
- 1.57.4
x-stainless-raw-response:
- "true"
x-stainless-retry-count:
- "1"
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.12.7
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
body:
string: !!binary |
H4sIAAAAAAAAA4xSTWsCMRS8768I7+wWv7XeChVKoR4KPbSlLDF5u5s2m5cmWVHE/16yWl3RQi85
zLyZzLxkmzAGSsKMgSh5EJXV6V2+Wq82z+Ut0n3xstBv8+H8cWFK9/r08A2dqKDlJ4rwq7oRVFmN
QZHZ08IhDxhde5PBcDSajAe9hqhIoo6ywoZ0SGm/2x+m3WnaHR+EJSmBHmbsPWGMsW1zxohG4hpm
rNv5RSr0nhcIs+MQY+BIRwS498oHbgJ0TqQgE9A0qduww7z2PKYytdYHfHe8R1NhHS39gT/iuTLK
l5lD7slETx/IQsPuEsY+mj71WUSwjiobskBfaKLhZLC3g9MCT+ShKgQKXF/RnJllEgNX2rfWAYKL
EuWFIWPAa6moRSStypdZrnnvaytT/Mf+RAiBNqDMrEOpxNW+jXn8XX+NHVfcBAa/8QGrLFemQGed
2j9wbjPZny4Fx8m0D8ku+QEAAP//AwDYqi3B6QIAAA==
headers:
CF-Cache-Status:
- DYNAMIC
CF-RAY:
- 8f42460a68a467f1-SJC
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Wed, 18 Dec 2024 21:33:51 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
access-control-expose-headers:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
openai-organization:
- future-house-xr4tdh
openai-processing-ms:
- "241"
openai-version:
- "2020-10-01"
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- "10000"
x-ratelimit-limit-tokens:
- "30000000"
x-ratelimit-remaining-requests:
- "9999"
x-ratelimit-remaining-tokens:
- "29999911"
x-ratelimit-reset-requests:
- 6ms
x-ratelimit-reset-tokens:
- 0s
x-request-id:
- req_83d07d0983e1d4d1995bfa068db503dd
status:
code: 200
message: OK
version: 1
Loading

0 comments on commit 9653adb

Please sign in to comment.