Exported #148 contents

Future-House · Dec 18, 2024 · 9653adb · 9653adb
1 parent c705773
commit 9653adb
Show file tree

Hide file tree

Showing 7 changed files with 491 additions and 2 deletions.
diff --git a/src/aviary/core.py b/src/aviary/core.py
@@ -40,6 +40,7 @@
     EvalAnswerMode,
     encode_image_to_base64,
     eval_answer,
+    extract_answer,
     is_coroutine_callable,
     partial_format,
 )
@@ -82,6 +83,7 @@
     "encode_image_to_base64",
     "eval_answer",
     "eval_answer",
+    "extract_answer",
     "fenv",
     "is_coroutine_callable",
     "join",

diff --git a/src/aviary/utils.py b/src/aviary/utils.py
@@ -22,7 +22,7 @@
 
 
 DEFAULT_EVAL_MODEL_NAME = "gpt-4o"
-LLM_BOOL_EVAL_CONFIG = {
+LLM_BOOL_EVAL_CONFIG: dict[str, Any] = {
     "prompt": (
         "Here is a question, the correct answer to the question, and a proposed answer"
         " to the question. Please tell me if the proposed answer is correct, given the"
@@ -35,6 +35,18 @@
     "temperature": 0,
 }
 
+LLM_EXTRACT_CONFIG = LLM_BOOL_EVAL_CONFIG | {
+    "prompt": (
+        "You are evaluating answers for a test which has fixed options. "
+        "Repeat back which option the proposed answer matches. "
+        "GIVE ONLY THE VERBATIM TEXT OF A FIXED OPTION. "
+        "If the proposed answer is empty, invalid, or ambiguous, "
+        "return an empty string."
+        "\n\nOptions:\n{options}"
+        "\n\nProposed answer: {proposed_answer}"
+    )
+}
+
 LLM_SCORE_EVAL_CONFIG = LLM_BOOL_EVAL_CONFIG | {
     "prompt": (
         "Here is a question, the correct answer to the question, and a rubric for"
@@ -175,6 +187,31 @@ async def eval_answer(
     raise RuntimeError(f"Invalid evaluation mode: {eval_mode}")
 
 
+async def extract_answer(
+    proposed_answer: str, options: Sequence[str], llm_eval_config: dict | None = None
+) -> str | None:
+    """Extract the answer matching a proposal from a list of options using an LLM."""
+    for option in options:
+        if proposed_answer.strip().casefold() == option.strip().casefold():
+            return option
+
+    default_config = LLM_EXTRACT_CONFIG
+    config = llm_eval_config or default_config
+    response_msg = await run_prompt(
+        prompt=config.get("prompt", default_config["prompt"]).format(
+            options="\n".join(options),
+            proposed_answer=proposed_answer,
+        ),
+        model=config.get("model", default_config["model"]),
+        temperature=config.get("temperature", default_config["temperature"]),
+    )
+    answer = response_msg.strip().casefold()
+    for option in options:
+        if answer == option.strip().casefold():
+            return option
+    return None
+
+
 _CAPITAL_A_INDEX = ord("A")
 
 

diff --git a/tests/cassettes/test_extract_answer[complex].yaml b/tests/cassettes/test_extract_answer[complex].yaml
@@ -0,0 +1,109 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"content": "You are evaluating answers for a test which
+        has fixed options. Repeat back which option the proposed answer matches. GIVE
+        ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
+        or ambiguous, return an empty string.\n\nOptions:\nEconomic factors\nSocial
+        unrest\nPolitical corruption\n\nProposed answer: Based on the context given,
+        Serif et al. (2026) claim that the overwhelming cause of regime collapse arises
+        from economic factors. Yet, most other scholars (Gerald and Robinson for example)
+        believe the collapse was due to social unrest because of the prolonged epidemic
+        of 2025. I tend to agree with the majority - although I can see both sides.
+        Thus my response is that the social unrest was the significant factor in the
+        collapse of the regime.", "role": "user"}], "model": "gpt-4o", "temperature":
+        0}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "861"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.57.4
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.57.4
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-retry-count:
+          - "1"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAAwAAAP//jJI/T8MwEMX3fArLc4Oa9E9KtrJ1REyAUOTal8TF8RnbgaKq3x05TZNW
+          FInFw/3uPb87+xARQqWgOaG8Zp43RsXr8nP/9VCtNKzbGuYvJSab3cdj+rySiw2dBAVud8D9WXXH
+          sTEKvER9wtwC8xBck2w2Xyyy5SztQIMCVJBVxsdzjNNpOo+nq3i67IU1Sg6O5uQ1IoSQQ3eGiFrA
+          nuZkOjlXGnCOVUDzoYkQalGFCmXOSeeZ9nQyQo7ag+5SPyGXTJFWW3BXPRbK1rEQUbdK9fXjcKnC
+          yljcup4P9VJq6erCAnOowwXOo6EdPUaEvHXDtVd5qbHYGF94fAcdDJPl4uRHx3WONO2ZR8/UpSib
+          3LArBHgmlbvYDuWM1yBG6bhK1gqJFyC6GPp3mFvep8Glrv5jPwLOwXgQhbEgJL8eeGyzED7bX23D
+          krvA1H07D01RSl2BNVae3rs0BcvuxUownpQ0OkY/AAAA//8DAOEzla34AgAA
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8f42461018c5eb29-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Wed, 18 Dec 2024 21:33:52 GMT
+        Server:
+          - cloudflare
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "235"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "10000"
+        x-ratelimit-limit-tokens:
+          - "30000000"
+        x-ratelimit-remaining-requests:
+          - "9999"
+        x-ratelimit-remaining-tokens:
+          - "29999790"
+        x-ratelimit-reset-requests:
+          - 6ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_366dfd5f505d08facd0f7d10e64a9f5e
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/cassettes/test_extract_answer[empty-proposal].yaml b/tests/cassettes/test_extract_answer[empty-proposal].yaml
@@ -0,0 +1,102 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"content": "You are evaluating answers for a test which
+        has fixed options. Repeat back which option the proposed answer matches. GIVE
+        ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
+        or ambiguous, return an empty string.\n\nOptions:\nA\nB\nC\n\nProposed answer:
+        ", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "369"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.57.4
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.57.4
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-retry-count:
+          - "1"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAAwAAAP//jFLLTsMwELznK6w9Nyht0pb2xokLIEDiAkKRa29Sg+O1bBcFqv47cpq+
+          1CJx8WFmZzyz9jphDJSEOQOx5EE0Vqc31Vfb3mXF/fOoeDG3Ol+0D4WZPD79uOoVBlFBiw8UYae6
+          EtRYjUGR2dLCIQ8YXYfTvBiPp5M874iGJOooq21IC0pH2ahIs+s0m/TCJSmBHubsLWGMsXV3xohG
+          Ygtzlg12SIPe8xphvh9iDBzpiAD3XvnATYDBgRRkApou9THssFp5HlOZldY9vtnfo6m2jha+5/d4
+          pYzyy9Ih92Sipw9koWM3CWPvXZ/VSUSwjhobykCfaKLhdLi1g8MCD2RfFQIFri9oTsxKiYEr7Y/W
+          AYKLJcozQ8aAr6SiIyI5qnye5ZL3trYy9X/sD4QQaAPK0jqUSlzs25nH3/XX2H7FXWDw3z5gU1bK
+          1OisU9sHrmw5q/iCz6osv4Zkk/wCAAD//wMA7iORIukCAAA=
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8f424615ca81eb32-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Wed, 18 Dec 2024 21:33:53 GMT
+        Server:
+          - cloudflare
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "171"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "10000"
+        x-ratelimit-limit-tokens:
+          - "30000000"
+        x-ratelimit-remaining-requests:
+          - "9999"
+        x-ratelimit-remaining-tokens:
+          - "29999912"
+        x-ratelimit-reset-requests:
+          - 6ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_de2070d3e02afd584ac618042c22382d
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/cassettes/test_extract_answer[gave-two].yaml b/tests/cassettes/test_extract_answer[gave-two].yaml
@@ -0,0 +1,102 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"content": "You are evaluating answers for a test which
+        has fixed options. Repeat back which option the proposed answer matches. GIVE
+        ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
+        or ambiguous, return an empty string.\n\nOptions:\nA\nB\nC\n\nProposed answer:
+        A or B", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "375"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.57.4
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.57.4
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-retry-count:
+          - "1"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAA4xSTWsCMRS8768I7+wWv7XeChVKoR4KPbSlLDF5u5s2m5cmWVHE/16yWl3RQi85
+          zLyZzLxkmzAGSsKMgSh5EJXV6V2+Wq82z+Ut0n3xstBv8+H8cWFK9/r08A2dqKDlJ4rwq7oRVFmN
+          QZHZ08IhDxhde5PBcDSajAe9hqhIoo6ywoZ0SGm/2x+m3WnaHR+EJSmBHmbsPWGMsW1zxohG4hpm
+          rNv5RSr0nhcIs+MQY+BIRwS498oHbgJ0TqQgE9A0qduww7z2PKYytdYHfHe8R1NhHS39gT/iuTLK
+          l5lD7slETx/IQsPuEsY+mj71WUSwjiobskBfaKLhZLC3g9MCT+ShKgQKXF/RnJllEgNX2rfWAYKL
+          EuWFIWPAa6moRSStypdZrnnvaytT/Mf+RAiBNqDMrEOpxNW+jXn8XX+NHVfcBAa/8QGrLFemQGed
+          2j9wbjPZny4Fx8m0D8ku+QEAAP//AwDYqi3B6QIAAA==
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8f42460a68a467f1-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Wed, 18 Dec 2024 21:33:51 GMT
+        Server:
+          - cloudflare
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "241"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "10000"
+        x-ratelimit-limit-tokens:
+          - "30000000"
+        x-ratelimit-remaining-requests:
+          - "9999"
+        x-ratelimit-remaining-tokens:
+          - "29999911"
+        x-ratelimit-reset-requests:
+          - 6ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_83d07d0983e1d4d1995bfa068db503dd
+      status:
+        code: 200
+        message: OK
+version: 1