From 5852aa1574e369f383931a8319abace4052eff79 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 4 Mar 2024 10:48:57 -0800
Subject: [PATCH 001/149] llm-claude-3

!stable-docs
---
 docs/plugins/directory.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/plugins/directory.md b/docs/plugins/directory.md
index f241dc8a..f04d595f 100644
--- a/docs/plugins/directory.md
+++ b/docs/plugins/directory.md
@@ -19,7 +19,8 @@ These plugins can be used to interact with remotely hosted models via their API:
 
 - **[llm-mistral](https://github.com/simonw/llm-mistral)** adds support for [Mistral AI](https://mistral.ai/)'s language and embedding models.
 - **[llm-gemini](https://github.com/simonw/llm-gemini)** adds support for Google's [Gemini](https://ai.google.dev/docs) models.
-- **[llm-claude](https://github.com/tomviner/llm-claude)** by Tom Viner adds support for Claude and Claude Instant by Anthropic.
+- **[llm-claude](https://github.com/tomviner/llm-claude)** by Tom Viner adds support for Claude 2.1 and Claude Instant 2.1 by Anthropic.
+- **[llm-claude-3](https://github.com/simonw/llm-claude-3)** supports Anthropic's [Claude 3 family](https://www.anthropic.com/news/claude-3-family) of models.
 - **[llm-anyscale-endpoints](https://github.com/simonw/llm-anyscale-endpoints)** supports models hosted on the [Anyscale Endpoints](https://app.endpoints.anyscale.com/) platform, including Llama 2 70B.
 - **[llm-replicate](https://github.com/simonw/llm-replicate)** adds support for remote models hosted on [Replicate](https://replicate.com/), including Llama 2 from Meta AI.
 - **[llm-palm](https://github.com/simonw/llm-palm)** adds support for Google's [PaLM 2 model](https://developers.generativeai.google/).

From fb63c92cd27053700daa5420a0d1ad8fdfb718bd Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 4 Mar 2024 13:28:59 -0800
Subject: [PATCH 002/149] llm logs -r/--response option, closes #431

---
 docs/help.md      |  1 +
 docs/logging.md   |  8 +++++++-
 llm/cli.py        | 11 ++++++++++-
 tests/test_llm.py | 11 +++++++++++
 4 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/docs/help.md b/docs/help.md
index 6813ef7a..9cd2927c 100644
--- a/docs/help.md
+++ b/docs/help.md
@@ -253,6 +253,7 @@ Options:
   -m, --model TEXT            Filter by model or model alias
   -q, --query TEXT            Search for logs matching this string
   -t, --truncate              Truncate long strings in output
+  -r, --response              Just output the last response
   -c, --current               Show logs from the current conversation
   --cid, --conversation TEXT  Show logs for this conversation ID
   --json                      Output logs as JSON
diff --git a/docs/logging.md b/docs/logging.md
index b24c71fe..ceca824f 100644
--- a/docs/logging.md
+++ b/docs/logging.md
@@ -54,7 +54,13 @@ You can view the logs using the `llm logs` command:
 ```bash
 llm logs
 ```
-This will output the three most recent logged items in Markdown format
+This will output the three most recent logged items in Markdown format, showing both the prompt and the response formatted using Markdown.
+
+To get back just the most recent prompt response as plain text, add `-r/--response`:
+
+```bash
+llm logs -r
+```
 
 Add `--json` to get the log messages in JSON instead:
 
diff --git a/llm/cli.py b/llm/cli.py
index 7e72283a..966c2e47 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -628,6 +628,7 @@ def logs_turn_off():
 @click.option("-m", "--model", help="Filter by model or model alias")
 @click.option("-q", "--query", help="Search for logs matching this string")
 @click.option("-t", "--truncate", is_flag=True, help="Truncate long strings in output")
+@click.option("-r", "--response", is_flag=True, help="Just output the last response")
 @click.option(
     "current_conversation",
     "-c",
@@ -654,6 +655,7 @@ def logs_list(
     model,
     query,
     truncate,
+    response,
     current_conversation,
     conversation_id,
     json_output,
@@ -665,6 +667,9 @@ def logs_list(
     db = sqlite_utils.Database(path)
     migrate(db)
 
+    if response and not current_conversation and not conversation_id:
+        current_conversation = True
+
     if current_conversation:
         try:
             conversation_id = next(
@@ -738,9 +743,13 @@ def logs_list(
                 else:
                     row[key] = json.loads(row[key])
 
-    # Output as JSON if request
     if json_output:
+        # Output as JSON if requested
         click.echo(json.dumps(list(rows), indent=2))
+    elif response:
+        # Just output the last response
+        if rows:
+            click.echo(rows[-1]["response"])
     else:
         # Output neatly formatted human-readable logs
         current_system = None
diff --git a/tests/test_llm.py b/tests/test_llm.py
index acfdc2bc..795c4146 100644
--- a/tests/test_llm.py
+++ b/tests/test_llm.py
@@ -97,6 +97,17 @@ def test_logs_json(n, log_path):
     assert len(logs) == expected_length
 
 
+@pytest.mark.parametrize(
+    "args", (["-r"], ["--response"], ["list", "-r"], ["list", "--response"])
+)
+def test_logs_response_only(args, log_path):
+    "Test that logs -r/--response returns just the last response"
+    runner = CliRunner()
+    result = runner.invoke(cli, ["logs"] + args, catch_exceptions=False)
+    assert result.exit_code == 0
+    assert result.output == "response\n"
+
+
 @pytest.mark.xfail(sys.platform == "win32", reason="Expected to fail on Windows")
 @pytest.mark.parametrize("env", ({}, {"LLM_USER_PATH": "/tmp/llm-user-path"}))
 def test_logs_path(monkeypatch, env, user_path):

From 008efae86a86c0a4782acbd15688aad2f4942bf5 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Tue, 26 Mar 2024 08:58:48 -0700
Subject: [PATCH 003/149] llm-cmd

!stable-docs

Refs https://github.com/simonw/llm-cmd/issues/1
---
 docs/plugins/directory.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/plugins/directory.md b/docs/plugins/directory.md
index f04d595f..37324f58 100644
--- a/docs/plugins/directory.md
+++ b/docs/plugins/directory.md
@@ -43,6 +43,7 @@ If an API model host provides an OpenAI-compatible API you can also [configure L
 
 ## Extra commands
 
+- **[llm-cmd](https://github.com/simonw/llm-cmd)** accepts a prompt for a shell command, runs that prompt and populates the result in your shell so you can review it, edit it and then hit `<enter>` to execute or `ctrl+c` to cancel.
 - **[llm-python](https://github.com/simonw/llm-python)** adds a `llm python` command for running a Python interpreter in the same virtual environment as LLM. This is useful for debugging, and also provides a convenient way to interact with the LLM {ref}`python-api` if you installed LLM using Homebrew or `pipx`.
 - **[llm-cluster](https://github.com/simonw/llm-cluster)** adds a `llm cluster` command for calculating clusters for a collection of embeddings. Calculated clusters can then be passed to a Large Language Model to generate a summary description.
 

From 12e027d3e48cf3615396e4190a02ee04392771fe Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 4 Apr 2024 07:41:03 -0700
Subject: [PATCH 004/149]  llm-command-r

!stable-docs

Refs https://github.com/simonw/llm-command-r/issues/1
---
 docs/plugins/directory.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/plugins/directory.md b/docs/plugins/directory.md
index 37324f58..327db4e6 100644
--- a/docs/plugins/directory.md
+++ b/docs/plugins/directory.md
@@ -21,6 +21,7 @@ These plugins can be used to interact with remotely hosted models via their API:
 - **[llm-gemini](https://github.com/simonw/llm-gemini)** adds support for Google's [Gemini](https://ai.google.dev/docs) models.
 - **[llm-claude](https://github.com/tomviner/llm-claude)** by Tom Viner adds support for Claude 2.1 and Claude Instant 2.1 by Anthropic.
 - **[llm-claude-3](https://github.com/simonw/llm-claude-3)** supports Anthropic's [Claude 3 family](https://www.anthropic.com/news/claude-3-family) of models.
+- **[llm-command-r](https://github.com/simonw/llm-command-r)** supports Cohere's Command R and [Command R Plus](https://txt.cohere.com/command-r-plus-microsoft-azure/) API models.
 - **[llm-anyscale-endpoints](https://github.com/simonw/llm-anyscale-endpoints)** supports models hosted on the [Anyscale Endpoints](https://app.endpoints.anyscale.com/) platform, including Llama 2 70B.
 - **[llm-replicate](https://github.com/simonw/llm-replicate)** adds support for remote models hosted on [Replicate](https://replicate.com/), including Llama 2 from Meta AI.
 - **[llm-palm](https://github.com/simonw/llm-palm)** adds support for Google's [PaLM 2 model](https://developers.generativeai.google/).

From 9ad9ac68dccf60c4eef737c1638c5aedee9f024a Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Wed, 17 Apr 2024 19:38:41 -0700
Subject: [PATCH 005/149] llm-reka in plugin directory

!stable-docs
---
 docs/plugins/directory.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/plugins/directory.md b/docs/plugins/directory.md
index 327db4e6..27546b01 100644
--- a/docs/plugins/directory.md
+++ b/docs/plugins/directory.md
@@ -22,6 +22,7 @@ These plugins can be used to interact with remotely hosted models via their API:
 - **[llm-claude](https://github.com/tomviner/llm-claude)** by Tom Viner adds support for Claude 2.1 and Claude Instant 2.1 by Anthropic.
 - **[llm-claude-3](https://github.com/simonw/llm-claude-3)** supports Anthropic's [Claude 3 family](https://www.anthropic.com/news/claude-3-family) of models.
 - **[llm-command-r](https://github.com/simonw/llm-command-r)** supports Cohere's Command R and [Command R Plus](https://txt.cohere.com/command-r-plus-microsoft-azure/) API models.
+- **[llm-reka](https://github.com/simonw/llm-reka)** supports the [Reka](https://www.reka.ai/) family of models via their API.
 - **[llm-anyscale-endpoints](https://github.com/simonw/llm-anyscale-endpoints)** supports models hosted on the [Anyscale Endpoints](https://app.endpoints.anyscale.com/) platform, including Llama 2 70B.
 - **[llm-replicate](https://github.com/simonw/llm-replicate)** adds support for remote models hosted on [Replicate](https://replicate.com/), including Llama 2 from Meta AI.
 - **[llm-palm](https://github.com/simonw/llm-palm)** adds support for Google's [PaLM 2 model](https://developers.generativeai.google/).

From 99a2836638739ab85b1b86aa407c60c98685791c Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 18 Apr 2024 17:20:09 -0700
Subject: [PATCH 006/149] llm-fireworks

Refs https://github.com/simonw/llm-fireworks/issues/1

!stable-docs
---
 docs/plugins/directory.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/plugins/directory.md b/docs/plugins/directory.md
index 27546b01..35fa25b1 100644
--- a/docs/plugins/directory.md
+++ b/docs/plugins/directory.md
@@ -25,6 +25,7 @@ These plugins can be used to interact with remotely hosted models via their API:
 - **[llm-reka](https://github.com/simonw/llm-reka)** supports the [Reka](https://www.reka.ai/) family of models via their API.
 - **[llm-anyscale-endpoints](https://github.com/simonw/llm-anyscale-endpoints)** supports models hosted on the [Anyscale Endpoints](https://app.endpoints.anyscale.com/) platform, including Llama 2 70B.
 - **[llm-replicate](https://github.com/simonw/llm-replicate)** adds support for remote models hosted on [Replicate](https://replicate.com/), including Llama 2 from Meta AI.
+- **[llm-fireworks](https://github.com/simonw/llm-fireworks)** supports models hosted by [Fireworks AI](https://fireworks.ai/).
 - **[llm-palm](https://github.com/simonw/llm-palm)** adds support for Google's [PaLM 2 model](https://developers.generativeai.google/).
 - **[llm-openrouter](https://github.com/simonw/llm-openrouter)** provides access to models hosted on [OpenRouter](https://openrouter.ai/).
 - **[llm-cohere](https://github.com/Accudio/llm-cohere)** by Alistair Shepherd provides `cohere-generate` and `cohere-summarize` API models, powered by [Cohere](https://cohere.com/).

From 2a9b6113f591e36594aba31523e04825d2affb65 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sun, 21 Apr 2024 16:18:37 -0700
Subject: [PATCH 007/149] llm-perplexity

Refs https://github.com/hex/llm-perplexity/issues/2

!stable-docs
---
 docs/plugins/directory.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/plugins/directory.md b/docs/plugins/directory.md
index 35fa25b1..dfb21fe9 100644
--- a/docs/plugins/directory.md
+++ b/docs/plugins/directory.md
@@ -23,6 +23,7 @@ These plugins can be used to interact with remotely hosted models via their API:
 - **[llm-claude-3](https://github.com/simonw/llm-claude-3)** supports Anthropic's [Claude 3 family](https://www.anthropic.com/news/claude-3-family) of models.
 - **[llm-command-r](https://github.com/simonw/llm-command-r)** supports Cohere's Command R and [Command R Plus](https://txt.cohere.com/command-r-plus-microsoft-azure/) API models.
 - **[llm-reka](https://github.com/simonw/llm-reka)** supports the [Reka](https://www.reka.ai/) family of models via their API.
+- **[llm-perplexity](https://github.com/hex/llm-perplexity)** by Alexandru Geana supports the [Perplexity Labs](https://docs.perplexity.ai/) API models, including `sonar-medium-online` which can search for things online and `llama-3-70b-instruct`.
 - **[llm-anyscale-endpoints](https://github.com/simonw/llm-anyscale-endpoints)** supports models hosted on the [Anyscale Endpoints](https://app.endpoints.anyscale.com/) platform, including Llama 2 70B.
 - **[llm-replicate](https://github.com/simonw/llm-replicate)** adds support for remote models hosted on [Replicate](https://replicate.com/), including Llama 2 from Meta AI.
 - **[llm-fireworks](https://github.com/simonw/llm-fireworks)** supports models hosted by [Fireworks AI](https://fireworks.ai/).

From 3696897db5a73806c7c65ed03809a03cd7b0ca0f Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sun, 21 Apr 2024 20:30:35 -0700
Subject: [PATCH 008/149] Link to plugins directory from README

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index e7e485ab..f027d36a 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,8 @@ A CLI utility and Python library for interacting with Large Language Models, bot
 
 [Run prompts from the command-line](https://llm.datasette.io/en/stable/usage.html#executing-a-prompt), [store the results in SQLite](https://llm.datasette.io/en/stable/logging.html), [generate embeddings](https://llm.datasette.io/en/stable/embeddings/index.html) and more.
 
+Consult the **[LLM plugins directory](https://llm.datasette.io/en/stable/plugins/directory.html)** for plugins that provide access to remote and local models.
+
 Full documentation: **[llm.datasette.io](https://llm.datasette.io/)**
 
 Background on this project:

From 04915e95f8ab490d6a82900ccba92ad3fe93d2ab Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sun, 21 Apr 2024 20:33:23 -0700
Subject: [PATCH 009/149] llm-groq

!stable-docs
---
 docs/plugins/directory.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/plugins/directory.md b/docs/plugins/directory.md
index dfb21fe9..dc1b6f38 100644
--- a/docs/plugins/directory.md
+++ b/docs/plugins/directory.md
@@ -24,6 +24,7 @@ These plugins can be used to interact with remotely hosted models via their API:
 - **[llm-command-r](https://github.com/simonw/llm-command-r)** supports Cohere's Command R and [Command R Plus](https://txt.cohere.com/command-r-plus-microsoft-azure/) API models.
 - **[llm-reka](https://github.com/simonw/llm-reka)** supports the [Reka](https://www.reka.ai/) family of models via their API.
 - **[llm-perplexity](https://github.com/hex/llm-perplexity)** by Alexandru Geana supports the [Perplexity Labs](https://docs.perplexity.ai/) API models, including `sonar-medium-online` which can search for things online and `llama-3-70b-instruct`.
+- **[llm-groq](https://github.com/angerman/llm-groq)** by Moritz Angermann provides access to fast models hosted by [Groq](https://console.groq.com/docs/models).
 - **[llm-anyscale-endpoints](https://github.com/simonw/llm-anyscale-endpoints)** supports models hosted on the [Anyscale Endpoints](https://app.endpoints.anyscale.com/) platform, including Llama 2 70B.
 - **[llm-replicate](https://github.com/simonw/llm-replicate)** adds support for remote models hosted on [Replicate](https://replicate.com/), including Llama 2 from Meta AI.
 - **[llm-fireworks](https://github.com/simonw/llm-fireworks)** supports models hosted by [Fireworks AI](https://fireworks.ai/).

From 73bbbec3722bca37ba329c7fdf7da9e408251ed3 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 13 May 2024 12:49:45 -0700
Subject: [PATCH 010/149] gpt-4o model, refs #490

---
 docs/aliases.md                      |  1 +
 docs/openai-models.md                | 14 +++++++++++---
 docs/usage.md                        | 10 ++++++++++
 llm/default_plugins/openai_models.py |  2 ++
 4 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/docs/aliases.md b/docs/aliases.md
index b334cd8c..85d52e50 100644
--- a/docs/aliases.md
+++ b/docs/aliases.md
@@ -29,6 +29,7 @@ gpt4             : gpt-4
 gpt-4-turbo      : gpt-4-turbo-preview
 4-turbo          : gpt-4-turbo-preview
 4t               : gpt-4-turbo-preview
+4o               : gpt-4o
 3.5-instruct     : gpt-3.5-turbo-instruct
 chatgpt-instruct : gpt-3.5-turbo-instruct
 ada              : ada-002 (embedding)
diff --git a/docs/openai-models.md b/docs/openai-models.md
index 62a45f92..a8d9ae49 100644
--- a/docs/openai-models.md
+++ b/docs/openai-models.md
@@ -23,16 +23,24 @@ Then paste in the API key.
 
 Run `llm models` for a full list of available models. The OpenAI models supported by LLM are:
 
+<!-- [[[cog
+from click.testing import CliRunner
+from llm.cli import cli
+result = CliRunner().invoke(cli, ["models", "list"])
+models = [line for line in result.output.split("\n") if line.startswith("OpenAI ")]
+cog.out("```\n{}```".format("\n".join(models)))
+]]] -->
 ```
 OpenAI Chat: gpt-3.5-turbo (aliases: 3.5, chatgpt)
-OpenAI Chat: gpt-3.5-turbo-16k (aliases: chatgpt-16k, 3.5-16k, turbo)
+OpenAI Chat: gpt-3.5-turbo-16k (aliases: chatgpt-16k, 3.5-16k)
 OpenAI Chat: gpt-4 (aliases: 4, gpt4)
 OpenAI Chat: gpt-4-32k (aliases: 4-32k)
 OpenAI Chat: gpt-4-1106-preview
 OpenAI Chat: gpt-4-0125-preview
 OpenAI Chat: gpt-4-turbo-preview (aliases: gpt-4-turbo, 4-turbo, 4t)
-OpenAI Completion: gpt-3.5-turbo-instruct (aliases: 3.5-instruct, chatgpt-instruct, instruct)
-```
+OpenAI Chat: gpt-4o (aliases: 4o)
+OpenAI Completion: gpt-3.5-turbo-instruct (aliases: 3.5-instruct, chatgpt-instruct)```
+<!-- [[[end]]] -->
 
 See [the OpenAI models documentation](https://platform.openai.com/docs/models) for details of each of these.
 
diff --git a/docs/usage.md b/docs/usage.md
index 99974641..e0371860 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -315,6 +315,16 @@ OpenAI Chat: gpt-4-turbo-preview (aliases: gpt-4-turbo, 4-turbo, 4t)
   logit_bias: dict, str
   seed: int
   json_object: boolean
+OpenAI Chat: gpt-4o (aliases: 4o)
+  temperature: float
+  max_tokens: int
+  top_p: float
+  frequency_penalty: float
+  presence_penalty: float
+  stop: str
+  logit_bias: dict, str
+  seed: int
+  json_object: boolean
 OpenAI Completion: gpt-3.5-turbo-instruct (aliases: 3.5-instruct, chatgpt-instruct)
   temperature: float
     What sampling temperature to use, between 0 and 2. Higher values like
diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index 88958e94..817919a6 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -31,6 +31,8 @@ def register_models(register):
     register(Chat("gpt-4-1106-preview"))
     register(Chat("gpt-4-0125-preview"))
     register(Chat("gpt-4-turbo-preview"), aliases=("gpt-4-turbo", "4-turbo", "4t"))
+    # GPT-4o
+    register(Chat("gpt-4o"), aliases=("4o",))
     # The -instruct completion model
     register(
         Completion("gpt-3.5-turbo-instruct", default_max_tokens=256),

From 8171c9a6bf2df1c19f0265e160ea468a547bf922 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 13 May 2024 12:53:31 -0700
Subject: [PATCH 011/149] Update help for GPT-4o, closes #490

---
 docs/openai-models.md | 2 +-
 docs/usage.md         | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/openai-models.md b/docs/openai-models.md
index a8d9ae49..038d139a 100644
--- a/docs/openai-models.md
+++ b/docs/openai-models.md
@@ -44,7 +44,7 @@ OpenAI Completion: gpt-3.5-turbo-instruct (aliases: 3.5-instruct, chatgpt-instru
 
 See [the OpenAI models documentation](https://platform.openai.com/docs/models) for details of each of these.
 
-The best balance of price and capacity are the `-turbo` models. `gpt-3.5-turbo` (aliased to `3.5`) is the least expensive. `gpt-4-turbo-preview` (aliased to `4t`) is the cheapest of the GPT-4 models.
+`gpt-3.5-turbo` (aliased to `3.5`) is the least expensive model. `gpt-4o` (aliased to `4o`) is the newest, cheapest and fastest of the GPT-4 family of models.
 
 The `gpt-3.5-turbo-instruct` model is a little different - it is a completion model rather than a chat model, described in [the OpenAI completions documentation](https://platform.openai.com/docs/api-reference/completions/create).
 
diff --git a/docs/usage.md b/docs/usage.md
index e0371860..568b5259 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -18,9 +18,9 @@ To disable streaming and only return the response once it has completed:
 ```bash
 llm 'Ten names for cheesecakes' --no-stream
 ```
-To switch from ChatGPT 3.5 (the default) to GPT-4 Turbo:
+To switch from ChatGPT 3.5 (the default) to GPT-4o:
 ```bash
-llm 'Ten names for cheesecakes' -m gpt-4-turbo
+llm 'Ten names for cheesecakes' -m gpt-4o
 ```
 You can use `-m 4t` as an even shorter shortcut.
 
@@ -361,6 +361,6 @@ OpenAI Completion: gpt-3.5-turbo-instruct (aliases: 3.5-instruct, chatgpt-instru
 
 When running a prompt you can pass the full model name or any of the aliases to the `-m/--model` option:
 ```bash
-llm -m chatgpt-16k \
+llm -m 4o \
   'As many names for cheesecakes as you can think of, with detailed descriptions'
 ```

From 3cc588f247300eca5e6c0c855f1907a2cceefbd5 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 13 May 2024 12:55:22 -0700
Subject: [PATCH 012/149] List llm-llamafile in plugins directory, closes #470

---
 docs/plugins/directory.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/plugins/directory.md b/docs/plugins/directory.md
index dc1b6f38..53762726 100644
--- a/docs/plugins/directory.md
+++ b/docs/plugins/directory.md
@@ -12,6 +12,7 @@ These plugins all help you run LLMs directly on your own computer:
 - **[llm-gpt4all](https://github.com/simonw/llm-gpt4all)** adds support for various models released by the [GPT4All](https://gpt4all.io/) project that are optimized to run locally on your own machine. These models include versions of Vicuna, Orca, Falcon and MPT - here's [a full list of models](https://observablehq.com/@simonw/gpt4all-models).
 - **[llm-mpt30b](https://github.com/simonw/llm-mpt30b)** adds support for the [MPT-30B](https://huggingface.co/mosaicml/mpt-30b) local model.
 - **[llm-ollama](https://github.com/taketwo/llm-ollama)** adds support for local models run using [Ollama](https://ollama.ai/).
+- **[llm-llamafile](https://github.com/simonw/llm-llamafile)** adds support for local models that are running locally using [llamafile](https://github.com/Mozilla-Ocho/llamafile).
 
 ## Remote APIs
 

From 0a8fd77b2615128f099e1f2cb4e610229b634a82 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 13 May 2024 13:00:03 -0700
Subject: [PATCH 013/149] Fix for mypy error, closes #491

This is why tests failed for #490
---
 llm/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/models.py b/llm/models.py
index e3e54b87..0e47bb60 100644
--- a/llm/models.py
+++ b/llm/models.py
@@ -87,7 +87,7 @@ def __iter__(self) -> Iterator[str]:
         self._start = time.monotonic()
         self._start_utcnow = datetime.datetime.utcnow()
         if self._done:
-            return self._chunks
+            yield from self._chunks
         for chunk in self.model.execute(
             self.prompt,
             stream=self.stream,

From 2040af8974d9aee4a2eb01160e1fac4d2931e05e Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 13 May 2024 13:00:37 -0700
Subject: [PATCH 014/149] Pin minimum mypy version, refs #491

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 2ad17a89..a644d7b9 100644
--- a/setup.py
+++ b/setup.py
@@ -55,7 +55,7 @@ def get_long_description():
             "numpy",
             "pytest-httpx",
             "cogapp",
-            "mypy",
+            "mypy>=1.10.0",
             "black>=24.1.0",
             "ruff",
             "types-click",

From 6cdc29c8d6192cc22775b970e52d4ea319b7fb21 Mon Sep 17 00:00:00 2001
From: Fabian Labat <flabat@flabat.com>
Date: Mon, 13 May 2024 16:01:33 -0400
Subject: [PATCH 015/149] Update directory.md (#486)

* Update directory.md

Added support for Bedrock Llama 3
---
 docs/plugins/directory.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/plugins/directory.md b/docs/plugins/directory.md
index 53762726..a399f43b 100644
--- a/docs/plugins/directory.md
+++ b/docs/plugins/directory.md
@@ -33,7 +33,7 @@ These plugins can be used to interact with remotely hosted models via their API:
 - **[llm-openrouter](https://github.com/simonw/llm-openrouter)** provides access to models hosted on [OpenRouter](https://openrouter.ai/).
 - **[llm-cohere](https://github.com/Accudio/llm-cohere)** by Alistair Shepherd provides `cohere-generate` and `cohere-summarize` API models, powered by [Cohere](https://cohere.com/).
 - **[llm-bedrock-anthropic](https://github.com/sblakey/llm-bedrock-anthropic)** by Sean Blakey adds support for Claude and Claude Instant by Anthropic via Amazon Bedrock.
-- **[llm-bedrock-meta](https://github.com/flabat/llm-bedrock-meta)** by Fabian Labat  adds support for Llama 2 by Meta via Amazon Bedrock.
+- **[llm-bedrock-meta](https://github.com/flabat/llm-bedrock-meta)** by Fabian Labat adds support for Llama 2 and Llama 3 by Meta via Amazon Bedrock.
 - **[llm-together](https://github.com/wearedevx/llm-together)** adds support for the [Together AI](https://www.together.ai/) extensive family of hosted openly licensed models.
 
 If an API model host provides an OpenAI-compatible API you can also [configure LLM to talk to it](https://llm.datasette.io/en/stable/other-models.html#openai-compatible-models) without needing an extra plugin.

From ab1cc4fd1f5ea1c037a8d8c56b455d11448e5577 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 13 May 2024 13:26:33 -0700
Subject: [PATCH 016/149] Release 0.14

Refs #404, #431, #470, #490, #491
---
 docs/changelog.md         | 16 ++++++++++++++++
 docs/plugins/directory.md |  2 +-
 setup.py                  |  2 +-
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index dad9fec4..9ca3afc3 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,5 +1,21 @@
 # Changelog
 
+(v0_14)=
+## 0.14 (2024-05-13)
+
+- Support for OpenAI's [new GPT-4o](https://openai.com/index/hello-gpt-4o/) model: `llm -m gpt-4o 'say hi in Spanish'` [#490](https://github.com/simonw/llm/issues/490)
+- New `llm logs -r/--response` option for outputting just the last captured response, without wrapping it in Markdown and accompanying it with the prompt. [#431](https://github.com/simonw/llm/issues/431)
+- Nine new {ref}`plugins <plugin-directory>` since version 0.13:
+  - **[llm-claude-3](https://github.com/simonw/llm-claude-3)** supporting Anthropic's [Claude 3 family](https://www.anthropic.com/news/claude-3-family) of models.
+  - **[llm-command-r](https://github.com/simonw/llm-command-r)** supporting Cohere's Command R and [Command R Plus](https://txt.cohere.com/command-r-plus-microsoft-azure/) API models.
+  - **[llm-reka](https://github.com/simonw/llm-reka)** supports the [Reka](https://www.reka.ai/) family of models via their API.
+  - **[llm-perplexity](https://github.com/hex/llm-perplexity)** by Alexandru Geana supporting the [Perplexity Labs](https://docs.perplexity.ai/) API models, including `llama-3-sonar-large-32k-online` which can search for things online and `llama-3-70b-instruct`.
+  - **[llm-groq](https://github.com/angerman/llm-groq)** by Moritz Angermann providing access to fast models hosted by [Groq](https://console.groq.com/docs/models).
+  - **[llm-fireworks](https://github.com/simonw/llm-fireworks)** supporting models hosted by [Fireworks AI](https://fireworks.ai/).
+  - **[llm-together](https://github.com/wearedevx/llm-together)** adds support for the [Together AI](https://www.together.ai/) extensive family of hosted openly licensed models.
+  - **[llm-embed-onnx](https://github.com/simonw/llm-embed-onnx)** provides seven embedding models that can be executed using the ONNX model framework.
+  - **[llm-cmd](https://github.com/simonw/llm-cmd)** accepts a prompt for a shell command, runs that prompt and populates the result in your shell so you can review it, edit it and then hit `<enter>` to execute or `ctrl+c` to cancel, see [this post for details](https://simonwillison.net/2024/Mar/26/llm-cmd/).
+
 (v0_13_1)=
 ## 0.13.1 (2024-01-26)
 
diff --git a/docs/plugins/directory.md b/docs/plugins/directory.md
index a399f43b..af0c88d6 100644
--- a/docs/plugins/directory.md
+++ b/docs/plugins/directory.md
@@ -24,7 +24,7 @@ These plugins can be used to interact with remotely hosted models via their API:
 - **[llm-claude-3](https://github.com/simonw/llm-claude-3)** supports Anthropic's [Claude 3 family](https://www.anthropic.com/news/claude-3-family) of models.
 - **[llm-command-r](https://github.com/simonw/llm-command-r)** supports Cohere's Command R and [Command R Plus](https://txt.cohere.com/command-r-plus-microsoft-azure/) API models.
 - **[llm-reka](https://github.com/simonw/llm-reka)** supports the [Reka](https://www.reka.ai/) family of models via their API.
-- **[llm-perplexity](https://github.com/hex/llm-perplexity)** by Alexandru Geana supports the [Perplexity Labs](https://docs.perplexity.ai/) API models, including `sonar-medium-online` which can search for things online and `llama-3-70b-instruct`.
+- **[llm-perplexity](https://github.com/hex/llm-perplexity)** by Alexandru Geana supports the [Perplexity Labs](https://docs.perplexity.ai/) API models, including `llama-3-sonar-large-32k-online` which can search for things online and `llama-3-70b-instruct`.
 - **[llm-groq](https://github.com/angerman/llm-groq)** by Moritz Angermann provides access to fast models hosted by [Groq](https://console.groq.com/docs/models).
 - **[llm-anyscale-endpoints](https://github.com/simonw/llm-anyscale-endpoints)** supports models hosted on the [Anyscale Endpoints](https://app.endpoints.anyscale.com/) platform, including Llama 2 70B.
 - **[llm-replicate](https://github.com/simonw/llm-replicate)** adds support for remote models hosted on [Replicate](https://replicate.com/), including Llama 2 from Meta AI.
diff --git a/setup.py b/setup.py
index a644d7b9..a569a070 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 import os
 
-VERSION = "0.13.1"
+VERSION = "0.14"
 
 
 def get_long_description():

From 9a3236db61d93cb6d4a5148a8aa8651d734e51a2 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 13 May 2024 13:37:23 -0700
Subject: [PATCH 017/149] gpt-4-turbo model ID, closes #493

---
 docs/aliases.md                      | 28 ++++++++++++++--------------
 docs/changelog.md                    |  1 +
 docs/openai-models.md                |  3 ++-
 docs/usage.md                        | 12 +++++++++++-
 llm/default_plugins/openai_models.py |  5 +++--
 5 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/docs/aliases.md b/docs/aliases.md
index 85d52e50..4ed6499a 100644
--- a/docs/aliases.md
+++ b/docs/aliases.md
@@ -19,20 +19,20 @@ result = CliRunner().invoke(cli, ["aliases", "list"])
 cog.out("```\n{}```".format(result.output))
 ]]] -->
 ```
-3.5              : gpt-3.5-turbo
-chatgpt          : gpt-3.5-turbo
-chatgpt-16k      : gpt-3.5-turbo-16k
-3.5-16k          : gpt-3.5-turbo-16k
-4                : gpt-4
-gpt4             : gpt-4
-4-32k            : gpt-4-32k
-gpt-4-turbo      : gpt-4-turbo-preview
-4-turbo          : gpt-4-turbo-preview
-4t               : gpt-4-turbo-preview
-4o               : gpt-4o
-3.5-instruct     : gpt-3.5-turbo-instruct
-chatgpt-instruct : gpt-3.5-turbo-instruct
-ada              : ada-002 (embedding)
+3.5                 : gpt-3.5-turbo
+chatgpt             : gpt-3.5-turbo
+chatgpt-16k         : gpt-3.5-turbo-16k
+3.5-16k             : gpt-3.5-turbo-16k
+4                   : gpt-4
+gpt4                : gpt-4
+4-32k               : gpt-4-32k
+gpt-4-turbo-preview : gpt-4-turbo
+4-turbo             : gpt-4-turbo
+4t                  : gpt-4-turbo
+4o                  : gpt-4o
+3.5-instruct        : gpt-3.5-turbo-instruct
+chatgpt-instruct    : gpt-3.5-turbo-instruct
+ada                 : ada-002 (embedding)
 ```
 <!-- [[[end]]] -->
 
diff --git a/docs/changelog.md b/docs/changelog.md
index 9ca3afc3..b92847df 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -4,6 +4,7 @@
 ## 0.14 (2024-05-13)
 
 - Support for OpenAI's [new GPT-4o](https://openai.com/index/hello-gpt-4o/) model: `llm -m gpt-4o 'say hi in Spanish'` [#490](https://github.com/simonw/llm/issues/490)
+- The `gpt-4-turbo` alias is now a model ID, which indicates the latest version of OpenAI's GPT-4 Turbo text and image model. Your existing `logs.db` database may contain records under the previous model ID of `gpt-4-turbo-preview`. [#493](https://github.com/simonw/llm/issues/493)
 - New `llm logs -r/--response` option for outputting just the last captured response, without wrapping it in Markdown and accompanying it with the prompt. [#431](https://github.com/simonw/llm/issues/431)
 - Nine new {ref}`plugins <plugin-directory>` since version 0.13:
   - **[llm-claude-3](https://github.com/simonw/llm-claude-3)** supporting Anthropic's [Claude 3 family](https://www.anthropic.com/news/claude-3-family) of models.
diff --git a/docs/openai-models.md b/docs/openai-models.md
index 038d139a..a022236a 100644
--- a/docs/openai-models.md
+++ b/docs/openai-models.md
@@ -37,7 +37,8 @@ OpenAI Chat: gpt-4 (aliases: 4, gpt4)
 OpenAI Chat: gpt-4-32k (aliases: 4-32k)
 OpenAI Chat: gpt-4-1106-preview
 OpenAI Chat: gpt-4-0125-preview
-OpenAI Chat: gpt-4-turbo-preview (aliases: gpt-4-turbo, 4-turbo, 4t)
+OpenAI Chat: gpt-4-turbo-2024-04-09
+OpenAI Chat: gpt-4-turbo (aliases: gpt-4-turbo-preview, 4-turbo, 4t)
 OpenAI Chat: gpt-4o (aliases: 4o)
 OpenAI Completion: gpt-3.5-turbo-instruct (aliases: 3.5-instruct, chatgpt-instruct)```
 <!-- [[[end]]] -->
diff --git a/docs/usage.md b/docs/usage.md
index 568b5259..62aefabe 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -305,7 +305,17 @@ OpenAI Chat: gpt-4-0125-preview
   logit_bias: dict, str
   seed: int
   json_object: boolean
-OpenAI Chat: gpt-4-turbo-preview (aliases: gpt-4-turbo, 4-turbo, 4t)
+OpenAI Chat: gpt-4-turbo-2024-04-09
+  temperature: float
+  max_tokens: int
+  top_p: float
+  frequency_penalty: float
+  presence_penalty: float
+  stop: str
+  logit_bias: dict, str
+  seed: int
+  json_object: boolean
+OpenAI Chat: gpt-4-turbo (aliases: gpt-4-turbo-preview, 4-turbo, 4t)
   temperature: float
   max_tokens: int
   top_p: float
diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index 817919a6..7591f742 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -27,10 +27,11 @@ def register_models(register):
     register(Chat("gpt-3.5-turbo-16k"), aliases=("chatgpt-16k", "3.5-16k"))
     register(Chat("gpt-4"), aliases=("4", "gpt4"))
     register(Chat("gpt-4-32k"), aliases=("4-32k",))
-    # GPT-4 turbo models
+    # GPT-4 Turbo models
     register(Chat("gpt-4-1106-preview"))
     register(Chat("gpt-4-0125-preview"))
-    register(Chat("gpt-4-turbo-preview"), aliases=("gpt-4-turbo", "4-turbo", "4t"))
+    register(Chat("gpt-4-turbo-2024-04-09"))
+    register(Chat("gpt-4-turbo"), aliases=("gpt-4-turbo-preview", "4-turbo", "4t"))
     # GPT-4o
     register(Chat("gpt-4o"), aliases=("4o",))
     # The -instruct completion model

From 8573b8ecc8359019c36a04fdb321032da0307819 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 13 May 2024 13:42:17 -0700
Subject: [PATCH 018/149] Publish releases using
 pypa/gh-action-pypi-publish@release/v1

---
 .github/workflows/publish.yml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 9f96d927..43654d6b 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -29,23 +29,23 @@ jobs:
         pytest
   deploy:
     runs-on: ubuntu-latest
+    environment: release
+    permissions:
+      id-token: write
     needs: [test]
     steps:
     - uses: actions/checkout@v4
     - name: Set up Python
       uses: actions/setup-python@v5
       with:
-        python-version: "3.12"
+        python-version: '3.12'
         cache: pip
         cache-dependency-path: setup.py
     - name: Install dependencies
       run: |
-        pip install setuptools wheel twine build
-    - name: Publish
-      env:
-        TWINE_USERNAME: __token__
-        TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
+        pip install setuptools wheel build
+    - name: Build
       run: |
         python -m build
-        twine upload dist/*
-
+    - name: Publish
+      uses: pypa/gh-action-pypi-publish@release/v1

From 45245413bd5093579315243b9034d8fa01cbba6a Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 13 May 2024 15:09:56 -0700
Subject: [PATCH 019/149] GitHub stars badge

!stable-docs
---
 docs/index.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/index.md b/docs/index.md
index 8d975b74..48cc007e 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,5 +1,6 @@
 # LLM
 
+![GitHub Repo stars](https://img.shields.io/github/stars/simonw/llm)
 [![PyPI](https://img.shields.io/pypi/v/llm.svg)](https://pypi.org/project/llm/)
 [![Changelog](https://img.shields.io/github/v/release/simonw/llm?include_prereleases&label=changelog)](https://llm.datasette.io/en/stable/changelog.html)
 [![Tests](https://github.com/simonw/llm/workflows/Test/badge.svg)](https://github.com/simonw/llm/actions?query=workflow%3ATest)

From 68df9721def4820e1b54311a032c19bac9e0e8ea Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 13 May 2024 18:41:07 -0700
Subject: [PATCH 020/149] github repo static badge

!stable-docs
---
 docs/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/index.md b/docs/index.md
index 48cc007e..0285fa92 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,6 +1,6 @@
 # LLM
 
-![GitHub Repo stars](https://img.shields.io/github/stars/simonw/llm)
+[![GitHub repo](https://img.shields.io/badge/github-repo-green)](https://github.com/simonw/llm)
 [![PyPI](https://img.shields.io/pypi/v/llm.svg)](https://pypi.org/project/llm/)
 [![Changelog](https://img.shields.io/github/v/release/simonw/llm?include_prereleases&label=changelog)](https://llm.datasette.io/en/stable/changelog.html)
 [![Tests](https://github.com/simonw/llm/workflows/Test/badge.svg)](https://github.com/simonw/llm/actions?query=workflow%3ATest)

From 964f4d99348d69e88f1c8952f3cd3d24ba3aa3a6 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sun, 16 Jun 2024 14:35:23 -0700
Subject: [PATCH 021/149] Fix for llm logs -q plus -m bug, closes #515

---
 llm/cli.py        |  3 ++-
 tests/test_llm.py | 15 +++++++++------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/llm/cli.py b/llm/cli.py
index 966c2e47..9b0109fa 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -716,7 +716,8 @@ def logs_list(
     if conversation_id:
         where_bits.append("responses.conversation_id = :conversation_id")
     if where_bits:
-        sql_format["extra_where"] = " where " + " and ".join(where_bits)
+        where_ = " and " if query else " where "
+        sql_format["extra_where"] = where_ + " and ".join(where_bits)
 
     final_sql = sql.format(**sql_format)
     rows = list(
diff --git a/tests/test_llm.py b/tests/test_llm.py
index 795c4146..e86a44e9 100644
--- a/tests/test_llm.py
+++ b/tests/test_llm.py
@@ -146,16 +146,19 @@ def test_logs_filtered(user_path, model):
 
 
 @pytest.mark.parametrize(
-    "query,expected",
+    "query,extra_args,expected",
     (
         # With no search term order should be by datetime
-        ("", ["doc1", "doc2", "doc3"]),
+        ("", [], ["doc1", "doc2", "doc3"]),
         # With a search it's order by rank instead
-        ("llama", ["doc1", "doc3"]),
-        ("alpaca", ["doc2"]),
+        ("llama", [], ["doc1", "doc3"]),
+        ("alpaca", [], ["doc2"]),
+        # Model filter should work too
+        ("llama", ["-m", "davinci"], ["doc1", "doc3"]),
+        ("llama", ["-m", "davinci2"], []),
     ),
 )
-def test_logs_search(user_path, query, expected):
+def test_logs_search(user_path, query, extra_args, expected):
     log_path = str(user_path / "logs.db")
     db = sqlite_utils.Database(log_path)
     migrate(db)
@@ -175,7 +178,7 @@ def _insert(id, text):
     _insert("doc2", "alpaca")
     _insert("doc3", "llama llama")
     runner = CliRunner()
-    result = runner.invoke(cli, ["logs", "list", "-q", query, "--json"])
+    result = runner.invoke(cli, ["logs", "list", "-q", query, "--json"] + extra_args)
     assert result.exit_code == 0
     records = json.loads(result.output.strip())
     assert [record["id"] for record in records] == expected

From 96db13f53774154a10fde9f41e659937ebe2ea01 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 17 Jun 2024 10:18:24 -0700
Subject: [PATCH 022/149] Link to new video

!stable-docs
---
 docs/index.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/index.md b/docs/index.md
index 0285fa92..263949f0 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -12,6 +12,8 @@ A CLI utility and Python library for interacting with Large Language Models, bot
 
 {ref}`Run prompts from the command-line <usage-executing-prompts>`, {ref}`store the results in SQLite <logging>`, {ref}`generate embeddings <embeddings>` and more.
 
+Here's a [YouTube video demo](https://www.youtube.com/watch?v=QUXQNi6jQ30) and [accompanying detailed notes](https://simonwillison.net/2024/Jun/17/cli-language-models/).
+
 Background on this project:
 - [llm, ttok and strip-tags—CLI tools for working with ChatGPT and other LLMs](https://simonwillison.net/2023/May/18/cli-tools-for-llms/)
 - [The LLM CLI tool now supports self-hosted language models via plugins](https://simonwillison.net/2023/Jul/12/llm/)

From aa8d2b9322a42a2ed69c9b6b6ad444bbaabbcf10 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 18 Jul 2024 11:49:31 -0700
Subject: [PATCH 023/149] sqlite-utils>=3.37

Closes #531
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index a569a070..386de3a6 100644
--- a/setup.py
+++ b/setup.py
@@ -39,7 +39,7 @@ def get_long_description():
         "click",
         "openai>=1.0",
         "click-default-group>=1.2.3",
-        "sqlite-utils>=3.35.0",
+        "sqlite-utils>=3.37",
         "sqlite-migrate>=0.1a2",
         "pydantic>=1.10.2",
         "PyYAML",

From 963a5ba46714d397a08af350f859e777baff2452 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 18 Jul 2024 11:53:00 -0700
Subject: [PATCH 024/149] gpt-4o-mini with alias 4o-mini, refs #536

---
 llm/default_plugins/openai_models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index 7591f742..12465477 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -34,6 +34,7 @@ def register_models(register):
     register(Chat("gpt-4-turbo"), aliases=("gpt-4-turbo-preview", "4-turbo", "4t"))
     # GPT-4o
     register(Chat("gpt-4o"), aliases=("4o",))
+    register(Chat("gpt-4o-mini"), aliases=("4o-mini",))
     # The -instruct completion model
     register(
         Completion("gpt-3.5-turbo-instruct", default_max_tokens=256),

From a83421607a6629edcd5ff3b524368839e7d9d1b6 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 18 Jul 2024 11:57:19 -0700
Subject: [PATCH 025/149] Switch default model to gpt-4o-mini (from
 gpt-3.5-turbo), refs #536

---
 llm/cli.py              |  2 +-
 tests/conftest.py       |  2 +-
 tests/test_llm.py       | 10 +++++-----
 tests/test_templates.py |  4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/llm/cli.py b/llm/cli.py
index 9b0109fa..967764d9 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -41,7 +41,7 @@
 
 warnings.simplefilter("ignore", ResourceWarning)
 
-DEFAULT_MODEL = "gpt-3.5-turbo"
+DEFAULT_MODEL = "gpt-4o-mini"
 DEFAULT_EMBEDDING_MODEL = "ada-002"
 
 DEFAULT_TEMPLATE = "prompt: "
diff --git a/tests/conftest.py b/tests/conftest.py
index 8113e2e7..120fe35c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -145,7 +145,7 @@ def mocked_openai_chat(httpx_mock):
         method="POST",
         url="https://api.openai.com/v1/chat/completions",
         json={
-            "model": "gpt-3.5-turbo",
+            "model": "gpt-4o-mini",
             "usage": {},
             "choices": [{"message": {"content": "Bob, Alice, Eve"}}],
         },
diff --git a/tests/test_llm.py b/tests/test_llm.py
index e86a44e9..eb418cae 100644
--- a/tests/test_llm.py
+++ b/tests/test_llm.py
@@ -260,7 +260,7 @@ def test_llm_default_prompt(
 
     assert len(rows) == 1
     expected = {
-        "model": "gpt-3.5-turbo",
+        "model": "gpt-4o-mini",
         "prompt": "three names \nfor a pet pelican",
         "system": None,
         "options_json": "{}",
@@ -274,7 +274,7 @@ def test_llm_default_prompt(
         "messages": [{"role": "user", "content": "three names \nfor a pet pelican"}]
     }
     assert json.loads(row["response_json"]) == {
-        "model": "gpt-3.5-turbo",
+        "model": "gpt-4o-mini",
         "choices": [{"message": {"content": "Bob, Alice, Eve"}}],
     }
 
@@ -288,7 +288,7 @@ def test_llm_default_prompt(
     assert (
         log_json[0].items()
         >= {
-            "model": "gpt-3.5-turbo",
+            "model": "gpt-4o-mini",
             "prompt": "three names \nfor a pet pelican",
             "system": None,
             "prompt_json": {
@@ -299,12 +299,12 @@ def test_llm_default_prompt(
             "options_json": {},
             "response": "Bob, Alice, Eve",
             "response_json": {
-                "model": "gpt-3.5-turbo",
+                "model": "gpt-4o-mini",
                 "choices": [{"message": {"content": "Bob, Alice, Eve"}}],
             },
             # This doesn't have the \n after three names:
             "conversation_name": "three names for a pet pelican",
-            "conversation_model": "gpt-3.5-turbo",
+            "conversation_model": "gpt-4o-mini",
         }.items()
     )
 
diff --git a/tests/test_templates.py b/tests/test_templates.py
index 04654f99..6289c38b 100644
--- a/tests/test_templates.py
+++ b/tests/test_templates.py
@@ -115,7 +115,7 @@ def test_templates_prompt_save(templates_path, args, expected_prompt, expected_e
         (
             "'Summarize this: $input'",
             [],
-            "gpt-3.5-turbo",
+            "gpt-4o-mini",
             "Summarize this: Input text",
             None,
         ),
@@ -150,7 +150,7 @@ def test_templates_prompt_save(templates_path, args, expected_prompt, expected_e
         (
             "prompt: 'Say $hello'",
             ["-p", "hello", "Blah"],
-            "gpt-3.5-turbo",
+            "gpt-4o-mini",
             "Say Blah",
             None,
         ),

From 2881576dd0924af65dbc640098ed33d08dcc91c7 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 18 Jul 2024 12:00:35 -0700
Subject: [PATCH 026/149] Re-ran cog, refs #536

---
 docs/aliases.md       |  1 +
 docs/openai-models.md |  1 +
 docs/usage.md         | 10 ++++++++++
 3 files changed, 12 insertions(+)

diff --git a/docs/aliases.md b/docs/aliases.md
index 4ed6499a..fe86288e 100644
--- a/docs/aliases.md
+++ b/docs/aliases.md
@@ -30,6 +30,7 @@ gpt-4-turbo-preview : gpt-4-turbo
 4-turbo             : gpt-4-turbo
 4t                  : gpt-4-turbo
 4o                  : gpt-4o
+4o-mini             : gpt-4o-mini
 3.5-instruct        : gpt-3.5-turbo-instruct
 chatgpt-instruct    : gpt-3.5-turbo-instruct
 ada                 : ada-002 (embedding)
diff --git a/docs/openai-models.md b/docs/openai-models.md
index a022236a..12690b29 100644
--- a/docs/openai-models.md
+++ b/docs/openai-models.md
@@ -40,6 +40,7 @@ OpenAI Chat: gpt-4-0125-preview
 OpenAI Chat: gpt-4-turbo-2024-04-09
 OpenAI Chat: gpt-4-turbo (aliases: gpt-4-turbo-preview, 4-turbo, 4t)
 OpenAI Chat: gpt-4o (aliases: 4o)
+OpenAI Chat: gpt-4o-mini (aliases: 4o-mini)
 OpenAI Completion: gpt-3.5-turbo-instruct (aliases: 3.5-instruct, chatgpt-instruct)```
 <!-- [[[end]]] -->
 
diff --git a/docs/usage.md b/docs/usage.md
index 62aefabe..64ffdb95 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -335,6 +335,16 @@ OpenAI Chat: gpt-4o (aliases: 4o)
   logit_bias: dict, str
   seed: int
   json_object: boolean
+OpenAI Chat: gpt-4o-mini (aliases: 4o-mini)
+  temperature: float
+  max_tokens: int
+  top_p: float
+  frequency_penalty: float
+  presence_penalty: float
+  stop: str
+  logit_bias: dict, str
+  seed: int
+  json_object: boolean
 OpenAI Completion: gpt-3.5-turbo-instruct (aliases: 3.5-instruct, chatgpt-instruct)
   temperature: float
     What sampling temperature to use, between 0 and 2. Higher values like

From c9fab1150c07138a245ab7dfff72a1e332ee2780 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 18 Jul 2024 12:06:41 -0700
Subject: [PATCH 027/149] Update to ruff check .

---
 .github/workflows/test.yml | 2 +-
 Justfile                   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 9d6a34d1..1884fc9a 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -45,7 +45,7 @@ jobs:
     - name: Run ruff
       if: matrix.os != 'windows-latest'
       run: |
-        ruff .
+        ruff check .
     - name: Run test-llm-load-plugins.sh
       if: matrix.os != 'windows-latest'
       run: |
diff --git a/Justfile b/Justfile
index a83d3600..868e7ea7 100644
--- a/Justfile
+++ b/Justfile
@@ -21,7 +21,7 @@
   echo "  mypy"
   pipenv run mypy llm
   echo "  ruff"
-  pipenv run ruff .
+  pipenv run ruff check .
 
 # Run mypy
 @mypy:
@@ -42,7 +42,7 @@
 
 # Run automatic fixes
 @fix: cog
-  pipenv run ruff . --fix
+  pipenv run ruff check . --fix
   pipenv run black .
 
 # Push commit if tests pass

From 50454c195723ba99e995b173ad4936112018c18d Mon Sep 17 00:00:00 2001
From: Simon Donohue <simon@donohue.uk>
Date: Thu, 18 Jul 2024 20:10:40 +0100
Subject: [PATCH 028/149] Update outdated reference to gpt-4-turbo (#525)

Looks like this alias was overlooked in 8171c9a. This commit makes it
match with the usage of gpt-4o in the associated example.
---
 docs/usage.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/usage.md b/docs/usage.md
index 64ffdb95..4d6ef5bd 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -22,7 +22,7 @@ To switch from ChatGPT 3.5 (the default) to GPT-4o:
 ```bash
 llm 'Ten names for cheesecakes' -m gpt-4o
 ```
-You can use `-m 4t` as an even shorter shortcut.
+You can use `-m 4o` as an even shorter shortcut.
 
 Pass `--model <model name>` to use a different model. Run `llm models` to see a list of available models.
 

From fcba89d73ba4669202dc816ab49e23a6a0a66c53 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 18 Jul 2024 12:15:56 -0700
Subject: [PATCH 029/149] Update docs to reflect new gpt-4o-mini default, refs
 #536

---
 docs/index.md         | 4 ++--
 docs/openai-models.md | 2 +-
 docs/setup.md         | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index 263949f0..6b580015 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -62,10 +62,10 @@ llm -m orca-mini-3b-gguf2-q4_0 'What is the capital of France?'
 ```
 To start {ref}`an interactive chat <usage-chat>` with a model, use `llm chat`:
 ```bash
-llm chat -m chatgpt
+llm chat -m gpt-4o-mini
 ```
 ```
-Chatting with gpt-3.5-turbo
+Chatting with gpt-4o-mini
 Type 'exit' or 'quit' to exit
 Type '!multi' to enter multiple lines, then '!end' to finish
 > Tell me a joke about a pelican
diff --git a/docs/openai-models.md b/docs/openai-models.md
index 12690b29..a5c642d9 100644
--- a/docs/openai-models.md
+++ b/docs/openai-models.md
@@ -46,7 +46,7 @@ OpenAI Completion: gpt-3.5-turbo-instruct (aliases: 3.5-instruct, chatgpt-instru
 
 See [the OpenAI models documentation](https://platform.openai.com/docs/models) for details of each of these.
 
-`gpt-3.5-turbo` (aliased to `3.5`) is the least expensive model. `gpt-4o` (aliased to `4o`) is the newest, cheapest and fastest of the GPT-4 family of models.
+`gpt-4o-mini` (aliased to `4o-mini`) is the least expensive model, and is the default for if you don't specify a model at all. `gpt-4o` (aliased to `4o`) is the newest, cheapest and fastest of the GPT-4 family of models.
 
 The `gpt-3.5-turbo-instruct` model is a little different - it is a completion model rather than a chat model, described in [the OpenAI completions documentation](https://platform.openai.com/docs/api-reference/completions/create).
 
diff --git a/docs/setup.md b/docs/setup.md
index a21b5663..0e955392 100644
--- a/docs/setup.md
+++ b/docs/setup.md
@@ -136,7 +136,7 @@ You can configure LLM in a number of different ways.
 
 ### Setting a custom default model
 
-The model used when calling `llm` without the `-m/--model` option defaults to `gpt-3.5-turbo` - the fastest and least expensive OpenAI model, and the same model family that powers ChatGPT.
+The model used when calling `llm` without the `-m/--model` option defaults to `gpt-4o-mini` - the fastest and least expensive OpenAI model.
 
 You can use the `llm models default` command to set a different default model. For GPT-4 (slower and more expensive, but more capable) run this:
 

From 0ef5037b001b84f887e66baff9cc3f9f9d0d9b9e Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 18 Jul 2024 12:22:17 -0700
Subject: [PATCH 030/149] Remove obsolete DEFAULT_EMBEDDING_MODEL, closes #537

---
 llm/cli.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llm/cli.py b/llm/cli.py
index 967764d9..96034ca7 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -42,7 +42,6 @@
 warnings.simplefilter("ignore", ResourceWarning)
 
 DEFAULT_MODEL = "gpt-4o-mini"
-DEFAULT_EMBEDDING_MODEL = "ada-002"
 
 DEFAULT_TEMPLATE = "prompt: "
 

From 562fefb374f13382653f594ab3620ea1d037951a Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 18 Jul 2024 12:23:49 -0700
Subject: [PATCH 031/149] Use 3-small in docs instead of ada-002

Spotted while working on #537
---
 docs/embeddings/python-api.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/embeddings/python-api.md b/docs/embeddings/python-api.md
index 4d635d4f..ca586991 100644
--- a/docs/embeddings/python-api.md
+++ b/docs/embeddings/python-api.md
@@ -5,7 +5,7 @@ You can load an embedding model using its model ID or alias like this:
 ```python
 import llm
 
-embedding_model = llm.get_embedding_model("ada-002")
+embedding_model = llm.get_embedding_model("3-small")
 ```
 To embed a string, returning a Python list of floating point numbers, use the `.embed()` method:
 ```python
@@ -45,14 +45,14 @@ import llm
 
 # This collection will use an in-memory database that will be
 # discarded when the Python process exits
-collection = llm.Collection("entries", model_id="ada-002")
+collection = llm.Collection("entries", model_id="3-small")
 
 # Or you can persist the database to disk like this:
 db = sqlite_utils.Database("my-embeddings.db")
-collection = llm.Collection("entries", db, model_id="ada-002")
+collection = llm.Collection("entries", db, model_id="3-small")
 
 # You can pass a model directly using model= instead of model_id=
-embedding_model = llm.get_embedding_model("ada-002")
+embedding_model = llm.get_embedding_model("3-small")
 collection = llm.Collection("entries", db, model=embedding_model)
 ```
 If the collection already exists in the database you can omit the `model` or `model_id` argument - the model ID will be read from the `collections` table.

From d075336c6900768b664277a7ca7c76a79de9a0cf Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 18 Jul 2024 12:31:14 -0700
Subject: [PATCH 032/149] Release 0.15

Refs #515, #525, #536, #537
---
 docs/changelog.md | 7 +++++++
 docs/setup.md     | 5 +++--
 setup.py          | 2 +-
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index b92847df..c3ed2b2e 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,5 +1,12 @@
 # Changelog
 
+(v0_15)=
+## 0.15 (2024-07-18)
+
+- Support for OpenAI's [new GPT-4o mini](https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/) model: `llm -m gpt-4o-mini 'rave about pelicans in French'` [#536](https://github.com/simonw/llm/issues/536)
+- `gpt-4o-mini` is now the default model if you do not {ref}`specify your own default <setup-default-model>`, replacing GPT-3.5 Turbo. GPT-4o mini is both cheaper and better than GPT-3.5 Turbo.
+- Fixed a bug where `llm logs -q 'flourish' -m haiku` could not combine both the `-q` search query and the `-m` model specifier. [#515](https://github.com/simonw/llm/issues/515)
+
 (v0_14)=
 ## 0.14 (2024-05-13)
 
diff --git a/docs/setup.md b/docs/setup.md
index 0e955392..3fb20c87 100644
--- a/docs/setup.md
+++ b/docs/setup.md
@@ -134,14 +134,15 @@ llm 'my prompt' --key $OPENAI_API_KEY
 
 You can configure LLM in a number of different ways.
 
+(setup-default-model)=
 ### Setting a custom default model
 
 The model used when calling `llm` without the `-m/--model` option defaults to `gpt-4o-mini` - the fastest and least expensive OpenAI model.
 
-You can use the `llm models default` command to set a different default model. For GPT-4 (slower and more expensive, but more capable) run this:
+You can use the `llm models default` command to set a different default model. For GPT-4o (slower and more expensive, but more capable) run this:
 
 ```bash
-llm models default gpt-4
+llm models default gpt-4o
 ```
 You can view the current model by running this:
 ```
diff --git a/setup.py b/setup.py
index 386de3a6..9d8d8aef 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 import os
 
-VERSION = "0.14"
+VERSION = "0.15"
 
 
 def get_long_description():

From 24cc042e59e43b196484002ea5d0c7906da3f185 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sun, 18 Aug 2024 17:01:28 -0700
Subject: [PATCH 033/149] Use self.get_key() in OpenAI Chat class, refs #552

---
 llm/default_plugins/openai_models.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index 12465477..28e3e9f7 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -339,8 +339,7 @@ def get_client(self):
         if self.api_engine:
             kwargs["engine"] = self.api_engine
         if self.needs_key:
-            if self.key:
-                kwargs["api_key"] = self.key
+            kwargs["api_key"] = self.get_key()
         else:
             # OpenAI-compatible models don't need a key, but the
             # openai client library requires one

From 6deed8f97680281433abb158fe023c2bf225d99d Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sun, 18 Aug 2024 17:36:22 -0700
Subject: [PATCH 034/149] get_model() improvement, get_default_model() /
 set_default_wodel() now documented

Refs #553
---
 docs/python-api.md | 63 +++++++++++++++++++++++++++++++++++-----------
 llm/__init__.py    | 28 ++++++++++++++++++++-
 llm/cli.py         | 30 +++-------------------
 tests/test_llm.py  | 14 +++++++++++
 4 files changed, 94 insertions(+), 41 deletions(-)

diff --git a/docs/python-api.md b/docs/python-api.md
index 576f6a15..dccf46bf 100644
--- a/docs/python-api.md
+++ b/docs/python-api.md
@@ -7,22 +7,25 @@ Understanding this API is also important for writing {ref}`plugins`.
 
 ## Basic prompt execution
 
-To run a prompt against the `gpt-3.5-turbo` model, run this:
+To run a prompt against the `gpt-4o-mini` model, run this:
 
 ```python
 import llm
 
-model = llm.get_model("gpt-3.5-turbo")
-model.key = 'YOUR_API_KEY_HERE'
+model = llm.get_model("gpt-4o-mini")
+# Optional, you can configure the key in other ways:
+model.key = "sk-..."
 response = model.prompt("Five surprising names for a pet pelican")
 print(response.text())
 ```
-The `llm.get_model()` function accepts model names or aliases - so `chatgpt` would work here too.
+The `llm.get_model()` function accepts model names or aliases. You can also omit it to use the currently configured default model, which is `gpt-4o-mini` if you have not changed the default.
+
+In this example the key is set by Python code. You can also provide the key using the `OPENAI_API_KEY` environment variable, or use the `llm keys set openai` command to store it in a `keys.json` file, see {ref}`api-keys`.
 
 The `__str__()` method of `response` also returns the text of the response, so you can do this instead:
 
 ```python
-print(response)
+print(llm.get_model().prompt("Five surprising names for a pet pelican"))
 ```
 
 You can run this command to see a list of available models and their aliases:
@@ -52,27 +55,28 @@ response = model.prompt(
 For models that support options (view those with `llm models --options`) you can pass options as keyword arguments to the `.prompt()` method:
 
 ```python
-model = llm.get_model("gpt-3.5-turbo")
-model.key = "... key here ..."
+model = llm.get_model()
 print(model.prompt("Names for otters", temperature=0.2))
 ```
 
 ### Models from plugins
 
-Any models you have installed as plugins will also be available through this mechanism, for example to use Google's PaLM 2 model with [llm-palm](https://github.com/simonw/llm-palm)
+Any models you have installed as plugins will also be available through this mechanism, for example to use Anthropic's Claude 3.5 Sonnet model with [llm-claude-3](https://github.com/simonw/llm-claude-3):
 
 ```bash
-pip install llm-palm
+pip install llm-claude-3
 ```
+Then in your Python code:
 ```python
 import llm
 
-model = llm.get_model("palm")
+model = llm.get_model("claude-3.5-sonnet")
+# Use this if you have not set the key using 'llm keys set claude':
 model.key = 'YOUR_API_KEY_HERE'
 response = model.prompt("Five surprising names for a pet pelican")
 print(response.text())
 ```
-You can omit the `model.key = ` line for models that do not use an API key
+Some models do not use API keys at all.
 
 ## Streaming responses
 
@@ -94,8 +98,7 @@ LLM supports *conversations*, where you ask follow-up questions of a model as pa
 To start a new conversation, use the `model.conversation()` method:
 
 ```python
-model = llm.get_model("gpt-3.5-turbo")
-model.key = 'YOUR_API_KEY_HERE'
+model = llm.get_model()
 conversation = model.conversation()
 ```
 You can then use the `conversation.prompt()` method to execute prompts against this conversation:
@@ -124,7 +127,7 @@ The `llm.set_alias()` function can be used to define a new alias:
 ```python
 import llm
 
-llm.set_alias("turbo", "gpt-3.5-turbo")
+llm.set_alias("mini", "gpt-4o-mini")
 ```
 The second argument can be a model identifier or another alias, in which case that alias will be resolved.
 
@@ -141,3 +144,35 @@ import llm
 
 llm.remove_alias("turbo")
 ```
+
+### set_default_model(alias)
+
+This sets the default model to the given model ID or alias. Any changes to defaults will be persisted in the LLM configuration folder, and will affect all programs using LLM on the system, including the `llm` CLI tool.
+
+```python
+import llm
+
+llm.set_default_model("claude-3.5-sonnet")
+```
+
+### get_default_model()
+
+This returns the currently configured default model, or `gpt-4o-mini` if no default has been set.
+
+```python
+import llm
+
+model_id = llm.get_default_model()
+```
+
+To detect if no default has been set you can use this pattern:
+
+```python
+if llm.get_default_model(default=None) is None:
+    print("No default has been set")
+```
+Here the `default=` parameter specifies the value that should be returned if there is no configured default.
+
+### set_default_embedding_model(alias) and get_default_embedding_model()
+
+These two methods work the same as `set_default_model()` and `get_default_model()` but for the default {ref}`embedding model <embeddings>` instead.
\ No newline at end of file
diff --git a/llm/__init__.py b/llm/__init__.py
index 6b96a93f..9e8afacb 100644
--- a/llm/__init__.py
+++ b/llm/__init__.py
@@ -38,6 +38,7 @@
     "ModelError",
     "NeedsKeyException",
 ]
+DEFAULT_MODEL = "gpt-4o-mini"
 
 
 def get_plugins(all=False):
@@ -144,8 +145,9 @@ class UnknownModelError(KeyError):
     pass
 
 
-def get_model(name):
+def get_model(name: Optional[str] = None) -> Model:
     aliases = get_model_aliases()
+    name = name or get_default_model()
     try:
         return aliases[name]
     except KeyError:
@@ -256,3 +258,27 @@ def cosine_similarity(a, b):
     magnitude_a = sum(x * x for x in a) ** 0.5
     magnitude_b = sum(x * x for x in b) ** 0.5
     return dot_product / (magnitude_a * magnitude_b)
+
+
+def get_default_model(filename="default_model.txt", default=DEFAULT_MODEL):
+    path = user_dir() / filename
+    if path.exists():
+        return path.read_text().strip()
+    else:
+        return default
+
+
+def set_default_model(model, filename="default_model.txt"):
+    path = user_dir() / filename
+    if model is None and path.exists():
+        path.unlink()
+    else:
+        path.write_text(model)
+
+
+def get_default_embedding_model():
+    return get_default_model("default_embedding_model.txt", None)
+
+
+def set_default_embedding_model(model):
+    set_default_model(model, "default_embedding_model.txt")
diff --git a/llm/cli.py b/llm/cli.py
index 96034ca7..a1b14576 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -10,6 +10,8 @@
     Template,
     UnknownModelError,
     encode,
+    get_default_model,
+    get_default_embedding_model,
     get_embedding_models_with_aliases,
     get_embedding_model_aliases,
     get_embedding_model,
@@ -20,6 +22,8 @@
     get_models_with_aliases,
     user_dir,
     set_alias,
+    set_default_model,
+    set_default_embedding_model,
     remove_alias,
 )
 
@@ -41,8 +45,6 @@
 
 warnings.simplefilter("ignore", ResourceWarning)
 
-DEFAULT_MODEL = "gpt-4o-mini"
-
 DEFAULT_TEMPLATE = "prompt: "
 
 
@@ -1574,30 +1576,6 @@ def _truncate_string(s, max_length=100):
     return s
 
 
-def get_default_model(filename="default_model.txt", default=DEFAULT_MODEL):
-    path = user_dir() / filename
-    if path.exists():
-        return path.read_text().strip()
-    else:
-        return default
-
-
-def set_default_model(model, filename="default_model.txt"):
-    path = user_dir() / filename
-    if model is None and path.exists():
-        path.unlink()
-    else:
-        path.write_text(model)
-
-
-def get_default_embedding_model():
-    return get_default_model("default_embedding_model.txt", None)
-
-
-def set_default_embedding_model(model):
-    set_default_model(model, "default_embedding_model.txt")
-
-
 def logs_db_path():
     return user_dir() / "logs.db"
 
diff --git a/tests/test_llm.py b/tests/test_llm.py
index eb418cae..c303061d 100644
--- a/tests/test_llm.py
+++ b/tests/test_llm.py
@@ -5,6 +5,7 @@
 from llm.migrations import migrate
 import json
 import os
+import pathlib
 import pytest
 import re
 import sqlite_utils
@@ -556,3 +557,16 @@ def test_llm_user_dir(tmpdir, monkeypatch):
     user_dir2 = llm.user_dir()
     assert user_dir == str(user_dir2)
     assert os.path.exists(user_dir)
+
+
+def test_model_defaults(tmpdir, monkeypatch):
+    user_dir = str(tmpdir / "u")
+    monkeypatch.setenv("LLM_USER_PATH", user_dir)
+    config_path = pathlib.Path(user_dir) / "default_model.txt"
+    assert not config_path.exists()
+    assert llm.get_default_model() == "gpt-4o-mini"
+    assert llm.get_model().model_id == "gpt-4o-mini"
+    llm.set_default_model("gpt-4o")
+    assert config_path.exists()
+    assert llm.get_default_model() == "gpt-4o"
+    assert llm.get_model().model_id == "gpt-4o"

From e867e13d1bb06d14fca5176e8bfe4dae2db000a3 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sun, 18 Aug 2024 17:39:09 -0700
Subject: [PATCH 035/149] Use model_dump() instead of dict()

Closes #554
---
 llm/default_plugins/openai_models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index 28e3e9f7..e8ca566d 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -325,7 +325,7 @@ def execute(self, prompt, stream, response, conversation=None):
                 stream=False,
                 **kwargs,
             )
-            response.response_json = remove_dict_none_values(completion.dict())
+            response.response_json = remove_dict_none_values(completion.model_dump())
             yield completion.choices[0].message.content
 
     def get_client(self):
@@ -412,7 +412,7 @@ def execute(self, prompt, stream, response, conversation=None):
                 stream=False,
                 **kwargs,
             )
-            response.response_json = remove_dict_none_values(completion.dict())
+            response.response_json = remove_dict_none_values(completion.model_dump())
             yield completion.choices[0].text
 
 

From 7d6ece2a31f15bcd2d2302b7aed2fd269fdd3b91 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sun, 25 Aug 2024 18:03:46 -0700
Subject: [PATCH 036/149] Fix for broken markdown on openai-models page

Refs #558 !stable-docs
---
 docs/openai-models.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/openai-models.md b/docs/openai-models.md
index a5c642d9..9d9a8daa 100644
--- a/docs/openai-models.md
+++ b/docs/openai-models.md
@@ -28,7 +28,7 @@ from click.testing import CliRunner
 from llm.cli import cli
 result = CliRunner().invoke(cli, ["models", "list"])
 models = [line for line in result.output.split("\n") if line.startswith("OpenAI ")]
-cog.out("```\n{}```".format("\n".join(models)))
+cog.out("```\n{}\n```".format("\n".join(models)))
 ]]] -->
 ```
 OpenAI Chat: gpt-3.5-turbo (aliases: 3.5, chatgpt)
@@ -41,7 +41,8 @@ OpenAI Chat: gpt-4-turbo-2024-04-09
 OpenAI Chat: gpt-4-turbo (aliases: gpt-4-turbo-preview, 4-turbo, 4t)
 OpenAI Chat: gpt-4o (aliases: 4o)
 OpenAI Chat: gpt-4o-mini (aliases: 4o-mini)
-OpenAI Completion: gpt-3.5-turbo-instruct (aliases: 3.5-instruct, chatgpt-instruct)```
+OpenAI Completion: gpt-3.5-turbo-instruct (aliases: 3.5-instruct, chatgpt-instruct)
+```
 <!-- [[[end]]] -->
 
 See [the OpenAI models documentation](https://platform.openai.com/docs/models) for details of each of these.

From 50520c7c1cb4e3f9353a96cb33cc8b9d18fc0ef8 Mon Sep 17 00:00:00 2001
From: Kian-Meng Ang <kianmeng.ang@gmail.com>
Date: Sun, 8 Sep 2024 23:44:43 +0800
Subject: [PATCH 037/149] Fix typos (#567)

Found via `codespell -H -L wit,thre`

!stable-docs
---
 docs/changelog.md                     | 2 +-
 docs/plugins/tutorial-model-plugin.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index c3ed2b2e..f8e38b4c 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -177,7 +177,7 @@ To create embeddings for every JPEG in a directory stored in a `photos` collecti
 llm install llm-clip
 llm embed-multi photos --files photos/ '*.jpg' --binary -m clip
 ```
-Now you can search for photos of racoons using:
+Now you can search for photos of raccoons using:
 ```
 llm similar photos -c 'raccoon'
 ```
diff --git a/docs/plugins/tutorial-model-plugin.md b/docs/plugins/tutorial-model-plugin.md
index a2f78df7..ff9c17fb 100644
--- a/docs/plugins/tutorial-model-plugin.md
+++ b/docs/plugins/tutorial-model-plugin.md
@@ -135,7 +135,7 @@ We can try that out by pasting it into the interactive Python interpreter and ru
 
 To execute the model, we start with a word. We look at the options for words that might come next and pick one of those at random. Then we repeat that process until we have produced the desired number of output words.
 
-Some words might not have any following words from our training sentence. For our implementation we wil fall back on picking a random word from our collection.
+Some words might not have any following words from our training sentence. For our implementation we will fall back on picking a random word from our collection.
 
 We will implement this as a [Python generator](https://realpython.com/introduction-to-python-generators/), using the yield keyword to produce each token:
 ```python

From bfcfd2c91b5396d2aa23e9d2474806b63f40ce86 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 12 Sep 2024 16:08:04 -0700
Subject: [PATCH 038/149] o1-preview and o1-mini, refs #570 (#573)

---
 docs/openai-models.md                |  2 ++
 docs/usage.md                        | 20 ++++++++++++++++++++
 llm/default_plugins/openai_models.py | 10 +++++++++-
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/docs/openai-models.md b/docs/openai-models.md
index 9d9a8daa..d0fc6fcd 100644
--- a/docs/openai-models.md
+++ b/docs/openai-models.md
@@ -41,6 +41,8 @@ OpenAI Chat: gpt-4-turbo-2024-04-09
 OpenAI Chat: gpt-4-turbo (aliases: gpt-4-turbo-preview, 4-turbo, 4t)
 OpenAI Chat: gpt-4o (aliases: 4o)
 OpenAI Chat: gpt-4o-mini (aliases: 4o-mini)
+OpenAI Chat: o1-preview
+OpenAI Chat: o1-mini
 OpenAI Completion: gpt-3.5-turbo-instruct (aliases: 3.5-instruct, chatgpt-instruct)
 ```
 <!-- [[[end]]] -->
diff --git a/docs/usage.md b/docs/usage.md
index 4d6ef5bd..005a1690 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -345,6 +345,26 @@ OpenAI Chat: gpt-4o-mini (aliases: 4o-mini)
   logit_bias: dict, str
   seed: int
   json_object: boolean
+OpenAI Chat: o1-preview
+  temperature: float
+  max_tokens: int
+  top_p: float
+  frequency_penalty: float
+  presence_penalty: float
+  stop: str
+  logit_bias: dict, str
+  seed: int
+  json_object: boolean
+OpenAI Chat: o1-mini
+  temperature: float
+  max_tokens: int
+  top_p: float
+  frequency_penalty: float
+  presence_penalty: float
+  stop: str
+  logit_bias: dict, str
+  seed: int
+  json_object: boolean
 OpenAI Completion: gpt-3.5-turbo-instruct (aliases: 3.5-instruct, chatgpt-instruct)
   temperature: float
     What sampling temperature to use, between 0 and 2. Higher values like
diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index e8ca566d..657c0d20 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -35,6 +35,9 @@ def register_models(register):
     # GPT-4o
     register(Chat("gpt-4o"), aliases=("4o",))
     register(Chat("gpt-4o-mini"), aliases=("4o-mini",))
+    # o1
+    register(Chat("o1-preview", can_stream=False, allows_system_prompt=False))
+    register(Chat("o1-mini", can_stream=False, allows_system_prompt=False))
     # The -instruct completion model
     register(
         Completion("gpt-3.5-turbo-instruct", default_max_tokens=256),
@@ -248,7 +251,6 @@ def validate_logit_bias(cls, logit_bias):
 class Chat(Model):
     needs_key = "openai"
     key_env_var = "OPENAI_API_KEY"
-    can_stream: bool = True
 
     default_max_tokens = None
 
@@ -268,6 +270,8 @@ def __init__(
         api_version=None,
         api_engine=None,
         headers=None,
+        can_stream=True,
+        allows_system_prompt=True,
     ):
         self.model_id = model_id
         self.key = key
@@ -277,12 +281,16 @@ def __init__(
         self.api_version = api_version
         self.api_engine = api_engine
         self.headers = headers
+        self.can_stream = can_stream
+        self.allows_system_prompt = allows_system_prompt
 
     def __str__(self):
         return "OpenAI Chat: {}".format(self.model_id)
 
     def execute(self, prompt, stream, response, conversation=None):
         messages = []
+        if prompt.system and not self.allows_system_prompt:
+            raise NotImplementedError("Model does not support system prompts")
         current_system = None
         if conversation is not None:
             for prev_response in conversation.responses:

From 38239839aec306f132034b49cbc0a227f9c76e83 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 12 Sep 2024 16:09:46 -0700
Subject: [PATCH 039/149] Release 0.16

Refs #552, #553, #554, #558, #567, #570, #573
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 9d8d8aef..1f6adcd7 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 import os
 
-VERSION = "0.15"
+VERSION = "0.16"
 
 
 def get_long_description():

From d654c9521235a737e59a4f1d77cf4682589123ec Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 12 Sep 2024 16:18:46 -0700
Subject: [PATCH 040/149] Release notes for 0.16

---
 docs/changelog.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/docs/changelog.md b/docs/changelog.md
index f8e38b4c..161317cb 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,5 +1,16 @@
 # Changelog
 
+(v0_16)=
+## 0.16 (2024-09-12)
+
+- OpenAI models now use the internal `self.get_key()` mechanism, which means they can be used from Python code in a way that will pick up keys that have been configured using `llm keys set` or the `OPENAI_API_KEY` environment variable. [#552](https://github.com/simonw/llm/issues/552). This code now works correctly:
+    ```python
+    import llm
+    print(llm.get_model("gpt-4o-mini").prompt("hi"))
+    ```
+- New documented API methods: `llm.get_default_model()`, `llm.set_default_model(alias)`, `llm.get_default_embedding_model(alias)`, `llm.set_default_embedding_model()`. [#553](https://github.com/simonw/llm/issues/553)
+- Support for OpenAI's new [o1 family](https://openai.com/o1/) of preview models, `llm -m o1-preview "prompt"` and `llm -m o1-mini "prompt"`. These models are currently only available to [tier 5](https://platform.openai.com/docs/guides/rate-limits/usage-tiers?context=tier-five) OpenAI API users, though this may change in the future. [#570](https://github.com/simonw/llm/issues/570)
+
 (v0_15)=
 ## 0.15 (2024-07-18)
 

From 7e6031e3821f602ebcb503113501446d6f7ee930 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sat, 26 Oct 2024 22:44:06 -0700
Subject: [PATCH 041/149] llm-gguf, llm-jq

!stable-docs
---
 docs/plugins/directory.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/plugins/directory.md b/docs/plugins/directory.md
index af0c88d6..22438d6a 100644
--- a/docs/plugins/directory.md
+++ b/docs/plugins/directory.md
@@ -7,7 +7,8 @@ The following plugins are available for LLM. Here's {ref}`how to install them <i
 
 These plugins all help you run LLMs directly on your own computer:
 
-- **[llm-llama-cpp](https://github.com/simonw/llm-llama-cpp)** uses [llama.cpp](https://github.com/ggerganov/llama.cpp) to run models published in the GGUF format.
+
+- **[llm-gguf](https://github.com/simonw/llm-gguf)** uses [llama.cpp](https://github.com/ggerganov/llama.cpp) to run models published in the GGUF format.
 - **[llm-mlc](https://github.com/simonw/llm-mlc)** can run local models released by the [MLC project](https://mlc.ai/mlc-llm/), including models that can take advantage of the GPU on Apple Silicon M1/M2 devices.
 - **[llm-gpt4all](https://github.com/simonw/llm-gpt4all)** adds support for various models released by the [GPT4All](https://gpt4all.io/) project that are optimized to run locally on your own machine. These models include versions of Vicuna, Orca, Falcon and MPT - here's [a full list of models](https://observablehq.com/@simonw/gpt4all-models).
 - **[llm-mpt30b](https://github.com/simonw/llm-mpt30b)** adds support for the [MPT-30B](https://huggingface.co/mosaicml/mpt-30b) local model.
@@ -52,6 +53,7 @@ If an API model host provides an OpenAI-compatible API you can also [configure L
 - **[llm-cmd](https://github.com/simonw/llm-cmd)** accepts a prompt for a shell command, runs that prompt and populates the result in your shell so you can review it, edit it and then hit `<enter>` to execute or `ctrl+c` to cancel.
 - **[llm-python](https://github.com/simonw/llm-python)** adds a `llm python` command for running a Python interpreter in the same virtual environment as LLM. This is useful for debugging, and also provides a convenient way to interact with the LLM {ref}`python-api` if you installed LLM using Homebrew or `pipx`.
 - **[llm-cluster](https://github.com/simonw/llm-cluster)** adds a `llm cluster` command for calculating clusters for a collection of embeddings. Calculated clusters can then be passed to a Large Language Model to generate a summary description.
+- **[llm-jq](https://github.com/simonw/llm-jq)** lets you pipe in JSON data and a prompt describing a `jq` program, then executes the generated program against the JSON.
 
 ## Just for fun
 

From a466ddf3cd4a6157f8674cc1ba0205de6df35c5a Mon Sep 17 00:00:00 2001
From: Andrew Wason <rectalogic@rectalogic.com>
Date: Sun, 27 Oct 2024 14:26:47 -0400
Subject: [PATCH 042/149] Fix broken tests.  Drop python 3.8. (#580)

* Fix broken tests.  Drop python 3.8.
* Test on Python 3.13 too

---------

Co-authored-by: Simon Willison <swillison@gmail.com>
---
 .github/workflows/publish.yml | 4 ++--
 .github/workflows/test.yml    | 2 +-
 setup.py                      | 2 +-
 tests/test_keys.py            | 3 +++
 tests/test_templates.py       | 8 +++++---
 5 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 43654d6b..1b63d02c 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -12,7 +12,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
     steps:
     - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
@@ -38,7 +38,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v5
       with:
-        python-version: '3.12'
+        python-version: '3.13'
         cache: pip
         cache-dependency-path: setup.py
     - name: Install dependencies
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 1884fc9a..1a4d3926 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -11,7 +11,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
         pydantic: ["==1.10.2", ">=2.0.0"]
     steps:
     - uses: actions/checkout@v4
diff --git a/setup.py b/setup.py
index 1f6adcd7..bc77c22b 100644
--- a/setup.py
+++ b/setup.py
@@ -63,5 +63,5 @@ def get_long_description():
             "types-setuptools",
         ]
     },
-    python_requires=">=3.8",
+    python_requires=">=3.9",
 )
diff --git a/tests/test_keys.py b/tests/test_keys.py
index 5c1d7d5a..5a5649a0 100644
--- a/tests/test_keys.py
+++ b/tests/test_keys.py
@@ -52,6 +52,9 @@ def test_keys_list(monkeypatch, tmpdir, args):
     assert result2.output.strip() == "openai"
 
 
+@pytest.mark.httpx_mock(
+    assert_all_requests_were_expected=False, can_send_already_matched_responses=True
+)
 def test_uses_correct_key(mocked_openai_chat, monkeypatch, tmpdir):
     user_dir = tmpdir / "user-dir"
     pathlib.Path(user_dir).mkdir()
diff --git a/tests/test_templates.py b/tests/test_templates.py
index 6289c38b..57c8f836 100644
--- a/tests/test_templates.py
+++ b/tests/test_templates.py
@@ -133,19 +133,21 @@ def test_templates_prompt_save(templates_path, args, expected_prompt, expected_e
             "Summarize this: Input text",
             None,
         ),
-        (
+        pytest.param(
             "boo",
             ["-s", "s"],
             None,
             None,
             "Error: Cannot use -t/--template and --system together",
+            marks=pytest.mark.httpx_mock(assert_all_responses_were_requested=False),
         ),
-        (
+        pytest.param(
             "prompt: 'Say $hello'",
             [],
             None,
             None,
             "Error: Missing variables: hello",
+            marks=pytest.mark.httpx_mock(assert_all_responses_were_requested=False),
         ),
         (
             "prompt: 'Say $hello'",
@@ -183,4 +185,4 @@ def test_template_basic(
     else:
         assert result.exit_code == 1
         assert result.output.strip() == expected_error
-        mocked_openai_chat.reset(assert_all_responses_were_requested=False)
+        mocked_openai_chat.reset()

From 6df00f92ffaf9333ed829ca0566043c1f5f7e225 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sat, 26 Oct 2024 17:40:23 -0700
Subject: [PATCH 043/149] First working prototype of new attachments feature,
 refs #587

---
 llm/__init__.py                      |  1 +
 llm/cli.py                           | 84 ++++++++++++++++++++++++-
 llm/default_plugins/openai_models.py | 27 +++++++-
 llm/models.py                        | 94 +++++++++++++++++++++++++---
 setup.py                             |  1 +
 5 files changed, 193 insertions(+), 14 deletions(-)

diff --git a/llm/__init__.py b/llm/__init__.py
index 9e8afacb..f76e2728 100644
--- a/llm/__init__.py
+++ b/llm/__init__.py
@@ -4,6 +4,7 @@
     NeedsKeyException,
 )
 from .models import (
+    Attachment,
     Conversation,
     Model,
     ModelWithAliases,
diff --git a/llm/cli.py b/llm/cli.py
index a1b14576..33e14f09 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -4,6 +4,7 @@
 import io
 import json
 from llm import (
+    Attachment,
     Collection,
     Conversation,
     Response,
@@ -30,7 +31,9 @@
 from .migrations import migrate
 from .plugins import pm
 import base64
+import httpx
 import pathlib
+import puremagic
 import pydantic
 import readline
 from runpy import run_module
@@ -48,6 +51,54 @@
 DEFAULT_TEMPLATE = "prompt: "
 
 
+class AttachmentType(click.ParamType):
+    name = "attachment"
+
+    def convert(self, value, param, ctx):
+        if value == "-":
+            content = sys.stdin.buffer.read()
+            # Try to guess type
+            try:
+                mimetype = puremagic.from_string(content, mime=True)
+            except puremagic.PureError:
+                raise click.BadParameter("Could not determine mimetype of stdin")
+            return Attachment(mimetype, None, None, content)
+        if "://" in value:
+            # Confirm URL exists and try to guess type
+            try:
+                response = httpx.head(value)
+                response.raise_for_status()
+                mimetype = response.headers.get("content-type")
+            except httpx.HTTPError as ex:
+                raise click.BadParameter(str(ex))
+            return Attachment(mimetype, None, value, None)
+        # Check that the file exists
+        path = pathlib.Path(value)
+        if not path.exists():
+            self.fail(f"File {value} does not exist", param, ctx)
+        # Try to guess type
+        mimetype = puremagic.from_file(str(path), mime=True)
+        return Attachment(mimetype, str(path), None, None)
+
+
+def attachment_types_callback(ctx, param, values):
+    collected = []
+    for value, mimetype in values:
+        if "://" in value:
+            attachment = Attachment(mimetype, None, value, None)
+        elif value == "-":
+            content = sys.stdin.buffer.read()
+            attachment = Attachment(mimetype, None, None, content)
+        else:
+            # Look for file
+            path = pathlib.Path(value)
+            if not path.exists():
+                raise click.BadParameter(f"File {value} does not exist")
+            attachment = Attachment(mimetype, str(path), None, None)
+        collected.append(attachment)
+    return collected
+
+
 def _validate_metadata_json(ctx, param, value):
     if value is None:
         return value
@@ -88,6 +139,23 @@ def cli():
 @click.argument("prompt", required=False)
 @click.option("-s", "--system", help="System prompt to use")
 @click.option("model_id", "-m", "--model", help="Model to use")
+@click.option(
+    "attachments",
+    "-a",
+    "--attachment",
+    type=AttachmentType(),
+    multiple=True,
+    help="Attachment path or URL or -",
+)
+@click.option(
+    "attachment_types",
+    "--at",
+    "--attachment-type",
+    type=(str, str),
+    multiple=True,
+    callback=attachment_types_callback,
+    help="Attachment with explicit mimetype",
+)
 @click.option(
     "options",
     "-o",
@@ -127,6 +195,8 @@ def prompt(
     prompt,
     system,
     model_id,
+    attachments,
+    attachment_types,
     options,
     template,
     param,
@@ -142,6 +212,14 @@ def prompt(
     Execute a prompt
 
     Documentation: https://llm.datasette.io/en/stable/usage.html
+
+    Examples:
+
+    \b
+        llm 'Capital of France?'
+        llm 'Capital of France?' -m gpt-4o
+        llm 'Capital of France?' -s 'answer in Spanish'
+        llm 'Extract text from this image' -a image.jpg
     """
     if log and no_log:
         raise click.ClickException("--log and --no-log are mutually exclusive")
@@ -262,6 +340,8 @@ def read_prompt():
         except pydantic.ValidationError as ex:
             raise click.ClickException(render_errors(ex.errors()))
 
+    resolved_attachments = [*attachments, *attachment_types]
+
     should_stream = model.can_stream and not no_stream
     if not should_stream:
         validated_options["stream"] = False
@@ -273,7 +353,9 @@ def read_prompt():
         prompt_method = conversation.prompt
 
     try:
-        response = prompt_method(prompt, system, **validated_options)
+        response = prompt_method(
+            prompt, *resolved_attachments, system=system, **validated_options
+        )
         if should_stream:
             for chunk in response:
                 print(chunk, end="")
diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index 657c0d20..913e7545 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -33,8 +33,8 @@ def register_models(register):
     register(Chat("gpt-4-turbo-2024-04-09"))
     register(Chat("gpt-4-turbo"), aliases=("gpt-4-turbo-preview", "4-turbo", "4t"))
     # GPT-4o
-    register(Chat("gpt-4o"), aliases=("4o",))
-    register(Chat("gpt-4o-mini"), aliases=("4o-mini",))
+    register(Chat("gpt-4o", vision=True), aliases=("4o",))
+    register(Chat("gpt-4o-mini", vision=True), aliases=("4o-mini",))
     # o1
     register(Chat("o1-preview", can_stream=False, allows_system_prompt=False))
     register(Chat("o1-mini", can_stream=False, allows_system_prompt=False))
@@ -271,6 +271,7 @@ def __init__(
         api_engine=None,
         headers=None,
         can_stream=True,
+        vision=False,
         allows_system_prompt=True,
     ):
         self.model_id = model_id
@@ -282,8 +283,17 @@ def __init__(
         self.api_engine = api_engine
         self.headers = headers
         self.can_stream = can_stream
+        self.vision = vision
         self.allows_system_prompt = allows_system_prompt
 
+        if vision:
+            self.attachment_types = {
+                "image/png",
+                "image/jpeg",
+                "image/webp",
+                "image/gif",
+            }
+
     def __str__(self):
         return "OpenAI Chat: {}".format(self.model_id)
 
@@ -308,7 +318,18 @@ def execute(self, prompt, stream, response, conversation=None):
                 messages.append({"role": "assistant", "content": prev_response.text()})
         if prompt.system and prompt.system != current_system:
             messages.append({"role": "system", "content": prompt.system})
-        messages.append({"role": "user", "content": prompt.prompt})
+        if not prompt.attachments:
+            messages.append({"role": "user", "content": prompt.prompt})
+        else:
+            vision_message = [{"type": "text", "text": prompt.prompt}]
+            for attachment in prompt.attachments:
+                url = attachment.url
+                if not url:
+                    base64_image = attachment.base64_content()
+                    url = f"data:{attachment.resolve_type()};base64,{base64_image}"
+                vision_message.append({"type": "image_url", "image_url": {"url": url}})
+            messages.append({"role": "user", "content": vision_message})
+
         response._prompt_json = {"messages": messages}
         kwargs = self.build_kwargs(prompt)
         client = self.get_client()
diff --git a/llm/models.py b/llm/models.py
index 0e47bb60..77bdb8e9 100644
--- a/llm/models.py
+++ b/llm/models.py
@@ -1,7 +1,10 @@
+import base64
 from dataclasses import dataclass, field
 import datetime
 from .errors import NeedsKeyException
+import httpx
 from itertools import islice
+import puremagic
 import re
 import time
 from typing import Any, Dict, Iterable, Iterator, List, Optional, Set, Union
@@ -13,17 +16,52 @@
 CONVERSATION_NAME_LENGTH = 32
 
 
+@dataclass
+class Attachment:
+    type: Optional[str] = None
+    path: Optional[str] = None
+    url: Optional[str] = None
+    content: Optional[bytes] = None
+
+    def resolve_type(self):
+        if self.type:
+            return self.type
+        # Derive it from path or url or content
+        if self.path:
+            return puremagic.from_file(self.path, mime=True)
+        if self.url:
+            return puremagic.from_url(self.url, mime=True)
+        if self.content:
+            return puremagic.from_string(self.content, mime=True)
+        raise ValueError("Attachment has no type and no content to derive it from")
+
+    def base64_content(self):
+        content = self.content
+        if not content:
+            if self.path:
+                content = open(self.path, "rb").read()
+            elif self.url:
+                response = httpx.get(self.url)
+                response.raise_for_status()
+                content = response.content
+        return base64.b64encode(content).decode("utf-8")
+
+
 @dataclass
 class Prompt:
     prompt: str
     model: "Model"
-    system: Optional[str]
-    prompt_json: Optional[str]
-    options: "Options"
+    attachments: Optional[List[Attachment]] = field(default_factory=list)
+    system: Optional[str] = None
+    prompt_json: Optional[str] = None
+    options: "Options" = field(default_factory=dict)
 
-    def __init__(self, prompt, model, system=None, prompt_json=None, options=None):
+    def __init__(
+        self, prompt, model, attachments, system=None, prompt_json=None, options=None
+    ):
         self.prompt = prompt
         self.model = model
+        self.attachments = list(attachments)
         self.system = system
         self.prompt_json = prompt_json
         self.options = options or {}
@@ -39,6 +77,7 @@ class Conversation:
     def prompt(
         self,
         prompt: Optional[str],
+        *attachments: Attachment,
         system: Optional[str] = None,
         stream: bool = True,
         **options
@@ -46,8 +85,9 @@ def prompt(
         return Response(
             Prompt(
                 prompt,
-                system=system,
                 model=self.model,
+                attachments=attachments,
+                system=system,
                 options=self.model.Options(**options),
             ),
             self.model,
@@ -158,14 +198,22 @@ def log_to_db(self, db):
         db["responses"].insert(response)
 
     @classmethod
-    def fake(cls, model: "Model", prompt: str, system: str, response: str):
+    def fake(
+        cls,
+        model: "Model",
+        prompt: str,
+        *attachments: List[Attachment],
+        system: str,
+        response: str
+    ):
         "Utility method to help with writing tests"
         response_obj = cls(
             model=model,
             prompt=Prompt(
                 prompt,
-                system=system,
                 model=model,
+                attachments=attachments,
+                system=system,
             ),
             stream=False,
         )
@@ -183,8 +231,9 @@ def from_row(cls, row):
             model=model,
             prompt=Prompt(
                 prompt=row["prompt"],
-                system=row["system"],
                 model=model,
+                attachments=[],
+                system=row["system"],
                 options=model.Options(**json.loads(row["options_json"])),
             ),
             stream=False,
@@ -242,10 +291,15 @@ def get_key(self):
 
 class Model(ABC, _get_key_mixin):
     model_id: str
+
+    # API key handling
     key: Optional[str] = None
     needs_key: Optional[str] = None
     key_env_var: Optional[str] = None
+
+    # Model characteristics
     can_stream: bool = False
+    attachment_types = set()
 
     class Options(_Options):
         pass
@@ -269,13 +323,33 @@ def execute(
 
     def prompt(
         self,
-        prompt: Optional[str],
+        prompt: str,
+        *attachments: Attachment,
         system: Optional[str] = None,
         stream: bool = True,
         **options
     ):
+        # Validate attachments
+        if attachments and not self.attachment_types:
+            raise ValueError(
+                "This model does not support attachments, but some were provided"
+            )
+        for attachment in attachments:
+            attachment_type = attachment.resolve_type()
+            if attachment_type not in self.attachment_types:
+                raise ValueError(
+                    "This model does not support attachments of type '{}', only {}".format(
+                        attachment_type, ", ".join(self.attachment_types)
+                    )
+                )
         return self.response(
-            Prompt(prompt, system=system, model=self, options=self.Options(**options)),
+            Prompt(
+                prompt,
+                attachments=attachments,
+                system=system,
+                model=self,
+                options=self.Options(**options),
+            ),
             stream=stream,
         )
 
diff --git a/setup.py b/setup.py
index bc77c22b..188d5df4 100644
--- a/setup.py
+++ b/setup.py
@@ -48,6 +48,7 @@ def get_long_description():
         "setuptools",
         "pip",
         "pyreadline3; sys_platform == 'win32'",
+        "puremagic",
     ],
     extras_require={
         "test": [

From c0fe719df65232665b071abd29f2df26e99c8b9d Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sat, 26 Oct 2024 19:48:57 -0700
Subject: [PATCH 044/149] Store prompt attachments in attachments and
 prompt_attachments tables

Refs https://github.com/simonw/llm/issues/587#issuecomment-2439791231
---
 llm/cli.py        |  2 ++
 llm/migrations.py | 26 ++++++++++++++++++++++++++
 llm/models.py     | 35 ++++++++++++++++++++++++++++++++++-
 3 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/llm/cli.py b/llm/cli.py
index 33e14f09..454e56ca 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -76,6 +76,7 @@ def convert(self, value, param, ctx):
         path = pathlib.Path(value)
         if not path.exists():
             self.fail(f"File {value} does not exist", param, ctx)
+        path = path.resolve()
         # Try to guess type
         mimetype = puremagic.from_file(str(path), mime=True)
         return Attachment(mimetype, str(path), None, None)
@@ -94,6 +95,7 @@ def attachment_types_callback(ctx, param, values):
             path = pathlib.Path(value)
             if not path.exists():
                 raise click.BadParameter(f"File {value} does not exist")
+            path = path.resolve()
             attachment = Attachment(mimetype, str(path), None, None)
         collected.append(attachment)
     return collected
diff --git a/llm/migrations.py b/llm/migrations.py
index 008ae976..91da6429 100644
--- a/llm/migrations.py
+++ b/llm/migrations.py
@@ -201,3 +201,29 @@ def m010_create_new_log_tables(db):
 @migration
 def m011_fts_for_responses(db):
     db["responses"].enable_fts(["prompt", "response"], create_triggers=True)
+
+
+@migration
+def m012_attachments_tables(db):
+    db["attachments"].create(
+        {
+            "id": str,
+            "type": str,
+            "path": str,
+            "url": str,
+            "content": bytes,
+        },
+        pk="id",
+    )
+    db["prompt_attachments"].create(
+        {
+            "response_id": str,
+            "attachment_id": str,
+            "order": int,
+        },
+        foreign_keys=(
+            ("response_id", "responses", "id"),
+            ("attachment_id", "attachments", "id"),
+        ),
+        pk=("response_id", "attachment_id"),
+    )
diff --git a/llm/models.py b/llm/models.py
index 77bdb8e9..3b7d4dad 100644
--- a/llm/models.py
+++ b/llm/models.py
@@ -2,6 +2,7 @@
 from dataclasses import dataclass, field
 import datetime
 from .errors import NeedsKeyException
+import hashlib
 import httpx
 from itertools import islice
 import puremagic
@@ -23,6 +24,17 @@ class Attachment:
     url: Optional[str] = None
     content: Optional[bytes] = None
 
+    def hash_id(self):
+        # Hash of the binary content, or of '{"url": "https://..."}' for URL attachments
+        if self.content:
+            return hashlib.sha256(self.content).hexdigest()
+        elif self.path:
+            return hashlib.sha256(open(self.path, "rb").read()).hexdigest()
+        else:
+            return hashlib.sha256(
+                json.dumps({"url": self.url}).encode("utf-8")
+            ).hexdigest()
+
     def resolve_type(self):
         if self.type:
             return self.type
@@ -178,8 +190,9 @@ def log_to_db(self, db):
             },
             ignore=True,
         )
+        response_id = str(ULID()).lower()
         response = {
-            "id": str(ULID()).lower(),
+            "id": response_id,
             "model": self.model.model_id,
             "prompt": self.prompt.prompt,
             "system": self.prompt.system,
@@ -196,6 +209,26 @@ def log_to_db(self, db):
             "datetime_utc": self.datetime_utc(),
         }
         db["responses"].insert(response)
+        # Persist any attachments - loop through with index
+        for index, attachment in enumerate(self.prompt.attachments):
+            attachment_id = attachment.hash_id()
+            db["attachments"].insert(
+                {
+                    "id": attachment_id,
+                    "type": attachment.resolve_type(),
+                    "path": attachment.path,
+                    "url": attachment.url,
+                    "content": attachment.content,
+                },
+                replace=True,
+            )
+            db["prompt_attachments"].insert(
+                {
+                    "response_id": response_id,
+                    "attachment_id": attachment_id,
+                    "order": index,
+                },
+            )
 
     @classmethod
     def fake(

From dff5b456fdbcd17dc7159a9e0d9ca69e6e7e9f61 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 28 Oct 2024 12:16:54 -0700
Subject: [PATCH 045/149] Got llm --continue to work with images, refs #587

---
 llm/cli.py                           |  6 ++--
 llm/default_plugins/openai_models.py | 28 ++++++++++++----
 llm/models.py                        | 48 +++++++++++++++++++++-------
 3 files changed, 62 insertions(+), 20 deletions(-)

diff --git a/llm/cli.py b/llm/cli.py
index 454e56ca..53087ade 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -62,7 +62,7 @@ def convert(self, value, param, ctx):
                 mimetype = puremagic.from_string(content, mime=True)
             except puremagic.PureError:
                 raise click.BadParameter("Could not determine mimetype of stdin")
-            return Attachment(mimetype, None, None, content)
+            return Attachment(type=mimetype, path=None, url=None, content=content)
         if "://" in value:
             # Confirm URL exists and try to guess type
             try:
@@ -79,7 +79,7 @@ def convert(self, value, param, ctx):
         path = path.resolve()
         # Try to guess type
         mimetype = puremagic.from_file(str(path), mime=True)
-        return Attachment(mimetype, str(path), None, None)
+        return Attachment(type=mimetype, path=str(path), url=None, content=None)
 
 
 def attachment_types_callback(ctx, param, values):
@@ -552,7 +552,7 @@ def load_conversation(conversation_id: Optional[str]) -> Optional[Conversation]:
     for response in db["responses"].rows_where(
         "conversation_id = ?", [conversation_id]
     ):
-        conversation.responses.append(Response.from_row(response))
+        conversation.responses.append(Response.from_row(db, response))
     return conversation
 
 
diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index 913e7545..651d37f1 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -312,23 +312,39 @@ def execute(self, prompt, stream, response, conversation=None):
                         {"role": "system", "content": prev_response.prompt.system}
                     )
                     current_system = prev_response.prompt.system
-                messages.append(
-                    {"role": "user", "content": prev_response.prompt.prompt}
-                )
+                if prev_response.attachments:
+                    attachment_message = [
+                        {"type": "text", "text": prev_response.prompt.prompt}
+                    ]
+                    for attachment in prev_response.attachments:
+                        url = attachment.url
+                        if not url:
+                            base64_image = attachment.base64_content()
+                            url = f"data:{attachment.resolve_type()};base64,{base64_image}"
+                        attachment_message.append(
+                            {"type": "image_url", "image_url": {"url": url}}
+                        )
+                    messages.append({"role": "user", "content": attachment_message})
+                else:
+                    messages.append(
+                        {"role": "user", "content": prev_response.prompt.prompt}
+                    )
                 messages.append({"role": "assistant", "content": prev_response.text()})
         if prompt.system and prompt.system != current_system:
             messages.append({"role": "system", "content": prompt.system})
         if not prompt.attachments:
             messages.append({"role": "user", "content": prompt.prompt})
         else:
-            vision_message = [{"type": "text", "text": prompt.prompt}]
+            attachment_message = [{"type": "text", "text": prompt.prompt}]
             for attachment in prompt.attachments:
                 url = attachment.url
                 if not url:
                     base64_image = attachment.base64_content()
                     url = f"data:{attachment.resolve_type()};base64,{base64_image}"
-                vision_message.append({"type": "image_url", "image_url": {"url": url}})
-            messages.append({"role": "user", "content": vision_message})
+                attachment_message.append(
+                    {"type": "image_url", "image_url": {"url": url}}
+                )
+            messages.append({"role": "user", "content": attachment_message})
 
         response._prompt_json = {"messages": messages}
         kwargs = self.build_kwargs(prompt)
diff --git a/llm/models.py b/llm/models.py
index 3b7d4dad..fc58a074 100644
--- a/llm/models.py
+++ b/llm/models.py
@@ -23,17 +23,20 @@ class Attachment:
     path: Optional[str] = None
     url: Optional[str] = None
     content: Optional[bytes] = None
+    _id: Optional[str] = None
 
-    def hash_id(self):
+    def id(self):
         # Hash of the binary content, or of '{"url": "https://..."}' for URL attachments
-        if self.content:
-            return hashlib.sha256(self.content).hexdigest()
-        elif self.path:
-            return hashlib.sha256(open(self.path, "rb").read()).hexdigest()
-        else:
-            return hashlib.sha256(
-                json.dumps({"url": self.url}).encode("utf-8")
-            ).hexdigest()
+        if self._id is None:
+            if self.content:
+                self._id = hashlib.sha256(self.content).hexdigest()
+            elif self.path:
+                self._id = hashlib.sha256(open(self.path, "rb").read()).hexdigest()
+            else:
+                self._id = hashlib.sha256(
+                    json.dumps({"url": self.url}).encode("utf-8")
+                ).hexdigest()
+        return self._id
 
     def resolve_type(self):
         if self.type:
@@ -58,6 +61,16 @@ def base64_content(self):
                 content = response.content
         return base64.b64encode(content).decode("utf-8")
 
+    @classmethod
+    def from_row(cls, row):
+        return cls(
+            _id=row["id"],
+            type=row["type"],
+            path=row["path"],
+            url=row["url"],
+            content=row["content"],
+        )
+
 
 @dataclass
 class Prompt:
@@ -211,7 +224,7 @@ def log_to_db(self, db):
         db["responses"].insert(response)
         # Persist any attachments - loop through with index
         for index, attachment in enumerate(self.prompt.attachments):
-            attachment_id = attachment.hash_id()
+            attachment_id = attachment.id()
             db["attachments"].insert(
                 {
                     "id": attachment_id,
@@ -255,7 +268,7 @@ def fake(
         return response_obj
 
     @classmethod
-    def from_row(cls, row):
+    def from_row(cls, db, row):
         from llm import get_model
 
         model = get_model(row["model"])
@@ -276,6 +289,19 @@ def from_row(cls, row):
         response.response_json = json.loads(row["response_json"] or "null")
         response._done = True
         response._chunks = [row["response"]]
+        # Attachments
+        response.attachments = [
+            Attachment.from_row(arow)
+            for arow in db.query(
+                """
+                select attachments.* from attachments
+                join prompt_attachments on attachments.id = prompt_attachments.attachment_id
+                where prompt_attachments.response_id = ?
+                order by prompt_attachments."order"
+            """,
+                [row["id"]],
+            )
+        ]
         return response
 
     def __repr__(self):

From cd722f653bf0d3d43aea5383b3477253be5bfbd7 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 28 Oct 2024 12:35:51 -0700
Subject: [PATCH 046/149] Redact base64 data from _prompt_json, refs #587

---
 llm/default_plugins/openai_models.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index 651d37f1..b7b1c465 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -346,7 +346,6 @@ def execute(self, prompt, stream, response, conversation=None):
                 )
             messages.append({"role": "user", "content": attachment_message})
 
-        response._prompt_json = {"messages": messages}
         kwargs = self.build_kwargs(prompt)
         client = self.get_client()
         if stream:
@@ -372,6 +371,7 @@ def execute(self, prompt, stream, response, conversation=None):
             )
             response.response_json = remove_dict_none_values(completion.model_dump())
             yield completion.choices[0].message.content
+        response._prompt_json = redact_data_urls({"messages": messages})
 
     def get_client(self):
         kwargs = {}
@@ -431,7 +431,6 @@ def execute(self, prompt, stream, response, conversation=None):
                 messages.append(prev_response.prompt.prompt)
                 messages.append(prev_response.text())
         messages.append(prompt.prompt)
-        response._prompt_json = {"messages": messages}
         kwargs = self.build_kwargs(prompt)
         client = self.get_client()
         if stream:
@@ -459,6 +458,7 @@ def execute(self, prompt, stream, response, conversation=None):
             )
             response.response_json = remove_dict_none_values(completion.model_dump())
             yield completion.choices[0].text
+        response._prompt_json = redact_data_urls({"messages": messages})
 
 
 def not_nulls(data) -> dict:
@@ -506,3 +506,20 @@ def combine_chunks(chunks: List) -> dict:
             combined[key] = value
 
     return combined
+
+
+def redact_data_urls(input_dict):
+    """
+    Recursively search through the input dictionary for any 'image_url' keys
+    and modify the 'url' value to be just 'data:...'.
+    """
+    if isinstance(input_dict, dict):
+        for key, value in input_dict.items():
+            if key == "image_url" and isinstance(value, dict) and "url" in value:
+                value["url"] = "data:..."
+            else:
+                redact_data_urls(value)
+    elif isinstance(input_dict, list):
+        for item in input_dict:
+            redact_data_urls(item)
+    return input_dict

From a68af9c8e659f5743f400855a51785c8b45cb89c Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 28 Oct 2024 13:27:28 -0700
Subject: [PATCH 047/149] Don't redact non-data URLs for OpenAI models

Refs #587
---
 llm/default_plugins/openai_models.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index b7b1c465..bce1fb04 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -515,7 +515,12 @@ def redact_data_urls(input_dict):
     """
     if isinstance(input_dict, dict):
         for key, value in input_dict.items():
-            if key == "image_url" and isinstance(value, dict) and "url" in value:
+            if (
+                key == "image_url"
+                and isinstance(value, dict)
+                and "url" in value
+                and value["url"].startswith("data:")
+            ):
                 value["url"] = "data:..."
             else:
                 redact_data_urls(value)

From 1126393ba1540bddc816a65b58b3b0789e0791c9 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 28 Oct 2024 13:46:06 -0700
Subject: [PATCH 048/149] Docs for writing models that accept attachments, refs
 #587

---
 docs/plugins/advanced-model-plugins.md | 102 +++++++++++++++++++++++++
 docs/plugins/index.md                  |   1 +
 docs/plugins/tutorial-model-plugin.md  |   2 +-
 llm/models.py                          |   7 +-
 4 files changed, 109 insertions(+), 3 deletions(-)
 create mode 100644 docs/plugins/advanced-model-plugins.md

diff --git a/docs/plugins/advanced-model-plugins.md b/docs/plugins/advanced-model-plugins.md
new file mode 100644
index 00000000..fdbdc232
--- /dev/null
+++ b/docs/plugins/advanced-model-plugins.md
@@ -0,0 +1,102 @@
+(advanced-model-plugins)=
+# Advanced model plugins
+
+The {ref}`model plugin tutorial <tutorial-model-plugin>` covers the basics of developing a plugin that adds support for a new model.
+
+This document covers more advanced topics.
+
+(advanced-model-plugins-attachments)=
+## Attachments for multi-modal models
+
+Models such as GPT-4o, Claude 3.5 Sonnet and Google's Gemini 1.5 are multi-modal: they accept input in the form of images and maybe even audio, video and other formats.
+
+LLM calls these **attachments**. Models can specify the types of attachments they accept and then implement special code in the `.execute()` method to handle them.
+
+### Specifying attachment types
+
+A `Model` subclass can list the types of attachments it accepts by defining a `attachment_types` class attribute:
+
+```python
+class NewModel(llm.Model):
+    model_id = "new-model"
+    attachment_types = {
+        "image/png",
+        "image/jpeg",
+        "image/webp",
+        "image/gif",
+    }
+```
+These content types are detected when an attachment is passed to LLM using `llm -a filename`, or can be specified by the user using the `--attachment-type filename image/png` option.
+
+**Note:** *MP3 files will have their attachment type detected as `audio/mpeg`, not `audio/mp3`.
+
+LLM will use the `attachment_types` attribute to validate that provided attachments should be accepted before passing them to the model.
+
+### Handling attachments
+
+The `prompt` object passed to the `execute()` method will have an `attachments` attribute containing a list of `Attachment` objects provided by the user.
+
+An `Attachment` instance has the following properties:
+
+- `url (str)`: The URL of the attachment, if it was provided as a URL
+- `path (str)`: The resolved file path of the attachment, if it was provided as a file
+- `type (str)`: The content type of the attachment, if it was provided
+- `content (bytes)`: The binary content of the attachment, if it was provided
+
+Generally only one of `url`, `path` or `content` will be set.
+
+You should usually access the type and the content through one of these methods:
+
+- `attachment.resolve_type() -> str`: Returns the `type` if it is available, otherwise attempts to guess the type by looking at the first few bytes of content
+- `attachment.content_bytes() -> bytes`: Returns the binary content, which it may need to read from a file or fetch from a URL
+- `attachment.base64_content() -> str`: Returns that content as a base64-encoded string
+
+A `id()` method returns a database ID for this content, which is either a SHA256 hash of the binary content or, in the case of attachments hosted at an external URL, a hash of `{"url": url}` instead. This is an implementation detail which you should not need to access directly.
+
+Here's how the OpenAI plugin handles attachments:
+
+```python
+messages = []
+if not prompt.attachments:
+    messages.append({"role": "user", "content": prompt.prompt})
+else:
+    attachment_message = [{"type": "text", "text": prompt.prompt}]
+    for attachment in prompt.attachments:
+        url = attachment.url
+        if not url:
+            base64_image = attachment.base64_content()
+            url = f"data:{attachment.resolve_type()};base64,{base64_image}"
+        attachment_message.append(
+            {"type": "image_url", "image_url": {"url": url}}
+        )
+    messages.append({"role": "user", "content": attachment_message})
+```
+As you can see, it uses `attachment.url` if that is available and otherwise falls back to using the `base64_content()` method to embed the image directly in the JSON sent to the API.
+
+### Attachments from previous conversations
+
+Models that implement the ability to continue a conversation can reconstruct the previous message JSON using the `response.attachments` attribute.
+
+Here's how the OpenAI plugin does that:
+
+```python
+for prev_response in conversation.responses:
+    if prev_response.attachments:
+        attachment_message = [
+            {"type": "text", "text": prev_response.prompt.prompt}
+        ]
+        for attachment in prev_response.attachments:
+            url = attachment.url
+            if not url:
+                base64_image = attachment.base64_content()
+                url = f"data:{attachment.resolve_type()};base64,{base64_image}"
+            attachment_message.append(
+                {"type": "image_url", "image_url": {"url": url}}
+            )
+        messages.append({"role": "user", "content": attachment_message})
+    else:
+        messages.append(
+            {"role": "user", "content": prev_response.prompt.prompt}
+        )
+    messages.append({"role": "assistant", "content": prev_response.text()})
+```
diff --git a/docs/plugins/index.md b/docs/plugins/index.md
index 96ae62fd..2a08844d 100644
--- a/docs/plugins/index.md
+++ b/docs/plugins/index.md
@@ -17,5 +17,6 @@ installing-plugins
 directory
 plugin-hooks
 tutorial-model-plugin
+advanced-model-plugins
 plugin-utilities
 ```
diff --git a/docs/plugins/tutorial-model-plugin.md b/docs/plugins/tutorial-model-plugin.md
index ff9c17fb..6f1bcbbc 100644
--- a/docs/plugins/tutorial-model-plugin.md
+++ b/docs/plugins/tutorial-model-plugin.md
@@ -1,5 +1,5 @@
 (tutorial-model-plugin)=
-# Writing a plugin to support a new model
+# Model plugin tutorial
 
 This tutorial will walk you through developing a new plugin for LLM that adds support for a new Large Language Model.
 
diff --git a/llm/models.py b/llm/models.py
index fc58a074..06440eed 100644
--- a/llm/models.py
+++ b/llm/models.py
@@ -50,7 +50,7 @@ def resolve_type(self):
             return puremagic.from_string(self.content, mime=True)
         raise ValueError("Attachment has no type and no content to derive it from")
 
-    def base64_content(self):
+    def content_bytes(self):
         content = self.content
         if not content:
             if self.path:
@@ -59,7 +59,10 @@ def base64_content(self):
                 response = httpx.get(self.url)
                 response.raise_for_status()
                 content = response.content
-        return base64.b64encode(content).decode("utf-8")
+        return content
+
+    def base64_content(self):
+        return base64.b64encode(self.content_bytes()).decode("utf-8")
 
     @classmethod
     def from_row(cls, row):

From bb5b802d4fa0f5ae0e0be9fb655ef4ae04035503 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 28 Oct 2024 14:06:22 -0700
Subject: [PATCH 049/149] llm logs markdown support for attachments, refs #587

---
 llm/cli.py | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/llm/cli.py b/llm/cli.py
index 53087ade..c15e3ac0 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -695,6 +695,21 @@ def logs_turn_off():
 order by responses_fts.rank desc{limit}
 """
 
+ATTACHMENTS_SQL = """
+select
+    response_id,
+    attachments.id,
+    attachments.type,
+    attachments.path,
+    attachments.url,
+    length(attachments.content) as content_length
+from attachments
+join prompt_attachments
+    on attachments.id = prompt_attachments.attachment_id
+where prompt_attachments.response_id in ({})
+order by prompt_attachments."order"
+"""
+
 
 @logs.command(name="list")
 @click.option(
@@ -816,6 +831,14 @@ def logs_list(
     # ... except for searches where we don't do this
     if not query:
         rows.reverse()
+
+    # Fetch any attachments
+    ids = [row["id"] for row in rows]
+    attachments = list(db.query(ATTACHMENTS_SQL.format(",".join("?" * len(ids))), ids))
+    attachments_by_id = {}
+    for attachment in attachments:
+        attachments_by_id.setdefault(attachment["response_id"], []).append(attachment)
+
     for row in rows:
         if truncate:
             row["prompt"] = _truncate_string(row["prompt"])
@@ -864,6 +887,35 @@ def logs_list(
                 if row["system"] is not None:
                     click.echo("\n## System:\n\n{}".format(row["system"]))
                 current_system = row["system"]
+            attachments = attachments_by_id.get(row["id"])
+            # ### Attachments
+
+            # 1. **image/jpeg**: `/path/example.jpg`
+            # 2. **image/png**: `https://example.com/image.png`
+            # 3. **application/pdf**: `<binary 1,003,425 bytes>`
+            if attachments:
+                click.echo("\n### Attachments\n")
+                for i, attachment in enumerate(attachments, 1):
+                    if attachment["path"]:
+                        path = attachment["path"]
+                        click.echo(
+                            "{}. **{}**: `{}`".format(i, attachment["type"], path)
+                        )
+                    elif attachment["url"]:
+                        click.echo(
+                            "{}. **{}**: {}".format(
+                                i, attachment["type"], attachment["url"]
+                            )
+                        )
+                    elif attachment["content_length"]:
+                        click.echo(
+                            "{}. **{}**: `<binary {} bytes>`".format(
+                                i,
+                                attachment["type"],
+                                f"{attachment['content_length']:,}",
+                            )
+                        )
+
             click.echo("\n## Response:\n\n{}\n".format(row["response"]))
 
 

From 2384fd52a052607ba28f759f21cf87d425d2906a Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 28 Oct 2024 14:07:01 -0700
Subject: [PATCH 050/149] Tighter log output for binary content

---
 llm/cli.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/llm/cli.py b/llm/cli.py
index c15e3ac0..96b7d331 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -888,11 +888,6 @@ def logs_list(
                     click.echo("\n## System:\n\n{}".format(row["system"]))
                 current_system = row["system"]
             attachments = attachments_by_id.get(row["id"])
-            # ### Attachments
-
-            # 1. **image/jpeg**: `/path/example.jpg`
-            # 2. **image/png**: `https://example.com/image.png`
-            # 3. **application/pdf**: `<binary 1,003,425 bytes>`
             if attachments:
                 click.echo("\n### Attachments\n")
                 for i, attachment in enumerate(attachments, 1):
@@ -909,7 +904,7 @@ def logs_list(
                         )
                     elif attachment["content_length"]:
                         click.echo(
-                            "{}. **{}**: `<binary {} bytes>`".format(
+                            "{}. **{}**: `<{} bytes>`".format(
                                 i,
                                 attachment["type"],
                                 f"{attachment['content_length']:,}",

From a9bc1c7329d7f9df138c2b66fa3661e5f37a5af2 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 28 Oct 2024 14:10:54 -0700
Subject: [PATCH 051/149] llm logs --json for attachments, refs #587

---
 llm/cli.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llm/cli.py b/llm/cli.py
index 96b7d331..b3c144bf 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -854,6 +854,11 @@ def logs_list(
 
     if json_output:
         # Output as JSON if requested
+        for row in rows:
+            row["attachments"] = [
+                {k: v for k, v in attachment.items() if k != "response_id"}
+                for attachment in attachments_by_id.get(row["id"], [])
+            ]
         click.echo(json.dumps(list(rows), indent=2))
     elif response:
         # Just output the last response

From 286cf9fcd9005df07ad2318b4f6b9eba5cb608f1 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 28 Oct 2024 14:22:02 -0700
Subject: [PATCH 052/149] attachments= keyword argument, tests pass again -
 refs #587

---
 llm/cli.py         |  2 +-
 llm/models.py      | 19 ++++++++++++++-----
 tests/test_chat.py |  5 ++++-
 3 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/llm/cli.py b/llm/cli.py
index b3c144bf..1a7519cf 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -521,7 +521,7 @@ def chat(
                 raise click.ClickException(str(ex))
         if prompt.strip() in ("exit", "quit"):
             break
-        response = conversation.prompt(prompt, system, **validated_options)
+        response = conversation.prompt(prompt, system=system, **validated_options)
         # System prompt only sent for the first message:
         system = None
         for chunk in response:
diff --git a/llm/models.py b/llm/models.py
index 06440eed..6364c27e 100644
--- a/llm/models.py
+++ b/llm/models.py
@@ -85,11 +85,18 @@ class Prompt:
     options: "Options" = field(default_factory=dict)
 
     def __init__(
-        self, prompt, model, attachments, system=None, prompt_json=None, options=None
+        self,
+        prompt,
+        model,
+        *,
+        attachments=None,
+        system=None,
+        prompt_json=None,
+        options=None
     ):
         self.prompt = prompt
         self.model = model
-        self.attachments = list(attachments)
+        self.attachments = list(attachments or [])
         self.system = system
         self.prompt_json = prompt_json
         self.options = options or {}
@@ -105,7 +112,8 @@ class Conversation:
     def prompt(
         self,
         prompt: Optional[str],
-        *attachments: Attachment,
+        *,
+        attachments: Attachment = None,
         system: Optional[str] = None,
         stream: bool = True,
         **options
@@ -386,7 +394,8 @@ def execute(
     def prompt(
         self,
         prompt: str,
-        *attachments: Attachment,
+        *,
+        attachments: Attachment = None,
         system: Optional[str] = None,
         stream: bool = True,
         **options
@@ -396,7 +405,7 @@ def prompt(
             raise ValueError(
                 "This model does not support attachments, but some were provided"
             )
-        for attachment in attachments:
+        for attachment in attachments or []:
             attachment_type = attachment.resolve_type()
             if attachment_type not in self.attachment_types:
                 raise ValueError(
diff --git a/tests/test_chat.py b/tests/test_chat.py
index cf7ddeff..01b2a0c0 100644
--- a/tests/test_chat.py
+++ b/tests/test_chat.py
@@ -23,7 +23,10 @@ def test_chat_basic(mock_model, logs_db):
     mock_model.enqueue(["one world"])
     mock_model.enqueue(["one again"])
     result = runner.invoke(
-        llm.cli.cli, ["chat", "-m", "mock"], input="Hi\nHi two\nquit\n"
+        llm.cli.cli,
+        ["chat", "-m", "mock"],
+        input="Hi\nHi two\nquit\n",
+        catch_exceptions=False,
     )
     assert result.exit_code == 0
     assert result.output == (

From 570a3eccae3d51ef2da8f90cf9ea45255ef2b2e5 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 28 Oct 2024 14:37:11 -0700
Subject: [PATCH 053/149] Python attachment documentation, plus fixed a
 mimetype detection bug

Refs #587
---
 docs/python-api.md | 28 ++++++++++++++++++++++++++++
 llm/models.py      |  4 +++-
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/docs/python-api.md b/docs/python-api.md
index dccf46bf..dd553101 100644
--- a/docs/python-api.md
+++ b/docs/python-api.md
@@ -49,6 +49,24 @@ response = model.prompt(
     system="Answer like GlaDOS"
 )
 ```
+### Attachments
+
+Model that accept multi-modal input (images, audio, video etc) can be passed attachments using the `attachments=` keyword argument. This accepts a list of `llm.Attachment()` instances.
+
+This example shows two attachments - one from a file path and one from a URL:
+```python
+import llm
+
+model = llm.get_model("gpt-4o-mini")
+response = model.prompt(
+    "Describe these images",
+    attachments=[
+        llm.Attachment(path="pelican.jpg"),
+        llm.Attachment(url="https://static.simonwillison.net/static/2024/pelicans.jpg"),
+    ]
+)
+```
+Use `llm.Attachment(content=b"binary image content here")` to pass binary content directly.
 
 ### Model options
 
@@ -114,6 +132,16 @@ print(response2.text())
 ```
 You will get back five fun facts about skunks.
 
+The `conversation.prompt()` method supports attachments as well:
+```python
+response = conversation.prompt(
+    "Describe these birds",
+    attachments=[
+        llm.Attachment(url="https://static.simonwillison.net/static/2024/pelicans.jpg")
+    ]
+)
+```
+
 Access `conversation.responses` for a list of all of the responses that have so far been returned during the conversation.
 
 ## Other functions
diff --git a/llm/models.py b/llm/models.py
index 6364c27e..7fe54cc4 100644
--- a/llm/models.py
+++ b/llm/models.py
@@ -45,7 +45,9 @@ def resolve_type(self):
         if self.path:
             return puremagic.from_file(self.path, mime=True)
         if self.url:
-            return puremagic.from_url(self.url, mime=True)
+            response = httpx.head(self.url)
+            response.raise_for_status()
+            return response.headers.get("content-type")
         if self.content:
             return puremagic.from_string(self.content, mime=True)
         raise ValueError("Attachment has no type and no content to derive it from")

From f0ed54abf13e0a2b33bec3ee8a063174eebd75f1 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 28 Oct 2024 15:03:30 -0700
Subject: [PATCH 054/149] Docs for CLI attachments, refs #587

---
 docs/help.md  | 45 +++++++++++++++++++---------
 docs/index.md | 11 ++++---
 docs/usage.md | 81 ++++++++++++++++++++++++++++++++-------------------
 llm/cli.py    | 10 ++++++-
 4 files changed, 98 insertions(+), 49 deletions(-)

diff --git a/docs/help.md b/docs/help.md
index 9cd2927c..b787676f 100644
--- a/docs/help.md
+++ b/docs/help.md
@@ -86,20 +86,37 @@ Usage: llm prompt [OPTIONS] [PROMPT]
 
   Documentation: https://llm.datasette.io/en/stable/usage.html
 
-Options:
-  -s, --system TEXT            System prompt to use
-  -m, --model TEXT             Model to use
-  -o, --option <TEXT TEXT>...  key/value options for the model
-  -t, --template TEXT          Template to use
-  -p, --param <TEXT TEXT>...   Parameters for template
-  --no-stream                  Do not stream output
-  -n, --no-log                 Don't log to database
-  --log                        Log prompt and response to the database
-  -c, --continue               Continue the most recent conversation.
-  --cid, --conversation TEXT   Continue the conversation with the given ID.
-  --key TEXT                   API key to use
-  --save TEXT                  Save prompt with this template name
-  --help                       Show this message and exit.
+  Examples:
+
+      llm 'Capital of France?'
+      llm 'Capital of France?' -m gpt-4o
+      llm 'Capital of France?' -s 'answer in Spanish'
+
+  Multi-modal models can be called with attachments like this:
+
+      llm 'Extract text from this image' -a image.jpg
+      llm 'Describe' -a https://static.simonwillison.net/static/2024/pelicans.jpg
+      cat image | llm 'describe image' -a -
+      # With an explicit content type:
+      cat image | llm 'describe image' --at - image/jpeg
+
+Options:
+  -s, --system TEXT               System prompt to use
+  -m, --model TEXT                Model to use
+  -a, --attachment ATTACHMENT     Attachment path or URL or -
+  --at, --attachment-type <TEXT TEXT>...
+                                  Attachment with explicit mimetype
+  -o, --option <TEXT TEXT>...     key/value options for the model
+  -t, --template TEXT             Template to use
+  -p, --param <TEXT TEXT>...      Parameters for template
+  --no-stream                     Do not stream output
+  -n, --no-log                    Don't log to database
+  --log                           Log prompt and response to the database
+  -c, --continue                  Continue the most recent conversation.
+  --cid, --conversation TEXT      Continue the conversation with the given ID.
+  --key TEXT                      API key to use
+  --save TEXT                     Save prompt with this template name
+  --help                          Show this message and exit.
 ```
 
 (help-chat)=
diff --git a/docs/index.md b/docs/index.md
index 6b580015..9b0ad47e 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -46,10 +46,13 @@ If you have an [OpenAI API key](https://platform.openai.com/api-keys) key you ca
 # Paste your OpenAI API key into this
 llm keys set openai
 
-# Run a prompt
+# Run a prompt (with the default gpt-4o-mini model)
 llm "Ten fun names for a pet pelican"
 
-# Run a system prompt against a file
+# Extract text from an image
+llm "extract text" -a scanned-document.jpg
+
+# Use a system prompt against a file
 cat myfile.py | llm -s "Explain this code"
 ```
 Or you can {ref}`install a plugin <installing-plugins>` and use models that can run on your local device:
@@ -62,10 +65,10 @@ llm -m orca-mini-3b-gguf2-q4_0 'What is the capital of France?'
 ```
 To start {ref}`an interactive chat <usage-chat>` with a model, use `llm chat`:
 ```bash
-llm chat -m gpt-4o-mini
+llm chat -m gpt-4o
 ```
 ```
-Chatting with gpt-4o-mini
+Chatting with gpt-4o
 Type 'exit' or 'quit' to exit
 Type '!multi' to enter multiple lines, then '!end' to finish
 > Tell me a joke about a pelican
diff --git a/docs/usage.md b/docs/usage.md
index 005a1690..6b825245 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -45,49 +45,30 @@ Some models support options. You can pass these using `-o/--option name value` -
 ```bash
 llm 'Ten names for cheesecakes' -o temperature 1.5
 ```
+### Attachments
 
-(usage-completion-prompts)=
-## Completion prompts
-
-Some models are completion models - rather than being tuned to respond to chat style prompts, they are designed to complete a sentence or paragraph.
-
-An example of this is the `gpt-3.5-turbo-instruct` OpenAI model.
+Some models are multi-modal, which means they can accept input in more than just text. GPT-4o and GPT-4o mini can accept images, and models such as Google Gemini 1.5 can accept audio and video as well.
 
-You can prompt that model the same way as the chat models, but be aware that the prompt format that works best is likely to differ.
+LLM calls these **attachments**. You can pass attachments using the `-a` option like this:
 
 ```bash
-llm -m gpt-3.5-turbo-instruct 'Reasons to tame a wild beaver:'
+llm "describe this image" -a https://static.simonwillison.net/static/2024/pelicans.jpg
 ```
-
-(conversation)=
-## Continuing a conversation
-
-By default, the tool will start a new conversation each time you run it.
-
-You can opt to continue the previous conversation by passing the `-c/--continue` option:
+Attachments can be passed using URLs or file paths, and you can attach more than one attachment to a single prompt:
 ```bash
-llm 'More names' -c
+llm "describe these images" -a image1.jpg -a image2.jpg
 ```
-This will re-send the prompts and responses for the previous conversation as part of the call to the language model. Note that this can add up quickly in terms of tokens, especially if you are using expensive models.
-
-`--continue` will automatically use the same model as the conversation that you are continuing, even if you omit the `-m/--model` option.
-
-To continue a conversation that is not the most recent one, use the `--cid/--conversation <id>` option:
+You can also pipe an attachment to LLM by using `-` as the filename:
 ```bash
-llm 'More names' --cid 01h53zma5txeby33t1kbe3xk8q
+cat image.jpg | llm "describe this image" -a -
 ```
-You can find these conversation IDs using the `llm logs` command.
-
-## Using with a shell
-
-To learn more about your computer's operating system based on the output of `uname -a`, run this:
+LLM will attempt to automatically detect the content type of the image. If this doesn't work you can instead use the `--attachment-type` option (`--at` for short) which takes the URL/path plus an explicit content type:
 ```bash
-llm "Tell me about my operating system: $(uname -a)"
+cat myfile | llm "describe this image" --at - image/jpeg
 ```
-This pattern of using `$(command)` inside a double quoted string is a useful way to quickly assemble prompts.
 
 (system-prompts)=
-## System prompts
+### System prompts
 
 You can use `-s/--system '...'` to set a system prompt.
 ```bash
@@ -120,6 +101,46 @@ cat llm/utils.py | llm -t pytest
 ```
 See {ref}`prompt templates <prompt-templates>` for more.
 
+(conversation)=
+### Continuing a conversation
+
+By default, the tool will start a new conversation each time you run it.
+
+You can opt to continue the previous conversation by passing the `-c/--continue` option:
+```bash
+llm 'More names' -c
+```
+This will re-send the prompts and responses for the previous conversation as part of the call to the language model. Note that this can add up quickly in terms of tokens, especially if you are using expensive models.
+
+`--continue` will automatically use the same model as the conversation that you are continuing, even if you omit the `-m/--model` option.
+
+To continue a conversation that is not the most recent one, use the `--cid/--conversation <id>` option:
+```bash
+llm 'More names' --cid 01h53zma5txeby33t1kbe3xk8q
+```
+You can find these conversation IDs using the `llm logs` command.
+
+### Tips for using LLM with Bash or Zsh
+
+To learn more about your computer's operating system based on the output of `uname -a`, run this:
+```bash
+llm "Tell me about my operating system: $(uname -a)"
+```
+This pattern of using `$(command)` inside a double quoted string is a useful way to quickly assemble prompts.
+
+(usage-completion-prompts)=
+### Completion prompts
+
+Some models are completion models - rather than being tuned to respond to chat style prompts, they are designed to complete a sentence or paragraph.
+
+An example of this is the `gpt-3.5-turbo-instruct` OpenAI model.
+
+You can prompt that model the same way as the chat models, but be aware that the prompt format that works best is likely to differ.
+
+```bash
+llm -m gpt-3.5-turbo-instruct 'Reasons to tame a wild beaver:'
+```
+
 (usage-chat)=
 
 ## Starting an interactive chat
diff --git a/llm/cli.py b/llm/cli.py
index 1a7519cf..941831c5 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -221,7 +221,15 @@ def prompt(
         llm 'Capital of France?'
         llm 'Capital of France?' -m gpt-4o
         llm 'Capital of France?' -s 'answer in Spanish'
+
+    Multi-modal models can be called with attachments like this:
+
+    \b
         llm 'Extract text from this image' -a image.jpg
+        llm 'Describe' -a https://static.simonwillison.net/static/2024/pelicans.jpg
+        cat image | llm 'describe image' -a -
+        # With an explicit mimetype:
+        cat image | llm 'describe image' --at - image/jpeg
     """
     if log and no_log:
         raise click.ClickException("--log and --no-log are mutually exclusive")
@@ -356,7 +364,7 @@ def read_prompt():
 
     try:
         response = prompt_method(
-            prompt, *resolved_attachments, system=system, **validated_options
+            prompt, attachments=resolved_attachments, system=system, **validated_options
         )
         if should_stream:
             for chunk in response:

From 1f822d820b5ea7a8cc182792fb9e1825e8dc5894 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 28 Oct 2024 15:06:17 -0700
Subject: [PATCH 055/149] Update docs with cog

---
 docs/help.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/help.md b/docs/help.md
index b787676f..0e28494a 100644
--- a/docs/help.md
+++ b/docs/help.md
@@ -97,7 +97,7 @@ Usage: llm prompt [OPTIONS] [PROMPT]
       llm 'Extract text from this image' -a image.jpg
       llm 'Describe' -a https://static.simonwillison.net/static/2024/pelicans.jpg
       cat image | llm 'describe image' -a -
-      # With an explicit content type:
+      # With an explicit mimetype:
       cat image | llm 'describe image' --at - image/jpeg
 
 Options:

From db1d77f486b011b5dadfbeb46007c4966c13d95e Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 28 Oct 2024 15:29:17 -0700
Subject: [PATCH 056/149] assert_all_responses_were_requested=False

---
 tests/test_templates.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_templates.py b/tests/test_templates.py
index 57c8f836..b33d44cb 100644
--- a/tests/test_templates.py
+++ b/tests/test_templates.py
@@ -185,4 +185,4 @@ def test_template_basic(
     else:
         assert result.exit_code == 1
         assert result.output.strip() == expected_error
-        mocked_openai_chat.reset()
+        mocked_openai_chat.reset(assert_all_responses_were_requested=False)

From be2953e6ab5b3677c8f2d36f26d53a5f5b6b48c7 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 28 Oct 2024 15:33:28 -0700
Subject: [PATCH 057/149] Ruff and mypy fixes

---
 llm/__init__.py |  1 +
 llm/models.py   | 14 +++++++-------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/llm/__init__.py b/llm/__init__.py
index f76e2728..0ea6c242 100644
--- a/llm/__init__.py
+++ b/llm/__init__.py
@@ -29,6 +29,7 @@
     "get_model",
     "get_key",
     "user_dir",
+    "Attachment",
     "Collection",
     "Conversation",
     "Model",
diff --git a/llm/models.py b/llm/models.py
index 7fe54cc4..6a602f52 100644
--- a/llm/models.py
+++ b/llm/models.py
@@ -81,10 +81,10 @@ def from_row(cls, row):
 class Prompt:
     prompt: str
     model: "Model"
-    attachments: Optional[List[Attachment]] = field(default_factory=list)
-    system: Optional[str] = None
-    prompt_json: Optional[str] = None
-    options: "Options" = field(default_factory=dict)
+    attachments: Optional[List[Attachment]]
+    system: Optional[str]
+    prompt_json: Optional[str]
+    options: "Options"
 
     def __init__(
         self,
@@ -115,7 +115,7 @@ def prompt(
         self,
         prompt: Optional[str],
         *,
-        attachments: Attachment = None,
+        attachments: Optional[List[Attachment]] = None,
         system: Optional[str] = None,
         stream: bool = True,
         **options
@@ -371,7 +371,7 @@ class Model(ABC, _get_key_mixin):
 
     # Model characteristics
     can_stream: bool = False
-    attachment_types = set()
+    attachment_types: Set = set()
 
     class Options(_Options):
         pass
@@ -397,7 +397,7 @@ def prompt(
         self,
         prompt: str,
         *,
-        attachments: Attachment = None,
+        attachments: Optional[List[Attachment]] = None,
         system: Optional[str] = None,
         stream: bool = True,
         **options

From 758ff9ac173a17464af0ed14d250438389881eb2 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 28 Oct 2024 15:36:12 -0700
Subject: [PATCH 058/149] Upgrade to pytest-httpx>=0.33.0

---
 setup.py                | 2 +-
 tests/test_templates.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 188d5df4..4c77a9e3 100644
--- a/setup.py
+++ b/setup.py
@@ -54,7 +54,7 @@ def get_long_description():
         "test": [
             "pytest",
             "numpy",
-            "pytest-httpx",
+            "pytest-httpx>=0.33.0",
             "cogapp",
             "mypy>=1.10.0",
             "black>=24.1.0",
diff --git a/tests/test_templates.py b/tests/test_templates.py
index b33d44cb..488a8f55 100644
--- a/tests/test_templates.py
+++ b/tests/test_templates.py
@@ -139,7 +139,7 @@ def test_templates_prompt_save(templates_path, args, expected_prompt, expected_e
             None,
             None,
             "Error: Cannot use -t/--template and --system together",
-            marks=pytest.mark.httpx_mock(assert_all_responses_were_requested=False),
+            marks=pytest.mark.httpx_mock(),
         ),
         pytest.param(
             "prompt: 'Say $hello'",
@@ -147,7 +147,7 @@ def test_templates_prompt_save(templates_path, args, expected_prompt, expected_e
             None,
             None,
             "Error: Missing variables: hello",
-            marks=pytest.mark.httpx_mock(assert_all_responses_were_requested=False),
+            marks=pytest.mark.httpx_mock(),
         ),
         (
             "prompt: 'Say $hello'",
@@ -185,4 +185,4 @@ def test_template_basic(
     else:
         assert result.exit_code == 1
         assert result.output.strip() == expected_error
-        mocked_openai_chat.reset(assert_all_responses_were_requested=False)
+        mocked_openai_chat.reset()

From ba1ccb3a4a8dbef7fc17c8fdec6c9f78a4ab137d Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 28 Oct 2024 15:46:52 -0700
Subject: [PATCH 059/149] Release 0.17a0

Refs #587, #590
---
 docs/changelog.md  | 34 ++++++++++++++++++++++++++++++++++
 docs/python-api.md |  3 +++
 docs/usage.md      |  3 ++-
 setup.py           |  2 +-
 4 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index 161317cb..970b2e9b 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,5 +1,39 @@
 # Changelog
 
+(v0_17a0)=
+## 0.17a0 (2024-10-28)
+
+Alpha support for **attachments**, allowing multi-modal models to accept images, audio, video and other formats. [#578](https://github.com/simonw/llm/issues/578)
+
+Attachments {ref}`in the CLI <usage-attachments>` can be URLs:
+
+```bash
+llm "describe this image" \
+  -a https://static.simonwillison.net/static/2024/pelicans.jpg
+```
+Or file paths:
+```bash
+llm "extract text" -a image1.jpg -a image2.jpg
+```
+Or binary data, which may need to use `--attachment-type` to specify the MIME type:
+```bash
+cat image | llm "extract text" --attachment-type - image/jpeg
+```
+
+Attachments are also available {ref}`in the Python API <python-api-attachments>`:
+
+```python
+model = llm.get_model("gpt-4o-mini")
+response = model.prompt(
+    "Describe these images",
+    attachments=[
+        llm.Attachment(path="pelican.jpg"),
+        llm.Attachment(url="https://static.simonwillison.net/static/2024/pelicans.jpg"),
+    ]
+)
+```
+Plugins that provide alternative models can support attachments, see {ref}`advanced-model-plugins-attachments` for details.
+
 (v0_16)=
 ## 0.16 (2024-09-12)
 
diff --git a/docs/python-api.md b/docs/python-api.md
index dd553101..ae135a68 100644
--- a/docs/python-api.md
+++ b/docs/python-api.md
@@ -49,6 +49,9 @@ response = model.prompt(
     system="Answer like GlaDOS"
 )
 ```
+
+(python-api-attachments)=
+
 ### Attachments
 
 Model that accept multi-modal input (images, audio, video etc) can be passed attachments using the `attachments=` keyword argument. This accepts a list of `llm.Attachment()` instances.
diff --git a/docs/usage.md b/docs/usage.md
index 6b825245..94cb5ca5 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -45,6 +45,7 @@ Some models support options. You can pass these using `-o/--option name value` -
 ```bash
 llm 'Ten names for cheesecakes' -o temperature 1.5
 ```
+(usage-attachments)=
 ### Attachments
 
 Some models are multi-modal, which means they can accept input in more than just text. GPT-4o and GPT-4o mini can accept images, and models such as Google Gemini 1.5 can accept audio and video as well.
@@ -56,7 +57,7 @@ llm "describe this image" -a https://static.simonwillison.net/static/2024/pelica
 ```
 Attachments can be passed using URLs or file paths, and you can attach more than one attachment to a single prompt:
 ```bash
-llm "describe these images" -a image1.jpg -a image2.jpg
+llm "extract text" -a image1.jpg -a image2.jpg
 ```
 You can also pipe an attachment to LLM by using `-` as the filename:
 ```bash
diff --git a/setup.py b/setup.py
index 4c77a9e3..44669127 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 import os
 
-VERSION = "0.16"
+VERSION = "0.17a0"
 
 
 def get_long_description():

From 389acdf52c3aaea9470b1307a017bdec21a524e2 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 28 Oct 2024 17:40:40 -0700
Subject: [PATCH 060/149] Track usage on OpenAI stream requests, closes #591

---
 llm/default_plugins/openai_models.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index bce1fb04..5cbb02bb 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -346,7 +346,7 @@ def execute(self, prompt, stream, response, conversation=None):
                 )
             messages.append({"role": "user", "content": attachment_message})
 
-        kwargs = self.build_kwargs(prompt)
+        kwargs = self.build_kwargs(prompt, stream)
         client = self.get_client()
         if stream:
             completion = client.chat.completions.create(
@@ -358,7 +358,10 @@ def execute(self, prompt, stream, response, conversation=None):
             chunks = []
             for chunk in completion:
                 chunks.append(chunk)
-                content = chunk.choices[0].delta.content
+                try:
+                    content = chunk.choices[0].delta.content
+                except IndexError:
+                    content = None
                 if content is not None:
                     yield content
             response.response_json = remove_dict_none_values(combine_chunks(chunks))
@@ -395,13 +398,15 @@ def get_client(self):
             kwargs["http_client"] = logging_client()
         return openai.OpenAI(**kwargs)
 
-    def build_kwargs(self, prompt):
+    def build_kwargs(self, prompt, stream):
         kwargs = dict(not_nulls(prompt.options))
         json_object = kwargs.pop("json_object", None)
         if "max_tokens" not in kwargs and self.default_max_tokens is not None:
             kwargs["max_tokens"] = self.default_max_tokens
         if json_object:
             kwargs["response_format"] = {"type": "json_object"}
+        if stream:
+            kwargs["stream_options"] = {"include_usage": True}
         return kwargs
 
 
@@ -431,7 +436,7 @@ def execute(self, prompt, stream, response, conversation=None):
                 messages.append(prev_response.prompt.prompt)
                 messages.append(prev_response.text())
         messages.append(prompt.prompt)
-        kwargs = self.build_kwargs(prompt)
+        kwargs = self.build_kwargs(prompt, stream)
         client = self.get_client()
         if stream:
             completion = client.completions.create(
@@ -443,7 +448,10 @@ def execute(self, prompt, stream, response, conversation=None):
             chunks = []
             for chunk in completion:
                 chunks.append(chunk)
-                content = chunk.choices[0].text
+                try:
+                    content = chunk.choices[0].text
+                except IndexError:
+                    content = None
                 if content is not None:
                     yield content
             combined = combine_chunks(chunks)
@@ -472,8 +480,11 @@ def combine_chunks(chunks: List) -> dict:
     # If any of them have log probability, we're going to persist
     # those later on
     logprobs = []
+    usage = {}
 
     for item in chunks:
+        if item.usage:
+            usage = dict(item.usage)
         for choice in item.choices:
             if choice.logprobs and hasattr(choice.logprobs, "top_logprobs"):
                 logprobs.append(
@@ -497,6 +508,7 @@ def combine_chunks(chunks: List) -> dict:
         "content": content,
         "role": role,
         "finish_reason": finish_reason,
+        "usage": usage,
     }
     if logprobs:
         combined["logprobs"] = logprobs

From 39d61d433ac1b7638a521d32a3f98f0a00ca5352 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 28 Oct 2024 19:21:11 -0700
Subject: [PATCH 061/149] Automated tests for attachments, refs #587

---
 tests/conftest.py         |  1 +
 tests/test_attachments.py | 48 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)
 create mode 100644 tests/test_attachments.py

diff --git a/tests/conftest.py b/tests/conftest.py
index 120fe35c..bcdb8854 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -49,6 +49,7 @@ def env_setup(monkeypatch, user_path):
 
 class MockModel(llm.Model):
     model_id = "mock"
+    attachment_types = {"image/png"}
 
     class Options(llm.Options):
         max_tokens: Optional[int] = Field(
diff --git a/tests/test_attachments.py b/tests/test_attachments.py
new file mode 100644
index 00000000..89a5b81a
--- /dev/null
+++ b/tests/test_attachments.py
@@ -0,0 +1,48 @@
+from click.testing import CliRunner
+from unittest.mock import ANY
+import llm
+
+TINY_PNG = (
+    b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\xa6\x00\x00\x01\x1a"
+    b"\x02\x03\x00\x00\x00\xe6\x99\xc4^\x00\x00\x00\tPLTE\xff\xff\xff"
+    b"\x00\xff\x00\xfe\x01\x00\x12t\x01J\x00\x00\x00GIDATx\xda\xed\xd81\x11"
+    b"\x000\x08\xc0\xc0.]\xea\xaf&Q\x89\x04V\xe0>\xf3+\xc8\x91Z\xf4\xa2\x08EQ\x14E"
+    b"Q\x14EQ\x14EQ\xd4B\x91$I3\xbb\xbf\x08EQ\x14EQ\x14EQ\x14E\xd1\xa5"
+    b"\xd4\x17\x91\xc6\x95\x05\x15\x0f\x9f\xc5\t\x9f\xa4\x00\x00\x00\x00IEND\xaeB`"
+    b"\x82"
+)
+
+
+def test_prompt_image(mock_model, logs_db):
+    runner = CliRunner()
+    mock_model.enqueue(["two boxes"])
+    result = runner.invoke(
+        llm.cli.cli,
+        ["prompt", "-m", "mock", "describe image", "-a", "-"],
+        input=TINY_PNG,
+        catch_exceptions=False,
+    )
+    assert result.exit_code == 0
+    assert result.output == "two boxes\n"
+    assert mock_model.history[0][0].attachments[0] == llm.Attachment(
+        type="image/png", path=None, url=None, content=TINY_PNG, _id=ANY
+    )
+
+    # Check it was logged correctly
+    conversations = list(logs_db["conversations"].rows)
+    assert len(conversations) == 1
+    conversation = conversations[0]
+    assert conversation["model"] == "mock"
+    assert conversation["name"] == "describe image"
+    response = list(logs_db["responses"].rows)[0]
+    attachment = list(logs_db["attachments"].rows)[0]
+    assert attachment == {
+        "id": ANY,
+        "type": "image/png",
+        "path": None,
+        "url": None,
+        "content": TINY_PNG,
+    }
+    prompt_attachment = list(logs_db["prompt_attachments"].rows)[0]
+    assert prompt_attachment["attachment_id"] == attachment["id"]
+    assert prompt_attachment["response_id"] == response["id"]

From a44ba49c21f8d4ac30c8e41bfa5599c258ce53cc Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 28 Oct 2024 19:34:25 -0700
Subject: [PATCH 062/149] Release 0.17

Refs #587, #590, #591
---
 docs/changelog.md | 23 +++++++++++++++++------
 setup.py          |  2 +-
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index 970b2e9b..31507afb 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,23 +1,25 @@
 # Changelog
 
-(v0_17a0)=
-## 0.17a0 (2024-10-28)
+(v0_17)=
+## 0.17 (2024-10-29)
+
+Support for **attachments**, allowing multi-modal models to accept images, audio, video and other formats. [#578](https://github.com/simonw/llm/issues/578)
 
-Alpha support for **attachments**, allowing multi-modal models to accept images, audio, video and other formats. [#578](https://github.com/simonw/llm/issues/578)
+The default OpenAI `gpt-4o` and `gpt-4o-mini` models can both now be prompted with JPEG, GIF, PNG and WEBP images.
 
 Attachments {ref}`in the CLI <usage-attachments>` can be URLs:
 
 ```bash
-llm "describe this image" \
+llm -m gpt-4o "describe this image" \
   -a https://static.simonwillison.net/static/2024/pelicans.jpg
 ```
 Or file paths:
 ```bash
-llm "extract text" -a image1.jpg -a image2.jpg
+llm -m gpt-4o-mini "extract text" -a image1.jpg -a image2.jpg
 ```
 Or binary data, which may need to use `--attachment-type` to specify the MIME type:
 ```bash
-cat image | llm "extract text" --attachment-type - image/jpeg
+cat image | llm -m gpt-4o-mini "extract text" --attachment-type - image/jpeg
 ```
 
 Attachments are also available {ref}`in the Python API <python-api-attachments>`:
@@ -34,6 +36,15 @@ response = model.prompt(
 ```
 Plugins that provide alternative models can support attachments, see {ref}`advanced-model-plugins-attachments` for details.
 
+The latest **[llm-claude-3](https://github.com/simonw/llm-claude-3)** plugin now supports attachments for Anthropic's Claude 3 and 3.5 models. The **[llm-gemini](https://github.com/simonw/llm-gemini)** plugin supports attachments for Google's Gemini 1.5 models.
+
+Also in this release: OpenAI models now record their `"usage"` data in the database even when the response was streamed. These records can be viewed using `llm logs --json`. [#591](https://github.com/simonw/llm/issues/591)
+
+(v0_17a0)=
+## 0.17a0 (2024-10-28)
+
+Alpha support for **attachments**. [#578](https://github.com/simonw/llm/issues/578)
+
 (v0_16)=
 ## 0.16 (2024-09-12)
 
diff --git a/setup.py b/setup.py
index 44669127..b906cdf2 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 import os
 
-VERSION = "0.17a0"
+VERSION = "0.17"
 
 
 def get_long_description():

From 122265a3d2131a7140911c2a3d99ad045d61c847 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Fri, 1 Nov 2024 14:17:25 -0700
Subject: [PATCH 063/149] Fix for chat continuation in llm chat, refs #601

---
 llm/models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llm/models.py b/llm/models.py
index 6a602f52..f0f79eb6 100644
--- a/llm/models.py
+++ b/llm/models.py
@@ -160,6 +160,7 @@ def __init__(
         self._done = False
         self.response_json = None
         self.conversation = conversation
+        self.attachments = []
 
     def __iter__(self) -> Iterator[str]:
         self._start = time.monotonic()

From d7b395bb5b05d1ddf78a750f5515f5049d98adf6 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Fri, 1 Nov 2024 14:19:39 -0700
Subject: [PATCH 064/149] Release 0.17.1

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index b906cdf2..6f500815 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 import os
 
-VERSION = "0.17"
+VERSION = "0.17.1"
 
 
 def get_long_description():

From a7eedd1d2ea50b3d2df95c85e489579d124d9548 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Fri, 1 Nov 2024 14:21:01 -0700
Subject: [PATCH 065/149] mypy fix, refs #601

---
 llm/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/models.py b/llm/models.py
index f0f79eb6..838e25b1 100644
--- a/llm/models.py
+++ b/llm/models.py
@@ -160,7 +160,7 @@ def __init__(
         self._done = False
         self.response_json = None
         self.conversation = conversation
-        self.attachments = []
+        self.attachments: List[Attachment] = []
 
     def __iter__(self) -> Iterator[str]:
         self._start = time.monotonic()

From fe1e09706f69a4934430d2f2dadc354ffc4aa4dc Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 4 Nov 2024 10:26:02 -0800
Subject: [PATCH 066/149] llm-lambda-labs

!stable-docs
---
 docs/plugins/directory.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/plugins/directory.md b/docs/plugins/directory.md
index 22438d6a..c94795e4 100644
--- a/docs/plugins/directory.md
+++ b/docs/plugins/directory.md
@@ -36,6 +36,7 @@ These plugins can be used to interact with remotely hosted models via their API:
 - **[llm-bedrock-anthropic](https://github.com/sblakey/llm-bedrock-anthropic)** by Sean Blakey adds support for Claude and Claude Instant by Anthropic via Amazon Bedrock.
 - **[llm-bedrock-meta](https://github.com/flabat/llm-bedrock-meta)** by Fabian Labat adds support for Llama 2 and Llama 3 by Meta via Amazon Bedrock.
 - **[llm-together](https://github.com/wearedevx/llm-together)** adds support for the [Together AI](https://www.together.ai/) extensive family of hosted openly licensed models.
+- **[llm-lambda-labs](https://github.com/simonw/llm-lambda-labs)** provides access to models hosted by [Lambda Labs](https://docs.lambdalabs.com/public-cloud/lambda-chat-api/), including the Nous Hermes 3 series.
 
 If an API model host provides an OpenAI-compatible API you can also [configure LLM to talk to it](https://llm.datasette.io/en/stable/other-models.html#openai-compatible-models) without needing an extra plugin.
 

From 336ab1013d76638a5516e0f83e73a4546f27a98c Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Tue, 5 Nov 2024 21:08:52 -0800
Subject: [PATCH 067/149] gpt-4o-audio-preview audio input, refs #608

---
 llm/default_plugins/openai_models.py | 75 ++++++++++++++++++----------
 1 file changed, 50 insertions(+), 25 deletions(-)

diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index 5cbb02bb..81d0cc07 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -35,6 +35,7 @@ def register_models(register):
     # GPT-4o
     register(Chat("gpt-4o", vision=True), aliases=("4o",))
     register(Chat("gpt-4o-mini", vision=True), aliases=("4o-mini",))
+    register(Chat("gpt-4o-audio-preview", audio=True))
     # o1
     register(Chat("o1-preview", can_stream=False, allows_system_prompt=False))
     register(Chat("o1-mini", can_stream=False, allows_system_prompt=False))
@@ -248,6 +249,25 @@ def validate_logit_bias(cls, logit_bias):
         return validated_logit_bias
 
 
+def _attachment(attachment):
+    url = attachment.url
+    base64_content = ""
+    if not url or attachment.resolve_type().startswith("audio/"):
+        base64_content = attachment.base64_content()
+        url = f"data:{attachment.resolve_type()};base64,{base64_content}"
+    if attachment.resolve_type().startswith("image/"):
+        return {"type": "image_url", "image_url": {"url": url}}
+    else:
+        format_ = "wav" if attachment.resolve_type() == "audio/wave" else "mp3"
+        return {
+            "type": "input_audio",
+            "input_audio": {
+                "data": base64_content,
+                "format": format_,
+            },
+        }
+
+
 class Chat(Model):
     needs_key = "openai"
     key_env_var = "OPENAI_API_KEY"
@@ -272,6 +292,7 @@ def __init__(
         headers=None,
         can_stream=True,
         vision=False,
+        audio=False,
         allows_system_prompt=True,
     ):
         self.model_id = model_id
@@ -286,13 +307,25 @@ def __init__(
         self.vision = vision
         self.allows_system_prompt = allows_system_prompt
 
+        self.attachment_types = set()
+
         if vision:
-            self.attachment_types = {
-                "image/png",
-                "image/jpeg",
-                "image/webp",
-                "image/gif",
-            }
+            self.attachment_types.update(
+                {
+                    "image/png",
+                    "image/jpeg",
+                    "image/webp",
+                    "image/gif",
+                }
+            )
+
+        if audio:
+            self.attachment_types.update(
+                {
+                    "audio/wave",
+                    "audio/mpeg",
+                }
+            )
 
     def __str__(self):
         return "OpenAI Chat: {}".format(self.model_id)
@@ -317,13 +350,7 @@ def execute(self, prompt, stream, response, conversation=None):
                         {"type": "text", "text": prev_response.prompt.prompt}
                     ]
                     for attachment in prev_response.attachments:
-                        url = attachment.url
-                        if not url:
-                            base64_image = attachment.base64_content()
-                            url = f"data:{attachment.resolve_type()};base64,{base64_image}"
-                        attachment_message.append(
-                            {"type": "image_url", "image_url": {"url": url}}
-                        )
+                        attachment_message.append(_attachment(attachment))
                     messages.append({"role": "user", "content": attachment_message})
                 else:
                     messages.append(
@@ -337,13 +364,7 @@ def execute(self, prompt, stream, response, conversation=None):
         else:
             attachment_message = [{"type": "text", "text": prompt.prompt}]
             for attachment in prompt.attachments:
-                url = attachment.url
-                if not url:
-                    base64_image = attachment.base64_content()
-                    url = f"data:{attachment.resolve_type()};base64,{base64_image}"
-                attachment_message.append(
-                    {"type": "image_url", "image_url": {"url": url}}
-                )
+                attachment_message.append(_attachment(attachment))
             messages.append({"role": "user", "content": attachment_message})
 
         kwargs = self.build_kwargs(prompt, stream)
@@ -374,7 +395,7 @@ def execute(self, prompt, stream, response, conversation=None):
             )
             response.response_json = remove_dict_none_values(completion.model_dump())
             yield completion.choices[0].message.content
-        response._prompt_json = redact_data_urls({"messages": messages})
+        response._prompt_json = redact_data({"messages": messages})
 
     def get_client(self):
         kwargs = {}
@@ -466,7 +487,7 @@ def execute(self, prompt, stream, response, conversation=None):
             )
             response.response_json = remove_dict_none_values(completion.model_dump())
             yield completion.choices[0].text
-        response._prompt_json = redact_data_urls({"messages": messages})
+        response._prompt_json = redact_data({"messages": messages})
 
 
 def not_nulls(data) -> dict:
@@ -520,10 +541,12 @@ def combine_chunks(chunks: List) -> dict:
     return combined
 
 
-def redact_data_urls(input_dict):
+def redact_data(input_dict):
     """
     Recursively search through the input dictionary for any 'image_url' keys
     and modify the 'url' value to be just 'data:...'.
+
+    Also redact input_audio.data keys
     """
     if isinstance(input_dict, dict):
         for key, value in input_dict.items():
@@ -534,9 +557,11 @@ def redact_data_urls(input_dict):
                 and value["url"].startswith("data:")
             ):
                 value["url"] = "data:..."
+            elif key == "input_audio" and isinstance(value, dict) and "data" in value:
+                value["data"] = "..."
             else:
-                redact_data_urls(value)
+                redact_data(value)
     elif isinstance(input_dict, list):
         for item in input_dict:
-            redact_data_urls(item)
+            redact_data(item)
     return input_dict

From 41cb5c3387d62f3318fc25f2f47f5ea942d21569 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Tue, 5 Nov 2024 21:13:36 -0800
Subject: [PATCH 068/149] Ran cog, refs #608

---
 docs/openai-models.md |  1 +
 docs/usage.md         | 10 ++++++++++
 2 files changed, 11 insertions(+)

diff --git a/docs/openai-models.md b/docs/openai-models.md
index d0fc6fcd..e1d90083 100644
--- a/docs/openai-models.md
+++ b/docs/openai-models.md
@@ -41,6 +41,7 @@ OpenAI Chat: gpt-4-turbo-2024-04-09
 OpenAI Chat: gpt-4-turbo (aliases: gpt-4-turbo-preview, 4-turbo, 4t)
 OpenAI Chat: gpt-4o (aliases: 4o)
 OpenAI Chat: gpt-4o-mini (aliases: 4o-mini)
+OpenAI Chat: gpt-4o-audio-preview
 OpenAI Chat: o1-preview
 OpenAI Chat: o1-mini
 OpenAI Completion: gpt-3.5-turbo-instruct (aliases: 3.5-instruct, chatgpt-instruct)
diff --git a/docs/usage.md b/docs/usage.md
index 94cb5ca5..ea15e453 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -367,6 +367,16 @@ OpenAI Chat: gpt-4o-mini (aliases: 4o-mini)
   logit_bias: dict, str
   seed: int
   json_object: boolean
+OpenAI Chat: gpt-4o-audio-preview
+  temperature: float
+  max_tokens: int
+  top_p: float
+  frequency_penalty: float
+  presence_penalty: float
+  stop: str
+  logit_bias: dict, str
+  seed: int
+  json_object: boolean
 OpenAI Chat: o1-preview
   temperature: float
   max_tokens: int

From 0cc4072bcd9af4e4c9f030955179e7614dcd9d00 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Tue, 5 Nov 2024 21:27:18 -0800
Subject: [PATCH 069/149] Support attachments without prompts, closes #611

---
 docs/plugins/advanced-model-plugins.md | 53 ++++++++++++++++----------
 llm/cli.py                             |  8 +++-
 llm/default_plugins/openai_models.py   | 12 ++++--
 3 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/docs/plugins/advanced-model-plugins.md b/docs/plugins/advanced-model-plugins.md
index fdbdc232..a201237c 100644
--- a/docs/plugins/advanced-model-plugins.md
+++ b/docs/plugins/advanced-model-plugins.md
@@ -53,25 +53,42 @@ You should usually access the type and the content through one of these methods:
 
 A `id()` method returns a database ID for this content, which is either a SHA256 hash of the binary content or, in the case of attachments hosted at an external URL, a hash of `{"url": url}` instead. This is an implementation detail which you should not need to access directly.
 
-Here's how the OpenAI plugin handles attachments:
+Note that it's possible for a prompt with an attachments to not include a text prompt at all, in which case `prompt.prompt` will be `None`.
+
+Here's how the OpenAI plugin handles attachments, including the case where no `prompt.prompt` was provided:
 
 ```python
-messages = []
 if not prompt.attachments:
     messages.append({"role": "user", "content": prompt.prompt})
 else:
-    attachment_message = [{"type": "text", "text": prompt.prompt}]
+    attachment_message = []
+    if prompt.prompt:
+        attachment_message.append({"type": "text", "text": prompt.prompt})
     for attachment in prompt.attachments:
-        url = attachment.url
-        if not url:
-            base64_image = attachment.base64_content()
-            url = f"data:{attachment.resolve_type()};base64,{base64_image}"
-        attachment_message.append(
-            {"type": "image_url", "image_url": {"url": url}}
-        )
+        attachment_message.append(_attachment(attachment))
     messages.append({"role": "user", "content": attachment_message})
+
+
+# And the code for creating the attachment message
+def _attachment(attachment):
+    url = attachment.url
+    base64_content = ""
+    if not url or attachment.resolve_type().startswith("audio/"):
+        base64_content = attachment.base64_content()
+        url = f"data:{attachment.resolve_type()};base64,{base64_content}"
+    if attachment.resolve_type().startswith("image/"):
+        return {"type": "image_url", "image_url": {"url": url}}
+    else:
+        format_ = "wav" if attachment.resolve_type() == "audio/wave" else "mp3"
+        return {
+            "type": "input_audio",
+            "input_audio": {
+                "data": base64_content,
+                "format": format_,
+            },
+        }
 ```
-As you can see, it uses `attachment.url` if that is available and otherwise falls back to using the `base64_content()` method to embed the image directly in the JSON sent to the API.
+As you can see, it uses `attachment.url` if that is available and otherwise falls back to using the `base64_content()` method to embed the image directly in the JSON sent to the API. For the OpenAI API audio attachments are always included as base64-encoded strings.
 
 ### Attachments from previous conversations
 
@@ -82,17 +99,13 @@ Here's how the OpenAI plugin does that:
 ```python
 for prev_response in conversation.responses:
     if prev_response.attachments:
-        attachment_message = [
-            {"type": "text", "text": prev_response.prompt.prompt}
-        ]
-        for attachment in prev_response.attachments:
-            url = attachment.url
-            if not url:
-                base64_image = attachment.base64_content()
-                url = f"data:{attachment.resolve_type()};base64,{base64_image}"
+        attachment_message = []
+        if prev_response.prompt.prompt:
             attachment_message.append(
-                {"type": "image_url", "image_url": {"url": url}}
+                {"type": "text", "text": prev_response.prompt.prompt}
             )
+        for attachment in prev_response.attachments:
+            attachment_message.append(_attachment(attachment))
         messages.append({"role": "user", "content": attachment_message})
     else:
         messages.append(
diff --git a/llm/cli.py b/llm/cli.py
index 941831c5..d4547199 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -250,7 +250,13 @@ def read_prompt():
                 bits.append(prompt)
             prompt = " ".join(bits)
 
-        if prompt is None and not save and sys.stdin.isatty():
+        if (
+            prompt is None
+            and not save
+            and sys.stdin.isatty()
+            and not attachments
+            and not attachment_types
+        ):
             # Hang waiting for input to stdin (unless --save)
             prompt = sys.stdin.read()
         return prompt
diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index 81d0cc07..6944df6c 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -346,9 +346,11 @@ def execute(self, prompt, stream, response, conversation=None):
                     )
                     current_system = prev_response.prompt.system
                 if prev_response.attachments:
-                    attachment_message = [
-                        {"type": "text", "text": prev_response.prompt.prompt}
-                    ]
+                    attachment_message = []
+                    if prev_response.prompt.prompt:
+                        attachment_message.append(
+                            {"type": "text", "text": prev_response.prompt.prompt}
+                        )
                     for attachment in prev_response.attachments:
                         attachment_message.append(_attachment(attachment))
                     messages.append({"role": "user", "content": attachment_message})
@@ -362,7 +364,9 @@ def execute(self, prompt, stream, response, conversation=None):
         if not prompt.attachments:
             messages.append({"role": "user", "content": prompt.prompt})
         else:
-            attachment_message = [{"type": "text", "text": prompt.prompt}]
+            attachment_message = []
+            if prompt.prompt:
+                attachment_message.append({"type": "text", "text": prompt.prompt})
             for attachment in prompt.attachments:
                 attachment_message.append(_attachment(attachment))
             messages.append({"role": "user", "content": attachment_message})

From 1a60fa16674ee5d70a359018dbfd545f6689cbfe Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Tue, 5 Nov 2024 21:50:00 -0800
Subject: [PATCH 070/149] Test to exercise gpt-4o-audio-preview, closes #608

---
 tests/test_cli_openai_models.py | 88 +++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/tests/test_cli_openai_models.py b/tests/test_cli_openai_models.py
index 56acd113..f341e385 100644
--- a/tests/test_cli_openai_models.py
+++ b/tests/test_cli_openai_models.py
@@ -56,3 +56,91 @@ def test_openai_options_min_max():
         result2 = runner.invoke(cli, ["-m", "chatgpt", "-o", option, "10"])
         assert result2.exit_code == 1
         assert f"less than or equal to {max_val}" in result2.output
+
+
+@pytest.mark.parametrize("model", ("gpt-4o-mini", "gpt-4o-audio-preview"))
+@pytest.mark.parametrize("filetype", ("mp3", "wav"))
+def test_only_gpt4_audio_preview_allows_mp3_or_wav(httpx_mock, model, filetype):
+    httpx_mock.add_response(
+        method="HEAD",
+        url=f"https://www.example.com/example.{filetype}",
+        content=b"binary-data",
+        headers={"Content-Type": "audio/mpeg" if filetype == "mp3" else "audio/wave"},
+    )
+    # Another mock for the correct model
+    if model == "gpt-4o-audio-preview":
+        httpx_mock.add_response(
+            method="POST",
+            # chat completion request
+            url="https://api.openai.com/v1/chat/completions",
+            json={
+                "id": "chatcmpl-AQT9a30kxEaM1bqxRPepQsPlCyGJh",
+                "object": "chat.completion",
+                "created": 1730871958,
+                "model": "gpt-4o-audio-preview-2024-10-01",
+                "choices": [
+                    {
+                        "index": 0,
+                        "message": {
+                            "role": "assistant",
+                            "content": "Why did the pelican get kicked out of the restaurant?\n\nBecause he had a big bill and no way to pay it!",
+                            "refusal": None,
+                        },
+                        "finish_reason": "stop",
+                    }
+                ],
+                "usage": {
+                    "prompt_tokens": 55,
+                    "completion_tokens": 25,
+                    "total_tokens": 80,
+                    "prompt_tokens_details": {
+                        "cached_tokens": 0,
+                        "audio_tokens": 44,
+                        "text_tokens": 11,
+                        "image_tokens": 0,
+                    },
+                    "completion_tokens_details": {
+                        "reasoning_tokens": 0,
+                        "audio_tokens": 0,
+                        "text_tokens": 25,
+                        "accepted_prediction_tokens": 0,
+                        "rejected_prediction_tokens": 0,
+                    },
+                },
+                "system_fingerprint": "fp_49254d0e9b",
+            },
+            headers={"Content-Type": "application/json"},
+        )
+        httpx_mock.add_response(
+            method="GET",
+            url=f"https://www.example.com/example.{filetype}",
+            content=b"binary-data",
+            headers={
+                "Content-Type": "audio/mpeg" if filetype == "mp3" else "audio/wave"
+            },
+        )
+    runner = CliRunner()
+    result = runner.invoke(
+        cli,
+        [
+            "-m",
+            model,
+            "-a",
+            f"https://www.example.com/example.{filetype}",
+            "--no-stream",
+            "--key",
+            "x",
+        ],
+    )
+    if model == "gpt-4o-audio-preview":
+        assert result.exit_code == 0
+        assert result.output == (
+            "Why did the pelican get kicked out of the restaurant?\n\n"
+            "Because he had a big bill and no way to pay it!\n"
+        )
+    else:
+        assert result.exit_code == 1
+        long = "audio/mpeg" if filetype == "mp3" else "audio/wave"
+        assert (
+            f"This model does not support attachments of type '{long}'" in result.output
+        )

From 12df1a3b2a5df23adacfd9a64ca6df6f7b105d3c Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Tue, 5 Nov 2024 22:49:26 -0800
Subject: [PATCH 071/149] Show attachment types in llm models --options, closes
 #612

---
 docs/aliases.md                      |   4 +-
 docs/openai-models.md                |   6 +-
 docs/usage.md                        | 358 ++++++++++++++-------------
 llm/cli.py                           |  29 ++-
 llm/default_plugins/openai_models.py |   9 +-
 tests/test_llm.py                    |  59 +++--
 6 files changed, 250 insertions(+), 215 deletions(-)

diff --git a/docs/aliases.md b/docs/aliases.md
index fe86288e..787a3034 100644
--- a/docs/aliases.md
+++ b/docs/aliases.md
@@ -19,6 +19,8 @@ result = CliRunner().invoke(cli, ["aliases", "list"])
 cog.out("```\n{}```".format(result.output))
 ]]] -->
 ```
+4o                  : gpt-4o
+4o-mini             : gpt-4o-mini
 3.5                 : gpt-3.5-turbo
 chatgpt             : gpt-3.5-turbo
 chatgpt-16k         : gpt-3.5-turbo-16k
@@ -29,8 +31,6 @@ gpt4                : gpt-4
 gpt-4-turbo-preview : gpt-4-turbo
 4-turbo             : gpt-4-turbo
 4t                  : gpt-4-turbo
-4o                  : gpt-4o
-4o-mini             : gpt-4o-mini
 3.5-instruct        : gpt-3.5-turbo-instruct
 chatgpt-instruct    : gpt-3.5-turbo-instruct
 ada                 : ada-002 (embedding)
diff --git a/docs/openai-models.md b/docs/openai-models.md
index e1d90083..eda613ad 100644
--- a/docs/openai-models.md
+++ b/docs/openai-models.md
@@ -31,6 +31,9 @@ models = [line for line in result.output.split("\n") if line.startswith("OpenAI
 cog.out("```\n{}\n```".format("\n".join(models)))
 ]]] -->
 ```
+OpenAI Chat: gpt-4o (aliases: 4o)
+OpenAI Chat: gpt-4o-mini (aliases: 4o-mini)
+OpenAI Chat: gpt-4o-audio-preview
 OpenAI Chat: gpt-3.5-turbo (aliases: 3.5, chatgpt)
 OpenAI Chat: gpt-3.5-turbo-16k (aliases: chatgpt-16k, 3.5-16k)
 OpenAI Chat: gpt-4 (aliases: 4, gpt4)
@@ -39,9 +42,6 @@ OpenAI Chat: gpt-4-1106-preview
 OpenAI Chat: gpt-4-0125-preview
 OpenAI Chat: gpt-4-turbo-2024-04-09
 OpenAI Chat: gpt-4-turbo (aliases: gpt-4-turbo-preview, 4-turbo, 4t)
-OpenAI Chat: gpt-4o (aliases: 4o)
-OpenAI Chat: gpt-4o-mini (aliases: 4o-mini)
-OpenAI Chat: gpt-4o-audio-preview
 OpenAI Chat: o1-preview
 OpenAI Chat: o1-mini
 OpenAI Completion: gpt-3.5-turbo-instruct (aliases: 3.5-instruct, chatgpt-instruct)
diff --git a/docs/usage.md b/docs/usage.md
index ea15e453..f5d80d11 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -247,186 +247,206 @@ result = CliRunner().invoke(cli, ["models", "list", "--options"])
 cog.out("```\n{}\n```".format(result.output))
 ]]] -->
 ```
+OpenAI Chat: gpt-4o (aliases: 4o)
+  Options:
+    temperature: float
+      What sampling temperature to use, between 0 and 2. Higher values like
+      0.8 will make the output more random, while lower values like 0.2 will
+      make it more focused and deterministic.
+    max_tokens: int
+      Maximum number of tokens to generate.
+    top_p: float
+      An alternative to sampling with temperature, called nucleus sampling,
+      where the model considers the results of the tokens with top_p
+      probability mass. So 0.1 means only the tokens comprising the top 10%
+      probability mass are considered. Recommended to use top_p or
+      temperature but not both.
+    frequency_penalty: float
+      Number between -2.0 and 2.0. Positive values penalize new tokens based
+      on their existing frequency in the text so far, decreasing the model's
+      likelihood to repeat the same line verbatim.
+    presence_penalty: float
+      Number between -2.0 and 2.0. Positive values penalize new tokens based
+      on whether they appear in the text so far, increasing the model's
+      likelihood to talk about new topics.
+    stop: str
+      A string where the API will stop generating further tokens.
+    logit_bias: dict, str
+      Modify the likelihood of specified tokens appearing in the completion.
+      Pass a JSON string like '{"1712":-100, "892":-100, "1489":-100}'
+    seed: int
+      Integer seed to attempt to sample deterministically
+    json_object: boolean
+      Output a valid JSON object {...}. Prompt must mention JSON.
+  Attachment types:
+    image/png, image/gif, image/webp, image/jpeg
+OpenAI Chat: gpt-4o-mini (aliases: 4o-mini)
+  Options:
+    temperature: float
+    max_tokens: int
+    top_p: float
+    frequency_penalty: float
+    presence_penalty: float
+    stop: str
+    logit_bias: dict, str
+    seed: int
+    json_object: boolean
+  Attachment types:
+    image/png, image/gif, image/webp, image/jpeg
+OpenAI Chat: gpt-4o-audio-preview
+  Options:
+    temperature: float
+    max_tokens: int
+    top_p: float
+    frequency_penalty: float
+    presence_penalty: float
+    stop: str
+    logit_bias: dict, str
+    seed: int
+    json_object: boolean
+  Attachment types:
+    audio/mpeg, audio/wave
 OpenAI Chat: gpt-3.5-turbo (aliases: 3.5, chatgpt)
-  temperature: float
-    What sampling temperature to use, between 0 and 2. Higher values like
-    0.8 will make the output more random, while lower values like 0.2 will
-    make it more focused and deterministic.
-  max_tokens: int
-    Maximum number of tokens to generate.
-  top_p: float
-    An alternative to sampling with temperature, called nucleus sampling,
-    where the model considers the results of the tokens with top_p
-    probability mass. So 0.1 means only the tokens comprising the top 10%
-    probability mass are considered. Recommended to use top_p or
-    temperature but not both.
-  frequency_penalty: float
-    Number between -2.0 and 2.0. Positive values penalize new tokens based
-    on their existing frequency in the text so far, decreasing the model's
-    likelihood to repeat the same line verbatim.
-  presence_penalty: float
-    Number between -2.0 and 2.0. Positive values penalize new tokens based
-    on whether they appear in the text so far, increasing the model's
-    likelihood to talk about new topics.
-  stop: str
-    A string where the API will stop generating further tokens.
-  logit_bias: dict, str
-    Modify the likelihood of specified tokens appearing in the completion.
-    Pass a JSON string like '{"1712":-100, "892":-100, "1489":-100}'
-  seed: int
-    Integer seed to attempt to sample deterministically
-  json_object: boolean
-    Output a valid JSON object {...}. Prompt must mention JSON.
+  Options:
+    temperature: float
+    max_tokens: int
+    top_p: float
+    frequency_penalty: float
+    presence_penalty: float
+    stop: str
+    logit_bias: dict, str
+    seed: int
+    json_object: boolean
 OpenAI Chat: gpt-3.5-turbo-16k (aliases: chatgpt-16k, 3.5-16k)
-  temperature: float
-  max_tokens: int
-  top_p: float
-  frequency_penalty: float
-  presence_penalty: float
-  stop: str
-  logit_bias: dict, str
-  seed: int
-  json_object: boolean
+  Options:
+    temperature: float
+    max_tokens: int
+    top_p: float
+    frequency_penalty: float
+    presence_penalty: float
+    stop: str
+    logit_bias: dict, str
+    seed: int
+    json_object: boolean
 OpenAI Chat: gpt-4 (aliases: 4, gpt4)
-  temperature: float
-  max_tokens: int
-  top_p: float
-  frequency_penalty: float
-  presence_penalty: float
-  stop: str
-  logit_bias: dict, str
-  seed: int
-  json_object: boolean
+  Options:
+    temperature: float
+    max_tokens: int
+    top_p: float
+    frequency_penalty: float
+    presence_penalty: float
+    stop: str
+    logit_bias: dict, str
+    seed: int
+    json_object: boolean
 OpenAI Chat: gpt-4-32k (aliases: 4-32k)
-  temperature: float
-  max_tokens: int
-  top_p: float
-  frequency_penalty: float
-  presence_penalty: float
-  stop: str
-  logit_bias: dict, str
-  seed: int
-  json_object: boolean
+  Options:
+    temperature: float
+    max_tokens: int
+    top_p: float
+    frequency_penalty: float
+    presence_penalty: float
+    stop: str
+    logit_bias: dict, str
+    seed: int
+    json_object: boolean
 OpenAI Chat: gpt-4-1106-preview
-  temperature: float
-  max_tokens: int
-  top_p: float
-  frequency_penalty: float
-  presence_penalty: float
-  stop: str
-  logit_bias: dict, str
-  seed: int
-  json_object: boolean
+  Options:
+    temperature: float
+    max_tokens: int
+    top_p: float
+    frequency_penalty: float
+    presence_penalty: float
+    stop: str
+    logit_bias: dict, str
+    seed: int
+    json_object: boolean
 OpenAI Chat: gpt-4-0125-preview
-  temperature: float
-  max_tokens: int
-  top_p: float
-  frequency_penalty: float
-  presence_penalty: float
-  stop: str
-  logit_bias: dict, str
-  seed: int
-  json_object: boolean
+  Options:
+    temperature: float
+    max_tokens: int
+    top_p: float
+    frequency_penalty: float
+    presence_penalty: float
+    stop: str
+    logit_bias: dict, str
+    seed: int
+    json_object: boolean
 OpenAI Chat: gpt-4-turbo-2024-04-09
-  temperature: float
-  max_tokens: int
-  top_p: float
-  frequency_penalty: float
-  presence_penalty: float
-  stop: str
-  logit_bias: dict, str
-  seed: int
-  json_object: boolean
+  Options:
+    temperature: float
+    max_tokens: int
+    top_p: float
+    frequency_penalty: float
+    presence_penalty: float
+    stop: str
+    logit_bias: dict, str
+    seed: int
+    json_object: boolean
 OpenAI Chat: gpt-4-turbo (aliases: gpt-4-turbo-preview, 4-turbo, 4t)
-  temperature: float
-  max_tokens: int
-  top_p: float
-  frequency_penalty: float
-  presence_penalty: float
-  stop: str
-  logit_bias: dict, str
-  seed: int
-  json_object: boolean
-OpenAI Chat: gpt-4o (aliases: 4o)
-  temperature: float
-  max_tokens: int
-  top_p: float
-  frequency_penalty: float
-  presence_penalty: float
-  stop: str
-  logit_bias: dict, str
-  seed: int
-  json_object: boolean
-OpenAI Chat: gpt-4o-mini (aliases: 4o-mini)
-  temperature: float
-  max_tokens: int
-  top_p: float
-  frequency_penalty: float
-  presence_penalty: float
-  stop: str
-  logit_bias: dict, str
-  seed: int
-  json_object: boolean
-OpenAI Chat: gpt-4o-audio-preview
-  temperature: float
-  max_tokens: int
-  top_p: float
-  frequency_penalty: float
-  presence_penalty: float
-  stop: str
-  logit_bias: dict, str
-  seed: int
-  json_object: boolean
+  Options:
+    temperature: float
+    max_tokens: int
+    top_p: float
+    frequency_penalty: float
+    presence_penalty: float
+    stop: str
+    logit_bias: dict, str
+    seed: int
+    json_object: boolean
 OpenAI Chat: o1-preview
-  temperature: float
-  max_tokens: int
-  top_p: float
-  frequency_penalty: float
-  presence_penalty: float
-  stop: str
-  logit_bias: dict, str
-  seed: int
-  json_object: boolean
+  Options:
+    temperature: float
+    max_tokens: int
+    top_p: float
+    frequency_penalty: float
+    presence_penalty: float
+    stop: str
+    logit_bias: dict, str
+    seed: int
+    json_object: boolean
 OpenAI Chat: o1-mini
-  temperature: float
-  max_tokens: int
-  top_p: float
-  frequency_penalty: float
-  presence_penalty: float
-  stop: str
-  logit_bias: dict, str
-  seed: int
-  json_object: boolean
+  Options:
+    temperature: float
+    max_tokens: int
+    top_p: float
+    frequency_penalty: float
+    presence_penalty: float
+    stop: str
+    logit_bias: dict, str
+    seed: int
+    json_object: boolean
 OpenAI Completion: gpt-3.5-turbo-instruct (aliases: 3.5-instruct, chatgpt-instruct)
-  temperature: float
-    What sampling temperature to use, between 0 and 2. Higher values like
-    0.8 will make the output more random, while lower values like 0.2 will
-    make it more focused and deterministic.
-  max_tokens: int
-    Maximum number of tokens to generate.
-  top_p: float
-    An alternative to sampling with temperature, called nucleus sampling,
-    where the model considers the results of the tokens with top_p
-    probability mass. So 0.1 means only the tokens comprising the top 10%
-    probability mass are considered. Recommended to use top_p or
-    temperature but not both.
-  frequency_penalty: float
-    Number between -2.0 and 2.0. Positive values penalize new tokens based
-    on their existing frequency in the text so far, decreasing the model's
-    likelihood to repeat the same line verbatim.
-  presence_penalty: float
-    Number between -2.0 and 2.0. Positive values penalize new tokens based
-    on whether they appear in the text so far, increasing the model's
-    likelihood to talk about new topics.
-  stop: str
-    A string where the API will stop generating further tokens.
-  logit_bias: dict, str
-    Modify the likelihood of specified tokens appearing in the completion.
-    Pass a JSON string like '{"1712":-100, "892":-100, "1489":-100}'
-  seed: int
-    Integer seed to attempt to sample deterministically
-  logprobs: int
-    Include the log probabilities of most likely N per token
+  Options:
+    temperature: float
+      What sampling temperature to use, between 0 and 2. Higher values like
+      0.8 will make the output more random, while lower values like 0.2 will
+      make it more focused and deterministic.
+    max_tokens: int
+      Maximum number of tokens to generate.
+    top_p: float
+      An alternative to sampling with temperature, called nucleus sampling,
+      where the model considers the results of the tokens with top_p
+      probability mass. So 0.1 means only the tokens comprising the top 10%
+      probability mass are considered. Recommended to use top_p or
+      temperature but not both.
+    frequency_penalty: float
+      Number between -2.0 and 2.0. Positive values penalize new tokens based
+      on their existing frequency in the text so far, decreasing the model's
+      likelihood to repeat the same line verbatim.
+    presence_penalty: float
+      Number between -2.0 and 2.0. Positive values penalize new tokens based
+      on whether they appear in the text so far, increasing the model's
+      likelihood to talk about new topics.
+    stop: str
+      A string where the API will stop generating further tokens.
+    logit_bias: dict, str
+      Modify the likelihood of specified tokens appearing in the completion.
+      Pass a JSON string like '{"1712":-100, "892":-100, "1489":-100}'
+    seed: int
+      Integer seed to attempt to sample deterministically
+    logprobs: int
+      Include the log probabilities of most likely N per token
 
 ```
 <!-- [[[end]]] -->
diff --git a/llm/cli.py b/llm/cli.py
index d4547199..3506b877 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -961,11 +961,11 @@ def models_list(options):
         extra = ""
         if model_with_aliases.aliases:
             extra = " (aliases: {})".format(", ".join(model_with_aliases.aliases))
-        output = str(model_with_aliases.model) + extra
-        if options and model_with_aliases.model.Options.schema()["properties"]:
-            for name, field in model_with_aliases.model.Options.schema()[
-                "properties"
-            ].items():
+        model = model_with_aliases.model
+        output = str(model) + extra
+        if options and model.Options.model_json_schema()["properties"]:
+            output += "\n  Options:"
+            for name, field in model.Options.model_json_schema()["properties"].items():
                 any_of = field.get("anyOf")
                 if any_of is None:
                     any_of = [{"type": field["type"]}]
@@ -976,17 +976,24 @@ def models_list(options):
                         if item["type"] != "null"
                     ]
                 )
-                bits = ["\n  ", name, ": ", types]
+                bits = ["\n    ", name, ": ", types]
                 description = field.get("description", "")
                 if description and (
-                    model_with_aliases.model.__class__
-                    not in models_that_have_shown_options
+                    model.__class__ not in models_that_have_shown_options
                 ):
                     wrapped = textwrap.wrap(description, 70)
-                    bits.append("\n    ")
-                    bits.extend("\n    ".join(wrapped))
+                    bits.append("\n      ")
+                    bits.extend("\n      ".join(wrapped))
                 output += "".join(bits)
-            models_that_have_shown_options.add(model_with_aliases.model.__class__)
+            models_that_have_shown_options.add(model.__class__)
+        if options and model.attachment_types:
+            attachment_types = ", ".join(sorted(model.attachment_types))
+            wrapper = textwrap.TextWrapper(
+                width=min(max(shutil.get_terminal_size().columns, 30), 70),
+                initial_indent="    ",
+                subsequent_indent="    ",
+            )
+            output += "\n  Attachment types:\n{}".format(wrapper.fill(attachment_types))
         click.echo(output)
 
 
diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index 6944df6c..4229553c 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -23,6 +23,11 @@
 
 @hookimpl
 def register_models(register):
+    # GPT-4o
+    register(Chat("gpt-4o", vision=True), aliases=("4o",))
+    register(Chat("gpt-4o-mini", vision=True), aliases=("4o-mini",))
+    register(Chat("gpt-4o-audio-preview", audio=True))
+    # 3.5 and 4
     register(Chat("gpt-3.5-turbo"), aliases=("3.5", "chatgpt"))
     register(Chat("gpt-3.5-turbo-16k"), aliases=("chatgpt-16k", "3.5-16k"))
     register(Chat("gpt-4"), aliases=("4", "gpt4"))
@@ -32,10 +37,6 @@ def register_models(register):
     register(Chat("gpt-4-0125-preview"))
     register(Chat("gpt-4-turbo-2024-04-09"))
     register(Chat("gpt-4-turbo"), aliases=("gpt-4-turbo-preview", "4-turbo", "4t"))
-    # GPT-4o
-    register(Chat("gpt-4o", vision=True), aliases=("4o",))
-    register(Chat("gpt-4o-mini", vision=True), aliases=("4o-mini",))
-    register(Chat("gpt-4o-audio-preview", audio=True))
     # o1
     register(Chat("o1-preview", can_stream=False, allows_system_prompt=False))
     register(Chat("o1-mini", can_stream=False, allows_system_prompt=False))
diff --git a/tests/test_llm.py b/tests/test_llm.py
index c303061d..a0058713 100644
--- a/tests/test_llm.py
+++ b/tests/test_llm.py
@@ -514,32 +514,39 @@ def test_openai_localai_configuration(mocked_localai, user_path):
 
 
 EXPECTED_OPTIONS = """
-OpenAI Chat: gpt-3.5-turbo (aliases: 3.5, chatgpt)
-  temperature: float
-    What sampling temperature to use, between 0 and 2. Higher values like
-    0.8 will make the output more random, while lower values like 0.2 will
-    make it more focused and deterministic.
-  max_tokens: int
-    Maximum number of tokens to generate.
-  top_p: float
-    An alternative to sampling with temperature, called nucleus sampling,
-    where the model considers the results of the tokens with top_p
-    probability mass. So 0.1 means only the tokens comprising the top 10%
-    probability mass are considered. Recommended to use top_p or
-    temperature but not both.
-  frequency_penalty: float
-    Number between -2.0 and 2.0. Positive values penalize new tokens based
-    on their existing frequency in the text so far, decreasing the model's
-    likelihood to repeat the same line verbatim.
-  presence_penalty: float
-    Number between -2.0 and 2.0. Positive values penalize new tokens based
-    on whether they appear in the text so far, increasing the model's
-    likelihood to talk about new topics.
-  stop: str
-    A string where the API will stop generating further tokens.
-  logit_bias: dict, str
-    Modify the likelihood of specified tokens appearing in the completion.
-    Pass a JSON string like '{"1712":-100, "892":-100, "1489":-100}'
+OpenAI Chat: gpt-4o (aliases: 4o)
+  Options:
+    temperature: float
+      What sampling temperature to use, between 0 and 2. Higher values like
+      0.8 will make the output more random, while lower values like 0.2 will
+      make it more focused and deterministic.
+    max_tokens: int
+      Maximum number of tokens to generate.
+    top_p: float
+      An alternative to sampling with temperature, called nucleus sampling,
+      where the model considers the results of the tokens with top_p
+      probability mass. So 0.1 means only the tokens comprising the top 10%
+      probability mass are considered. Recommended to use top_p or
+      temperature but not both.
+    frequency_penalty: float
+      Number between -2.0 and 2.0. Positive values penalize new tokens based
+      on their existing frequency in the text so far, decreasing the model's
+      likelihood to repeat the same line verbatim.
+    presence_penalty: float
+      Number between -2.0 and 2.0. Positive values penalize new tokens based
+      on whether they appear in the text so far, increasing the model's
+      likelihood to talk about new topics.
+    stop: str
+      A string where the API will stop generating further tokens.
+    logit_bias: dict, str
+      Modify the likelihood of specified tokens appearing in the completion.
+      Pass a JSON string like '{"1712":-100, "892":-100, "1489":-100}'
+    seed: int
+      Integer seed to attempt to sample deterministically
+    json_object: boolean
+      Output a valid JSON object {...}. Prompt must mention JSON.
+  Attachment types:
+    image/gif, image/jpeg, image/png, image/webp
 """
 
 

From 7146fe82d1380e63ab25677f1655ba74b23e6085 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Tue, 5 Nov 2024 23:03:31 -0800
Subject: [PATCH 072/149] Back to the deprecated Pydantic thing to get tests
 passing

Refs #612
---
 llm/cli.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llm/cli.py b/llm/cli.py
index 3506b877..1c9b642e 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -963,9 +963,9 @@ def models_list(options):
             extra = " (aliases: {})".format(", ".join(model_with_aliases.aliases))
         model = model_with_aliases.model
         output = str(model) + extra
-        if options and model.Options.model_json_schema()["properties"]:
+        if options and model.Options.schema()["properties"]:
             output += "\n  Options:"
-            for name, field in model.Options.model_json_schema()["properties"].items():
+            for name, field in model.Options.schema()["properties"].items():
                 any_of = field.get("anyOf")
                 if any_of is None:
                     any_of = [{"type": field["type"]}]

From 3b2e5263a3330299017df5ff4d240591a02d706c Mon Sep 17 00:00:00 2001
From: Chris Mungall <cjm@berkeleybop.org>
Date: Tue, 5 Nov 2024 23:04:13 -0800
Subject: [PATCH 073/149] Allow passing of can_stream in openai_models.py
 (#600)

* Allow passing of can_stream in openai_models.py

Fixes #599

* Only set can_stream: false if it is false

Refs https://github.com/simonw/llm/pull/600#issuecomment-2458825866

* Docs for can_stream: false

---------

Co-authored-by: Simon Willison <swillison@gmail.com>
---
 docs/other-models.md                 | 2 ++
 llm/default_plugins/openai_models.py | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/docs/other-models.md b/docs/other-models.md
index 5149b43a..4e511681 100644
--- a/docs/other-models.md
+++ b/docs/other-models.md
@@ -45,6 +45,8 @@ You can set `api_key_name` to the name of a key stored using the {ref}`api-keys`
 
 Add `completion: true` if the model is a completion model that uses a `/completion` as opposed to a `/completion/chat` endpoint.
 
+If a model does not support streaming, add `can_stream: false` to disable the streaming option.
+
 Having configured the model like this, run `llm models` to check that it installed correctly. You can then run prompts against it like so:
 
 ```bash
diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index 4229553c..fc0c73e0 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -61,6 +61,9 @@ def register_models(register):
         api_version = extra_model.get("api_version")
         api_engine = extra_model.get("api_engine")
         headers = extra_model.get("headers")
+        kwargs = {}
+        if extra_model.get("can_stream") is False:
+            kwargs["can_stream"] = False
         if extra_model.get("completion"):
             klass = Completion
         else:
@@ -73,6 +76,7 @@ def register_models(register):
             api_version=api_version,
             api_engine=api_engine,
             headers=headers,
+            **kwargs,
         )
         if api_base:
             chat_model.needs_key = None

From 245e025270109db2ad63359b62f24d907bea47de Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Tue, 5 Nov 2024 23:45:17 -0800
Subject: [PATCH 074/149] Ran cog, refs #612

---
 docs/usage.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/usage.md b/docs/usage.md
index f5d80d11..942f2d6a 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -279,7 +279,7 @@ OpenAI Chat: gpt-4o (aliases: 4o)
     json_object: boolean
       Output a valid JSON object {...}. Prompt must mention JSON.
   Attachment types:
-    image/png, image/gif, image/webp, image/jpeg
+    image/gif, image/jpeg, image/png, image/webp
 OpenAI Chat: gpt-4o-mini (aliases: 4o-mini)
   Options:
     temperature: float
@@ -292,7 +292,7 @@ OpenAI Chat: gpt-4o-mini (aliases: 4o-mini)
     seed: int
     json_object: boolean
   Attachment types:
-    image/png, image/gif, image/webp, image/jpeg
+    image/gif, image/jpeg, image/png, image/webp
 OpenAI Chat: gpt-4o-audio-preview
   Options:
     temperature: float

From 3352eb9f577d4360529b29fbaef725536d5a8882 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Wed, 6 Nov 2024 03:27:25 -0800
Subject: [PATCH 075/149] Serialize usage to JSON properly, closes #614

---
 llm/default_plugins/openai_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index fc0c73e0..d7e29e05 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -514,7 +514,7 @@ def combine_chunks(chunks: List) -> dict:
 
     for item in chunks:
         if item.usage:
-            usage = dict(item.usage)
+            usage = item.usage.dict()
         for choice in item.choices:
             if choice.logprobs and hasattr(choice.logprobs, "top_logprobs"):
                 logprobs.append(

From 98d2c198761f94a37399446f04adef783d5e951f Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Wed, 6 Nov 2024 06:38:53 -0800
Subject: [PATCH 076/149] Promote alternative model providers in llm --help

---
 docs/help.md | 9 +++++++--
 llm/cli.py   | 9 +++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/docs/help.md b/docs/help.md
index 0e28494a..f25927e9 100644
--- a/docs/help.md
+++ b/docs/help.md
@@ -41,11 +41,16 @@ cog.out(all_help(cli))
 ```
 Usage: llm [OPTIONS] COMMAND [ARGS]...
 
-  Access large language models from the command-line
+  Access Large Language Models from the command-line
 
   Documentation: https://llm.datasette.io/
 
-  To get started, obtain an OpenAI key and set it like this:
+  LLM can run models from many different providers. Consult the plugin directory
+  for a list of available models:
+
+  https://llm.datasette.io/en/stable/plugins/directory.html
+
+  To get started with OpenAI, obtain an API key from them and:
 
       $ llm keys set openai
       Enter key: ...
diff --git a/llm/cli.py b/llm/cli.py
index 1c9b642e..ad7aeb4f 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -121,11 +121,16 @@ def _validate_metadata_json(ctx, param, value):
 @click.version_option()
 def cli():
     """
-    Access large language models from the command-line
+    Access Large Language Models from the command-line
 
     Documentation: https://llm.datasette.io/
 
-    To get started, obtain an OpenAI key and set it like this:
+    LLM can run models from many different providers. Consult the
+    plugin directory for a list of available models:
+
+    https://llm.datasette.io/en/stable/plugins/directory.html
+
+    To get started with OpenAI, obtain an API key from them and:
 
     \b
         $ llm keys set openai

From febbc04fb6d847bd35ce5ba0156ef010f5894564 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Wed, 6 Nov 2024 06:56:19 -0800
Subject: [PATCH 077/149] Run cog -r in PRs, use that to update logging.md with
 new tables (#616)

* Create cog.yml
* Document attachments and prompt_attachments table schemas

Closes #615

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 .github/workflows/cog.yml | 48 +++++++++++++++++++++++++++++++++++++++
 docs/logging.md           | 18 +++++++++++++--
 2 files changed, 64 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/cog.yml

diff --git a/.github/workflows/cog.yml b/.github/workflows/cog.yml
new file mode 100644
index 00000000..56ce1b27
--- /dev/null
+++ b/.github/workflows/cog.yml
@@ -0,0 +1,48 @@
+name: Run Cog
+
+on:
+  pull_request:
+    types: [opened, synchronize]
+
+permissions:
+  contents: write
+  pull-requests: write
+
+jobs:
+  run-cog:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.head_ref }}
+
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install dependencies
+        run: pip install -e '.[test]'
+
+      - name: Run cog
+        run: |
+          cog -r -p "import sys, os; sys._called_from_test=True; os.environ['LLM_USER_PATH'] = '/tmp'" docs/**/*.md docs/*.md
+
+      - name: Check for changes
+        id: check-changes
+        run: |
+          if [ -n "$(git diff)" ]; then
+            echo "changes=true" >> $GITHUB_OUTPUT
+          else
+            echo "changes=false" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Commit and push if changed
+        if: steps.check-changes.outputs.changes == 'true'
+        run: |
+          git config --local user.email "github-actions[bot]@users.noreply.github.com"
+          git config --local user.name "github-actions[bot]"
+          git add -A
+          git commit -m "Ran cog"
+          git push
diff --git a/docs/logging.md b/docs/logging.md
index ceca824f..63722e01 100644
--- a/docs/logging.md
+++ b/docs/logging.md
@@ -136,7 +136,7 @@ def cleanup_sql(sql):
     return first_line + '(\n  ' + ',\n  '.join(columns) + '\n);'
 
 cog.out("```sql\n")
-for table in ("conversations", "responses", "responses_fts"):
+for table in ("conversations", "responses", "responses_fts", "attachments", "prompt_attachments"):
     schema = db[table].schema
     cog.out(format(cleanup_sql(schema)))
     cog.out("\n")
@@ -166,6 +166,20 @@ CREATE VIRTUAL TABLE [responses_fts] USING FTS5 (
   [response],
   content=[responses]
 );
+CREATE TABLE [attachments] (
+  [id] TEXT PRIMARY KEY,
+  [type] TEXT,
+  [path] TEXT,
+  [url] TEXT,
+  [content] BLOB
+);
+CREATE TABLE [prompt_attachments] (
+  [response_id] TEXT REFERENCES [responses]([id]),
+  [attachment_id] TEXT REFERENCES [attachments]([id]),
+  [order] INTEGER,
+  PRIMARY KEY ([response_id],
+  [attachment_id])
+);
 ```
 <!-- [[[end]]] -->
-`responses_fts` configures [SQLite full-text search](https://www.sqlite.org/fts5.html) against the `prompt` and `response` columns in the `responses` table.
\ No newline at end of file
+`responses_fts` configures [SQLite full-text search](https://www.sqlite.org/fts5.html) against the `prompt` and `response` columns in the `responses` table.

From 5d1d723d4beb546eab4deb8bb8f740b2fe20e065 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 7 Nov 2024 17:13:54 -0800
Subject: [PATCH 078/149] Special case treat audio/wave as audio/wav, closes
 #603

---
 llm/cli.py                | 11 ++++++-----
 llm/models.py             |  6 +++---
 llm/utils.py              | 23 ++++++++++++++++++++++-
 tests/conftest.py         |  2 +-
 tests/test_attachments.py | 29 ++++++++++++++++++++---------
 5 files changed, 52 insertions(+), 19 deletions(-)

diff --git a/llm/cli.py b/llm/cli.py
index ad7aeb4f..6a6fb2cf 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -30,10 +30,10 @@
 
 from .migrations import migrate
 from .plugins import pm
+from .utils import mimetype_from_path, mimetype_from_string
 import base64
 import httpx
 import pathlib
-import puremagic
 import pydantic
 import readline
 from runpy import run_module
@@ -58,9 +58,8 @@ def convert(self, value, param, ctx):
         if value == "-":
             content = sys.stdin.buffer.read()
             # Try to guess type
-            try:
-                mimetype = puremagic.from_string(content, mime=True)
-            except puremagic.PureError:
+            mimetype = mimetype_from_string(content)
+            if mimetype is None:
                 raise click.BadParameter("Could not determine mimetype of stdin")
             return Attachment(type=mimetype, path=None, url=None, content=content)
         if "://" in value:
@@ -78,7 +77,9 @@ def convert(self, value, param, ctx):
             self.fail(f"File {value} does not exist", param, ctx)
         path = path.resolve()
         # Try to guess type
-        mimetype = puremagic.from_file(str(path), mime=True)
+        mimetype = mimetype_from_path(str(path))
+        if mimetype is None:
+            raise click.BadParameter(f"Could not determine mimetype of {value}")
         return Attachment(type=mimetype, path=str(path), url=None, content=None)
 
 
diff --git a/llm/models.py b/llm/models.py
index 838e25b1..485d9720 100644
--- a/llm/models.py
+++ b/llm/models.py
@@ -5,10 +5,10 @@
 import hashlib
 import httpx
 from itertools import islice
-import puremagic
 import re
 import time
 from typing import Any, Dict, Iterable, Iterator, List, Optional, Set, Union
+from .utils import mimetype_from_path, mimetype_from_string
 from abc import ABC, abstractmethod
 import json
 from pydantic import BaseModel
@@ -43,13 +43,13 @@ def resolve_type(self):
             return self.type
         # Derive it from path or url or content
         if self.path:
-            return puremagic.from_file(self.path, mime=True)
+            return mimetype_from_path(self.path)
         if self.url:
             response = httpx.head(self.url)
             response.raise_for_status()
             return response.headers.get("content-type")
         if self.content:
-            return puremagic.from_string(self.content, mime=True)
+            return mimetype_from_string(self.content)
         raise ValueError("Attachment has no type and no content to derive it from")
 
     def content_bytes(self):
diff --git a/llm/utils.py b/llm/utils.py
index 2ea9870a..d2618dd4 100644
--- a/llm/utils.py
+++ b/llm/utils.py
@@ -1,8 +1,29 @@
 import click
 import httpx
 import json
+import puremagic
 import textwrap
-from typing import List, Dict
+from typing import List, Dict, Optional
+
+MIME_TYPE_FIXES = {
+    "audio/wave": "audio/wav",
+}
+
+
+def mimetype_from_string(content) -> Optional[str]:
+    try:
+        type_ = puremagic.from_string(content, mime=True)
+        return MIME_TYPE_FIXES.get(type_, type_)
+    except puremagic.PureError:
+        return None
+
+
+def mimetype_from_path(path) -> Optional[str]:
+    try:
+        type_ = puremagic.from_file(path, mime=True)
+        return MIME_TYPE_FIXES.get(type_, type_)
+    except puremagic.PureError:
+        return None
 
 
 def dicts_to_table_string(
diff --git a/tests/conftest.py b/tests/conftest.py
index bcdb8854..7d44b757 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -49,7 +49,7 @@ def env_setup(monkeypatch, user_path):
 
 class MockModel(llm.Model):
     model_id = "mock"
-    attachment_types = {"image/png"}
+    attachment_types = {"image/png", "audio/wav"}
 
     class Options(llm.Options):
         max_tokens: Optional[int] = Field(
diff --git a/tests/test_attachments.py b/tests/test_attachments.py
index 89a5b81a..e5417d47 100644
--- a/tests/test_attachments.py
+++ b/tests/test_attachments.py
@@ -1,6 +1,8 @@
 from click.testing import CliRunner
 from unittest.mock import ANY
 import llm
+from llm import cli
+import pytest
 
 TINY_PNG = (
     b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\xa6\x00\x00\x01\x1a"
@@ -12,20 +14,29 @@
     b"\x82"
 )
 
+TINY_WAV = b"RIFF$\x00\x00\x00WAVEfmt \x10\x00\x00\x00\x01\x00\x01\x00D\xac\x00\x00"
 
-def test_prompt_image(mock_model, logs_db):
+
+@pytest.mark.parametrize(
+    "attachment_type,attachment_content",
+    [
+        ("image/png", TINY_PNG),
+        ("audio/wav", TINY_WAV),
+    ],
+)
+def test_prompt_attachment(mock_model, logs_db, attachment_type, attachment_content):
     runner = CliRunner()
     mock_model.enqueue(["two boxes"])
     result = runner.invoke(
-        llm.cli.cli,
-        ["prompt", "-m", "mock", "describe image", "-a", "-"],
-        input=TINY_PNG,
+        cli.cli,
+        ["prompt", "-m", "mock", "describe file", "-a", "-"],
+        input=attachment_content,
         catch_exceptions=False,
     )
-    assert result.exit_code == 0
+    assert result.exit_code == 0, result.output
     assert result.output == "two boxes\n"
     assert mock_model.history[0][0].attachments[0] == llm.Attachment(
-        type="image/png", path=None, url=None, content=TINY_PNG, _id=ANY
+        type=attachment_type, path=None, url=None, content=attachment_content, _id=ANY
     )
 
     # Check it was logged correctly
@@ -33,15 +44,15 @@ def test_prompt_image(mock_model, logs_db):
     assert len(conversations) == 1
     conversation = conversations[0]
     assert conversation["model"] == "mock"
-    assert conversation["name"] == "describe image"
+    assert conversation["name"] == "describe file"
     response = list(logs_db["responses"].rows)[0]
     attachment = list(logs_db["attachments"].rows)[0]
     assert attachment == {
         "id": ANY,
-        "type": "image/png",
+        "type": attachment_type,
         "path": None,
         "url": None,
-        "content": TINY_PNG,
+        "content": attachment_content,
     }
     prompt_attachment = list(logs_db["prompt_attachments"].rows)[0]
     assert prompt_attachment["attachment_id"] == attachment["id"]

From 561784df6edf2ab415c784d571b434f9f81bf03e Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 11 Nov 2024 09:47:13 -0800
Subject: [PATCH 079/149] llm keys get command, refs #623

---
 docs/help.md       | 12 ++++++++++++
 llm/cli.py         | 14 ++++++++++++++
 tests/test_keys.py | 12 ++++++++++++
 3 files changed, 38 insertions(+)

diff --git a/docs/help.md b/docs/help.md
index f25927e9..6d9ac6da 100644
--- a/docs/help.md
+++ b/docs/help.md
@@ -156,6 +156,7 @@ Options:
 
 Commands:
   list*  List names of all stored keys
+  get    Return the value of a stored key
   path   Output the path to the keys.json file
   set    Save a key in the keys.json file
 ```
@@ -182,6 +183,17 @@ Options:
   --help  Show this message and exit.
 ```
 
+(help-keys-get)=
+#### llm keys get --help
+```
+Usage: llm keys get [OPTIONS] NAME
+
+  Return the value of a stored key
+
+Options:
+  --help  Show this message and exit.
+```
+
 (help-keys-set)=
 #### llm keys set --help
 ```
diff --git a/llm/cli.py b/llm/cli.py
index 6a6fb2cf..c864e235 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -604,6 +604,20 @@ def keys_path_command():
     click.echo(user_dir() / "keys.json")
 
 
+@keys.command(name="get")
+@click.argument("name")
+def keys_get(name):
+    "Return the value of a stored key"
+    path = user_dir() / "keys.json"
+    if not path.exists():
+        raise click.ClickException("No keys found")
+    keys = json.loads(path.read_text())
+    try:
+        click.echo(keys[name])
+    except KeyError:
+        raise click.ClickException("No key found with name '{}'".format(name))
+
+
 @keys.command(name="set")
 @click.argument("name")
 @click.option("--value", prompt="Enter key", hide_input=True, help="Value to set")
diff --git a/tests/test_keys.py b/tests/test_keys.py
index 5a5649a0..ae142d00 100644
--- a/tests/test_keys.py
+++ b/tests/test_keys.py
@@ -40,6 +40,18 @@ def test_keys_set(monkeypatch, tmpdir):
     }
 
 
+@pytest.mark.xfail(sys.platform == "win32", reason="Expected to fail on Windows")
+def test_keys_get(monkeypatch, tmpdir):
+    user_path = tmpdir / "user/keys"
+    monkeypatch.setenv("LLM_USER_PATH", str(user_path))
+    runner = CliRunner()
+    result = runner.invoke(cli, ["keys", "set", "openai"], input="fx")
+    assert result.exit_code == 0
+    result2 = runner.invoke(cli, ["keys", "get", "openai"])
+    assert result2.exit_code == 0
+    assert result2.output.strip() == "fx"
+
+
 @pytest.mark.parametrize("args", (["keys", "list"], ["keys"]))
 def test_keys_list(monkeypatch, tmpdir, args):
     user_path = str(tmpdir / "user/keys")

From dff53a9caebd01acdb8c79f6f52e0b5f8b038475 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Mon, 11 Nov 2024 09:53:24 -0800
Subject: [PATCH 080/149] Better --help for llm keys get, refs #623

---
 docs/help.md | 4 ++++
 llm/cli.py   | 9 ++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/docs/help.md b/docs/help.md
index 6d9ac6da..e5ad9317 100644
--- a/docs/help.md
+++ b/docs/help.md
@@ -190,6 +190,10 @@ Usage: llm keys get [OPTIONS] NAME
 
   Return the value of a stored key
 
+  Example usage:
+
+      export OPENAI_API_KEY=$(llm keys get openai)
+
 Options:
   --help  Show this message and exit.
 ```
diff --git a/llm/cli.py b/llm/cli.py
index c864e235..69cd44e6 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -607,7 +607,14 @@ def keys_path_command():
 @keys.command(name="get")
 @click.argument("name")
 def keys_get(name):
-    "Return the value of a stored key"
+    """
+    Return the value of a stored key
+
+    Example usage:
+
+    \b
+        export OPENAI_API_KEY=$(llm keys get openai)
+    """
     path = user_dir() / "keys.json"
     if not path.exists():
         raise click.ClickException("No keys found")

From c0cb1697bcf58a2de01c0fe78c9d20dff2eaa714 Mon Sep 17 00:00:00 2001
From: Travis Northcutt <travis@travisnorthcutt.com>
Date: Tue, 12 Nov 2024 20:06:16 -0700
Subject: [PATCH 081/149] Update default model information (#622)

The default model is now 4o-mini; this change updates the usage page of the docs to reflect that
---
 docs/usage.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/usage.md b/docs/usage.md
index 942f2d6a..564e4382 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -6,7 +6,7 @@ The command to run a prompt is `llm prompt 'your prompt'`. This is the default c
 (usage-executing-prompts)=
 ## Executing a prompt
 
-These examples use the default OpenAI `gpt-3.5-turbo` model, which requires you to first {ref}`set an OpenAI API key <api-keys>`.
+These examples use the default OpenAI `gpt-4o-mini` model, which requires you to first {ref}`set an OpenAI API key <api-keys>`.
 
 You can {ref}`install LLM plugins <installing-plugins>` to use models from other providers, including openly licensed models you can run directly on your own computer.
 
@@ -18,7 +18,7 @@ To disable streaming and only return the response once it has completed:
 ```bash
 llm 'Ten names for cheesecakes' --no-stream
 ```
-To switch from ChatGPT 3.5 (the default) to GPT-4o:
+To switch from ChatGPT 4o-mini (the default) to GPT-4o:
 ```bash
 llm 'Ten names for cheesecakes' -m gpt-4o
 ```

From d34eac57d39c20d829a0c090cdda9e64705d9291 Mon Sep 17 00:00:00 2001
From: gabriel pita <dvpita@gmail.com>
Date: Wed, 13 Nov 2024 00:07:28 -0300
Subject: [PATCH 082/149] Update README.md (#621)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f027d36a..797b5162 100644
--- a/README.md
+++ b/README.md
@@ -103,7 +103,7 @@ Type '!multi' to enter multiple lines, then '!end' to finish
 
 You can use the `-s/--system` option to set a system prompt, providing instructions for processing other input to the tool.
 
-To describe how the code a file works, try this:
+To describe how the code in a file works, try this:
 
 ```bash
 cat mycode.py | llm -s "Explain this code"

From 330e171e8689f3f856d3821d8f76f1825e9a4bec Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Tue, 12 Nov 2024 21:10:03 -0800
Subject: [PATCH 083/149] Fix for not loading plugins during tests, refs #626

---
 llm/__init__.py |  5 ++++-
 llm/cli.py      |  4 ++++
 llm/plugins.py  | 57 ++++++++++++++++++++++++++++---------------------
 3 files changed, 41 insertions(+), 25 deletions(-)

diff --git a/llm/__init__.py b/llm/__init__.py
index 0ea6c242..49eff551 100644
--- a/llm/__init__.py
+++ b/llm/__init__.py
@@ -16,7 +16,7 @@
 )
 from .embeddings import Collection
 from .templates import Template
-from .plugins import pm
+from .plugins import pm, load_plugins
 import click
 from typing import Dict, List, Optional
 import json
@@ -80,6 +80,7 @@ def register(model, aliases=None):
             alias_list.extend(extra_model_aliases[model.model_id])
         model_aliases.append(ModelWithAliases(model, alias_list))
 
+    load_plugins()
     pm.hook.register_models(register=register)
 
     return model_aliases
@@ -102,6 +103,7 @@ def register(model, aliases=None):
             alias_list.extend(extra_model_aliases[model.model_id])
         model_aliases.append(EmbeddingModelWithAliases(model, alias_list))
 
+    load_plugins()
     pm.hook.register_embedding_models(register=register)
 
     return model_aliases
@@ -113,6 +115,7 @@ def get_embedding_models():
     def register(model, aliases=None):
         models.append(model)
 
+    load_plugins()
     pm.hook.register_embedding_models(register=register)
     return models
 
diff --git a/llm/cli.py b/llm/cli.py
index 69cd44e6..2cc06395 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -1810,6 +1810,10 @@ def render_errors(errors):
     return "\n".join(output)
 
 
+from .plugins import load_plugins
+
+load_plugins()
+
 pm.hook.register_commands(cli=cli)
 
 
diff --git a/llm/plugins.py b/llm/plugins.py
index 933725c7..5c00b9e6 100644
--- a/llm/plugins.py
+++ b/llm/plugins.py
@@ -12,27 +12,36 @@
 
 LLM_LOAD_PLUGINS = os.environ.get("LLM_LOAD_PLUGINS", None)
 
-if not hasattr(sys, "_called_from_test") and LLM_LOAD_PLUGINS is None:
-    # Only load plugins if not running tests
-    pm.load_setuptools_entrypoints("llm")
-
-
-# Load any plugins specified in LLM_LOAD_PLUGINS")
-if LLM_LOAD_PLUGINS is not None:
-    for package_name in [name for name in LLM_LOAD_PLUGINS.split(",") if name.strip()]:
-        try:
-            distribution = metadata.distribution(package_name)  # Updated call
-            llm_entry_points = [
-                ep for ep in distribution.entry_points if ep.group == "llm"
-            ]
-            for entry_point in llm_entry_points:
-                mod = entry_point.load()
-                pm.register(mod, name=entry_point.name)
-                # Ensure name can be found in plugin_to_distinfo later:
-                pm._plugin_distinfo.append((mod, distribution))  # type: ignore
-        except metadata.PackageNotFoundError:
-            sys.stderr.write(f"Plugin {package_name} could not be found\n")
-
-for plugin in DEFAULT_PLUGINS:
-    mod = importlib.import_module(plugin)
-    pm.register(mod, plugin)
+_loaded = False
+
+
+def load_plugins():
+    global _loaded
+    if _loaded:
+        return
+    _loaded = True
+    if not hasattr(sys, "_called_from_test") and LLM_LOAD_PLUGINS is None:
+        # Only load plugins if not running tests
+        pm.load_setuptools_entrypoints("llm")
+
+    # Load any plugins specified in LLM_LOAD_PLUGINS")
+    if LLM_LOAD_PLUGINS is not None:
+        for package_name in [
+            name for name in LLM_LOAD_PLUGINS.split(",") if name.strip()
+        ]:
+            try:
+                distribution = metadata.distribution(package_name)  # Updated call
+                llm_entry_points = [
+                    ep for ep in distribution.entry_points if ep.group == "llm"
+                ]
+                for entry_point in llm_entry_points:
+                    mod = entry_point.load()
+                    pm.register(mod, name=entry_point.name)
+                    # Ensure name can be found in plugin_to_distinfo later:
+                    pm._plugin_distinfo.append((mod, distribution))  # type: ignore
+            except metadata.PackageNotFoundError:
+                sys.stderr.write(f"Plugin {package_name} could not be found\n")
+
+    for plugin in DEFAULT_PLUGINS:
+        mod = importlib.import_module(plugin)
+        pm.register(mod, plugin)

From 7520671176ad24680632b47138862c2c548a9368 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Tue, 12 Nov 2024 21:11:05 -0800
Subject: [PATCH 084/149] audio/wav not audio/wave, refs #603

---
 docs/plugins/advanced-model-plugins.md | 2 +-
 docs/usage.md                          | 2 +-
 llm/default_plugins/openai_models.py   | 4 ++--
 tests/test_cli_openai_models.py        | 7 +++----
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/docs/plugins/advanced-model-plugins.md b/docs/plugins/advanced-model-plugins.md
index a201237c..b9a16885 100644
--- a/docs/plugins/advanced-model-plugins.md
+++ b/docs/plugins/advanced-model-plugins.md
@@ -79,7 +79,7 @@ def _attachment(attachment):
     if attachment.resolve_type().startswith("image/"):
         return {"type": "image_url", "image_url": {"url": url}}
     else:
-        format_ = "wav" if attachment.resolve_type() == "audio/wave" else "mp3"
+        format_ = "wav" if attachment.resolve_type() == "audio/wav" else "mp3"
         return {
             "type": "input_audio",
             "input_audio": {
diff --git a/docs/usage.md b/docs/usage.md
index 564e4382..dd44ff10 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -305,7 +305,7 @@ OpenAI Chat: gpt-4o-audio-preview
     seed: int
     json_object: boolean
   Attachment types:
-    audio/mpeg, audio/wave
+    audio/mpeg, audio/wav
 OpenAI Chat: gpt-3.5-turbo (aliases: 3.5, chatgpt)
   Options:
     temperature: float
diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index d7e29e05..cc68df03 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -263,7 +263,7 @@ def _attachment(attachment):
     if attachment.resolve_type().startswith("image/"):
         return {"type": "image_url", "image_url": {"url": url}}
     else:
-        format_ = "wav" if attachment.resolve_type() == "audio/wave" else "mp3"
+        format_ = "wav" if attachment.resolve_type() == "audio/wav" else "mp3"
         return {
             "type": "input_audio",
             "input_audio": {
@@ -327,7 +327,7 @@ def __init__(
         if audio:
             self.attachment_types.update(
                 {
-                    "audio/wave",
+                    "audio/wav",
                     "audio/mpeg",
                 }
             )
diff --git a/tests/test_cli_openai_models.py b/tests/test_cli_openai_models.py
index f341e385..7cbab726 100644
--- a/tests/test_cli_openai_models.py
+++ b/tests/test_cli_openai_models.py
@@ -65,9 +65,8 @@ def test_only_gpt4_audio_preview_allows_mp3_or_wav(httpx_mock, model, filetype):
         method="HEAD",
         url=f"https://www.example.com/example.{filetype}",
         content=b"binary-data",
-        headers={"Content-Type": "audio/mpeg" if filetype == "mp3" else "audio/wave"},
+        headers={"Content-Type": "audio/mpeg" if filetype == "mp3" else "audio/wav"},
     )
-    # Another mock for the correct model
     if model == "gpt-4o-audio-preview":
         httpx_mock.add_response(
             method="POST",
@@ -116,7 +115,7 @@ def test_only_gpt4_audio_preview_allows_mp3_or_wav(httpx_mock, model, filetype):
             url=f"https://www.example.com/example.{filetype}",
             content=b"binary-data",
             headers={
-                "Content-Type": "audio/mpeg" if filetype == "mp3" else "audio/wave"
+                "Content-Type": "audio/mpeg" if filetype == "mp3" else "audio/wav"
             },
         )
     runner = CliRunner()
@@ -140,7 +139,7 @@ def test_only_gpt4_audio_preview_allows_mp3_or_wav(httpx_mock, model, filetype):
         )
     else:
         assert result.exit_code == 1
-        long = "audio/mpeg" if filetype == "mp3" else "audio/wave"
+        long = "audio/mpeg" if filetype == "mp3" else "audio/wav"
         assert (
             f"This model does not support attachments of type '{long}'" in result.output
         )

From bc96e1c7390440f9dc7fed3d00aa9905a4016ba1 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Wed, 13 Nov 2024 06:37:31 -0800
Subject: [PATCH 085/149] Ruff fix for #626

---
 llm/cli.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llm/cli.py b/llm/cli.py
index 2cc06395..afaa663c 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -29,7 +29,7 @@
 )
 
 from .migrations import migrate
-from .plugins import pm
+from .plugins import pm, load_plugins
 from .utils import mimetype_from_path, mimetype_from_string
 import base64
 import httpx
@@ -1810,8 +1810,6 @@ def render_errors(errors):
     return "\n".join(output)
 
 
-from .plugins import load_plugins
-
 load_plugins()
 
 pm.hook.register_commands(cli=cli)

From 5a984d0c87c5f4ce21c8f3c5b499faaccccc10e9 Mon Sep 17 00:00:00 2001
From: Hiepler <Hiepler@users.noreply.github.com>
Date: Thu, 14 Nov 2024 02:21:04 +0100
Subject: [PATCH 086/149] docs: add llm-grok (#629)

Adds`llm-grok` xAI API (https://github.com/Hiepler/llm-grok) to the plugin directory.

!stable-docs
---
 docs/plugins/directory.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/plugins/directory.md b/docs/plugins/directory.md
index c94795e4..4da2bbf5 100644
--- a/docs/plugins/directory.md
+++ b/docs/plugins/directory.md
@@ -27,6 +27,7 @@ These plugins can be used to interact with remotely hosted models via their API:
 - **[llm-reka](https://github.com/simonw/llm-reka)** supports the [Reka](https://www.reka.ai/) family of models via their API.
 - **[llm-perplexity](https://github.com/hex/llm-perplexity)** by Alexandru Geana supports the [Perplexity Labs](https://docs.perplexity.ai/) API models, including `llama-3-sonar-large-32k-online` which can search for things online and `llama-3-70b-instruct`.
 - **[llm-groq](https://github.com/angerman/llm-groq)** by Moritz Angermann provides access to fast models hosted by [Groq](https://console.groq.com/docs/models).
+- **[llm-grok](https://github.com/Hiepler/llm-grok)** by Benedikt Hiepler providing access to Grok model using the xAI API [Grok](https://x.ai/api).
 - **[llm-anyscale-endpoints](https://github.com/simonw/llm-anyscale-endpoints)** supports models hosted on the [Anyscale Endpoints](https://app.endpoints.anyscale.com/) platform, including Llama 2 70B.
 - **[llm-replicate](https://github.com/simonw/llm-replicate)** adds support for remote models hosted on [Replicate](https://replicate.com/), including Llama 2 from Meta AI.
 - **[llm-fireworks](https://github.com/simonw/llm-fireworks)** supports models hosted by [Fireworks AI](https://fireworks.ai/).

From ba75c674cb3f357a5ab7f2cfae02853e70091e45 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Wed, 13 Nov 2024 17:51:00 -0800
Subject: [PATCH 087/149] llm.get_async_model(), llm.AsyncModel base class and
 OpenAI async models (#613)

- https://github.com/simonw/llm/issues/507#issuecomment-2458639308

* register_model is now async aware

Refs https://github.com/simonw/llm/issues/507#issuecomment-2458658134

* Refactor Chat and AsyncChat to use _Shared base class

Refs https://github.com/simonw/llm/issues/507#issuecomment-2458692338

* fixed function name

* Fix for infinite loop

* Applied Black

* Ran cog

* Applied Black

* Add Response.from_row() classmethod back again

It does not matter that this is a blocking call, since it is a classmethod

* Made mypy happy with llm/models.py

* mypy fixes for openai_models.py

I am unhappy with this, had to duplicate some code.

* First test for AsyncModel

* Still have not quite got this working

* Fix for not loading plugins during tests, refs #626

* audio/wav not audio/wave, refs #603

* Black and mypy and ruff all happy

* Refactor to avoid generics

* Removed obsolete response() method

* Support text = await async_mock_model.prompt("hello")

* Initial docs for llm.get_async_model() and await model.prompt()

Refs #507

* Initial async model plugin creation docs

* duration_ms ANY to pass test

* llm models --async option

Refs https://github.com/simonw/llm/pull/613#issuecomment-2474724406

* Removed obsolete TypeVars

* Expanded register_models() docs for async

* await model.prompt() now returns AsyncResponse

Refs https://github.com/simonw/llm/pull/613#issuecomment-2475157822

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 docs/help.md                           |   2 +
 docs/plugins/advanced-model-plugins.md |  51 +++
 docs/plugins/plugin-hooks.md           |  17 +-
 docs/python-api.md                     |  30 +-
 llm/__init__.py                        |  60 +++-
 llm/cli.py                             |  69 ++++-
 llm/default_plugins/openai_models.py   | 200 ++++++++----
 llm/models.py                          | 410 +++++++++++++++++--------
 pytest.ini                             |   3 +-
 setup.py                               |   1 +
 tests/conftest.py                      |  32 +-
 tests/test_async.py                    |  17 +
 tests/test_chat.py                     |   7 +-
 tests/test_llm.py                      |   8 +
 14 files changed, 688 insertions(+), 219 deletions(-)
 create mode 100644 tests/test_async.py

diff --git a/docs/help.md b/docs/help.md
index e5ad9317..9db540a3 100644
--- a/docs/help.md
+++ b/docs/help.md
@@ -121,6 +121,7 @@ Options:
   --cid, --conversation TEXT      Continue the conversation with the given ID.
   --key TEXT                      API key to use
   --save TEXT                     Save prompt with this template name
+  --async                         Run prompt asynchronously
   --help                          Show this message and exit.
 ```
 
@@ -322,6 +323,7 @@ Usage: llm models list [OPTIONS]
 
 Options:
   --options  Show options for each model, if available
+  --async    List async models
   --help     Show this message and exit.
 ```
 
diff --git a/docs/plugins/advanced-model-plugins.md b/docs/plugins/advanced-model-plugins.md
index b9a16885..1793c751 100644
--- a/docs/plugins/advanced-model-plugins.md
+++ b/docs/plugins/advanced-model-plugins.md
@@ -5,13 +5,64 @@ The {ref}`model plugin tutorial <tutorial-model-plugin>` covers the basics of de
 
 This document covers more advanced topics.
 
+(advanced-model-plugins-async)=
+
+## Async models
+
+Plugins can optionally provide an asynchronous version of their model, suitable for use with Python [asyncio](https://docs.python.org/3/library/asyncio.html). This is particularly useful for remote models accessible by an HTTP API.
+
+The async version of a model subclasses `llm.AsyncModel` instead of `llm.Model`. It must implement an `async def execute()` async generator method instead of `def execute()`.
+
+This example shows a subset of the OpenAI default plugin illustrating how this method might work:
+
+
+```python
+from typing import AsyncGenerator
+import llm
+
+class MyAsyncModel(llm.AsyncModel):
+    # This cn duplicate the model_id of the sync model:
+    model_id = "my-model-id"
+
+    async def execute(
+        self, prompt, stream, response, conversation=None
+    ) -> AsyncGenerator[str, None]:
+        if stream:
+            completion = await client.chat.completions.create(
+                model=self.model_id,
+                messages=messages,
+                stream=True,
+            )
+            async for chunk in completion:
+                yield chunk.choices[0].delta.content
+        else:
+            completion = await client.chat.completions.create(
+                model=self.model_name or self.model_id,
+                messages=messages,
+                stream=False,
+            )
+            yield completion.choices[0].message.content
+```
+This async model instance should then be passed to the `register()` method in the `register_models()` plugin hook:
+
+```python
+@hookimpl
+def register_models(register):
+    register(
+        MyModel(), MyAsyncModel(), aliases=("my-model-aliases",)
+    )
+```
+
 (advanced-model-plugins-attachments)=
+
 ## Attachments for multi-modal models
 
 Models such as GPT-4o, Claude 3.5 Sonnet and Google's Gemini 1.5 are multi-modal: they accept input in the form of images and maybe even audio, video and other formats.
 
 LLM calls these **attachments**. Models can specify the types of attachments they accept and then implement special code in the `.execute()` method to handle them.
 
+See {ref}`the Python attachments documentation <python-api-attachments>` for details on using attachments in the Python API.
+
 ### Specifying attachment types
 
 A `Model` subclass can list the types of attachments it accepts by defining a `attachment_types` class attribute:
diff --git a/docs/plugins/plugin-hooks.md b/docs/plugins/plugin-hooks.md
index 1d7d58f6..0f38cd64 100644
--- a/docs/plugins/plugin-hooks.md
+++ b/docs/plugins/plugin-hooks.md
@@ -42,5 +42,20 @@ class HelloWorld(llm.Model):
     def execute(self, prompt, stream, response):
         return ["hello world"]
 ```
+If your model includes an async version, you can register that too:
+
+```python
+class AsyncHelloWorld(llm.AsyncModel):
+    model_id = "helloworld"
+
+    async def execute(self, prompt, stream, response):
+        return ["hello world"]
+
+@llm.hookimpl
+def register_models(register):
+    register(HelloWorld(), AsyncHelloWorld(), aliases=("hw",))
+```
+This demonstrates how to register a model with both sync and async versions, and how to specify an alias for that model.
+
+The {ref}`model plugin tutorial <tutorial-model-plugin>` describes how to use this hook in detail. Asynchronous models {ref}`are described here <advanced-model-plugins-async>`.
 
-{ref}`tutorial-model-plugin` describes how to use this hook in detail.
diff --git a/docs/python-api.md b/docs/python-api.md
index ae135a68..0450031a 100644
--- a/docs/python-api.md
+++ b/docs/python-api.md
@@ -99,7 +99,7 @@ print(response.text())
 ```
 Some models do not use API keys at all.
 
-## Streaming responses
+### Streaming responses
 
 For models that support it you can stream responses as they are generated, like this:
 
@@ -112,6 +112,34 @@ The `response.text()` method described earlier does this for you - it runs throu
 
 If a response has been evaluated, `response.text()` will continue to return the same string.
 
+(python-api-async)=
+
+## Async models
+
+Some plugins provide async versions of their supported models, suitable for use with Python [asyncio](https://docs.python.org/3/library/asyncio.html).
+
+To use an async model, use the `llm.get_async_model()` function instead of `llm.get_model()`:
+
+```python
+import llm
+model = llm.get_async_model("gpt-4o")
+```
+You can then run a prompt using `await model.prompt(...)`:
+
+```python
+response = await model.prompt(
+    "Five surprising names for a pet pelican"
+)
+print(await response.text())
+```
+Or use `async for chunk in ...` to stream the response as it is generated:
+```python
+async for chunk in model.prompt(
+    "Five surprising names for a pet pelican"
+):
+    print(chunk, end="", flush=True)
+```
+
 ## Conversations
 
 LLM supports *conversations*, where you ask follow-up questions of a model as part of an ongoing conversation.
diff --git a/llm/__init__.py b/llm/__init__.py
index 49eff551..d6df280f 100644
--- a/llm/__init__.py
+++ b/llm/__init__.py
@@ -4,6 +4,8 @@
     NeedsKeyException,
 )
 from .models import (
+    AsyncModel,
+    AsyncResponse,
     Attachment,
     Conversation,
     Model,
@@ -26,9 +28,11 @@
 
 __all__ = [
     "hookimpl",
+    "get_async_model",
     "get_model",
     "get_key",
     "user_dir",
+    "AsyncResponse",
     "Attachment",
     "Collection",
     "Conversation",
@@ -74,11 +78,11 @@ def get_models_with_aliases() -> List["ModelWithAliases"]:
         for alias, model_id in configured_aliases.items():
             extra_model_aliases.setdefault(model_id, []).append(alias)
 
-    def register(model, aliases=None):
+    def register(model, async_model=None, aliases=None):
         alias_list = list(aliases or [])
         if model.model_id in extra_model_aliases:
             alias_list.extend(extra_model_aliases[model.model_id])
-        model_aliases.append(ModelWithAliases(model, alias_list))
+        model_aliases.append(ModelWithAliases(model, async_model, alias_list))
 
     load_plugins()
     pm.hook.register_models(register=register)
@@ -137,12 +141,25 @@ def get_embedding_model_aliases() -> Dict[str, EmbeddingModel]:
     return model_aliases
 
 
+def get_async_model_aliases() -> Dict[str, AsyncModel]:
+    async_model_aliases = {}
+    for model_with_aliases in get_models_with_aliases():
+        if model_with_aliases.async_model:
+            for alias in model_with_aliases.aliases:
+                async_model_aliases[alias] = model_with_aliases.async_model
+            async_model_aliases[model_with_aliases.model.model_id] = (
+                model_with_aliases.async_model
+            )
+    return async_model_aliases
+
+
 def get_model_aliases() -> Dict[str, Model]:
     model_aliases = {}
     for model_with_aliases in get_models_with_aliases():
-        for alias in model_with_aliases.aliases:
-            model_aliases[alias] = model_with_aliases.model
-        model_aliases[model_with_aliases.model.model_id] = model_with_aliases.model
+        if model_with_aliases.model:
+            for alias in model_with_aliases.aliases:
+                model_aliases[alias] = model_with_aliases.model
+            model_aliases[model_with_aliases.model.model_id] = model_with_aliases.model
     return model_aliases
 
 
@@ -150,13 +167,42 @@ class UnknownModelError(KeyError):
     pass
 
 
-def get_model(name: Optional[str] = None) -> Model:
+def get_async_model(name: Optional[str] = None) -> AsyncModel:
+    aliases = get_async_model_aliases()
+    name = name or get_default_model()
+    try:
+        return aliases[name]
+    except KeyError:
+        # Does a sync model exist?
+        sync_model = None
+        try:
+            sync_model = get_model(name, _skip_async=True)
+        except UnknownModelError:
+            pass
+        if sync_model:
+            raise UnknownModelError("Unknown async model (sync model exists): " + name)
+        else:
+            raise UnknownModelError("Unknown model: " + name)
+
+
+def get_model(name: Optional[str] = None, _skip_async: bool = False) -> Model:
     aliases = get_model_aliases()
     name = name or get_default_model()
     try:
         return aliases[name]
     except KeyError:
-        raise UnknownModelError("Unknown model: " + name)
+        # Does an async model exist?
+        if _skip_async:
+            raise UnknownModelError("Unknown model: " + name)
+        async_model = None
+        try:
+            async_model = get_async_model(name)
+        except UnknownModelError:
+            pass
+        if async_model:
+            raise UnknownModelError("Unknown model (async model exists): " + name)
+        else:
+            raise UnknownModelError("Unknown model: " + name)
 
 
 def get_key(
diff --git a/llm/cli.py b/llm/cli.py
index afaa663c..5a9f20b4 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -1,3 +1,4 @@
+import asyncio
 import click
 from click_default_group import DefaultGroup
 from dataclasses import asdict
@@ -11,6 +12,7 @@
     Template,
     UnknownModelError,
     encode,
+    get_async_model,
     get_default_model,
     get_default_embedding_model,
     get_embedding_models_with_aliases,
@@ -199,6 +201,7 @@ def cli():
 )
 @click.option("--key", help="API key to use")
 @click.option("--save", help="Save prompt with this template name")
+@click.option("async_", "--async", is_flag=True, help="Run prompt asynchronously")
 def prompt(
     prompt,
     system,
@@ -215,6 +218,7 @@ def prompt(
     conversation_id,
     key,
     save,
+    async_,
 ):
     """
     Execute a prompt
@@ -337,9 +341,12 @@ def read_prompt():
 
     # Now resolve the model
     try:
-        model = model_aliases[model_id]
-    except KeyError:
-        raise click.ClickException("'{}' is not a known model".format(model_id))
+        if async_:
+            model = get_async_model(model_id)
+        else:
+            model = get_model(model_id)
+    except UnknownModelError as ex:
+        raise click.ClickException(ex)
 
     # Provide the API key, if one is needed and has been provided
     if model.needs_key:
@@ -375,21 +382,48 @@ def read_prompt():
         prompt_method = conversation.prompt
 
     try:
-        response = prompt_method(
-            prompt, attachments=resolved_attachments, system=system, **validated_options
-        )
-        if should_stream:
-            for chunk in response:
-                print(chunk, end="")
-                sys.stdout.flush()
-            print("")
+        if async_:
+
+            async def inner():
+                if should_stream:
+                    async for chunk in prompt_method(
+                        prompt,
+                        attachments=resolved_attachments,
+                        system=system,
+                        **validated_options,
+                    ):
+                        print(chunk, end="")
+                        sys.stdout.flush()
+                    print("")
+                else:
+                    response = prompt_method(
+                        prompt,
+                        attachments=resolved_attachments,
+                        system=system,
+                        **validated_options,
+                    )
+                    print(await response.text())
+
+            asyncio.run(inner())
         else:
-            print(response.text())
+            response = prompt_method(
+                prompt,
+                attachments=resolved_attachments,
+                system=system,
+                **validated_options,
+            )
+            if should_stream:
+                for chunk in response:
+                    print(chunk, end="")
+                    sys.stdout.flush()
+                print("")
+            else:
+                print(response.text())
     except Exception as ex:
         raise click.ClickException(str(ex))
 
     # Log to the database
-    if (logs_on() or log) and not no_log:
+    if (logs_on() or log) and not no_log and not async_:
         log_path = logs_db_path()
         (log_path.parent).mkdir(parents=True, exist_ok=True)
         db = sqlite_utils.Database(log_path)
@@ -981,14 +1015,19 @@ def models():
 @click.option(
     "--options", is_flag=True, help="Show options for each model, if available"
 )
-def models_list(options):
+@click.option("async_", "--async", is_flag=True, help="List async models")
+def models_list(options, async_):
     "List available models"
     models_that_have_shown_options = set()
     for model_with_aliases in get_models_with_aliases():
+        if async_ and not model_with_aliases.async_model:
+            continue
         extra = ""
         if model_with_aliases.aliases:
             extra = " (aliases: {})".format(", ".join(model_with_aliases.aliases))
-        model = model_with_aliases.model
+        model = (
+            model_with_aliases.model if not async_ else model_with_aliases.async_model
+        )
         output = str(model) + extra
         if options and model.Options.schema()["properties"]:
             output += "\n  Options:"
diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index cc68df03..82f737c5 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -1,4 +1,4 @@
-from llm import EmbeddingModel, Model, hookimpl
+from llm import AsyncModel, EmbeddingModel, Model, hookimpl
 import llm
 from llm.utils import dicts_to_table_string, remove_dict_none_values, logging_client
 import click
@@ -16,7 +16,7 @@
     from pydantic.fields import Field
     from pydantic.class_validators import validator as field_validator  # type: ignore [no-redef]
 
-from typing import List, Iterable, Iterator, Optional, Union
+from typing import AsyncGenerator, List, Iterable, Iterator, Optional, Union
 import json
 import yaml
 
@@ -24,22 +24,47 @@
 @hookimpl
 def register_models(register):
     # GPT-4o
-    register(Chat("gpt-4o", vision=True), aliases=("4o",))
-    register(Chat("gpt-4o-mini", vision=True), aliases=("4o-mini",))
-    register(Chat("gpt-4o-audio-preview", audio=True))
+    register(
+        Chat("gpt-4o", vision=True), AsyncChat("gpt-4o", vision=True), aliases=("4o",)
+    )
+    register(
+        Chat("gpt-4o-mini", vision=True),
+        AsyncChat("gpt-4o-mini", vision=True),
+        aliases=("4o-mini",),
+    )
+    register(
+        Chat("gpt-4o-audio-preview", audio=True),
+        AsyncChat("gpt-4o-audio-preview", audio=True),
+    )
     # 3.5 and 4
-    register(Chat("gpt-3.5-turbo"), aliases=("3.5", "chatgpt"))
-    register(Chat("gpt-3.5-turbo-16k"), aliases=("chatgpt-16k", "3.5-16k"))
-    register(Chat("gpt-4"), aliases=("4", "gpt4"))
-    register(Chat("gpt-4-32k"), aliases=("4-32k",))
+    register(
+        Chat("gpt-3.5-turbo"), AsyncChat("gpt-3.5-turbo"), aliases=("3.5", "chatgpt")
+    )
+    register(
+        Chat("gpt-3.5-turbo-16k"),
+        AsyncChat("gpt-3.5-turbo-16k"),
+        aliases=("chatgpt-16k", "3.5-16k"),
+    )
+    register(Chat("gpt-4"), AsyncChat("gpt-4"), aliases=("4", "gpt4"))
+    register(Chat("gpt-4-32k"), AsyncChat("gpt-4-32k"), aliases=("4-32k",))
     # GPT-4 Turbo models
-    register(Chat("gpt-4-1106-preview"))
-    register(Chat("gpt-4-0125-preview"))
-    register(Chat("gpt-4-turbo-2024-04-09"))
-    register(Chat("gpt-4-turbo"), aliases=("gpt-4-turbo-preview", "4-turbo", "4t"))
+    register(Chat("gpt-4-1106-preview"), AsyncChat("gpt-4-1106-preview"))
+    register(Chat("gpt-4-0125-preview"), AsyncChat("gpt-4-0125-preview"))
+    register(Chat("gpt-4-turbo-2024-04-09"), AsyncChat("gpt-4-turbo-2024-04-09"))
+    register(
+        Chat("gpt-4-turbo"),
+        AsyncChat("gpt-4-turbo"),
+        aliases=("gpt-4-turbo-preview", "4-turbo", "4t"),
+    )
     # o1
-    register(Chat("o1-preview", can_stream=False, allows_system_prompt=False))
-    register(Chat("o1-mini", can_stream=False, allows_system_prompt=False))
+    register(
+        Chat("o1-preview", can_stream=False, allows_system_prompt=False),
+        AsyncChat("o1-preview", can_stream=False, allows_system_prompt=False),
+    )
+    register(
+        Chat("o1-mini", can_stream=False, allows_system_prompt=False),
+        AsyncChat("o1-mini", can_stream=False, allows_system_prompt=False),
+    )
     # The -instruct completion model
     register(
         Completion("gpt-3.5-turbo-instruct", default_max_tokens=256),
@@ -273,18 +298,7 @@ def _attachment(attachment):
         }
 
 
-class Chat(Model):
-    needs_key = "openai"
-    key_env_var = "OPENAI_API_KEY"
-
-    default_max_tokens = None
-
-    class Options(SharedOptions):
-        json_object: Optional[bool] = Field(
-            description="Output a valid JSON object {...}. Prompt must mention JSON.",
-            default=None,
-        )
-
+class _Shared:
     def __init__(
         self,
         model_id,
@@ -335,10 +349,8 @@ def __init__(
     def __str__(self):
         return "OpenAI Chat: {}".format(self.model_id)
 
-    def execute(self, prompt, stream, response, conversation=None):
+    def build_messages(self, prompt, conversation):
         messages = []
-        if prompt.system and not self.allows_system_prompt:
-            raise NotImplementedError("Model does not support system prompts")
         current_system = None
         if conversation is not None:
             for prev_response in conversation.responses:
@@ -375,7 +387,60 @@ def execute(self, prompt, stream, response, conversation=None):
             for attachment in prompt.attachments:
                 attachment_message.append(_attachment(attachment))
             messages.append({"role": "user", "content": attachment_message})
+        return messages
+
+    def get_client(self, async_=False):
+        kwargs = {}
+        if self.api_base:
+            kwargs["base_url"] = self.api_base
+        if self.api_type:
+            kwargs["api_type"] = self.api_type
+        if self.api_version:
+            kwargs["api_version"] = self.api_version
+        if self.api_engine:
+            kwargs["engine"] = self.api_engine
+        if self.needs_key:
+            kwargs["api_key"] = self.get_key()
+        else:
+            # OpenAI-compatible models don't need a key, but the
+            # openai client library requires one
+            kwargs["api_key"] = "DUMMY_KEY"
+        if self.headers:
+            kwargs["default_headers"] = self.headers
+        if os.environ.get("LLM_OPENAI_SHOW_RESPONSES"):
+            kwargs["http_client"] = logging_client()
+        if async_:
+            return openai.AsyncOpenAI(**kwargs)
+        else:
+            return openai.OpenAI(**kwargs)
+
+    def build_kwargs(self, prompt, stream):
+        kwargs = dict(not_nulls(prompt.options))
+        json_object = kwargs.pop("json_object", None)
+        if "max_tokens" not in kwargs and self.default_max_tokens is not None:
+            kwargs["max_tokens"] = self.default_max_tokens
+        if json_object:
+            kwargs["response_format"] = {"type": "json_object"}
+        if stream:
+            kwargs["stream_options"] = {"include_usage": True}
+        return kwargs
+
+
+class Chat(_Shared, Model):
+    needs_key = "openai"
+    key_env_var = "OPENAI_API_KEY"
+    default_max_tokens = None
+
+    class Options(SharedOptions):
+        json_object: Optional[bool] = Field(
+            description="Output a valid JSON object {...}. Prompt must mention JSON.",
+            default=None,
+        )
 
+    def execute(self, prompt, stream, response, conversation=None):
+        if prompt.system and not self.allows_system_prompt:
+            raise NotImplementedError("Model does not support system prompts")
+        messages = self.build_messages(prompt, conversation)
         kwargs = self.build_kwargs(prompt, stream)
         client = self.get_client()
         if stream:
@@ -406,38 +471,53 @@ def execute(self, prompt, stream, response, conversation=None):
             yield completion.choices[0].message.content
         response._prompt_json = redact_data({"messages": messages})
 
-    def get_client(self):
-        kwargs = {}
-        if self.api_base:
-            kwargs["base_url"] = self.api_base
-        if self.api_type:
-            kwargs["api_type"] = self.api_type
-        if self.api_version:
-            kwargs["api_version"] = self.api_version
-        if self.api_engine:
-            kwargs["engine"] = self.api_engine
-        if self.needs_key:
-            kwargs["api_key"] = self.get_key()
-        else:
-            # OpenAI-compatible models don't need a key, but the
-            # openai client library requires one
-            kwargs["api_key"] = "DUMMY_KEY"
-        if self.headers:
-            kwargs["default_headers"] = self.headers
-        if os.environ.get("LLM_OPENAI_SHOW_RESPONSES"):
-            kwargs["http_client"] = logging_client()
-        return openai.OpenAI(**kwargs)
 
-    def build_kwargs(self, prompt, stream):
-        kwargs = dict(not_nulls(prompt.options))
-        json_object = kwargs.pop("json_object", None)
-        if "max_tokens" not in kwargs and self.default_max_tokens is not None:
-            kwargs["max_tokens"] = self.default_max_tokens
-        if json_object:
-            kwargs["response_format"] = {"type": "json_object"}
+class AsyncChat(_Shared, AsyncModel):
+    needs_key = "openai"
+    key_env_var = "OPENAI_API_KEY"
+    default_max_tokens = None
+
+    class Options(SharedOptions):
+        json_object: Optional[bool] = Field(
+            description="Output a valid JSON object {...}. Prompt must mention JSON.",
+            default=None,
+        )
+
+    async def execute(
+        self, prompt, stream, response, conversation=None
+    ) -> AsyncGenerator[str, None]:
+        if prompt.system and not self.allows_system_prompt:
+            raise NotImplementedError("Model does not support system prompts")
+        messages = self.build_messages(prompt, conversation)
+        kwargs = self.build_kwargs(prompt, stream)
+        client = self.get_client(async_=True)
         if stream:
-            kwargs["stream_options"] = {"include_usage": True}
-        return kwargs
+            completion = await client.chat.completions.create(
+                model=self.model_name or self.model_id,
+                messages=messages,
+                stream=True,
+                **kwargs,
+            )
+            chunks = []
+            async for chunk in completion:
+                chunks.append(chunk)
+                try:
+                    content = chunk.choices[0].delta.content
+                except IndexError:
+                    content = None
+                if content is not None:
+                    yield content
+            response.response_json = remove_dict_none_values(combine_chunks(chunks))
+        else:
+            completion = await client.chat.completions.create(
+                model=self.model_name or self.model_id,
+                messages=messages,
+                stream=False,
+                **kwargs,
+            )
+            response.response_json = remove_dict_none_values(completion.model_dump())
+            yield completion.choices[0].message.content
+        response._prompt_json = redact_data({"messages": messages})
 
 
 class Completion(Chat):
diff --git a/llm/models.py b/llm/models.py
index 485d9720..cb9c7ab3 100644
--- a/llm/models.py
+++ b/llm/models.py
@@ -7,7 +7,17 @@
 from itertools import islice
 import re
 import time
-from typing import Any, Dict, Iterable, Iterator, List, Optional, Set, Union
+from typing import (
+    Any,
+    AsyncGenerator,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Set,
+    Union,
+)
 from .utils import mimetype_from_path, mimetype_from_string
 from abc import ABC, abstractmethod
 import json
@@ -94,7 +104,7 @@ def __init__(
         attachments=None,
         system=None,
         prompt_json=None,
-        options=None
+        options=None,
     ):
         self.prompt = prompt
         self.model = model
@@ -105,12 +115,25 @@ def __init__(
 
 
 @dataclass
-class Conversation:
-    model: "Model"
+class _BaseConversation:
+    model: "_BaseModel"
     id: str = field(default_factory=lambda: str(ULID()).lower())
     name: Optional[str] = None
-    responses: List["Response"] = field(default_factory=list)
+    responses: List["_BaseResponse"] = field(default_factory=list)
+
+    @classmethod
+    def from_row(cls, row):
+        from llm import get_model
 
+        return cls(
+            model=get_model(row["model"]),
+            id=row["id"],
+            name=row["name"],
+        )
+
+
+@dataclass
+class Conversation(_BaseConversation):
     def prompt(
         self,
         prompt: Optional[str],
@@ -118,8 +141,8 @@ def prompt(
         attachments: Optional[List[Attachment]] = None,
         system: Optional[str] = None,
         stream: bool = True,
-        **options
-    ):
+        **options,
+    ) -> "Response":
         return Response(
             Prompt(
                 prompt,
@@ -133,24 +156,45 @@ def prompt(
             conversation=self,
         )
 
-    @classmethod
-    def from_row(cls, row):
-        from llm import get_model
 
-        return cls(
-            model=get_model(row["model"]),
-            id=row["id"],
-            name=row["name"],
+@dataclass
+class AsyncConversation(_BaseConversation):
+    def prompt(
+        self,
+        prompt: Optional[str],
+        *,
+        attachments: Optional[List[Attachment]] = None,
+        system: Optional[str] = None,
+        stream: bool = True,
+        **options,
+    ) -> "AsyncResponse":
+        return AsyncResponse(
+            Prompt(
+                prompt,
+                model=self.model,
+                attachments=attachments,
+                system=system,
+                options=self.model.Options(**options),
+            ),
+            self.model,
+            stream,
+            conversation=self,
         )
 
 
-class Response(ABC):
+class _BaseResponse:
+    """Base response class shared between sync and async responses"""
+
+    prompt: "Prompt"
+    stream: bool
+    conversation: Optional["_BaseConversation"] = None
+
     def __init__(
         self,
         prompt: Prompt,
-        model: "Model",
+        model: "_BaseModel",
         stream: bool,
-        conversation: Optional[Conversation] = None,
+        conversation: Optional[_BaseConversation] = None,
     ):
         self.prompt = prompt
         self._prompt_json = None
@@ -161,47 +205,46 @@ def __init__(
         self.response_json = None
         self.conversation = conversation
         self.attachments: List[Attachment] = []
+        self._start: Optional[float] = None
+        self._end: Optional[float] = None
+        self._start_utcnow: Optional[datetime.datetime] = None
 
-    def __iter__(self) -> Iterator[str]:
-        self._start = time.monotonic()
-        self._start_utcnow = datetime.datetime.utcnow()
-        if self._done:
-            yield from self._chunks
-        for chunk in self.model.execute(
-            self.prompt,
-            stream=self.stream,
-            response=self,
-            conversation=self.conversation,
-        ):
-            yield chunk
-            self._chunks.append(chunk)
-        if self.conversation:
-            self.conversation.responses.append(self)
-        self._end = time.monotonic()
-        self._done = True
-
-    def _force(self):
-        if not self._done:
-            list(self)
-
-    def __str__(self) -> str:
-        return self.text()
-
-    def text(self) -> str:
-        self._force()
-        return "".join(self._chunks)
-
-    def json(self) -> Optional[Dict[str, Any]]:
-        self._force()
-        return self.response_json
+    @classmethod
+    def from_row(cls, db, row):
+        from llm import get_model
 
-    def duration_ms(self) -> int:
-        self._force()
-        return int((self._end - self._start) * 1000)
+        model = get_model(row["model"])
 
-    def datetime_utc(self) -> str:
-        self._force()
-        return self._start_utcnow.isoformat()
+        response = cls(
+            model=model,
+            prompt=Prompt(
+                prompt=row["prompt"],
+                model=model,
+                attachments=[],
+                system=row["system"],
+                options=model.Options(**json.loads(row["options_json"])),
+            ),
+            stream=False,
+        )
+        response.id = row["id"]
+        response._prompt_json = json.loads(row["prompt_json"] or "null")
+        response.response_json = json.loads(row["response_json"] or "null")
+        response._done = True
+        response._chunks = [row["response"]]
+        # Attachments
+        response.attachments = [
+            Attachment.from_row(arow)
+            for arow in db.query(
+                """
+                select attachments.* from attachments
+                join prompt_attachments on attachments.id = prompt_attachments.attachment_id
+                where prompt_attachments.response_id = ?
+                order by prompt_attachments."order"
+            """,
+                [row["id"]],
+            )
+        ]
+        return response
 
     def log_to_db(self, db):
         conversation = self.conversation
@@ -257,14 +300,126 @@ def log_to_db(self, db):
                 },
             )
 
+
+class Response(_BaseResponse):
+    model: "Model"
+    conversation: Optional["Conversation"] = None
+
+    def __str__(self) -> str:
+        return self.text()
+
+    def _force(self):
+        if not self._done:
+            list(self)
+
+    def text(self) -> str:
+        self._force()
+        return "".join(self._chunks)
+
+    def json(self) -> Optional[Dict[str, Any]]:
+        self._force()
+        return self.response_json
+
+    def duration_ms(self) -> int:
+        self._force()
+        return int(((self._end or 0) - (self._start or 0)) * 1000)
+
+    def datetime_utc(self) -> str:
+        self._force()
+        return self._start_utcnow.isoformat() if self._start_utcnow else ""
+
+    def __iter__(self) -> Iterator[str]:
+        self._start = time.monotonic()
+        self._start_utcnow = datetime.datetime.utcnow()
+        if self._done:
+            yield from self._chunks
+            return
+
+        for chunk in self.model.execute(
+            self.prompt,
+            stream=self.stream,
+            response=self,
+            conversation=self.conversation,
+        ):
+            yield chunk
+            self._chunks.append(chunk)
+
+        if self.conversation:
+            self.conversation.responses.append(self)
+        self._end = time.monotonic()
+        self._done = True
+
+
+class AsyncResponse(_BaseResponse):
+    model: "AsyncModel"
+    conversation: Optional["AsyncConversation"] = None
+
+    def __aiter__(self):
+        self._start = time.monotonic()
+        self._start_utcnow = datetime.datetime.utcnow()
+        return self
+
+    async def __anext__(self) -> str:
+        if self._done:
+            if not self._chunks:
+                raise StopAsyncIteration
+            chunk = self._chunks.pop(0)
+            if not self._chunks:
+                raise StopAsyncIteration
+            return chunk
+
+        if not hasattr(self, "_generator"):
+            self._generator = self.model.execute(
+                self.prompt,
+                stream=self.stream,
+                response=self,
+                conversation=self.conversation,
+            )
+
+        try:
+            chunk = await self._generator.__anext__()
+            self._chunks.append(chunk)
+            return chunk
+        except StopAsyncIteration:
+            if self.conversation:
+                self.conversation.responses.append(self)
+            self._end = time.monotonic()
+            self._done = True
+            raise
+
+    async def _force(self):
+        if not self._done:
+            async for _ in self:
+                pass
+        return self
+
+    async def text(self) -> str:
+        await self._force()
+        return "".join(self._chunks)
+
+    async def json(self) -> Optional[Dict[str, Any]]:
+        await self._force()
+        return self.response_json
+
+    async def duration_ms(self) -> int:
+        await self._force()
+        return int(((self._end or 0) - (self._start or 0)) * 1000)
+
+    async def datetime_utc(self) -> str:
+        await self._force()
+        return self._start_utcnow.isoformat() if self._start_utcnow else ""
+
+    def __await__(self):
+        return self._force().__await__()
+
     @classmethod
     def fake(
         cls,
-        model: "Model",
+        model: "AsyncModel",
         prompt: str,
         *attachments: List[Attachment],
         system: str,
-        response: str
+        response: str,
     ):
         "Utility method to help with writing tests"
         response_obj = cls(
@@ -281,47 +436,11 @@ def fake(
         response_obj._chunks = [response]
         return response_obj
 
-    @classmethod
-    def from_row(cls, db, row):
-        from llm import get_model
-
-        model = get_model(row["model"])
-
-        response = cls(
-            model=model,
-            prompt=Prompt(
-                prompt=row["prompt"],
-                model=model,
-                attachments=[],
-                system=row["system"],
-                options=model.Options(**json.loads(row["options_json"])),
-            ),
-            stream=False,
-        )
-        response.id = row["id"]
-        response._prompt_json = json.loads(row["prompt_json"] or "null")
-        response.response_json = json.loads(row["response_json"] or "null")
-        response._done = True
-        response._chunks = [row["response"]]
-        # Attachments
-        response.attachments = [
-            Attachment.from_row(arow)
-            for arow in db.query(
-                """
-                select attachments.* from attachments
-                join prompt_attachments on attachments.id = prompt_attachments.attachment_id
-                where prompt_attachments.response_id = ?
-                order by prompt_attachments."order"
-            """,
-                [row["id"]],
-            )
-        ]
-        return response
-
     def __repr__(self):
-        return "<Response prompt='{}' text='{}'>".format(
-            self.prompt.prompt, self.text()
-        )
+        text = "... not yet awaited ..."
+        if self._done:
+            text = "".join(self._chunks)
+        return "<Response prompt='{}' text='{}'>".format(self.prompt.prompt, text)
 
 
 class Options(BaseModel):
@@ -362,22 +481,39 @@ def get_key(self):
         raise NeedsKeyException(message)
 
 
-class Model(ABC, _get_key_mixin):
+class _BaseModel(ABC, _get_key_mixin):
     model_id: str
-
-    # API key handling
     key: Optional[str] = None
     needs_key: Optional[str] = None
     key_env_var: Optional[str] = None
-
-    # Model characteristics
     can_stream: bool = False
     attachment_types: Set = set()
 
     class Options(_Options):
         pass
 
-    def conversation(self):
+    def _validate_attachments(
+        self, attachments: Optional[List[Attachment]] = None
+    ) -> None:
+        if attachments and not self.attachment_types:
+            raise ValueError("This model does not support attachments")
+        for attachment in attachments or []:
+            attachment_type = attachment.resolve_type()
+            if attachment_type not in self.attachment_types:
+                raise ValueError(
+                    f"This model does not support attachments of type '{attachment_type}', "
+                    f"only {', '.join(self.attachment_types)}"
+                )
+
+    def __str__(self) -> str:
+        return "{}: {}".format(self.__class__.__name__, self.model_id)
+
+    def __repr__(self):
+        return "<{} '{}'>".format(self.__class__.__name__, self.model_id)
+
+
+class Model(_BaseModel):
+    def conversation(self) -> Conversation:
         return Conversation(model=self)
 
     @abstractmethod
@@ -388,10 +524,6 @@ def execute(
         response: Response,
         conversation: Optional[Conversation],
     ) -> Iterator[str]:
-        """
-        Execute a prompt and yield chunks of text, or yield a single big chunk.
-        Any additional useful information about the execution should be assigned to the response.
-        """
         pass
 
     def prompt(
@@ -401,22 +533,10 @@ def prompt(
         attachments: Optional[List[Attachment]] = None,
         system: Optional[str] = None,
         stream: bool = True,
-        **options
-    ):
-        # Validate attachments
-        if attachments and not self.attachment_types:
-            raise ValueError(
-                "This model does not support attachments, but some were provided"
-            )
-        for attachment in attachments or []:
-            attachment_type = attachment.resolve_type()
-            if attachment_type not in self.attachment_types:
-                raise ValueError(
-                    "This model does not support attachments of type '{}', only {}".format(
-                        attachment_type, ", ".join(self.attachment_types)
-                    )
-                )
-        return self.response(
+        **options,
+    ) -> Response:
+        self._validate_attachments(attachments)
+        return Response(
             Prompt(
                 prompt,
                 attachments=attachments,
@@ -424,17 +544,46 @@ def prompt(
                 model=self,
                 options=self.Options(**options),
             ),
-            stream=stream,
+            self,
+            stream,
         )
 
-    def response(self, prompt: Prompt, stream: bool = True) -> Response:
-        return Response(prompt, self, stream)
 
-    def __str__(self) -> str:
-        return "{}: {}".format(self.__class__.__name__, self.model_id)
+class AsyncModel(_BaseModel):
+    def conversation(self) -> AsyncConversation:
+        return AsyncConversation(model=self)
 
-    def __repr__(self):
-        return "<Model '{}'>".format(self.model_id)
+    @abstractmethod
+    async def execute(
+        self,
+        prompt: Prompt,
+        stream: bool,
+        response: AsyncResponse,
+        conversation: Optional[AsyncConversation],
+    ) -> AsyncGenerator[str, None]:
+        yield ""
+
+    def prompt(
+        self,
+        prompt: str,
+        *,
+        attachments: Optional[List[Attachment]] = None,
+        system: Optional[str] = None,
+        stream: bool = True,
+        **options,
+    ) -> AsyncResponse:
+        self._validate_attachments(attachments)
+        return AsyncResponse(
+            Prompt(
+                prompt,
+                attachments=attachments,
+                system=system,
+                model=self,
+                options=self.Options(**options),
+            ),
+            self,
+            stream,
+        )
 
 
 class EmbeddingModel(ABC, _get_key_mixin):
@@ -495,6 +644,7 @@ def embed_batch(self, items: Iterable[Union[str, bytes]]) -> Iterator[List[float
 @dataclass
 class ModelWithAliases:
     model: Model
+    async_model: AsyncModel
     aliases: Set[str]
 
 
diff --git a/pytest.ini b/pytest.ini
index 8658fc91..ba352d26 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,4 +1,5 @@
 [pytest]
 filterwarnings =
     ignore:The `schema` method is deprecated.*:DeprecationWarning
-    ignore:Support for class-based `config` is deprecated*:DeprecationWarning
\ No newline at end of file
+    ignore:Support for class-based `config` is deprecated*:DeprecationWarning
+asyncio_default_fixture_loop_scope = function
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 6f500815..24b5acd2 100644
--- a/setup.py
+++ b/setup.py
@@ -55,6 +55,7 @@ def get_long_description():
             "pytest",
             "numpy",
             "pytest-httpx>=0.33.0",
+            "pytest-asyncio",
             "cogapp",
             "mypy>=1.10.0",
             "black>=24.1.0",
diff --git a/tests/conftest.py b/tests/conftest.py
index 7d44b757..6fb8bf75 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -75,6 +75,29 @@ def execute(self, prompt, stream, response, conversation):
                 break
 
 
+class AsyncMockModel(llm.AsyncModel):
+    model_id = "mock"
+
+    def __init__(self):
+        self.history = []
+        self._queue = []
+
+    def enqueue(self, messages):
+        assert isinstance(messages, list)
+        self._queue.append(messages)
+
+    async def execute(self, prompt, stream, response, conversation):
+        self.history.append((prompt, stream, response, conversation))
+        while True:
+            try:
+                messages = self._queue.pop(0)
+                for message in messages:
+                    yield message
+                break
+            except IndexError:
+                break
+
+
 class EmbedDemo(llm.EmbeddingModel):
     model_id = "embed-demo"
     batch_size = 10
@@ -118,8 +141,13 @@ def mock_model():
     return MockModel()
 
 
+@pytest.fixture
+def async_mock_model():
+    return AsyncMockModel()
+
+
 @pytest.fixture(autouse=True)
-def register_embed_demo_model(embed_demo, mock_model):
+def register_embed_demo_model(embed_demo, mock_model, async_mock_model):
     class MockModelsPlugin:
         __name__ = "MockModelsPlugin"
 
@@ -131,7 +159,7 @@ def register_embedding_models(self, register):
 
         @llm.hookimpl
         def register_models(self, register):
-            register(mock_model)
+            register(mock_model, async_model=async_mock_model)
 
     pm.register(MockModelsPlugin(), name="undo-mock-models-plugin")
     try:
diff --git a/tests/test_async.py b/tests/test_async.py
new file mode 100644
index 00000000..a84dd97d
--- /dev/null
+++ b/tests/test_async.py
@@ -0,0 +1,17 @@
+import llm
+import pytest
+
+
+@pytest.mark.asyncio
+async def test_async_model(async_mock_model):
+    gathered = []
+    async_mock_model.enqueue(["hello world"])
+    async for chunk in async_mock_model.prompt("hello"):
+        gathered.append(chunk)
+    assert gathered == ["hello world"]
+    # Not as an iterator
+    async_mock_model.enqueue(["hello world"])
+    response = await async_mock_model.prompt("hello")
+    text = await response.text()
+    assert text == "hello world"
+    assert isinstance(response, llm.AsyncResponse)
diff --git a/tests/test_chat.py b/tests/test_chat.py
index 01b2a0c0..285fa476 100644
--- a/tests/test_chat.py
+++ b/tests/test_chat.py
@@ -80,7 +80,10 @@ def test_chat_basic(mock_model, logs_db):
     # Now continue that conversation
     mock_model.enqueue(["continued"])
     result2 = runner.invoke(
-        llm.cli.cli, ["chat", "-m", "mock", "-c"], input="Continue\nquit\n"
+        llm.cli.cli,
+        ["chat", "-m", "mock", "-c"],
+        input="Continue\nquit\n",
+        catch_exceptions=False,
     )
     assert result2.exit_code == 0
     assert result2.output == (
@@ -176,7 +179,7 @@ def test_chat_options(mock_model, logs_db):
             "response": "Some text",
             "response_json": None,
             "conversation_id": ANY,
-            "duration_ms": 0,
+            "duration_ms": ANY,
             "datetime_utc": ANY,
         }
     ]
diff --git a/tests/test_llm.py b/tests/test_llm.py
index a0058713..0e54cc91 100644
--- a/tests/test_llm.py
+++ b/tests/test_llm.py
@@ -555,6 +555,14 @@ def test_llm_models_options(user_path):
     result = runner.invoke(cli, ["models", "--options"], catch_exceptions=False)
     assert result.exit_code == 0
     assert EXPECTED_OPTIONS.strip() in result.output
+    assert "AsyncMockModel: mock" not in result.output
+
+
+def test_llm_models_async(user_path):
+    runner = CliRunner()
+    result = runner.invoke(cli, ["models", "--async"], catch_exceptions=False)
+    assert result.exit_code == 0
+    assert "AsyncMockModel: mock" in result.output
 
 
 def test_llm_user_dir(tmpdir, monkeypatch):

From 041730d8b2bc12f62cfe41c44b62a03ef4790117 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Wed, 13 Nov 2024 17:55:28 -0800
Subject: [PATCH 088/149] Release 0.18a0

Refs #507, #599, #600, #603, #608, #611, #612, #613, #614, #615, #616, #621, #622, #623, #626, #629
---
 docs/changelog.md | 7 +++++++
 setup.py          | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index 31507afb..54a45a87 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,5 +1,12 @@
 # Changelog
 
+(v0_18a0)=
+## 0.18a0 (2024-11-13)
+
+Alpha support for **async models**. [#507](https://github.com/simonw/llm/issues/507)
+
+Multiple [smaller changes](https://github.com/simonw/llm/compare/0.17.1...0.18a0).
+
 (v0_17)=
 ## 0.17 (2024-10-29)
 
diff --git a/setup.py b/setup.py
index 24b5acd2..d06c9b2a 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 import os
 
-VERSION = "0.17.1"
+VERSION = "0.18a0"
 
 
 def get_long_description():

From 157b29ddebe6d17cb72dd64a612adc047a811646 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 14 Nov 2024 14:28:17 -0800
Subject: [PATCH 089/149] Test for basic async conversation, refs #632

---
 llm/models.py       |  8 ++++++++
 tests/test_async.py | 13 +++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/llm/models.py b/llm/models.py
index cb9c7ab3..9ce5f293 100644
--- a/llm/models.py
+++ b/llm/models.py
@@ -660,3 +660,11 @@ def _conversation_name(text):
     if len(text) <= CONVERSATION_NAME_LENGTH:
         return text
     return text[: CONVERSATION_NAME_LENGTH - 1] + "…"
+
+
+@dataclass
+class Usage:
+    model_id: str
+    input_tokens: int
+    output_tokens: int
+    details: Dict[str, int]
diff --git a/tests/test_async.py b/tests/test_async.py
index a84dd97d..9623f054 100644
--- a/tests/test_async.py
+++ b/tests/test_async.py
@@ -15,3 +15,16 @@ async def test_async_model(async_mock_model):
     text = await response.text()
     assert text == "hello world"
     assert isinstance(response, llm.AsyncResponse)
+
+
+@pytest.mark.asyncio
+async def test_async_model_conversation(async_mock_model):
+    async_mock_model.enqueue(["joke 1"])
+    conversation = async_mock_model.conversation()
+    response = await conversation.prompt("joke")
+    text = await response.text()
+    assert text == "joke 1"
+    async_mock_model.enqueue(["joke 2"])
+    response2 = await conversation.prompt("again")
+    text2 = await response2.text()
+    assert text2 == "joke 2"

From f90f29dec9ff10c765c6d997adb11372d5b8dfaa Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 14 Nov 2024 14:29:05 -0800
Subject: [PATCH 090/149] Removed accidental commit of Usage class

---
 llm/models.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/llm/models.py b/llm/models.py
index 9ce5f293..cb9c7ab3 100644
--- a/llm/models.py
+++ b/llm/models.py
@@ -660,11 +660,3 @@ def _conversation_name(text):
     if len(text) <= CONVERSATION_NAME_LENGTH:
         return text
     return text[: CONVERSATION_NAME_LENGTH - 1] + "…"
-
-
-@dataclass
-class Usage:
-    model_id: str
-    input_tokens: int
-    output_tokens: int
-    details: Dict[str, int]

From 3b6e73445cf99112ba5017b2f7e65dd33673f221 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 14 Nov 2024 14:42:40 -0800
Subject: [PATCH 091/149] Better __repr__ for Response and AsyncResponse

---
 llm/models.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/llm/models.py b/llm/models.py
index cb9c7ab3..c10a3336 100644
--- a/llm/models.py
+++ b/llm/models.py
@@ -349,6 +349,12 @@ def __iter__(self) -> Iterator[str]:
         self._end = time.monotonic()
         self._done = True
 
+    def __repr__(self):
+        text = "... not yet done ..."
+        if self._done:
+            text = "".join(self._chunks)
+        return "<Response prompt='{}' text='{}'>".format(self.prompt.prompt, text)
+
 
 class AsyncResponse(_BaseResponse):
     model: "AsyncModel"
@@ -440,7 +446,7 @@ def __repr__(self):
         text = "... not yet awaited ..."
         if self._done:
             text = "".join(self._chunks)
-        return "<Response prompt='{}' text='{}'>".format(self.prompt.prompt, text)
+        return "<AsyncResponse prompt='{}' text='{}'>".format(self.prompt.prompt, text)
 
 
 class Options(BaseModel):

From cf172cc70a92d0c3d17451da7590770e64cb174a Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 14 Nov 2024 15:08:41 -0800
Subject: [PATCH 092/149] response.text_or_raise() workaround

Closes https://github.com/simonw/llm/issues/632
---
 docs/plugins/advanced-model-plugins.md | 5 ++++-
 llm/default_plugins/openai_models.py   | 4 +++-
 llm/models.py                          | 5 +++++
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/docs/plugins/advanced-model-plugins.md b/docs/plugins/advanced-model-plugins.md
index 1793c751..f0efcfd1 100644
--- a/docs/plugins/advanced-model-plugins.md
+++ b/docs/plugins/advanced-model-plugins.md
@@ -162,5 +162,8 @@ for prev_response in conversation.responses:
         messages.append(
             {"role": "user", "content": prev_response.prompt.prompt}
         )
-    messages.append({"role": "assistant", "content": prev_response.text()})
+    messages.append({"role": "assistant", "content": prev_response.text_or_raise()})
 ```
+The `response.text_or_raise()` method used there will return the text from the response or raise a `ValueError` exception if the response is an `AsyncResponse` instance that has not yet been fully resolved.
+
+This is a slightly weird hack to work around the common need to share logic for building up the `messages` list across both sync and async models.
diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index 82f737c5..6234d5b1 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -375,7 +375,9 @@ def build_messages(self, prompt, conversation):
                     messages.append(
                         {"role": "user", "content": prev_response.prompt.prompt}
                     )
-                messages.append({"role": "assistant", "content": prev_response.text()})
+                messages.append(
+                    {"role": "assistant", "content": prev_response.text_or_raise()}
+                )
         if prompt.system and prompt.system != current_system:
             messages.append({"role": "system", "content": prompt.system})
         if not prompt.attachments:
diff --git a/llm/models.py b/llm/models.py
index c10a3336..f5c8fd3b 100644
--- a/llm/models.py
+++ b/llm/models.py
@@ -399,6 +399,11 @@ async def _force(self):
                 pass
         return self
 
+    def text_or_raise(self) -> str:
+        if not self._done:
+            raise ValueError("Response not yet awaited")
+        return "".join(self._chunks)
+
     async def text(self) -> str:
         await self._force()
         return "".join(self._chunks)

From 73823012cabefe678588715b3d7269f6828f7f33 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 14 Nov 2024 15:10:39 -0800
Subject: [PATCH 093/149] Release 0.18a1

Refs #632
---
 docs/changelog.md | 6 ++++++
 setup.py          | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index 54a45a87..4651d8b9 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,5 +1,11 @@
 # Changelog
 
+(v0_18a1)=
+## 0.18a1 (2024-11-14)
+
+- Fixed bug where conversations did not work for async OpenAI models. [#632](https://github.com/simonw/llm/issues/632)
+- `__repr__` methods for `Response` and `AsyncResponse`.
+
 (v0_18a0)=
 ## 0.18a0 (2024-11-13)
 
diff --git a/setup.py b/setup.py
index d06c9b2a..15617e74 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 import os
 
-VERSION = "0.18a0"
+VERSION = "0.18a1"
 
 
 def get_long_description():

From 0fec9746f4ae863e82bac1312b75b0f5f38767dd Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sun, 17 Nov 2024 12:20:20 -0800
Subject: [PATCH 094/149] text_or_raise() on sync Response too

Refs #632
---
 llm/models.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llm/models.py b/llm/models.py
index f5c8fd3b..70d19377 100644
--- a/llm/models.py
+++ b/llm/models.py
@@ -316,6 +316,9 @@ def text(self) -> str:
         self._force()
         return "".join(self._chunks)
 
+    def text_or_raise(self) -> str:
+        return self.text()
+
     def json(self) -> Optional[Dict[str, Any]]:
         self._force()
         return self.response_json

From a6d62b7ec97d8fe3658c53cee32e777f1e23d776 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sun, 17 Nov 2024 12:31:48 -0800
Subject: [PATCH 095/149] Release 0.18

Refs #507, #600, #603, #608, #611, #612, #614
---
 docs/changelog.md | 13 +++++++++++++
 setup.py          |  2 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index 4651d8b9..4440bd55 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,5 +1,18 @@
 # Changelog
 
+(v0_18)=
+## 0.18 (2024-11-17)
+
+- Initial support for async models. Plugins can now provide an `AsyncModel` subclass that can be accessed in the Python API using the new `llm.get_async_model(model_id)` method. See {ref}`async models in the Python API docs<python-api-async>` and {ref}`implementing async models in plugins <advanced-model-plugins-async>`. [#507](https://github.com/simonw/llm/issues/507)
+- OpenAI models all now include async models, so function calls such as `llm.get_async_model("gpt-4o-mini")` will return an async model.
+- `gpt-4o-audio-preview` model can be used to send audio attachments to the GPT-4o audio model. [#608](https://github.com/simonw/llm/issues/608)
+- Attachments can now be sent without requiring a prompt. [#611](https://github.com/simonw/llm/issues/611)
+- `llm models --options` now includes information on whether a model supports attachments. [#612](https://github.com/simonw/llm/issues/612)
+- `llm models --async` shows available async models.
+- Custom OpenAI-compatible models can now be marked as `can_stream: false` in the YAML if they do not support streaming. Thanks, [Chris Mungall](https://github.com/cmungall). [#600](https://github.com/simonw/llm/pull/600)
+- Fixed bug where OpenAI usage data was incorrectly serialized to JSON. [#614](https://github.com/simonw/llm/issues/614)
+- Standardized on `audio/wav` MIME type for audio attachments rather than `audio/wave`. [#603](https://github.com/simonw/llm/issues/603)
+
 (v0_18a1)=
 ## 0.18a1 (2024-11-14)
 
diff --git a/setup.py b/setup.py
index 15617e74..63bfc1e1 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 import os
 
-VERSION = "0.18a1"
+VERSION = "0.18"
 
 
 def get_long_description():

From 4a059d722b6d2898aadb08e200ddb385c9da513c Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Tue, 19 Nov 2024 18:11:52 -0800
Subject: [PATCH 096/149] Log --async responses to DB, closes #641

Refs #507
---
 llm/cli.py                      | 14 ++++++---
 llm/models.py                   | 15 ++++++++++
 tests/test_cli_openai_models.py | 51 +++++++++++++++++++++++++++++++++
 3 files changed, 76 insertions(+), 4 deletions(-)

diff --git a/llm/cli.py b/llm/cli.py
index 5a9f20b4..c75e0e3e 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -6,6 +6,7 @@
 import json
 from llm import (
     Attachment,
+    AsyncResponse,
     Collection,
     Conversation,
     Response,
@@ -376,6 +377,7 @@ def read_prompt():
         validated_options["stream"] = False
 
     prompt = read_prompt()
+    response = None
 
     prompt_method = model.prompt
     if conversation:
@@ -386,12 +388,13 @@ def read_prompt():
 
             async def inner():
                 if should_stream:
-                    async for chunk in prompt_method(
+                    response = prompt_method(
                         prompt,
                         attachments=resolved_attachments,
                         system=system,
                         **validated_options,
-                    ):
+                    )
+                    async for chunk in response:
                         print(chunk, end="")
                         sys.stdout.flush()
                     print("")
@@ -403,8 +406,9 @@ async def inner():
                         **validated_options,
                     )
                     print(await response.text())
+                return response
 
-            asyncio.run(inner())
+            response = asyncio.run(inner())
         else:
             response = prompt_method(
                 prompt,
@@ -423,11 +427,13 @@ async def inner():
         raise click.ClickException(str(ex))
 
     # Log to the database
-    if (logs_on() or log) and not no_log and not async_:
+    if (logs_on() or log) and not no_log:
         log_path = logs_db_path()
         (log_path.parent).mkdir(parents=True, exist_ok=True)
         db = sqlite_utils.Database(log_path)
         migrate(db)
+        if isinstance(response, AsyncResponse):
+            response = asyncio.run(response.to_sync_response())
         response.log_to_db(db)
 
 
diff --git a/llm/models.py b/llm/models.py
index 70d19377..c160798b 100644
--- a/llm/models.py
+++ b/llm/models.py
@@ -426,6 +426,21 @@ async def datetime_utc(self) -> str:
     def __await__(self):
         return self._force().__await__()
 
+    async def to_sync_response(self) -> Response:
+        await self._force()
+        response = Response(
+            self.prompt,
+            self.model,
+            self.stream,
+            conversation=self.conversation,
+        )
+        response._chunks = self._chunks
+        response._done = True
+        response._end = self._end
+        response._start = self._start
+        response._start_utcnow = self._start_utcnow
+        return response
+
     @classmethod
     def fake(
         cls,
diff --git a/tests/test_cli_openai_models.py b/tests/test_cli_openai_models.py
index 7cbab726..b65ad078 100644
--- a/tests/test_cli_openai_models.py
+++ b/tests/test_cli_openai_models.py
@@ -1,6 +1,7 @@
 from click.testing import CliRunner
 from llm.cli import cli
 import pytest
+import sqlite_utils
 
 
 @pytest.fixture
@@ -143,3 +144,53 @@ def test_only_gpt4_audio_preview_allows_mp3_or_wav(httpx_mock, model, filetype):
         assert (
             f"This model does not support attachments of type '{long}'" in result.output
         )
+
+
+@pytest.mark.parametrize("async_", (False, True))
+def test_gpt4o_mini_sync_and_async(monkeypatch, tmpdir, httpx_mock, async_):
+    user_path = tmpdir / "user_dir"
+    log_db = user_path / "logs.db"
+    monkeypatch.setenv("LLM_USER_PATH", str(user_path))
+    assert not log_db.exists()
+    httpx_mock.add_response(
+        method="POST",
+        # chat completion request
+        url="https://api.openai.com/v1/chat/completions",
+        json={
+            "id": "chatcmpl-AQT9a30kxEaM1bqxRPepQsPlCyGJh",
+            "object": "chat.completion",
+            "created": 1730871958,
+            "model": "gpt-4o-mini",
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": "Ho ho ho",
+                        "refusal": None,
+                    },
+                    "finish_reason": "stop",
+                }
+            ],
+            "usage": {
+                "prompt_tokens": 10,
+                "completion_tokens": 2,
+                "total_tokens": 12,
+            },
+            "system_fingerprint": "fp_49254d0e9b",
+        },
+        headers={"Content-Type": "application/json"},
+    )
+    runner = CliRunner()
+    args = ["-m", "gpt-4o-mini", "--key", "x", "--no-stream"]
+    if async_:
+        args.append("--async")
+    result = runner.invoke(cli, args, catch_exceptions=False)
+    assert result.exit_code == 0
+    assert result.output == "Ho ho ho\n"
+    # Confirm it was correctly logged
+    assert log_db.exists()
+    db = sqlite_utils.Database(str(log_db))
+    assert db["responses"].count == 1
+    row = next(db["responses"].rows)
+    assert row["response"] == "Ho ho ho"

From cfb10f4afd2a37c3e120d691d419ea7223f11800 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Tue, 19 Nov 2024 20:21:59 -0800
Subject: [PATCH 097/149] Log input tokens, output tokens and token details
 (#642)

* Store input_tokens, output_tokens, token_details on Response, closes #610
* llm prompt -u/--usage option
* llm logs -u/--usage option
* Docs on tracking token usage in plugins
* OpenAI default plugin logs usage
---
 docs/help.md                           |  2 ++
 docs/logging.md                        |  5 ++-
 docs/plugins/advanced-model-plugins.md | 16 ++++++++++
 llm/cli.py                             | 31 +++++++++++++++++--
 llm/default_plugins/openai_models.py   | 27 ++++++++++++++++-
 llm/migrations.py                      |  7 +++++
 llm/models.py                          | 31 +++++++++++++++++--
 llm/utils.py                           | 26 ++++++++++++++++
 tests/conftest.py                      |  6 +++-
 tests/test_chat.py                     | 15 +++++++++
 tests/test_cli_openai_models.py        | 13 +++++---
 tests/test_llm.py                      | 13 +++++++-
 tests/test_migrate.py                  |  3 ++
 tests/test_utils.py                    | 42 ++++++++++++++++++++++++++
 14 files changed, 224 insertions(+), 13 deletions(-)
 create mode 100644 tests/test_utils.py

diff --git a/docs/help.md b/docs/help.md
index 9db540a3..ba5d3f0d 100644
--- a/docs/help.md
+++ b/docs/help.md
@@ -122,6 +122,7 @@ Options:
   --key TEXT                      API key to use
   --save TEXT                     Save prompt with this template name
   --async                         Run prompt asynchronously
+  -u, --usage                     Show token usage
   --help                          Show this message and exit.
 ```
 
@@ -292,6 +293,7 @@ Options:
   -m, --model TEXT            Filter by model or model alias
   -q, --query TEXT            Search for logs matching this string
   -t, --truncate              Truncate long strings in output
+  -u, --usage                 Include token usage
   -r, --response              Just output the last response
   -c, --current               Show logs from the current conversation
   --cid, --conversation TEXT  Show logs for this conversation ID
diff --git a/docs/logging.md b/docs/logging.md
index 63722e01..56c0379d 100644
--- a/docs/logging.md
+++ b/docs/logging.md
@@ -159,7 +159,10 @@ CREATE TABLE [responses] (
   [response_json] TEXT,
   [conversation_id] TEXT REFERENCES [conversations]([id]),
   [duration_ms] INTEGER,
-  [datetime_utc] TEXT
+  [datetime_utc] TEXT,
+  [input_tokens] INTEGER,
+  [output_tokens] INTEGER,
+  [token_details] TEXT
 );
 CREATE VIRTUAL TABLE [responses_fts] USING FTS5 (
   [prompt],
diff --git a/docs/plugins/advanced-model-plugins.md b/docs/plugins/advanced-model-plugins.md
index f0efcfd1..9342d355 100644
--- a/docs/plugins/advanced-model-plugins.md
+++ b/docs/plugins/advanced-model-plugins.md
@@ -167,3 +167,19 @@ for prev_response in conversation.responses:
 The `response.text_or_raise()` method used there will return the text from the response or raise a `ValueError` exception if the response is an `AsyncResponse` instance that has not yet been fully resolved.
 
 This is a slightly weird hack to work around the common need to share logic for building up the `messages` list across both sync and async models.
+
+(advanced-model-plugins-usage)=
+
+## Tracking token usage
+
+Models that charge by the token should track the number of tokens used by each prompt. The ``response.set_usage()`` method can be used to record the number of tokens used by a response - these will then be made available through the Python API and logged to the SQLite database for command-line users.
+
+`response` here is the response object that is passed to `.execute()` as an argument.
+
+Call ``response.set_usage()`` at the end of your `.execute()` method. It accepts keyword arguments `input=`, `output=` and `details=` - all three are optional. `input` and `output` should be integers, and `details` should be a dictionary that provides additional information beyond the input and output token counts.
+
+This example logs 15 input tokens, 340 output tokens and notes that 37 tokens were cached:
+
+```python
+response.set_usage(input=15, output=340, details={"cached": 37})
+```
diff --git a/llm/cli.py b/llm/cli.py
index c75e0e3e..e0c8e47c 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -33,7 +33,7 @@
 
 from .migrations import migrate
 from .plugins import pm, load_plugins
-from .utils import mimetype_from_path, mimetype_from_string
+from .utils import mimetype_from_path, mimetype_from_string, token_usage_string
 import base64
 import httpx
 import pathlib
@@ -203,6 +203,7 @@ def cli():
 @click.option("--key", help="API key to use")
 @click.option("--save", help="Save prompt with this template name")
 @click.option("async_", "--async", is_flag=True, help="Run prompt asynchronously")
+@click.option("-u", "--usage", is_flag=True, help="Show token usage")
 def prompt(
     prompt,
     system,
@@ -220,6 +221,7 @@ def prompt(
     key,
     save,
     async_,
+    usage,
 ):
     """
     Execute a prompt
@@ -426,14 +428,24 @@ async def inner():
     except Exception as ex:
         raise click.ClickException(str(ex))
 
+    if isinstance(response, AsyncResponse):
+        response = asyncio.run(response.to_sync_response())
+
+    if usage:
+        # Show token usage to stderr in yellow
+        click.echo(
+            click.style(
+                "Token usage: {}".format(response.token_usage()), fg="yellow", bold=True
+            ),
+            err=True,
+        )
+
     # Log to the database
     if (logs_on() or log) and not no_log:
         log_path = logs_db_path()
         (log_path.parent).mkdir(parents=True, exist_ok=True)
         db = sqlite_utils.Database(log_path)
         migrate(db)
-        if isinstance(response, AsyncResponse):
-            response = asyncio.run(response.to_sync_response())
         response.log_to_db(db)
 
 
@@ -754,6 +766,9 @@ def logs_turn_off():
     responses.conversation_id,
     responses.duration_ms,
     responses.datetime_utc,
+    responses.input_tokens,
+    responses.output_tokens,
+    responses.token_details,
     conversations.name as conversation_name,
     conversations.model as conversation_model"""
 
@@ -809,6 +824,7 @@ def logs_turn_off():
 @click.option("-m", "--model", help="Filter by model or model alias")
 @click.option("-q", "--query", help="Search for logs matching this string")
 @click.option("-t", "--truncate", is_flag=True, help="Truncate long strings in output")
+@click.option("-u", "--usage", is_flag=True, help="Include token usage")
 @click.option("-r", "--response", is_flag=True, help="Just output the last response")
 @click.option(
     "current_conversation",
@@ -836,6 +852,7 @@ def logs_list(
     model,
     query,
     truncate,
+    usage,
     response,
     current_conversation,
     conversation_id,
@@ -998,6 +1015,14 @@ def logs_list(
                         )
 
             click.echo("\n## Response:\n\n{}\n".format(row["response"]))
+            if usage:
+                token_usage = token_usage_string(
+                    row["input_tokens"],
+                    row["output_tokens"],
+                    json.loads(row["token_details"]) if row["token_details"] else None,
+                )
+                if token_usage:
+                    click.echo("## Token usage:\n\n{}\n".format(token_usage))
 
 
 @cli.group(
diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index 6234d5b1..ab33d1b4 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -1,6 +1,11 @@
 from llm import AsyncModel, EmbeddingModel, Model, hookimpl
 import llm
-from llm.utils import dicts_to_table_string, remove_dict_none_values, logging_client
+from llm.utils import (
+    dicts_to_table_string,
+    remove_dict_none_values,
+    logging_client,
+    simplify_usage_dict,
+)
 import click
 import datetime
 import httpx
@@ -391,6 +396,16 @@ def build_messages(self, prompt, conversation):
             messages.append({"role": "user", "content": attachment_message})
         return messages
 
+    def set_usage(self, response, usage):
+        if not usage:
+            return
+        input_tokens = usage.pop("prompt_tokens")
+        output_tokens = usage.pop("completion_tokens")
+        usage.pop("total_tokens")
+        response.set_usage(
+            input=input_tokens, output=output_tokens, details=simplify_usage_dict(usage)
+        )
+
     def get_client(self, async_=False):
         kwargs = {}
         if self.api_base:
@@ -445,6 +460,7 @@ def execute(self, prompt, stream, response, conversation=None):
         messages = self.build_messages(prompt, conversation)
         kwargs = self.build_kwargs(prompt, stream)
         client = self.get_client()
+        usage = None
         if stream:
             completion = client.chat.completions.create(
                 model=self.model_name or self.model_id,
@@ -455,6 +471,8 @@ def execute(self, prompt, stream, response, conversation=None):
             chunks = []
             for chunk in completion:
                 chunks.append(chunk)
+                if chunk.usage:
+                    usage = chunk.usage.model_dump()
                 try:
                     content = chunk.choices[0].delta.content
                 except IndexError:
@@ -469,8 +487,10 @@ def execute(self, prompt, stream, response, conversation=None):
                 stream=False,
                 **kwargs,
             )
+            usage = completion.usage.model_dump()
             response.response_json = remove_dict_none_values(completion.model_dump())
             yield completion.choices[0].message.content
+        self.set_usage(response, usage)
         response._prompt_json = redact_data({"messages": messages})
 
 
@@ -493,6 +513,7 @@ async def execute(
         messages = self.build_messages(prompt, conversation)
         kwargs = self.build_kwargs(prompt, stream)
         client = self.get_client(async_=True)
+        usage = None
         if stream:
             completion = await client.chat.completions.create(
                 model=self.model_name or self.model_id,
@@ -502,6 +523,8 @@ async def execute(
             )
             chunks = []
             async for chunk in completion:
+                if chunk.usage:
+                    usage = chunk.usage.model_dump()
                 chunks.append(chunk)
                 try:
                     content = chunk.choices[0].delta.content
@@ -518,7 +541,9 @@ async def execute(
                 **kwargs,
             )
             response.response_json = remove_dict_none_values(completion.model_dump())
+            usage = completion.usage.model_dump()
             yield completion.choices[0].message.content
+        self.set_usage(response, usage)
         response._prompt_json = redact_data({"messages": messages})
 
 
diff --git a/llm/migrations.py b/llm/migrations.py
index 91da6429..b8ac8b13 100644
--- a/llm/migrations.py
+++ b/llm/migrations.py
@@ -227,3 +227,10 @@ def m012_attachments_tables(db):
         ),
         pk=("response_id", "attachment_id"),
     )
+
+
+@migration
+def m013_usage(db):
+    db["responses"].add_column("input_tokens", int)
+    db["responses"].add_column("output_tokens", int)
+    db["responses"].add_column("token_details", str)
diff --git a/llm/models.py b/llm/models.py
index c160798b..5bf9f11c 100644
--- a/llm/models.py
+++ b/llm/models.py
@@ -18,7 +18,7 @@
     Set,
     Union,
 )
-from .utils import mimetype_from_path, mimetype_from_string
+from .utils import mimetype_from_path, mimetype_from_string, token_usage_string
 from abc import ABC, abstractmethod
 import json
 from pydantic import BaseModel
@@ -208,6 +208,20 @@ def __init__(
         self._start: Optional[float] = None
         self._end: Optional[float] = None
         self._start_utcnow: Optional[datetime.datetime] = None
+        self.input_tokens: Optional[int] = None
+        self.output_tokens: Optional[int] = None
+        self.token_details: Optional[dict] = None
+
+    def set_usage(
+        self,
+        *,
+        input: Optional[int] = None,
+        output: Optional[int] = None,
+        details: Optional[dict] = None,
+    ):
+        self.input_tokens = input
+        self.output_tokens = output
+        self.token_details = details
 
     @classmethod
     def from_row(cls, db, row):
@@ -246,6 +260,11 @@ def from_row(cls, db, row):
         ]
         return response
 
+    def token_usage(self) -> str:
+        return token_usage_string(
+            self.input_tokens, self.output_tokens, self.token_details
+        )
+
     def log_to_db(self, db):
         conversation = self.conversation
         if not conversation:
@@ -272,11 +291,16 @@ def log_to_db(self, db):
                 for key, value in dict(self.prompt.options).items()
                 if value is not None
             },
-            "response": self.text(),
+            "response": self.text_or_raise(),
             "response_json": self.json(),
             "conversation_id": conversation.id,
             "duration_ms": self.duration_ms(),
             "datetime_utc": self.datetime_utc(),
+            "input_tokens": self.input_tokens,
+            "output_tokens": self.output_tokens,
+            "token_details": (
+                json.dumps(self.token_details) if self.token_details else None
+            ),
         }
         db["responses"].insert(response)
         # Persist any attachments - loop through with index
@@ -439,6 +463,9 @@ async def to_sync_response(self) -> Response:
         response._end = self._end
         response._start = self._start
         response._start_utcnow = self._start_utcnow
+        response.input_tokens = self.input_tokens
+        response.output_tokens = self.output_tokens
+        response.token_details = self.token_details
         return response
 
     @classmethod
diff --git a/llm/utils.py b/llm/utils.py
index d2618dd4..e9853185 100644
--- a/llm/utils.py
+++ b/llm/utils.py
@@ -127,3 +127,29 @@ def logging_client() -> httpx.Client:
         transport=_LogTransport(httpx.HTTPTransport()),
         event_hooks={"request": [_no_accept_encoding], "response": [_log_response]},
     )
+
+
+def simplify_usage_dict(d):
+    # Recursively remove keys with value 0 and empty dictionaries
+    def remove_empty_and_zero(obj):
+        if isinstance(obj, dict):
+            cleaned = {
+                k: remove_empty_and_zero(v)
+                for k, v in obj.items()
+                if v != 0 and v != {}
+            }
+            return {k: v for k, v in cleaned.items() if v is not None and v != {}}
+        return obj
+
+    return remove_empty_and_zero(d) or {}
+
+
+def token_usage_string(input_tokens, output_tokens, token_details) -> str:
+    bits = []
+    if input_tokens is not None:
+        bits.append(f"{format(input_tokens, ',')} input")
+    if output_tokens is not None:
+        bits.append(f"{format(output_tokens, ',')} output")
+    if token_details:
+        bits.append(json.dumps(token_details))
+    return ", ".join(bits)
diff --git a/tests/conftest.py b/tests/conftest.py
index 6fb8bf75..447e1caa 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -66,13 +66,17 @@ def enqueue(self, messages):
 
     def execute(self, prompt, stream, response, conversation):
         self.history.append((prompt, stream, response, conversation))
+        gathered = []
         while True:
             try:
                 messages = self._queue.pop(0)
-                yield from messages
+                for message in messages:
+                    gathered.append(message)
+                    yield message
                 break
             except IndexError:
                 break
+        response.set_usage(input=len(prompt.prompt.split()), output=len(gathered))
 
 
 class AsyncMockModel(llm.AsyncModel):
diff --git a/tests/test_chat.py b/tests/test_chat.py
index 285fa476..1a31f290 100644
--- a/tests/test_chat.py
+++ b/tests/test_chat.py
@@ -62,6 +62,9 @@ def test_chat_basic(mock_model, logs_db):
             "conversation_id": conversation_id,
             "duration_ms": ANY,
             "datetime_utc": ANY,
+            "input_tokens": 1,
+            "output_tokens": 1,
+            "token_details": None,
         },
         {
             "id": ANY,
@@ -75,6 +78,9 @@ def test_chat_basic(mock_model, logs_db):
             "conversation_id": conversation_id,
             "duration_ms": ANY,
             "datetime_utc": ANY,
+            "input_tokens": 2,
+            "output_tokens": 1,
+            "token_details": None,
         },
     ]
     # Now continue that conversation
@@ -116,6 +122,9 @@ def test_chat_basic(mock_model, logs_db):
             "conversation_id": conversation_id,
             "duration_ms": ANY,
             "datetime_utc": ANY,
+            "input_tokens": 1,
+            "output_tokens": 1,
+            "token_details": None,
         }
     ]
 
@@ -153,6 +162,9 @@ def test_chat_system(mock_model, logs_db):
             "conversation_id": ANY,
             "duration_ms": ANY,
             "datetime_utc": ANY,
+            "input_tokens": 1,
+            "output_tokens": 1,
+            "token_details": None,
         }
     ]
 
@@ -181,6 +193,9 @@ def test_chat_options(mock_model, logs_db):
             "conversation_id": ANY,
             "duration_ms": ANY,
             "datetime_utc": ANY,
+            "input_tokens": 1,
+            "output_tokens": 1,
+            "token_details": None,
         }
     ]
 
diff --git a/tests/test_cli_openai_models.py b/tests/test_cli_openai_models.py
index b65ad078..3d0a7c16 100644
--- a/tests/test_cli_openai_models.py
+++ b/tests/test_cli_openai_models.py
@@ -147,7 +147,8 @@ def test_only_gpt4_audio_preview_allows_mp3_or_wav(httpx_mock, model, filetype):
 
 
 @pytest.mark.parametrize("async_", (False, True))
-def test_gpt4o_mini_sync_and_async(monkeypatch, tmpdir, httpx_mock, async_):
+@pytest.mark.parametrize("usage", (None, "-u", "--usage"))
+def test_gpt4o_mini_sync_and_async(monkeypatch, tmpdir, httpx_mock, async_, usage):
     user_path = tmpdir / "user_dir"
     log_db = user_path / "logs.db"
     monkeypatch.setenv("LLM_USER_PATH", str(user_path))
@@ -173,21 +174,25 @@ def test_gpt4o_mini_sync_and_async(monkeypatch, tmpdir, httpx_mock, async_):
                 }
             ],
             "usage": {
-                "prompt_tokens": 10,
-                "completion_tokens": 2,
+                "prompt_tokens": 1000,
+                "completion_tokens": 2000,
                 "total_tokens": 12,
             },
             "system_fingerprint": "fp_49254d0e9b",
         },
         headers={"Content-Type": "application/json"},
     )
-    runner = CliRunner()
+    runner = CliRunner(mix_stderr=False)
     args = ["-m", "gpt-4o-mini", "--key", "x", "--no-stream"]
+    if usage:
+        args.append(usage)
     if async_:
         args.append("--async")
     result = runner.invoke(cli, args, catch_exceptions=False)
     assert result.exit_code == 0
     assert result.output == "Ho ho ho\n"
+    if usage:
+        assert result.stderr == "Token usage: 1,000 input, 2,000 output\n"
     # Confirm it was correctly logged
     assert log_db.exists()
     db = sqlite_utils.Database(str(log_db))
diff --git a/tests/test_llm.py b/tests/test_llm.py
index 0e54cc91..b83ff842 100644
--- a/tests/test_llm.py
+++ b/tests/test_llm.py
@@ -37,6 +37,8 @@ def log_path(user_path):
             "model": "davinci",
             "datetime_utc": (start + datetime.timedelta(seconds=i)).isoformat(),
             "conversation_id": "abc123",
+            "input_tokens": 2,
+            "output_tokens": 5,
         }
         for i in range(100)
     )
@@ -46,9 +48,12 @@ def log_path(user_path):
 datetime_re = re.compile(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}")
 
 
-def test_logs_text(log_path):
+@pytest.mark.parametrize("usage", (False, True))
+def test_logs_text(log_path, usage):
     runner = CliRunner()
     args = ["logs", "-p", str(log_path)]
+    if usage:
+        args.append("-u")
     result = runner.invoke(cli, args, catch_exceptions=False)
     assert result.exit_code == 0
     output = result.output
@@ -64,18 +69,24 @@ def test_logs_text(log_path):
         "system\n\n"
         "## Response:\n\n"
         "response\n\n"
+    ) + ("## Token usage:\n\n2 input, 5 output\n\n" if usage else "") + (
         "# YYYY-MM-DDTHH:MM:SS    conversation: abc123\n\n"
         "Model: **davinci**\n\n"
         "## Prompt:\n\n"
         "prompt\n\n"
         "## Response:\n\n"
         "response\n\n"
+    ) + (
+        "## Token usage:\n\n2 input, 5 output\n\n" if usage else ""
+    ) + (
         "# YYYY-MM-DDTHH:MM:SS    conversation: abc123\n\n"
         "Model: **davinci**\n\n"
         "## Prompt:\n\n"
         "prompt\n\n"
         "## Response:\n\n"
         "response\n\n"
+    ) + (
+        "## Token usage:\n\n2 input, 5 output\n\n" if usage else ""
     )
 
 
diff --git a/tests/test_migrate.py b/tests/test_migrate.py
index 1c68de93..d1da5571 100644
--- a/tests/test_migrate.py
+++ b/tests/test_migrate.py
@@ -17,6 +17,9 @@
     "conversation_id": str,
     "duration_ms": int,
     "datetime_utc": str,
+    "input_tokens": int,
+    "output_tokens": int,
+    "token_details": str,
 }
 
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 00000000..85ed54ae
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,42 @@
+import pytest
+from llm.utils import simplify_usage_dict
+
+
+@pytest.mark.parametrize(
+    "input_data,expected_output",
+    [
+        (
+            {
+                "prompt_tokens_details": {"cached_tokens": 0, "audio_tokens": 0},
+                "completion_tokens_details": {
+                    "reasoning_tokens": 0,
+                    "audio_tokens": 1,
+                    "accepted_prediction_tokens": 0,
+                    "rejected_prediction_tokens": 0,
+                },
+            },
+            {"completion_tokens_details": {"audio_tokens": 1}},
+        ),
+        (
+            {
+                "details": {"tokens": 5, "audio_tokens": 2},
+                "more_details": {"accepted_tokens": 3},
+            },
+            {
+                "details": {"tokens": 5, "audio_tokens": 2},
+                "more_details": {"accepted_tokens": 3},
+            },
+        ),
+        ({"details": {"tokens": 0, "audio_tokens": 0}, "more_details": {}}, {}),
+        ({"level1": {"level2": {"value": 0, "another_value": {}}}}, {}),
+        (
+            {
+                "level1": {"level2": {"value": 0, "another_value": 1}},
+                "level3": {"empty_dict": {}, "valid_token": 10},
+            },
+            {"level1": {"level2": {"another_value": 1}}, "level3": {"valid_token": 10}},
+        ),
+    ],
+)
+def test_simplify_usage_dict(input_data, expected_output):
+    assert simplify_usage_dict(input_data) == expected_output

From 02852fe1a53b5039f1c6e4b2c02c490978d87d08 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Tue, 19 Nov 2024 20:23:54 -0800
Subject: [PATCH 098/149] Release 0.19a0

Refs #610, #641
---
 docs/changelog.md | 8 ++++++++
 setup.py          | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index 4440bd55..298a7b32 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,5 +1,13 @@
 # Changelog
 
+(v0_19a0)=
+## 0.19a0 (2024-11-19)
+
+- Tokens used by a response are now logged to new `input_tokens` and `output_tokens` integer columns and a `token_details` JSON string column, for the default OpenAI models and models from other plugins that {ref}`implement this feature <advanced-model-plugins-usage>`. [#610](https://github.com/simonw/llm/issues/610)
+- `llm prompt` now takes a `-u/--usage` flag to display token usage at the end of the response.
+- `llm logs -u/--usage` shows token usage information for logged responses.
+- `llm prompt ... --async` responses are now logged to the database. [#641](https://github.com/simonw/llm/issues/641)
+
 (v0_18)=
 ## 0.18 (2024-11-17)
 
diff --git a/setup.py b/setup.py
index 63bfc1e1..2ec9897f 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 import os
 
-VERSION = "0.18"
+VERSION = "0.19a0"
 
 
 def get_long_description():

From 8a7b0c4f5d65baa53f31d830630ed515e571b755 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Tue, 19 Nov 2024 21:25:37 -0800
Subject: [PATCH 099/149] response.usage() and await aresponse.usage(), closes
 #644

---
 llm/models.py       | 23 +++++++++++++++++++++++
 tests/conftest.py   |  3 +++
 tests/test_async.py |  4 ++++
 tests/test_chat.py  |  3 +++
 4 files changed, 33 insertions(+)

diff --git a/llm/models.py b/llm/models.py
index 5bf9f11c..3799c4d2 100644
--- a/llm/models.py
+++ b/llm/models.py
@@ -27,6 +27,13 @@
 CONVERSATION_NAME_LENGTH = 32
 
 
+@dataclass
+class Usage:
+    input: Optional[int] = None
+    output: Optional[int] = None
+    details: Optional[Dict[str, Any]] = None
+
+
 @dataclass
 class Attachment:
     type: Optional[str] = None
@@ -355,6 +362,14 @@ def datetime_utc(self) -> str:
         self._force()
         return self._start_utcnow.isoformat() if self._start_utcnow else ""
 
+    def usage(self) -> Usage:
+        self._force()
+        return Usage(
+            input=self.input_tokens,
+            output=self.output_tokens,
+            details=self.token_details,
+        )
+
     def __iter__(self) -> Iterator[str]:
         self._start = time.monotonic()
         self._start_utcnow = datetime.datetime.utcnow()
@@ -447,6 +462,14 @@ async def datetime_utc(self) -> str:
         await self._force()
         return self._start_utcnow.isoformat() if self._start_utcnow else ""
 
+    async def usage(self) -> Usage:
+        await self._force()
+        return Usage(
+            input=self.input_tokens,
+            output=self.output_tokens,
+            details=self.token_details,
+        )
+
     def __await__(self):
         return self._force().__await__()
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 447e1caa..bfcbae88 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -92,14 +92,17 @@ def enqueue(self, messages):
 
     async def execute(self, prompt, stream, response, conversation):
         self.history.append((prompt, stream, response, conversation))
+        gathered = []
         while True:
             try:
                 messages = self._queue.pop(0)
                 for message in messages:
+                    gathered.append(message)
                     yield message
                 break
             except IndexError:
                 break
+        response.set_usage(input=len(prompt.prompt.split()), output=len(gathered))
 
 
 class EmbedDemo(llm.EmbeddingModel):
diff --git a/tests/test_async.py b/tests/test_async.py
index 9623f054..d34396a1 100644
--- a/tests/test_async.py
+++ b/tests/test_async.py
@@ -15,6 +15,10 @@ async def test_async_model(async_mock_model):
     text = await response.text()
     assert text == "hello world"
     assert isinstance(response, llm.AsyncResponse)
+    usage = await response.usage()
+    assert usage.input == 1
+    assert usage.output == 1
+    assert usage.details is None
 
 
 @pytest.mark.asyncio
diff --git a/tests/test_chat.py b/tests/test_chat.py
index 1a31f290..e20478a3 100644
--- a/tests/test_chat.py
+++ b/tests/test_chat.py
@@ -1,4 +1,5 @@
 from click.testing import CliRunner
+from llm.models import Usage
 import llm.cli
 from unittest.mock import ANY
 import pytest
@@ -13,8 +14,10 @@ def test_mock_model(mock_model):
     assert response.text() == "hello world"
     assert str(response) == "hello world"
     assert model.history[0][0].prompt == "hello"
+    assert response.usage() == Usage(input=1, output=1, details=None)
     response2 = model.prompt(prompt="hello again")
     assert response2.text() == "second"
+    assert response2.usage() == Usage(input=2, output=1, details=None)
 
 
 @pytest.mark.xfail(sys.platform == "win32", reason="Expected to fail on Windows")

From 845322e97097368ff4e6b5290d29c0b8785adc85 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Tue, 19 Nov 2024 21:28:01 -0800
Subject: [PATCH 100/149] Release 0.19a1

Refs #644
---
 docs/changelog.md | 5 +++++
 setup.py          | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index 298a7b32..32a78412 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,5 +1,10 @@
 # Changelog
 
+(v0_19a1)=
+## 0.19a1 (2024-11-19)
+
+- `response.usage()` and async response `await response.usage()` methods, returning a `Usage(input=2, output=1, details=None)` dataclass. [#644](https://github.com/simonw/llm/issues/644)
+
 (v0_19a0)=
 ## 0.19a0 (2024-11-19)
 
diff --git a/setup.py b/setup.py
index 2ec9897f..d09d2d84 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 import os
 
-VERSION = "0.19a0"
+VERSION = "0.19a1"
 
 
 def get_long_description():

From c52cfee881d2150643bd3fa80cd82a3f81b76b59 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Wed, 20 Nov 2024 20:09:06 -0800
Subject: [PATCH 101/149] llm.get_models() and llm.get_async_models(), closes
 #640

---
 docs/python-api.md | 22 ++++++++++++++++++++--
 llm/__init__.py    | 12 ++++++++++++
 tests/test_llm.py  | 14 ++++++++++++++
 3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/docs/python-api.md b/docs/python-api.md
index 0450031a..9710698f 100644
--- a/docs/python-api.md
+++ b/docs/python-api.md
@@ -18,7 +18,7 @@ model.key = "sk-..."
 response = model.prompt("Five surprising names for a pet pelican")
 print(response.text())
 ```
-The `llm.get_model()` function accepts model names or aliases. You can also omit it to use the currently configured default model, which is `gpt-4o-mini` if you have not changed the default.
+The `llm.get_model()` function accepts model IDs or aliases. You can also omit it to use the currently configured default model, which is `gpt-4o-mini` if you have not changed the default.
 
 In this example the key is set by Python code. You can also provide the key using the `OPENAI_API_KEY` environment variable, or use the `llm keys set openai` command to store it in a `keys.json` file, see {ref}`api-keys`.
 
@@ -35,7 +35,7 @@ llm models
 ```
 If you have set a `OPENAI_API_KEY` environment variable you can omit the `model.key = ` line.
 
-Calling `llm.get_model()` with an invalid model name will raise a `llm.UnknownModelError` exception.
+Calling `llm.get_model()` with an invalid model ID will raise a `llm.UnknownModelError` exception.
 
 (python-api-system-prompts)=
 
@@ -99,6 +99,24 @@ print(response.text())
 ```
 Some models do not use API keys at all.
 
+### Listing models
+
+The `llm.get_models()` list returns a list of all available models, including those from plugins.
+
+```python
+import llm
+
+for model in llm.get_models():
+    print(model.model_id)
+```
+
+Use `llm.get_async_models()` to list async models:
+
+```python
+for model in llm.get_async_models():
+    print(model.model_id)
+```
+
 ### Streaming responses
 
 For models that support it you can stream responses as they are generated, like this:
diff --git a/llm/__init__.py b/llm/__init__.py
index d6df280f..6b0f7fe1 100644
--- a/llm/__init__.py
+++ b/llm/__init__.py
@@ -167,7 +167,18 @@ class UnknownModelError(KeyError):
     pass
 
 
+def get_models() -> List[Model]:
+    "Get all registered models"
+    return [model for model in get_model_aliases().values()]
+
+
+def get_async_models() -> List[AsyncModel]:
+    "Get all registered async models"
+    return [model for model in get_async_model_aliases().values()]
+
+
 def get_async_model(name: Optional[str] = None) -> AsyncModel:
+    "Get an async model by name or alias"
     aliases = get_async_model_aliases()
     name = name or get_default_model()
     try:
@@ -186,6 +197,7 @@ def get_async_model(name: Optional[str] = None) -> AsyncModel:
 
 
 def get_model(name: Optional[str] = None, _skip_async: bool = False) -> Model:
+    "Get a model by name or alias"
     aliases = get_model_aliases()
     name = name or get_default_model()
     try:
diff --git a/tests/test_llm.py b/tests/test_llm.py
index b83ff842..8bba3f2c 100644
--- a/tests/test_llm.py
+++ b/tests/test_llm.py
@@ -596,3 +596,17 @@ def test_model_defaults(tmpdir, monkeypatch):
     assert config_path.exists()
     assert llm.get_default_model() == "gpt-4o"
     assert llm.get_model().model_id == "gpt-4o"
+
+
+def test_get_models():
+    models = llm.get_models()
+    assert all(isinstance(model, llm.Model) for model in models)
+    model_ids = [model.model_id for model in models]
+    assert "gpt-4o-mini" in model_ids
+
+
+def test_get_async_models():
+    models = llm.get_async_models()
+    assert all(isinstance(model, llm.AsyncModel) for model in models)
+    model_ids = [model.model_id for model in models]
+    assert "gpt-4o-mini" in model_ids

From 335b3e635aa1439edafb13b0c2a225ce5840cc98 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Wed, 20 Nov 2024 20:12:43 -0800
Subject: [PATCH 102/149] Release 0.19a2

Refs #640
---
 docs/changelog.md  | 5 +++++
 docs/python-api.md | 2 ++
 setup.py           | 2 +-
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index 32a78412..d652c4eb 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,5 +1,10 @@
 # Changelog
 
+(v0_19a2)=
+## 0.19a2 (2024-11-20)
+
+- `llm.get_models()` and `llm.get_async_models()` functions, {ref}`documented here <python-api-listing-models>`. [#640](https://github.com/simonw/llm/issues/640)
+
 (v0_19a1)=
 ## 0.19a1 (2024-11-19)
 
diff --git a/docs/python-api.md b/docs/python-api.md
index 9710698f..a7ad2ab3 100644
--- a/docs/python-api.md
+++ b/docs/python-api.md
@@ -99,6 +99,8 @@ print(response.text())
 ```
 Some models do not use API keys at all.
 
+(python-api-listing-models)=
+
 ### Listing models
 
 The `llm.get_models()` list returns a list of all available models, including those from plugins.
diff --git a/setup.py b/setup.py
index d09d2d84..7dca8b22 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 import os
 
-VERSION = "0.19a1"
+VERSION = "0.19a2"
 
 
 def get_long_description():

From f9af563df554fea5637e3c106836d7b1ada79c01 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sun, 1 Dec 2024 15:47:23 -0800
Subject: [PATCH 103/149] response.on_done() mechanism, closes #653

---
 docs/python-api.md  | 41 +++++++++++++++++++++++++++++++++++++++++
 llm/models.py       | 31 +++++++++++++++++++++++++++++++
 tests/test_async.py | 16 ++++++++++++++++
 tests/test_chat.py  | 15 ---------------
 tests/test_llm.py   | 30 ++++++++++++++++++++++++++++++
 5 files changed, 118 insertions(+), 15 deletions(-)

diff --git a/docs/python-api.md b/docs/python-api.md
index a7ad2ab3..dff051e7 100644
--- a/docs/python-api.md
+++ b/docs/python-api.md
@@ -195,6 +195,47 @@ response = conversation.prompt(
 
 Access `conversation.responses` for a list of all of the responses that have so far been returned during the conversation.
 
+## Running code when a response has completed
+
+For some applications, such as tracking the tokens used by an application, it may be useful to execute code as soon as a response has finished being executed
+
+You can do this using the `response.on_done(callback)` method, which causes your callback function to be called as soon as the response has finished (all tokens have been returned).
+
+The signature of the method you provide is `def callback(response)` - it can be optionally an `async def` method when working with asynchronous models.
+
+Example usage:
+
+```python
+import llm
+
+model = llm.get_model("gpt-4o-mini")
+response = model.prompt("a poem about a hippo")
+response.on_done(lambda response: print(response.usage()))
+print(response.text())
+```
+Which outputs:
+```
+Usage(input=20, output=494, details={})
+In a sunlit glade by a bubbling brook,
+Lived a hefty hippo, with a curious look.
+...
+```
+Or using an `asyncio` model, where you need to `await response.on_done(done)` to queue up the callback:
+```python
+import asyncio, llm
+
+async def run():
+    model = llm.get_async_model("gpt-4o-mini")
+    response = model.prompt("a short poem about a brick")
+    async def done(response):
+        print(await response.usage())
+        print(await response.text())
+    await response.on_done(done)
+    print(await response.text())
+
+asyncio.run(run())
+```
+
 ## Other functions
 
 The `llm` top level package includes some useful utility functions.
diff --git a/llm/models.py b/llm/models.py
index 3799c4d2..8f7849e0 100644
--- a/llm/models.py
+++ b/llm/models.py
@@ -1,3 +1,4 @@
+import asyncio
 import base64
 from dataclasses import dataclass, field
 import datetime
@@ -10,6 +11,7 @@
 from typing import (
     Any,
     AsyncGenerator,
+    Callable,
     Dict,
     Iterable,
     Iterator,
@@ -218,6 +220,7 @@ def __init__(
         self.input_tokens: Optional[int] = None
         self.output_tokens: Optional[int] = None
         self.token_details: Optional[dict] = None
+        self.done_callbacks: List[Callable] = []
 
     def set_usage(
         self,
@@ -336,6 +339,16 @@ class Response(_BaseResponse):
     model: "Model"
     conversation: Optional["Conversation"] = None
 
+    def on_done(self, callback):
+        if not self._done:
+            self.done_callbacks.append(callback)
+        else:
+            callback(self)
+
+    def _on_done(self):
+        for callback in self.done_callbacks:
+            callback(self)
+
     def __str__(self) -> str:
         return self.text()
 
@@ -390,6 +403,7 @@ def __iter__(self) -> Iterator[str]:
             self.conversation.responses.append(self)
         self._end = time.monotonic()
         self._done = True
+        self._on_done()
 
     def __repr__(self):
         text = "... not yet done ..."
@@ -402,6 +416,22 @@ class AsyncResponse(_BaseResponse):
     model: "AsyncModel"
     conversation: Optional["AsyncConversation"] = None
 
+    async def on_done(self, callback):
+        if not self._done:
+            self.done_callbacks.append(callback)
+        else:
+            if callable(callback):
+                callback = callback(self)
+            if asyncio.iscoroutine(callback):
+                await callback
+
+    async def _on_done(self):
+        for callback in self.done_callbacks:
+            if callable(callback):
+                callback = callback(self)
+            if asyncio.iscoroutine(callback):
+                await callback
+
     def __aiter__(self):
         self._start = time.monotonic()
         self._start_utcnow = datetime.datetime.utcnow()
@@ -433,6 +463,7 @@ async def __anext__(self) -> str:
                 self.conversation.responses.append(self)
             self._end = time.monotonic()
             self._done = True
+            await self._on_done()
             raise
 
     async def _force(self):
diff --git a/tests/test_async.py b/tests/test_async.py
index d34396a1..cc6b517f 100644
--- a/tests/test_async.py
+++ b/tests/test_async.py
@@ -32,3 +32,19 @@ async def test_async_model_conversation(async_mock_model):
     response2 = await conversation.prompt("again")
     text2 = await response2.text()
     assert text2 == "joke 2"
+
+
+@pytest.mark.asyncio
+async def test_async_on_done(async_mock_model):
+    async_mock_model.enqueue(["hello world"])
+    response = await async_mock_model.prompt(prompt="hello")
+    caught = []
+
+    def done(response):
+        caught.append(response)
+
+    assert len(caught) == 0
+    await response.on_done(done)
+    await response.text()
+    assert response._done
+    assert len(caught) == 1
diff --git a/tests/test_chat.py b/tests/test_chat.py
index e20478a3..728ed04a 100644
--- a/tests/test_chat.py
+++ b/tests/test_chat.py
@@ -1,25 +1,10 @@
 from click.testing import CliRunner
-from llm.models import Usage
 import llm.cli
 from unittest.mock import ANY
 import pytest
 import sys
 
 
-def test_mock_model(mock_model):
-    mock_model.enqueue(["hello world"])
-    mock_model.enqueue(["second"])
-    model = llm.get_model("mock")
-    response = model.prompt(prompt="hello")
-    assert response.text() == "hello world"
-    assert str(response) == "hello world"
-    assert model.history[0][0].prompt == "hello"
-    assert response.usage() == Usage(input=1, output=1, details=None)
-    response2 = model.prompt(prompt="hello again")
-    assert response2.text() == "second"
-    assert response2.usage() == Usage(input=2, output=1, details=None)
-
-
 @pytest.mark.xfail(sys.platform == "win32", reason="Expected to fail on Windows")
 def test_chat_basic(mock_model, logs_db):
     runner = CliRunner()
diff --git a/tests/test_llm.py b/tests/test_llm.py
index 8bba3f2c..3166e4d3 100644
--- a/tests/test_llm.py
+++ b/tests/test_llm.py
@@ -3,6 +3,7 @@
 import llm
 from llm.cli import cli
 from llm.migrations import migrate
+from llm.models import Usage
 import json
 import os
 import pathlib
@@ -610,3 +611,32 @@ def test_get_async_models():
     assert all(isinstance(model, llm.AsyncModel) for model in models)
     model_ids = [model.model_id for model in models]
     assert "gpt-4o-mini" in model_ids
+
+
+def test_mock_model(mock_model):
+    mock_model.enqueue(["hello world"])
+    mock_model.enqueue(["second"])
+    model = llm.get_model("mock")
+    response = model.prompt(prompt="hello")
+    assert response.text() == "hello world"
+    assert str(response) == "hello world"
+    assert model.history[0][0].prompt == "hello"
+    assert response.usage() == Usage(input=1, output=1, details=None)
+    response2 = model.prompt(prompt="hello again")
+    assert response2.text() == "second"
+    assert response2.usage() == Usage(input=2, output=1, details=None)
+
+
+def test_sync_on_done(mock_model):
+    mock_model.enqueue(["hello world"])
+    model = llm.get_model("mock")
+    response = model.prompt(prompt="hello")
+    caught = []
+
+    def done(response):
+        caught.append(response)
+
+    response.on_done(done)
+    assert len(caught) == 0
+    str(response)
+    assert len(caught) == 1

From ac3d0089d0c29975a31d08a43d2f153c61c4a367 Mon Sep 17 00:00:00 2001
From: Sukhbinder Singh <sukh2010@yahoo.com>
Date: Mon, 2 Dec 2024 05:27:24 +0530
Subject: [PATCH 104/149] Fix windows bug where llm doesn't run <<llm chat>> on
 Windows issue #495 (#646)

* Fix windows bug where llm doesn't run <<llm chat>> on Windows issue #495

* Applied Black

---------

Co-authored-by: Sukhbinder Singh <sukhbindersingh@gmail.com>
Co-authored-by: Simon Willison <swillison@gmail.com>
---
 llm/cli.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/llm/cli.py b/llm/cli.py
index e0c8e47c..de828bb8 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -499,8 +499,12 @@ def chat(
     Hold an ongoing chat with a model.
     """
     # Left and right arrow keys to move cursor:
-    readline.parse_and_bind("\\e[D: backward-char")
-    readline.parse_and_bind("\\e[C: forward-char")
+    if sys.platform != "win32":
+        readline.parse_and_bind("\\e[D: backward-char")
+        readline.parse_and_bind("\\e[C: forward-char")
+    else:
+        readline.parse_and_bind("bind -x '\\e[D: backward-char'")
+        readline.parse_and_bind("bind -x '\\e[C: forward-char'")
     log_path = logs_db_path()
     (log_path.parent).mkdir(parents=True, exist_ok=True)
     db = sqlite_utils.Database(log_path)

From c018104083a30b67f54c4743c138265591e52cf2 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sun, 1 Dec 2024 15:58:27 -0800
Subject: [PATCH 105/149] Release 0.19

Refs #495, #610, #640, #641, #644, #653
---
 docs/changelog.md  | 11 +++++++++++
 docs/python-api.md |  4 ++++
 setup.py           |  2 +-
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index d652c4eb..2fa88f50 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,5 +1,16 @@
 # Changelog
 
+## 0.19 (2024-12-01)
+
+- Tokens used by a response are now logged to new `input_tokens` and `output_tokens` integer columns and a `token_details` JSON string column, for the default OpenAI models and models from other plugins that {ref}`implement this feature <advanced-model-plugins-usage>`. [#610](https://github.com/simonw/llm/issues/610)
+- `llm prompt` now takes a `-u/--usage` flag to display token usage at the end of the response.
+- `llm logs -u/--usage` shows token usage information for logged responses.
+- `llm prompt ... --async` responses are now logged to the database. [#641](https://github.com/simonw/llm/issues/641)
+- `llm.get_models()` and `llm.get_async_models()` functions, {ref}`documented here <python-api-listing-models>`. [#640](https://github.com/simonw/llm/issues/640)
+- `response.usage()` and async response `await response.usage()` methods, returning a `Usage(input=2, output=1, details=None)` dataclass. [#644](https://github.com/simonw/llm/issues/644)
+- `response.on_done(callback)` and `await response.on_done(callback)` methods for specifying a callback to be executed when a response has completed, {ref}`documented here <python-api-response-on-done>`. [#653](https://github.com/simonw/llm/issues/653)
+- Fix for bug running `llm chat` on Windows 11. Thanks, [Sukhbinder Singh](https://github.com/sukhbinder). [#495](https://github.com/simonw/llm/issues/495)
+
 (v0_19a2)=
 ## 0.19a2 (2024-11-20)
 
diff --git a/docs/python-api.md b/docs/python-api.md
index dff051e7..d261baf8 100644
--- a/docs/python-api.md
+++ b/docs/python-api.md
@@ -160,6 +160,8 @@ async for chunk in model.prompt(
     print(chunk, end="", flush=True)
 ```
 
+(python-api-conversations)=
+
 ## Conversations
 
 LLM supports *conversations*, where you ask follow-up questions of a model as part of an ongoing conversation.
@@ -195,6 +197,8 @@ response = conversation.prompt(
 
 Access `conversation.responses` for a list of all of the responses that have so far been returned during the conversation.
 
+(python-api-response-on-done)=
+
 ## Running code when a response has completed
 
 For some applications, such as tracking the tokens used by an application, it may be useful to execute code as soon as a response has finished being executed
diff --git a/setup.py b/setup.py
index 7dca8b22..a4aaf901 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 import os
 
-VERSION = "0.19a2"
+VERSION = "0.19"
 
 
 def get_long_description():

From e78fea17dfbf36121375fb2274296e3f26b179f6 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sun, 1 Dec 2024 16:09:55 -0800
Subject: [PATCH 106/149] Fragment hash on 0.19 release

!stable-docs
---
 docs/changelog.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/changelog.md b/docs/changelog.md
index 2fa88f50..8396e576 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,5 +1,6 @@
 # Changelog
 
+(v0_19)=
 ## 0.19 (2024-12-01)
 
 - Tokens used by a response are now logged to new `input_tokens` and `output_tokens` integer columns and a `token_details` JSON string column, for the default OpenAI models and models from other plugins that {ref}`implement this feature <advanced-model-plugins-usage>`. [#610](https://github.com/simonw/llm/issues/610)

From b6be09aa281492f3d2feb003db735e84d0d58737 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 5 Dec 2024 13:44:07 -0800
Subject: [PATCH 107/149] Fix get_models() and get_async_models() duplicates
 bug

Closes #667, refs #640
---
 llm/__init__.py                      | 6 ++++--
 llm/default_plugins/openai_models.py | 4 ++++
 tests/test_llm.py                    | 3 +++
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/llm/__init__.py b/llm/__init__.py
index 6b0f7fe1..b16a004c 100644
--- a/llm/__init__.py
+++ b/llm/__init__.py
@@ -169,12 +169,14 @@ class UnknownModelError(KeyError):
 
 def get_models() -> List[Model]:
     "Get all registered models"
-    return [model for model in get_model_aliases().values()]
+    models_with_aliases = get_models_with_aliases()
+    return [mwa.model for mwa in models_with_aliases if mwa.model]
 
 
 def get_async_models() -> List[AsyncModel]:
     "Get all registered async models"
-    return [model for model in get_async_model_aliases().values()]
+    models_with_aliases = get_models_with_aliases()
+    return [mwa.async_model for mwa in models_with_aliases if mwa.async_model]
 
 
 def get_async_model(name: Optional[str] = None) -> AsyncModel:
diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index ab33d1b4..b5ffd9d5 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -62,6 +62,10 @@ def register_models(register):
         aliases=("gpt-4-turbo-preview", "4-turbo", "4t"),
     )
     # o1
+    # register(
+    #     Chat("o1", can_stream=False, allows_system_prompt=False, vision=True),
+    #     AsyncChat("o1", can_stream=False, allows_system_prompt=False, vision=True),
+    # )
     register(
         Chat("o1-preview", can_stream=False, allows_system_prompt=False),
         AsyncChat("o1-preview", can_stream=False, allows_system_prompt=False),
diff --git a/tests/test_llm.py b/tests/test_llm.py
index 3166e4d3..f23bf71d 100644
--- a/tests/test_llm.py
+++ b/tests/test_llm.py
@@ -604,6 +604,9 @@ def test_get_models():
     assert all(isinstance(model, llm.Model) for model in models)
     model_ids = [model.model_id for model in models]
     assert "gpt-4o-mini" in model_ids
+    # Ensure no model_ids are duplicated
+    # https://github.com/simonw/llm/issues/667
+    assert len(model_ids) == len(set(model_ids))
 
 
 def test_get_async_models():

From 491dd9b4371f4cdcb05f5403a5e1f3291e576421 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 5 Dec 2024 13:45:50 -0800
Subject: [PATCH 108/149] Removed accidental comment

---
 llm/default_plugins/openai_models.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index b5ffd9d5..ab33d1b4 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -62,10 +62,6 @@ def register_models(register):
         aliases=("gpt-4-turbo-preview", "4-turbo", "4t"),
     )
     # o1
-    # register(
-    #     Chat("o1", can_stream=False, allows_system_prompt=False, vision=True),
-    #     AsyncChat("o1", can_stream=False, allows_system_prompt=False, vision=True),
-    # )
     register(
         Chat("o1-preview", can_stream=False, allows_system_prompt=False),
         AsyncChat("o1-preview", can_stream=False, allows_system_prompt=False),

From b8e80522291f14c210fad6930041a4cb0232286f Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 5 Dec 2024 13:47:28 -0800
Subject: [PATCH 109/149] Release 0.19.1

Refs #667
---
 docs/changelog.md | 5 +++++
 setup.py          | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index 8396e576..dede4116 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,5 +1,10 @@
 # Changelog
 
+(v0_19_1)=
+## 0.19.1 (2024-12-05)
+
+- FIxed bug where `llm.get_models()` and `llm.get_async_models()` returned the same model multiple times. [#667](https://github.com/simonw/llm/issues/667)
+
 (v0_19)=
 ## 0.19 (2024-12-01)
 
diff --git a/setup.py b/setup.py
index a4aaf901..ca9bf8ac 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 import os
 
-VERSION = "0.19"
+VERSION = "0.19.1"
 
 
 def get_long_description():

From 571f4b2a4da52ad127061b7fa953562f6ba6aeb0 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 12 Dec 2024 14:57:23 -0800
Subject: [PATCH 110/149] Fix for UTC warnings

Closes #672
---
 llm/default_plugins/openai_models.py | 4 ++--
 llm/migrations.py                    | 5 ++++-
 llm/models.py                        | 4 ++--
 tests/test_cli_openai_models.py      | 6 +++---
 tests/test_llm.py                    | 2 +-
 5 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index ab33d1b4..3d418582 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -182,8 +182,8 @@ def models(json_, key):
             to_print = []
             for model in models:
                 # Print id, owned_by, root, created as ISO 8601
-                created_str = datetime.datetime.utcfromtimestamp(
-                    model["created"]
+                created_str = datetime.datetime.fromtimestamp(
+                    model["created"], datetime.timezone.utc
                 ).isoformat()
                 to_print.append(
                     {
diff --git a/llm/migrations.py b/llm/migrations.py
index b8ac8b13..3575b9e5 100644
--- a/llm/migrations.py
+++ b/llm/migrations.py
@@ -13,7 +13,10 @@ def migrate(db):
         if name not in already_applied:
             fn(db)
             db["_llm_migrations"].insert(
-                {"name": name, "applied_at": str(datetime.datetime.utcnow())}
+                {
+                    "name": name,
+                    "applied_at": str(datetime.datetime.now(datetime.timezone.utc)),
+                }
             )
             already_applied.add(name)
 
diff --git a/llm/models.py b/llm/models.py
index 8f7849e0..8a75dc4e 100644
--- a/llm/models.py
+++ b/llm/models.py
@@ -385,7 +385,7 @@ def usage(self) -> Usage:
 
     def __iter__(self) -> Iterator[str]:
         self._start = time.monotonic()
-        self._start_utcnow = datetime.datetime.utcnow()
+        self._start_utcnow = datetime.datetime.now(datetime.timezone.utc)
         if self._done:
             yield from self._chunks
             return
@@ -434,7 +434,7 @@ async def _on_done(self):
 
     def __aiter__(self):
         self._start = time.monotonic()
-        self._start_utcnow = datetime.datetime.utcnow()
+        self._start_utcnow = datetime.datetime.now(datetime.timezone.utc)
         return self
 
     async def __anext__(self) -> str:
diff --git a/tests/test_cli_openai_models.py b/tests/test_cli_openai_models.py
index 3d0a7c16..fbb382de 100644
--- a/tests/test_cli_openai_models.py
+++ b/tests/test_cli_openai_models.py
@@ -35,9 +35,9 @@ def test_openai_models(mocked_models):
     result = runner.invoke(cli, ["openai", "models", "--key", "x"])
     assert result.exit_code == 0
     assert result.output == (
-        "id                    owned_by    created            \n"
-        "ada:2020-05-03        openai      2020-05-03T20:26:40\n"
-        "babbage:2020-05-03    openai      2020-05-03T20:26:40\n"
+        "id                    owned_by    created                  \n"
+        "ada:2020-05-03        openai      2020-05-03T20:26:40+00:00\n"
+        "babbage:2020-05-03    openai      2020-05-03T20:26:40+00:00\n"
     )
 
 
diff --git a/tests/test_llm.py b/tests/test_llm.py
index f23bf71d..79aa147c 100644
--- a/tests/test_llm.py
+++ b/tests/test_llm.py
@@ -28,7 +28,7 @@ def log_path(user_path):
     log_path = str(user_path / "logs.db")
     db = sqlite_utils.Database(log_path)
     migrate(db)
-    start = datetime.datetime.utcnow()
+    start = datetime.datetime.now(datetime.timezone.utc)
     db["responses"].insert_all(
         {
             "id": str(ULID()).lower(),

From aa25ad1d54a1b6a507ac34043c7a53668520cd53 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Tue, 17 Dec 2024 10:52:28 -0800
Subject: [PATCH 111/149] o1-preview and o1-mini can stream now

Refs https://github.com/simonw/llm/issues/676#issuecomment-2549328154
---
 llm/default_plugins/openai_models.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index 3d418582..57ec615e 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -63,12 +63,12 @@ def register_models(register):
     )
     # o1
     register(
-        Chat("o1-preview", can_stream=False, allows_system_prompt=False),
-        AsyncChat("o1-preview", can_stream=False, allows_system_prompt=False),
+        Chat("o1-preview", allows_system_prompt=False),
+        AsyncChat("o1-preview", allows_system_prompt=False),
     )
     register(
-        Chat("o1-mini", can_stream=False, allows_system_prompt=False),
-        AsyncChat("o1-mini", can_stream=False, allows_system_prompt=False),
+        Chat("o1-mini", allows_system_prompt=False),
+        AsyncChat("o1-mini", allows_system_prompt=False),
     )
     # The -instruct completion model
     register(

From 8898584ba6a04ac0014402b2c784d03b8da20ec4 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Tue, 17 Dec 2024 11:14:42 -0800
Subject: [PATCH 112/149] New OpenAI audio models, closes #677

---
 docs/openai-models.md                |  2 ++
 docs/usage.md                        | 26 ++++++++++++++++++++++++++
 llm/default_plugins/openai_models.py | 13 +++++++++----
 3 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/docs/openai-models.md b/docs/openai-models.md
index eda613ad..95e4167b 100644
--- a/docs/openai-models.md
+++ b/docs/openai-models.md
@@ -34,6 +34,8 @@ cog.out("```\n{}\n```".format("\n".join(models)))
 OpenAI Chat: gpt-4o (aliases: 4o)
 OpenAI Chat: gpt-4o-mini (aliases: 4o-mini)
 OpenAI Chat: gpt-4o-audio-preview
+OpenAI Chat: gpt-4o-audio-preview-2024-12-17
+OpenAI Chat: gpt-4o-audio-preview-2024-10-01
 OpenAI Chat: gpt-3.5-turbo (aliases: 3.5, chatgpt)
 OpenAI Chat: gpt-3.5-turbo-16k (aliases: chatgpt-16k, 3.5-16k)
 OpenAI Chat: gpt-4 (aliases: 4, gpt4)
diff --git a/docs/usage.md b/docs/usage.md
index dd44ff10..dfd5e244 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -306,6 +306,32 @@ OpenAI Chat: gpt-4o-audio-preview
     json_object: boolean
   Attachment types:
     audio/mpeg, audio/wav
+OpenAI Chat: gpt-4o-audio-preview-2024-12-17
+  Options:
+    temperature: float
+    max_tokens: int
+    top_p: float
+    frequency_penalty: float
+    presence_penalty: float
+    stop: str
+    logit_bias: dict, str
+    seed: int
+    json_object: boolean
+  Attachment types:
+    audio/mpeg, audio/wav
+OpenAI Chat: gpt-4o-audio-preview-2024-10-01
+  Options:
+    temperature: float
+    max_tokens: int
+    top_p: float
+    frequency_penalty: float
+    presence_penalty: float
+    stop: str
+    logit_bias: dict, str
+    seed: int
+    json_object: boolean
+  Attachment types:
+    audio/mpeg, audio/wav
 OpenAI Chat: gpt-3.5-turbo (aliases: 3.5, chatgpt)
   Options:
     temperature: float
diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index 57ec615e..affd38ae 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -37,10 +37,15 @@ def register_models(register):
         AsyncChat("gpt-4o-mini", vision=True),
         aliases=("4o-mini",),
     )
-    register(
-        Chat("gpt-4o-audio-preview", audio=True),
-        AsyncChat("gpt-4o-audio-preview", audio=True),
-    )
+    for audio_model_id in (
+        "gpt-4o-audio-preview",
+        "gpt-4o-audio-preview-2024-12-17",
+        "gpt-4o-audio-preview-2024-10-01",
+    ):
+        register(
+            Chat(audio_model_id, audio=True),
+            AsyncChat(audio_model_id, audio=True),
+        )
     # 3.5 and 4
     register(
         Chat("gpt-3.5-turbo"), AsyncChat("gpt-3.5-turbo"), aliases=("3.5", "chatgpt")

From 6305b86026d953d3ebd36894500edce0de5feaab Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Tue, 17 Dec 2024 20:28:57 -0800
Subject: [PATCH 113/149] gpt-4o-mini-audio-preview, closes #677

---
 docs/openai-models.md                |  2 ++
 docs/usage.md                        | 26 ++++++++++++++++++++++++++
 llm/default_plugins/openai_models.py |  2 ++
 3 files changed, 30 insertions(+)

diff --git a/docs/openai-models.md b/docs/openai-models.md
index 95e4167b..982e3179 100644
--- a/docs/openai-models.md
+++ b/docs/openai-models.md
@@ -36,6 +36,8 @@ OpenAI Chat: gpt-4o-mini (aliases: 4o-mini)
 OpenAI Chat: gpt-4o-audio-preview
 OpenAI Chat: gpt-4o-audio-preview-2024-12-17
 OpenAI Chat: gpt-4o-audio-preview-2024-10-01
+OpenAI Chat: gpt-4o-mini-audio-preview
+OpenAI Chat: gpt-4o-mini-audio-preview-2024-12-17
 OpenAI Chat: gpt-3.5-turbo (aliases: 3.5, chatgpt)
 OpenAI Chat: gpt-3.5-turbo-16k (aliases: chatgpt-16k, 3.5-16k)
 OpenAI Chat: gpt-4 (aliases: 4, gpt4)
diff --git a/docs/usage.md b/docs/usage.md
index dfd5e244..f76bf83a 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -332,6 +332,32 @@ OpenAI Chat: gpt-4o-audio-preview-2024-10-01
     json_object: boolean
   Attachment types:
     audio/mpeg, audio/wav
+OpenAI Chat: gpt-4o-mini-audio-preview
+  Options:
+    temperature: float
+    max_tokens: int
+    top_p: float
+    frequency_penalty: float
+    presence_penalty: float
+    stop: str
+    logit_bias: dict, str
+    seed: int
+    json_object: boolean
+  Attachment types:
+    audio/mpeg, audio/wav
+OpenAI Chat: gpt-4o-mini-audio-preview-2024-12-17
+  Options:
+    temperature: float
+    max_tokens: int
+    top_p: float
+    frequency_penalty: float
+    presence_penalty: float
+    stop: str
+    logit_bias: dict, str
+    seed: int
+    json_object: boolean
+  Attachment types:
+    audio/mpeg, audio/wav
 OpenAI Chat: gpt-3.5-turbo (aliases: 3.5, chatgpt)
   Options:
     temperature: float
diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index affd38ae..706b61c5 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -41,6 +41,8 @@ def register_models(register):
         "gpt-4o-audio-preview",
         "gpt-4o-audio-preview-2024-12-17",
         "gpt-4o-audio-preview-2024-10-01",
+        "gpt-4o-mini-audio-preview",
+        "gpt-4o-mini-audio-preview-2024-12-17",
     ):
         register(
             Chat(audio_model_id, audio=True),

From 67d4a9964501071b3d810a76598ea27cc741ec6f Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 19 Dec 2024 06:40:05 -0800
Subject: [PATCH 114/149] llm prompt -x/--extract option, closes #681

---
 docs/help.md                         |  6 ++++
 docs/usage.md                        | 18 ++++++++++++
 llm/cli.py                           | 28 +++++++++++++++++--
 llm/default_plugins/openai_models.py |  9 +++---
 llm/utils.py                         | 36 ++++++++++++++++++++++++
 tests/conftest.py                    | 21 ++++++++++++++
 tests/test_llm.py                    | 27 ++++++++++++++++++
 tests/test_utils.py                  | 41 +++++++++++++++++++++++++++-
 8 files changed, 178 insertions(+), 8 deletions(-)

diff --git a/docs/help.md b/docs/help.md
index ba5d3f0d..4e592e89 100644
--- a/docs/help.md
+++ b/docs/help.md
@@ -105,6 +105,11 @@ Usage: llm prompt [OPTIONS] [PROMPT]
       # With an explicit mimetype:
       cat image | llm 'describe image' --at - image/jpeg
 
+  The -x/--extract option returns just the content of the first ``` fenced code
+  block, if one is present. If none are present it returns the full response.
+
+      llm 'JavaScript function for reversing a string' -x
+
 Options:
   -s, --system TEXT               System prompt to use
   -m, --model TEXT                Model to use
@@ -123,6 +128,7 @@ Options:
   --save TEXT                     Save prompt with this template name
   --async                         Run prompt asynchronously
   -u, --usage                     Show token usage
+  -x, --extract                   Extract first fenced code block
   --help                          Show this message and exit.
 ```
 
diff --git a/docs/usage.md b/docs/usage.md
index f76bf83a..4ac07e85 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -45,6 +45,24 @@ Some models support options. You can pass these using `-o/--option name value` -
 ```bash
 llm 'Ten names for cheesecakes' -o temperature 1.5
 ```
+
+(usage-extract-fenced-code)=
+### Extracting fenced code blocks
+
+If you are using an LLM to generate code it can be useful to retrieve just the code it produces without any of the surrounding explanatory text.
+
+The `-x/--extract` option will scan the response for the first instance of a Markdown fenced code block - something that looks like this:
+
+````
+```python
+def my_function():
+    # ...
+```
+````
+It will extract and returns just the content of that block, excluding the fenced coded delimiters. If there are no fenced code blocks it will return the full response.
+
+The entire response including explanatory text is still logged to the database, and can be viewed using `llm logs -c`.
+
 (usage-attachments)=
 ### Attachments
 
diff --git a/llm/cli.py b/llm/cli.py
index de828bb8..40ffe556 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -33,7 +33,12 @@
 
 from .migrations import migrate
 from .plugins import pm, load_plugins
-from .utils import mimetype_from_path, mimetype_from_string, token_usage_string
+from .utils import (
+    mimetype_from_path,
+    mimetype_from_string,
+    token_usage_string,
+    extract_first_fenced_code_block,
+)
 import base64
 import httpx
 import pathlib
@@ -204,6 +209,7 @@ def cli():
 @click.option("--save", help="Save prompt with this template name")
 @click.option("async_", "--async", is_flag=True, help="Run prompt asynchronously")
 @click.option("-u", "--usage", is_flag=True, help="Show token usage")
+@click.option("-x", "--extract", is_flag=True, help="Extract first fenced code block")
 def prompt(
     prompt,
     system,
@@ -222,6 +228,7 @@ def prompt(
     save,
     async_,
     usage,
+    extract,
 ):
     """
     Execute a prompt
@@ -243,12 +250,21 @@ def prompt(
         cat image | llm 'describe image' -a -
         # With an explicit mimetype:
         cat image | llm 'describe image' --at - image/jpeg
+
+    The -x/--extract option returns just the content of the first ``` fenced code
+    block, if one is present. If none are present it returns the full response.
+
+    \b
+        llm 'JavaScript function for reversing a string' -x
     """
     if log and no_log:
         raise click.ClickException("--log and --no-log are mutually exclusive")
 
     model_aliases = get_model_aliases()
 
+    if extract:
+        no_stream = True
+
     def read_prompt():
         nonlocal prompt
 
@@ -407,7 +423,10 @@ async def inner():
                         system=system,
                         **validated_options,
                     )
-                    print(await response.text())
+                    text = await response.text()
+                    if extract:
+                        text = extract_first_fenced_code_block(text) or text
+                    print(text)
                 return response
 
             response = asyncio.run(inner())
@@ -424,7 +443,10 @@ async def inner():
                     sys.stdout.flush()
                 print("")
             else:
-                print(response.text())
+                text = response.text()
+                if extract:
+                    text = extract_first_fenced_code_block(text) or text
+                print(text)
     except Exception as ex:
         raise click.ClickException(str(ex))
 
diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index 706b61c5..41b6866a 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -656,10 +656,11 @@ def combine_chunks(chunks: List) -> dict:
     }
     if logprobs:
         combined["logprobs"] = logprobs
-    for key in ("id", "object", "model", "created", "index"):
-        value = getattr(chunks[0], key, None)
-        if value is not None:
-            combined[key] = value
+    if chunks:
+        for key in ("id", "object", "model", "created", "index"):
+            value = getattr(chunks[0], key, None)
+            if value is not None:
+                combined[key] = value
 
     return combined
 
diff --git a/llm/utils.py b/llm/utils.py
index e9853185..a4f57a0e 100644
--- a/llm/utils.py
+++ b/llm/utils.py
@@ -2,6 +2,7 @@
 import httpx
 import json
 import puremagic
+import re
 import textwrap
 from typing import List, Dict, Optional
 
@@ -153,3 +154,38 @@ def token_usage_string(input_tokens, output_tokens, token_details) -> str:
     if token_details:
         bits.append(json.dumps(token_details))
     return ", ".join(bits)
+
+
+def extract_first_fenced_code_block(text: str) -> Optional[str]:
+    """
+    Extracts and returns the first Markdown fenced code block found in the given text.
+
+    The function handles fenced code blocks that:
+    - Use at least three backticks (`).
+    - May include a language tag immediately after the opening backticks.
+    - Use more than three backticks as long as the closing fence has the same number.
+
+    If no fenced code block is found, the function returns None.
+
+    Args:
+        text (str): The input text to search for a fenced code block.
+
+    Returns:
+        Optional[str]: The content of the first fenced code block, or None if not found.
+    """
+    # Regex pattern to match fenced code blocks
+    # - ^ or \n ensures that the fence is at the start of a line
+    # - (`{3,}) captures the opening backticks (at least three)
+    # - (\w+)? optionally captures the language tag
+    # - \n matches the newline after the opening fence
+    # - (.*?) non-greedy match for the code block content
+    # - \1 ensures that the closing fence has the same number of backticks
+    # - (?=\n|$) ensures that the closing fence is followed by a newline or end of string
+    pattern = re.compile(
+        r"""(?m)^(?P<fence>`{3,})(?P<lang>\w+)?\n(?P<code>.*?)^(?P=fence)(?=\n|$)""",
+        re.DOTALL,
+    )
+    match = pattern.search(text)
+    if match:
+        return match.group("code")
+    return None
diff --git a/tests/conftest.py b/tests/conftest.py
index bfcbae88..243c9887 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -190,6 +190,27 @@ def mocked_openai_chat(httpx_mock):
     return httpx_mock
 
 
+@pytest.fixture
+def mocked_openai_chat_returning_fenced_code(httpx_mock):
+    httpx_mock.add_response(
+        method="POST",
+        url="https://api.openai.com/v1/chat/completions",
+        json={
+            "model": "gpt-4o-mini",
+            "usage": {},
+            "choices": [
+                {
+                    "message": {
+                        "content": "Code:\n\n````javascript\nfunction foo() {\n  return 'bar';\n}\n````\nDone.",
+                    }
+                }
+            ],
+        },
+        headers={"Content-Type": "application/json"},
+    )
+    return httpx_mock
+
+
 def stream_events():
     for delta, finish_reason in (
         ({"role": "assistant", "content": ""}, None),
diff --git a/tests/test_llm.py b/tests/test_llm.py
index 79aa147c..bc9649d3 100644
--- a/tests/test_llm.py
+++ b/tests/test_llm.py
@@ -322,6 +322,33 @@ def test_llm_default_prompt(
     )
 
 
+@pytest.mark.parametrize(
+    "args,expect_just_code",
+    (
+        (["-x"], True),
+        (["--extract"], True),
+        (["-x", "--async"], True),
+        (["--extract", "--async"], True),
+        # Use --no-stream here to ensure it passes test same as -x/--extract cases
+        (["--no-stream"], False),
+    ),
+)
+def test_extract_fenced_code(
+    mocked_openai_chat_returning_fenced_code, args, expect_just_code
+):
+    runner = CliRunner()
+    result = runner.invoke(
+        cli,
+        ["-m", "gpt-4o-mini", "--key", "x", "Write code"] + args,
+        catch_exceptions=False,
+    )
+    output = result.output
+    if expect_just_code:
+        assert "```" not in output
+    else:
+        assert "```" in output
+
+
 def test_openai_chat_stream(mocked_openai_chat_stream, user_path):
     runner = CliRunner()
     result = runner.invoke(cli, ["-m", "gpt-3.5-turbo", "--key", "x", "Say hi"])
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 85ed54ae..11d3dc70 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,5 +1,5 @@
 import pytest
-from llm.utils import simplify_usage_dict
+from llm.utils import simplify_usage_dict, extract_first_fenced_code_block
 
 
 @pytest.mark.parametrize(
@@ -40,3 +40,42 @@
 )
 def test_simplify_usage_dict(input_data, expected_output):
     assert simplify_usage_dict(input_data) == expected_output
+
+
+@pytest.mark.parametrize(
+    "input,expected",
+    [
+        ["This is a sample text without any code blocks.", None],
+        [
+            "Here is some text.\n\n```\ndef foo():\n    return 'bar'\n```\n\nMore text.",
+            "def foo():\n    return 'bar'\n",
+        ],
+        [
+            "Here is some text.\n\n```python\ndef foo():\n    return 'bar'\n```\n\nMore text.",
+            "def foo():\n    return 'bar'\n",
+        ],
+        [
+            "Here is some text.\n\n````\ndef foo():\n    return 'bar'\n````\n\nMore text.",
+            "def foo():\n    return 'bar'\n",
+        ],
+        [
+            "Here is some text.\n\n````javascript\nfunction foo() {\n    return 'bar';\n}\n````\n\nMore text.",
+            "function foo() {\n    return 'bar';\n}\n",
+        ],
+        [
+            "Here is some text.\n\n```python\ndef foo():\n    return 'bar'\n````\n\nMore text.",
+            None,
+        ],
+        [
+            "First code block:\n\n```python\ndef foo():\n    return 'bar'\n```\n\nSecond code block:\n\n```javascript\nfunction foo() {\n    return 'bar';\n}\n```",
+            "def foo():\n    return 'bar'\n",
+        ],
+        [
+            "Here is some text.\n\n```python\ndef foo():\n    return `bar`\n```\n\nMore text.",
+            "def foo():\n    return `bar`\n",
+        ],
+    ],
+)
+def test_extract_first_fenced_code_block(input, expected):
+    actual = extract_first_fenced_code_block(input)
+    assert actual == expected

From 000e984def983aa36384a24df42d4dbb558b5bb1 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 19 Dec 2024 07:16:48 -0800
Subject: [PATCH 115/149] --extract support for templates, closes #681

---
 docs/templates.md       | 21 +++++++++++++++++----
 llm/cli.py              |  9 ++++++---
 llm/templates.py        |  2 ++
 tests/test_templates.py |  6 ++++++
 4 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/docs/templates.md b/docs/templates.md
index 95707b80..69121597 100644
--- a/docs/templates.md
+++ b/docs/templates.md
@@ -26,6 +26,11 @@ You can also save default parameters:
 llm --system 'Summarize this text in the voice of $voice' \
   --model gpt-4 -p voice GlaDOS --save summarize
 ```
+If you add `--extract` the setting to  {ref}`extract the first fenced code block <usage-extract-fenced-code>` will be persisted in the template.
+```bash
+llm --system 'write a Python function' --extract --save python-function
+llm -t python-function 'reverse a string'
+```
 ## Using a template
 
 You can execute a named template using the `-t/--template` option:
@@ -100,7 +105,7 @@ curl -s 'https://til.simonwillison.net/macos/imovie-slides-and-audio' | \
 Output:
 > In a fantastical steampunk world, Simon Willison decided to merge an old MP3 recording with slides from the talk using iMovie. After exporting the slides as images and importing them into iMovie, he had to disable the default Ken Burns effect using the "Crop" tool. Then, Simon manually synchronized the audio by adjusting the duration of each image. Finally, he published the masterpiece to YouTube, with the whimsical magic of steampunk-infused illustrations leaving his viewers in awe.
 
-## System templates
+### System templates
 
 When working with models that support system prompts (such as `gpt-3.5-turbo` and `gpt-4`) you can set a system prompt using a `system:` key like so:
 
@@ -116,7 +121,7 @@ system: You speak like an excitable Victorian adventurer
 prompt: 'Summarize this: $input'
 ```
 
-## Additional template variables
+### Additional template variables
 
 Templates that work against the user's normal input (content that is either piped to the tool via standard input or passed as a command-line argument) use just the `$input` variable.
 
@@ -157,7 +162,7 @@ I got this:
 > My previous test subject seemed to have learned something new about iMovie. They exported keynote slides as individual images [...] Quite impressive for a human.
 
 (prompt-default-parameters)=
-## Specifying default parameters
+### Specifying default parameters
 
 You can also specify default values for parameters, using a `defaults:` key.
 
@@ -185,7 +190,15 @@ I got this:
 
 > Text, summarize in Yoda's voice, I will: "Hmm, young padawan. Summary of this text, you seek. Hmmm. ...
 
-## Setting a default model for a template
+### Configuring code extraction
+
+To configure the {ref}`extract first fenced code block <usage-extract-fenced-code>` setting for the template, add this:
+
+```yaml
+extract: true
+```
+
+### Setting a default model for a template
 
 Templates executed using `llm -t template-name` will execute using the default model that the user has configured for the tool - or `gpt-3.5-turbo` if they have not configured their own default.
 
diff --git a/llm/cli.py b/llm/cli.py
index 40ffe556..f4a9d32c 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -262,9 +262,6 @@ def prompt(
 
     model_aliases = get_model_aliases()
 
-    if extract:
-        no_stream = True
-
     def read_prompt():
         nonlocal prompt
 
@@ -319,6 +316,8 @@ def read_prompt():
             to_save["system"] = system
         if param:
             to_save["defaults"] = dict(param)
+        if extract:
+            to_save["extract"] = True
         path.write_text(
             yaml.dump(
                 to_save,
@@ -335,6 +334,7 @@ def read_prompt():
         if system:
             raise click.ClickException("Cannot use -t/--template and --system together")
         template_obj = load_template(template)
+        extract = template_obj.extract
         prompt = read_prompt()
         try:
             prompt, system = template_obj.evaluate(prompt, params)
@@ -343,6 +343,9 @@ def read_prompt():
         if model_id is None and template_obj.model:
             model_id = template_obj.model
 
+    if extract:
+        no_stream = True
+
     conversation = None
     if conversation_id or _continue:
         # Load the conversation - loads most recent if no ID provided
diff --git a/llm/templates.py b/llm/templates.py
index 38e00d7e..b540fad1 100644
--- a/llm/templates.py
+++ b/llm/templates.py
@@ -9,6 +9,8 @@ class Template(BaseModel):
     system: Optional[str] = None
     model: Optional[str] = None
     defaults: Optional[Dict[str, Any]] = None
+    # Should first fenced code block be extracted?
+    extract: Optional[bool] = None
 
     class Config:
         extra = "forbid"
diff --git a/tests/test_templates.py b/tests/test_templates.py
index 488a8f55..e66005c4 100644
--- a/tests/test_templates.py
+++ b/tests/test_templates.py
@@ -91,6 +91,12 @@ def test_templates_list(templates_path, args):
             {"prompt": "Say hello as $name", "defaults": {"name": "default-name"}},
             None,
         ),
+        # -x/--extract should be persisted:
+        (
+            ["--system", "write python", "--extract"],
+            {"system": "write python", "extract": True},
+            None,
+        ),
     ),
 )
 def test_templates_prompt_save(templates_path, args, expected_prompt, expected_error):

From b452effa0966c1962e73cc761ffa3ba8268b75a0 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 9 Jan 2025 11:37:33 -0800
Subject: [PATCH 116/149] llm models -q/--query option, closes #700

---
 docs/help.md      |  7 ++++---
 docs/usage.md     | 17 ++++++++++++-----
 llm/cli.py        |  5 ++++-
 llm/models.py     | 10 ++++++++++
 tests/test_llm.py |  8 ++++++++
 5 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/docs/help.md b/docs/help.md
index 4e592e89..a5784e80 100644
--- a/docs/help.md
+++ b/docs/help.md
@@ -330,9 +330,10 @@ Usage: llm models list [OPTIONS]
   List available models
 
 Options:
-  --options  Show options for each model, if available
-  --async    List async models
-  --help     Show this message and exit.
+  --options         Show options for each model, if available
+  --async           List async models
+  -q, --query TEXT  Search for models matching this string
+  --help            Show this message and exit.
 ```
 
 (help-models-default)=
diff --git a/docs/usage.md b/docs/usage.md
index 4ac07e85..01a2840e 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -246,11 +246,18 @@ llm models
 ```
 Example output:
 ```
-OpenAI Chat: gpt-3.5-turbo (aliases: 3.5, chatgpt)
-OpenAI Chat: gpt-3.5-turbo-16k (aliases: chatgpt-16k, 3.5-16k)
-OpenAI Chat: gpt-4 (aliases: 4, gpt4)
-OpenAI Chat: gpt-4-32k (aliases: 4-32k)
-PaLM 2: chat-bison-001 (aliases: palm, palm2)
+OpenAI Chat: gpt-4o (aliases: 4o)
+OpenAI Chat: gpt-4o-mini (aliases: 4o-mini)
+OpenAI Chat: o1-preview
+OpenAI Chat: o1-mini
+GeminiPro: gemini-1.5-pro-002
+GeminiPro: gemini-1.5-flash-002
+...
+```
+
+Add `-q term` to search for models matching a specific search term:
+```bash
+llm models -q gpt-4o
 ```
 
 Add `--options` to also see documentation for the options supported by each model:
diff --git a/llm/cli.py b/llm/cli.py
index f4a9d32c..4c7c8eef 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -1076,12 +1076,15 @@ def models():
     "--options", is_flag=True, help="Show options for each model, if available"
 )
 @click.option("async_", "--async", is_flag=True, help="List async models")
-def models_list(options, async_):
+@click.option("-q", "--query", help="Search for models matching this string")
+def models_list(options, async_, query):
     "List available models"
     models_that_have_shown_options = set()
     for model_with_aliases in get_models_with_aliases():
         if async_ and not model_with_aliases.async_model:
             continue
+        if query and not model_with_aliases.matches(query):
+            continue
         extra = ""
         if model_with_aliases.aliases:
             extra = " (aliases: {})".format(", ".join(model_with_aliases.aliases))
diff --git a/llm/models.py b/llm/models.py
index 8a75dc4e..fe0faed1 100644
--- a/llm/models.py
+++ b/llm/models.py
@@ -757,6 +757,16 @@ class ModelWithAliases:
     async_model: AsyncModel
     aliases: Set[str]
 
+    def matches(self, query: str) -> bool:
+        query = query.lower()
+        all_strings = []
+        all_strings.extend(self.aliases)
+        if self.model:
+            all_strings.append(str(self.model))
+        if self.async_model:
+            all_strings.append(str(self.async_model.model_id))
+        return any(query in alias.lower() for alias in all_strings)
+
 
 @dataclass
 class EmbeddingModelWithAliases:
diff --git a/tests/test_llm.py b/tests/test_llm.py
index bc9649d3..4502c3c8 100644
--- a/tests/test_llm.py
+++ b/tests/test_llm.py
@@ -604,6 +604,14 @@ def test_llm_models_async(user_path):
     assert "AsyncMockModel: mock" in result.output
 
 
+@pytest.mark.parametrize("option", ("-q", "--query"))
+def test_llm_models_query(user_path, option):
+    runner = CliRunner()
+    result = runner.invoke(cli, ["models", option, "mockmodel"], catch_exceptions=False)
+    assert result.exit_code == 0
+    assert result.output == "MockModel: mock\n"
+
+
 def test_llm_user_dir(tmpdir, monkeypatch):
     user_dir = str(tmpdir / "u")
     monkeypatch.setenv("LLM_USER_PATH", user_dir)

From 88a8cfd9e41212e20290fc5ad35c540f9f670d62 Mon Sep 17 00:00:00 2001
From: Csaba Henk <csaba.henk+github@gmail.com>
Date: Sat, 11 Jan 2025 00:53:04 +0100
Subject: [PATCH 117/149] llm logs -x/--extract option (#693)

* llm logs -x/--extract option
* Update docs/help.md for llm logs -x
* Added test for llm logs -x/--extract, refs #693
* llm logs -xr behaves same as llm logs -x
* -x/--extract in llm logging docs

---------

Co-authored-by: Simon Willison <swillison@gmail.com>
---
 docs/help.md        |  1 +
 docs/logging.md     |  5 +++
 llm/cli.py          | 16 +++++++--
 tests/test_llm.py   | 84 +++++++++++++++++++++++++++++----------------
 tests/test_utils.py |  3 +-
 5 files changed, 76 insertions(+), 33 deletions(-)

diff --git a/docs/help.md b/docs/help.md
index a5784e80..8dcd88f3 100644
--- a/docs/help.md
+++ b/docs/help.md
@@ -301,6 +301,7 @@ Options:
   -t, --truncate              Truncate long strings in output
   -u, --usage                 Include token usage
   -r, --response              Just output the last response
+  -x, --extract               Extract first fenced code block
   -c, --current               Show logs from the current conversation
   --cid, --conversation TEXT  Show logs for this conversation ID
   --json                      Output logs as JSON
diff --git a/docs/logging.md b/docs/logging.md
index 56c0379d..508f22b5 100644
--- a/docs/logging.md
+++ b/docs/logging.md
@@ -61,6 +61,11 @@ To get back just the most recent prompt response as plain text, add `-r/--respon
 ```bash
 llm logs -r
 ```
+Use `-x/--extract` to extract and return the first fenced code block from the selected log entries:
+
+```bash
+llm logs -x
+```
 
 Add `--json` to get the log messages in JSON instead:
 
diff --git a/llm/cli.py b/llm/cli.py
index 4c7c8eef..86954abb 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -855,6 +855,7 @@ def logs_turn_off():
 @click.option("-t", "--truncate", is_flag=True, help="Truncate long strings in output")
 @click.option("-u", "--usage", is_flag=True, help="Include token usage")
 @click.option("-r", "--response", is_flag=True, help="Just output the last response")
+@click.option("-x", "--extract", is_flag=True, help="Extract first fenced code block")
 @click.option(
     "current_conversation",
     "-c",
@@ -883,6 +884,7 @@ def logs_list(
     truncate,
     usage,
     response,
+    extract,
     current_conversation,
     conversation_id,
     json_output,
@@ -979,6 +981,7 @@ def logs_list(
                 else:
                     row[key] = json.loads(row[key])
 
+    output = None
     if json_output:
         # Output as JSON if requested
         for row in rows:
@@ -986,11 +989,20 @@ def logs_list(
                 {k: v for k, v in attachment.items() if k != "response_id"}
                 for attachment in attachments_by_id.get(row["id"], [])
             ]
-        click.echo(json.dumps(list(rows), indent=2))
+        output = json.dumps(list(rows), indent=2)
+    elif extract:
+        # Extract and return first code block
+        for row in rows:
+            output = extract_first_fenced_code_block(row["response"])
+            if output is not None:
+                break
     elif response:
         # Just output the last response
         if rows:
-            click.echo(rows[-1]["response"])
+            output = rows[-1]["response"]
+
+    if output is not None:
+        click.echo(output)
     else:
         # Output neatly formatted human-readable logs
         current_system = None
diff --git a/tests/test_llm.py b/tests/test_llm.py
index 4502c3c8..c1224309 100644
--- a/tests/test_llm.py
+++ b/tests/test_llm.py
@@ -34,7 +34,7 @@ def log_path(user_path):
             "id": str(ULID()).lower(),
             "system": "system",
             "prompt": "prompt",
-            "response": "response",
+            "response": 'response\n```python\nprint("hello word")\n```',
             "model": "davinci",
             "datetime_utc": (start + datetime.timedelta(seconds=i)).isoformat(),
             "conversation_id": "abc123",
@@ -60,35 +60,38 @@ def test_logs_text(log_path, usage):
     output = result.output
     # Replace 2023-08-17T20:53:58 with YYYY-MM-DDTHH:MM:SS
     output = datetime_re.sub("YYYY-MM-DDTHH:MM:SS", output)
-
-    assert output == (
-        "# YYYY-MM-DDTHH:MM:SS    conversation: abc123\n\n"
-        "Model: **davinci**\n\n"
-        "## Prompt:\n\n"
-        "prompt\n\n"
-        "## System:\n\n"
-        "system\n\n"
-        "## Response:\n\n"
-        "response\n\n"
-    ) + ("## Token usage:\n\n2 input, 5 output\n\n" if usage else "") + (
-        "# YYYY-MM-DDTHH:MM:SS    conversation: abc123\n\n"
-        "Model: **davinci**\n\n"
-        "## Prompt:\n\n"
-        "prompt\n\n"
-        "## Response:\n\n"
-        "response\n\n"
-    ) + (
-        "## Token usage:\n\n2 input, 5 output\n\n" if usage else ""
-    ) + (
-        "# YYYY-MM-DDTHH:MM:SS    conversation: abc123\n\n"
-        "Model: **davinci**\n\n"
-        "## Prompt:\n\n"
-        "prompt\n\n"
-        "## Response:\n\n"
-        "response\n\n"
-    ) + (
-        "## Token usage:\n\n2 input, 5 output\n\n" if usage else ""
+    expected = (
+        (
+            "# YYYY-MM-DDTHH:MM:SS    conversation: abc123\n\n"
+            "Model: **davinci**\n\n"
+            "## Prompt:\n\n"
+            "prompt\n\n"
+            "## System:\n\n"
+            "system\n\n"
+            "## Response:\n\n"
+            'response\n```python\nprint("hello word")\n```\n\n'
+        )
+        + ("## Token usage:\n\n2 input, 5 output\n\n" if usage else "")
+        + (
+            "# YYYY-MM-DDTHH:MM:SS    conversation: abc123\n\n"
+            "Model: **davinci**\n\n"
+            "## Prompt:\n\n"
+            "prompt\n\n"
+            "## Response:\n\n"
+            'response\n```python\nprint("hello word")\n```\n\n'
+        )
+        + ("## Token usage:\n\n2 input, 5 output\n\n" if usage else "")
+        + (
+            "# YYYY-MM-DDTHH:MM:SS    conversation: abc123\n\n"
+            "Model: **davinci**\n\n"
+            "## Prompt:\n\n"
+            "prompt\n\n"
+            "## Response:\n\n"
+            'response\n```python\nprint("hello word")\n```\n\n'
+        )
+        + ("## Token usage:\n\n2 input, 5 output\n\n" if usage else "")
     )
+    assert output == expected
 
 
 @pytest.mark.parametrize("n", (None, 0, 2))
@@ -118,7 +121,28 @@ def test_logs_response_only(args, log_path):
     runner = CliRunner()
     result = runner.invoke(cli, ["logs"] + args, catch_exceptions=False)
     assert result.exit_code == 0
-    assert result.output == "response\n"
+    assert result.output == 'response\n```python\nprint("hello word")\n```\n'
+
+
+@pytest.mark.parametrize(
+    "args",
+    (
+        ["-x"],
+        ["--extract"],
+        ["list", "-x"],
+        ["list", "--extract"],
+        # Using -xr together should have same effect as just -x
+        ["-xr"],
+        ["-x", "-r"],
+        ["--extract", "--response"],
+    ),
+)
+def test_logs_extract_first_code(args, log_path):
+    "Test that logs -x/--extract returns the first code block"
+    runner = CliRunner()
+    result = runner.invoke(cli, ["logs"] + args, catch_exceptions=False)
+    assert result.exit_code == 0
+    assert result.output == 'print("hello word")\n\n'
 
 
 @pytest.mark.xfail(sys.platform == "win32", reason="Expected to fail on Windows")
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 11d3dc70..783e5892 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -67,7 +67,8 @@ def test_simplify_usage_dict(input_data, expected_output):
             None,
         ],
         [
-            "First code block:\n\n```python\ndef foo():\n    return 'bar'\n```\n\nSecond code block:\n\n```javascript\nfunction foo() {\n    return 'bar';\n}\n```",
+            "First code block:\n\n```python\ndef foo():\n    return 'bar'\n```\n\n"
+            "Second code block:\n\n```javascript\nfunction foo() {\n    return 'bar';\n}\n```",
             "def foo():\n    return 'bar'\n",
         ],
         [

From 6baf1f7d8311bfdaf8f119a28b2553ed745c3b7c Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sun, 22 Dec 2024 14:06:47 -0800
Subject: [PATCH 118/149] o1

Closes #676
---
 docs/openai-models.md                |  2 ++
 docs/usage.md                        | 26 ++++++++++++++++++++++++++
 llm/default_plugins/openai_models.py |  6 ++++++
 3 files changed, 34 insertions(+)

diff --git a/docs/openai-models.md b/docs/openai-models.md
index 982e3179..9d7f2a32 100644
--- a/docs/openai-models.md
+++ b/docs/openai-models.md
@@ -46,6 +46,8 @@ OpenAI Chat: gpt-4-1106-preview
 OpenAI Chat: gpt-4-0125-preview
 OpenAI Chat: gpt-4-turbo-2024-04-09
 OpenAI Chat: gpt-4-turbo (aliases: gpt-4-turbo-preview, 4-turbo, 4t)
+OpenAI Chat: o1
+OpenAI Chat: o1-2024-12-17
 OpenAI Chat: o1-preview
 OpenAI Chat: o1-mini
 OpenAI Completion: gpt-3.5-turbo-instruct (aliases: 3.5-instruct, chatgpt-instruct)
diff --git a/docs/usage.md b/docs/usage.md
index 01a2840e..adfea886 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -471,6 +471,32 @@ OpenAI Chat: gpt-4-turbo (aliases: gpt-4-turbo-preview, 4-turbo, 4t)
     logit_bias: dict, str
     seed: int
     json_object: boolean
+OpenAI Chat: o1
+  Options:
+    temperature: float
+    max_tokens: int
+    top_p: float
+    frequency_penalty: float
+    presence_penalty: float
+    stop: str
+    logit_bias: dict, str
+    seed: int
+    json_object: boolean
+  Attachment types:
+    image/gif, image/jpeg, image/png, image/webp
+OpenAI Chat: o1-2024-12-17
+  Options:
+    temperature: float
+    max_tokens: int
+    top_p: float
+    frequency_penalty: float
+    presence_penalty: float
+    stop: str
+    logit_bias: dict, str
+    seed: int
+    json_object: boolean
+  Attachment types:
+    image/gif, image/jpeg, image/png, image/webp
 OpenAI Chat: o1-preview
   Options:
     temperature: float
diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index 41b6866a..27ee61dc 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -69,6 +69,12 @@ def register_models(register):
         aliases=("gpt-4-turbo-preview", "4-turbo", "4t"),
     )
     # o1
+    for model_id in ("o1", "o1-2024-12-17"):
+        register(
+            Chat(model_id, vision=True),
+            AsyncChat(model_id, vision=True),
+        )
+
     register(
         Chat("o1-preview", allows_system_prompt=False),
         AsyncChat("o1-preview", allows_system_prompt=False),

From 38a7366d8e2f5f10492e5acf56e6a71f97a9a24c Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Fri, 10 Jan 2025 16:03:09 -0800
Subject: [PATCH 119/149] o1 cannot stream

https://github.com/simonw/llm/issues/676#issuecomment-2584932453
---
 llm/default_plugins/openai_models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index 27ee61dc..0ec9dae2 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -71,8 +71,8 @@ def register_models(register):
     # o1
     for model_id in ("o1", "o1-2024-12-17"):
         register(
-            Chat(model_id, vision=True),
-            AsyncChat(model_id, vision=True),
+            Chat(model_id, vision=True, can_stream=False),
+            AsyncChat(model_id, vision=True, can_stream=False),
         )
 
     register(

From 73043ec406797295ab947b63264a42228dd1cee2 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Fri, 10 Jan 2025 16:05:29 -0800
Subject: [PATCH 120/149] Fixed mypy complaint

---
 llm/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/models.py b/llm/models.py
index fe0faed1..2bed85e7 100644
--- a/llm/models.py
+++ b/llm/models.py
@@ -759,7 +759,7 @@ class ModelWithAliases:
 
     def matches(self, query: str) -> bool:
         query = query.lower()
-        all_strings = []
+        all_strings: List[str] = []
         all_strings.extend(self.aliases)
         if self.model:
             all_strings.append(str(self.model))

From 4f4f9bc07dc3de941e483c76dd5f4706c02df843 Mon Sep 17 00:00:00 2001
From: Arjan Mossel <arjanmossel@gmail.com>
Date: Sat, 11 Jan 2025 01:41:21 +0100
Subject: [PATCH 121/149] Add llm-venice to plugin directory (#699)

---
 docs/plugins/directory.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/plugins/directory.md b/docs/plugins/directory.md
index 4da2bbf5..01ae3709 100644
--- a/docs/plugins/directory.md
+++ b/docs/plugins/directory.md
@@ -38,6 +38,7 @@ These plugins can be used to interact with remotely hosted models via their API:
 - **[llm-bedrock-meta](https://github.com/flabat/llm-bedrock-meta)** by Fabian Labat adds support for Llama 2 and Llama 3 by Meta via Amazon Bedrock.
 - **[llm-together](https://github.com/wearedevx/llm-together)** adds support for the [Together AI](https://www.together.ai/) extensive family of hosted openly licensed models.
 - **[llm-lambda-labs](https://github.com/simonw/llm-lambda-labs)** provides access to models hosted by [Lambda Labs](https://docs.lambdalabs.com/public-cloud/lambda-chat-api/), including the Nous Hermes 3 series.
+- **[llm-venice](https://github.com/ar-jan/llm-venice)** provides access to uncensored models hosted by privacy-focused [Venice AI](https://docs.venice.ai/), including Llama 3.1 405B.
 
 If an API model host provides an OpenAI-compatible API you can also [configure LLM to talk to it](https://llm.datasette.io/en/stable/other-models.html#openai-compatible-models) without needing an extra plugin.
 

From 1c61b5adddaf2a529d3ad682c23e1762b12786a9 Mon Sep 17 00:00:00 2001
From: watany <76135106+watany-dev@users.noreply.github.com>
Date: Sat, 11 Jan 2025 09:42:39 +0900
Subject: [PATCH 122/149] doc(plugin): adding AmazonBedrock (#698)

---
 docs/plugins/directory.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/plugins/directory.md b/docs/plugins/directory.md
index 01ae3709..c97a0383 100644
--- a/docs/plugins/directory.md
+++ b/docs/plugins/directory.md
@@ -34,6 +34,7 @@ These plugins can be used to interact with remotely hosted models via their API:
 - **[llm-palm](https://github.com/simonw/llm-palm)** adds support for Google's [PaLM 2 model](https://developers.generativeai.google/).
 - **[llm-openrouter](https://github.com/simonw/llm-openrouter)** provides access to models hosted on [OpenRouter](https://openrouter.ai/).
 - **[llm-cohere](https://github.com/Accudio/llm-cohere)** by Alistair Shepherd provides `cohere-generate` and `cohere-summarize` API models, powered by [Cohere](https://cohere.com/).
+- **[llm-bedrock](https://github.com/simonw/llm-bedrock)** adds support for Nova by Amazon via Amazon Bedrock.
 - **[llm-bedrock-anthropic](https://github.com/sblakey/llm-bedrock-anthropic)** by Sean Blakey adds support for Claude and Claude Instant by Anthropic via Amazon Bedrock.
 - **[llm-bedrock-meta](https://github.com/flabat/llm-bedrock-meta)** by Fabian Labat adds support for Llama 2 and Llama 3 by Meta via Amazon Bedrock.
 - **[llm-together](https://github.com/wearedevx/llm-together)** adds support for the [Together AI](https://www.together.ai/) extensive family of hosted openly licensed models.

From d964d02e906e392de8ffb27e5cbfa03958a6676b Mon Sep 17 00:00:00 2001
From: Ariel Marcus <ajmarcus@users.noreply.github.com>
Date: Sat, 11 Jan 2025 12:57:10 -0500
Subject: [PATCH 123/149] Add installation docs with uv (#690)

---
 docs/index.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/index.md b/docs/index.md
index 9b0ad47e..90993a48 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -41,6 +41,10 @@ Or with [pipx](https://pypa.github.io/pipx/):
 ```bash
 pipx install llm
 ```
+Or with [uv](https://docs.astral.sh/uv/guides/tools/)
+```bash
+uv tool install llm
+```
 If you have an [OpenAI API key](https://platform.openai.com/api-keys) key you can run this:
 ```bash
 # Paste your OpenAI API key into this

From 1d75792f9b74eb75be8e4cd7717d6795ee6ef06a Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sat, 11 Jan 2025 10:06:32 -0800
Subject: [PATCH 124/149] More uv/uvx tips, closes #702

Refs #690
---
 docs/setup.md | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/docs/setup.md b/docs/setup.md
index 3fb20c87..90bcdc3d 100644
--- a/docs/setup.md
+++ b/docs/setup.md
@@ -10,6 +10,10 @@ Or using [pipx](https://pypa.github.io/pipx/):
 ```bash
 pipx install llm
 ```
+Or using [uv](https://docs.astral.sh/uv/guides/tools/) ({ref}`more tips below <setup-uvx>`):
+```bash
+uv tool install llm
+```
 Or using [Homebrew](https://brew.sh/) (see {ref}`warning note <homebrew-warning>`):
 ```bash
 brew install llm
@@ -34,6 +38,28 @@ If the latest version is not yet available on Homebrew you can upgrade like this
 llm install -U llm
 ```
 
+(setup-uvx)=
+## Using uvx
+
+If you have [uv](https://docs.astral.sh/uv/) installed you can also use the `uvx` command to try LLM without first installing it like this:
+
+```bash
+export OPENAI_API_KEY='sx-...'
+uvx llm 'fun facts about skunks'
+```
+This will install and run LLM using a temporary virtual environment.
+
+You can use the `--with` option to add extra plugins. To use Anthropic's models, for example:
+```bash
+export ANTHROPIC_API_KEY='...'
+uvx --with llm-claude-3 llm -m claude-3.5-haiku 'fun facts about skunks'
+```
+All of the usual LLM commands will work with `uvx llm`. Here's how to set your OpenAI key without needing an environment variable for example:
+```bash
+uvx llm keys set openai
+# Paste key here
+```
+
 (homebrew-warning)=
 ## A note about Homebrew and PyTorch
 

From e3c104b13627fc41831ba8e85dbcec0fe70e3316 Mon Sep 17 00:00:00 2001
From: Amjith Ramanujam <amjith.r@gmail.com>
Date: Sat, 11 Jan 2025 12:04:39 -0800
Subject: [PATCH 125/149] Show the default model when listing all available
 models. (#688)

---
 docs/usage.md | 1 +
 llm/cli.py    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/docs/usage.md b/docs/usage.md
index adfea886..3420c274 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -550,6 +550,7 @@ OpenAI Completion: gpt-3.5-turbo-instruct (aliases: 3.5-instruct, chatgpt-instru
       Integer seed to attempt to sample deterministically
     logprobs: int
       Include the log probabilities of most likely N per token
+Default: gpt-4o-mini
 
 ```
 <!-- [[[end]]] -->
diff --git a/llm/cli.py b/llm/cli.py
index 86954abb..4656638a 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -1136,6 +1136,7 @@ def models_list(options, async_, query):
             )
             output += "\n  Attachment types:\n{}".format(wrapper.fill(attachment_types))
         click.echo(output)
+    click.echo(f"Default: {get_default_model()}")
 
 
 @models.command(name="default")

From 2b6b00641c2a42da53097589dd06705583b78d6d Mon Sep 17 00:00:00 2001
From: Steven Weaver <sweaver@temple.edu>
Date: Sat, 11 Jan 2025 13:05:05 -0700
Subject: [PATCH 126/149] Update tutorial-model-plugin.md (#685)

pydantic.org -> pydantic.dev
---
 docs/plugins/tutorial-model-plugin.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/plugins/tutorial-model-plugin.md b/docs/plugins/tutorial-model-plugin.md
index 6f1bcbbc..916437dd 100644
--- a/docs/plugins/tutorial-model-plugin.md
+++ b/docs/plugins/tutorial-model-plugin.md
@@ -344,7 +344,7 @@ class Markov(Model):
 ```
 Let's add extra validation rules to our options. Length must be at least 2. Duration must be between 0 and 10.
 
-The `Options` class uses [Pydantic 2](https://pydantic.org/), which can support all sorts of advanced validation rules.
+The `Options` class uses [Pydantic 2](https://pydantic.dev/), which can support all sorts of advanced validation rules.
 
 We can also add inline documentation, which can then be displayed by the `llm models --options` command.
 

From e1388b27fe2cad04a407806c98548ce32ccb94b2 Mon Sep 17 00:00:00 2001
From: abrasumente <abrasumentee@gmail.com>
Date: Sun, 12 Jan 2025 10:56:34 +0800
Subject: [PATCH 127/149] Add `llm-deepseek` plugin (#517)

---
 docs/plugins/directory.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/plugins/directory.md b/docs/plugins/directory.md
index c97a0383..9974e466 100644
--- a/docs/plugins/directory.md
+++ b/docs/plugins/directory.md
@@ -38,6 +38,7 @@ These plugins can be used to interact with remotely hosted models via their API:
 - **[llm-bedrock-anthropic](https://github.com/sblakey/llm-bedrock-anthropic)** by Sean Blakey adds support for Claude and Claude Instant by Anthropic via Amazon Bedrock.
 - **[llm-bedrock-meta](https://github.com/flabat/llm-bedrock-meta)** by Fabian Labat adds support for Llama 2 and Llama 3 by Meta via Amazon Bedrock.
 - **[llm-together](https://github.com/wearedevx/llm-together)** adds support for the [Together AI](https://www.together.ai/) extensive family of hosted openly licensed models.
+- **[llm-deepseek](https://github.com/abrasumente233/llm-deepseek)** adds support for the [DeepSeek](https://deepseek.com)'s DeepSeek-Chat and DeepSeek-Coder models.
 - **[llm-lambda-labs](https://github.com/simonw/llm-lambda-labs)** provides access to models hosted by [Lambda Labs](https://docs.lambdalabs.com/public-cloud/lambda-chat-api/), including the Nous Hermes 3 series.
 - **[llm-venice](https://github.com/ar-jan/llm-venice)** provides access to uncensored models hosted by privacy-focused [Venice AI](https://docs.venice.ai/), including Llama 3.1 405B.
 

From 64179fa9e0c683848f4496af50549a50dd77e55d Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sat, 18 Jan 2025 14:10:55 -0800
Subject: [PATCH 128/149] Use openai>=1.55.3 for issue #709

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index ca9bf8ac..e2a1aa64 100644
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,7 @@ def get_long_description():
     """,
     install_requires=[
         "click",
-        "openai>=1.0",
+        "openai>=1.55.3",
         "click-default-group>=1.2.3",
         "sqlite-utils>=3.37",
         "sqlite-migrate>=0.1a2",

From f95dd55cda798cdf7ef64d5ad360ec27e2208ac9 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sat, 18 Jan 2025 14:21:43 -0800
Subject: [PATCH 129/149] Make it easier to debug CLI errors in pytest

Found this pattern while working on #709
---
 llm/cli.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/llm/cli.py b/llm/cli.py
index 4656638a..a62dcbaf 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -450,7 +450,13 @@ async def inner():
                 if extract:
                     text = extract_first_fenced_code_block(text) or text
                 print(text)
+    # List of exceptions that should never be raised in pytest:
+    except (ValueError, NotImplementedError) as ex:
+        raise click.ClickException(str(ex))
     except Exception as ex:
+        # All other exceptions should raise in pytest, show to user otherwise
+        if getattr(sys, "_called_from_test", False):
+            raise
         raise click.ClickException(str(ex))
 
     if isinstance(response, AsyncResponse):

From 02e59a201e977b8230f623c79106bad918302b78 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sat, 18 Jan 2025 14:24:18 -0800
Subject: [PATCH 130/149] Don't show default model for llm models -q, closes
 #710

---
 llm/cli.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llm/cli.py b/llm/cli.py
index a62dcbaf..1bde74bd 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -1142,7 +1142,8 @@ def models_list(options, async_, query):
             )
             output += "\n  Attachment types:\n{}".format(wrapper.fill(attachment_types))
         click.echo(output)
-    click.echo(f"Default: {get_default_model()}")
+    if not query:
+        click.echo(f"Default: {get_default_model()}")
 
 
 @models.command(name="default")

From 59983740e673e31e5aee0508409547a0fb43dd59 Mon Sep 17 00:00:00 2001
From: Ryan Patterson <cgamesplay@cgamesplay.com>
Date: Sun, 19 Jan 2025 06:52:51 +0800
Subject: [PATCH 131/149] Update directory.md (#666)

---
 docs/plugins/directory.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/plugins/directory.md b/docs/plugins/directory.md
index 9974e466..a80b1b0f 100644
--- a/docs/plugins/directory.md
+++ b/docs/plugins/directory.md
@@ -56,6 +56,7 @@ If an API model host provides an OpenAI-compatible API you can also [configure L
 ## Extra commands
 
 - **[llm-cmd](https://github.com/simonw/llm-cmd)** accepts a prompt for a shell command, runs that prompt and populates the result in your shell so you can review it, edit it and then hit `<enter>` to execute or `ctrl+c` to cancel.
+- **[llm-cmd-comp](https://github.com/CGamesPlay/llm-cmd-comp)** provides a key binding for your shell that will launch a chat to build the command. When ready, hit `<enter>` and it will go right back into your shell command line, so you can run it.
 - **[llm-python](https://github.com/simonw/llm-python)** adds a `llm python` command for running a Python interpreter in the same virtual environment as LLM. This is useful for debugging, and also provides a convenient way to interact with the LLM {ref}`python-api` if you installed LLM using Homebrew or `pipx`.
 - **[llm-cluster](https://github.com/simonw/llm-cluster)** adds a `llm cluster` command for calculating clusters for a collection of embeddings. Calculated clusters can then be passed to a Large Language Model to generate a summary description.
 - **[llm-jq](https://github.com/simonw/llm-jq)** lets you pipe in JSON data and a prompt describing a `jq` program, then executes the generated program against the JSON.

From 6f7ea406bf7de9186979a0a2a1c13578f6d158aa Mon Sep 17 00:00:00 2001
From: web-sst <36205453+web-sst@users.noreply.github.com>
Date: Wed, 22 Jan 2025 22:14:03 -0600
Subject: [PATCH 132/149] Register full embedding model names (#654)

Provide backward compatible aliases.
This makes available the same model names that ttok uses.
---
 llm/default_plugins/openai_models.py | 12 ++++++------
 tests/test_aliases.py                |  4 ++--
 tests/test_embed_cli.py              |  2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index 0ec9dae2..a59690f1 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -134,14 +134,14 @@ def register_models(register):
 @hookimpl
 def register_embedding_models(register):
     register(
-        OpenAIEmbeddingModel("ada-002", "text-embedding-ada-002"), aliases=("ada",)
+        OpenAIEmbeddingModel("text-embedding-ada-002", "text-embedding-ada-002"), aliases=("ada","ada-002",)
     )
-    register(OpenAIEmbeddingModel("3-small", "text-embedding-3-small"))
-    register(OpenAIEmbeddingModel("3-large", "text-embedding-3-large"))
+    register(OpenAIEmbeddingModel("text-embedding-3-small", "text-embedding-3-small"), aliases=("3-small",))
+    register(OpenAIEmbeddingModel("text-embedding-3-large", "text-embedding-3-large"), aliases=("3-large",))
     # With varying dimensions
-    register(OpenAIEmbeddingModel("3-small-512", "text-embedding-3-small", 512))
-    register(OpenAIEmbeddingModel("3-large-256", "text-embedding-3-large", 256))
-    register(OpenAIEmbeddingModel("3-large-1024", "text-embedding-3-large", 1024))
+    register(OpenAIEmbeddingModel("text-embedding-3-small-512", "text-embedding-3-small", 512), aliases=("3-small-512",))
+    register(OpenAIEmbeddingModel("text-embedding-3-large-256", "text-embedding-3-large", 256), aliases=("3-large-256",))
+    register(OpenAIEmbeddingModel("text-embedding-3-large-1024", "text-embedding-3-large", 1024), aliases=("3-large-1024",))
 
 
 class OpenAIEmbeddingModel(EmbeddingModel):
diff --git a/tests/test_aliases.py b/tests/test_aliases.py
index dc1483b8..b08871a9 100644
--- a/tests/test_aliases.py
+++ b/tests/test_aliases.py
@@ -39,7 +39,7 @@ def test_cli_aliases_list(args):
         "gpt4        : gpt-4\n"
         "4-32k       : gpt-4-32k\n"
         "e-demo      : embed-demo (embedding)\n"
-        "ada         : ada-002 (embedding)\n"
+        "ada         : text-embedding-ada-002 (embedding)\n"
     ).split("\n"):
         line = line.strip()
         if not line:
@@ -65,7 +65,7 @@ def test_cli_aliases_list_json(args):
             "4": "gpt-4",
             "gpt4": "gpt-4",
             "4-32k": "gpt-4-32k",
-            "ada": "ada-002",
+            "ada": "text-embedding-ada-002",
             "e-demo": "embed-demo",
         }.items()
     )
diff --git a/tests/test_embed_cli.py b/tests/test_embed_cli.py
index 007aac5d..57462316 100644
--- a/tests/test_embed_cli.py
+++ b/tests/test_embed_cli.py
@@ -554,7 +554,7 @@ def test_default_embedding_model():
     assert result2.exit_code == 0
     result3 = runner.invoke(cli, ["embed-models", "default"])
     assert result3.exit_code == 0
-    assert result3.output == "ada-002\n"
+    assert result3.output == "text-embedding-ada-002\n"
     result4 = runner.invoke(cli, ["embed-models", "default", "--remove-default"])
     assert result4.exit_code == 0
     result5 = runner.invoke(cli, ["embed-models", "default"])

From 57d3baac42cbc25f2e701d85deaa29cc43af4e42 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Wed, 22 Jan 2025 20:34:59 -0800
Subject: [PATCH 133/149] Update embedding model names in docs, refs #654

Also ran Black.
---
 docs/aliases.md                      |  8 +++++-
 llm/default_plugins/openai_models.py | 37 +++++++++++++++++++++++-----
 2 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/docs/aliases.md b/docs/aliases.md
index 787a3034..3fa5c530 100644
--- a/docs/aliases.md
+++ b/docs/aliases.md
@@ -33,7 +33,13 @@ gpt-4-turbo-preview : gpt-4-turbo
 4t                  : gpt-4-turbo
 3.5-instruct        : gpt-3.5-turbo-instruct
 chatgpt-instruct    : gpt-3.5-turbo-instruct
-ada                 : ada-002 (embedding)
+ada                 : text-embedding-ada-002 (embedding)
+ada-002             : text-embedding-ada-002 (embedding)
+3-small             : text-embedding-3-small (embedding)
+3-large             : text-embedding-3-large (embedding)
+3-small-512         : text-embedding-3-small-512 (embedding)
+3-large-256         : text-embedding-3-large-256 (embedding)
+3-large-1024        : text-embedding-3-large-1024 (embedding)
 ```
 <!-- [[[end]]] -->
 
diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index a59690f1..33d0566f 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -134,14 +134,39 @@ def register_models(register):
 @hookimpl
 def register_embedding_models(register):
     register(
-        OpenAIEmbeddingModel("text-embedding-ada-002", "text-embedding-ada-002"), aliases=("ada","ada-002",)
+        OpenAIEmbeddingModel("text-embedding-ada-002", "text-embedding-ada-002"),
+        aliases=(
+            "ada",
+            "ada-002",
+        ),
+    )
+    register(
+        OpenAIEmbeddingModel("text-embedding-3-small", "text-embedding-3-small"),
+        aliases=("3-small",),
+    )
+    register(
+        OpenAIEmbeddingModel("text-embedding-3-large", "text-embedding-3-large"),
+        aliases=("3-large",),
     )
-    register(OpenAIEmbeddingModel("text-embedding-3-small", "text-embedding-3-small"), aliases=("3-small",))
-    register(OpenAIEmbeddingModel("text-embedding-3-large", "text-embedding-3-large"), aliases=("3-large",))
     # With varying dimensions
-    register(OpenAIEmbeddingModel("text-embedding-3-small-512", "text-embedding-3-small", 512), aliases=("3-small-512",))
-    register(OpenAIEmbeddingModel("text-embedding-3-large-256", "text-embedding-3-large", 256), aliases=("3-large-256",))
-    register(OpenAIEmbeddingModel("text-embedding-3-large-1024", "text-embedding-3-large", 1024), aliases=("3-large-1024",))
+    register(
+        OpenAIEmbeddingModel(
+            "text-embedding-3-small-512", "text-embedding-3-small", 512
+        ),
+        aliases=("3-small-512",),
+    )
+    register(
+        OpenAIEmbeddingModel(
+            "text-embedding-3-large-256", "text-embedding-3-large", 256
+        ),
+        aliases=("3-large-256",),
+    )
+    register(
+        OpenAIEmbeddingModel(
+            "text-embedding-3-large-1024", "text-embedding-3-large", 1024
+        ),
+        aliases=("3-large-1024",),
+    )
 
 
 class OpenAIEmbeddingModel(EmbeddingModel):

From dc127d2a87942a15b87f9f2567182c039d16f3b8 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Wed, 22 Jan 2025 20:36:10 -0800
Subject: [PATCH 134/149] Release 0.20

Refs #654, #676, #677, #681, #688, #690, #700, #702, #709
---
 docs/changelog.md | 16 ++++++++++++++++
 setup.py          |  2 +-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index dede4116..5295d8ad 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,5 +1,21 @@
 # Changelog
 
+(v0_20)=
+## 0.20 (2025-01-22)
+
+- New model, `o1`. This model does not yet support streaming. [#676](https://github.com/simonw/lm/issues/676)
+- `o1-preview` and `o1-mini` models now support streaming.
+- New model, `gpt-4o-audio-preview`. [#677](https://github.com/simonw/llm/issues/677)
+- `llm prompt -x/--extract` option, which returns just the content of the first fenced code block in the response. Try `llm prompt -x 'Python function to reverse a string'`. [#681](https://github.com/simonw/llm/issues/681)
+  - Creating a template using `llm ... --save x` now supports the `-x/--extract` option, which is saved to the template. YAML templates can set this option using `extract: true`.
+  - New `llm logs -x/--extract` option extracts the first fenced code block from matching logged responses.
+- New `llm models -q 'search'` option returning models that case-insensitively match the search query. [#700](https://github.com/simonw/llm/issues/700)
+- Installation documentation now also includes `uv`. Thanks, [Ariel Marcus](https://github.com/ajmarcus). [#690](https://github.com/simonw/llm/pull/690) and [#702](https://github.com/simonw/llm/issues/702)
+- `llm models` command now shows the current default model at the bottom of the listing. Thanks, [Amjith Ramanujam](https://github.com/amjith). [#688](https://github.com/simonw/llm/pull/688)
+- {ref}`Plugin directory <plugin-directory>` now includes `llm-venice`, `llm-bedrock`, `llm-deepseek` and `llm-cmd-comp`.
+- Fixed bug where some dependency version combinations could cause a `Client.__init__() got an unexpected keyword argument 'proxies'` error. [#709](https://github.com/simonw/llm/issues/709)
+- OpenAI embedding models are now available using their full names of `text-embedding-ada-002`, `text-embedding-3-small` and `text-embedding-3-large` - the previous names are still supported as aliases. Thanks, [web-sst](https://github.com/web-sst). [#654](https://github.com/simonw/llm/pull/654)
+
 (v0_19_1)=
 ## 0.19.1 (2024-12-05)
 
diff --git a/setup.py b/setup.py
index e2a1aa64..26bbec6d 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 import os
 
-VERSION = "0.19.1"
+VERSION = "0.20"
 
 
 def get_long_description():

From 2b9a1bbc50ce28da70ae17ec0bead76a19a7fd9b Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Wed, 22 Jan 2025 20:39:01 -0800
Subject: [PATCH 135/149] Fixed broken link

---
 docs/changelog.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index 5295d8ad..6a2ecd66 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -3,7 +3,7 @@
 (v0_20)=
 ## 0.20 (2025-01-22)
 
-- New model, `o1`. This model does not yet support streaming. [#676](https://github.com/simonw/lm/issues/676)
+- New model, `o1`. This model does not yet support streaming. [#676](https://github.com/simonw/llm/issues/676)
 - `o1-preview` and `o1-mini` models now support streaming.
 - New model, `gpt-4o-audio-preview`. [#677](https://github.com/simonw/llm/issues/677)
 - `llm prompt -x/--extract` option, which returns just the content of the first fenced code block in the response. Try `llm prompt -x 'Python function to reverse a string'`. [#681](https://github.com/simonw/llm/issues/681)

From eb996baeab3c6a848ea48db0597c8e980660eaa7 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Wed, 22 Jan 2025 20:46:28 -0800
Subject: [PATCH 136/149] Documentation for model.attachment_types, closes #705

---
 docs/python-api.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/docs/python-api.md b/docs/python-api.md
index d261baf8..49aff02d 100644
--- a/docs/python-api.md
+++ b/docs/python-api.md
@@ -71,6 +71,18 @@ response = model.prompt(
 ```
 Use `llm.Attachment(content=b"binary image content here")` to pass binary content directly.
 
+You can check which attachment types (if any) a model supports using the `model.attachment_types` set:
+
+```python
+model = llm.get_model("gpt-4o-mini")
+print(model.attachment_types)
+# {'image/gif', 'image/png', 'image/jpeg', 'image/webp'}
+
+if "image/jpeg" in model.attachment_types:
+    # Use a JPEG attachment here
+    ...
+```
+
 ### Model options
 
 For models that support options (view those with `llm models --options`) you can pass options as keyword arguments to the `.prompt()` method:

From bf10f63d3d1f1053debe28d83d11a54f994342a7 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Wed, 22 Jan 2025 21:06:12 -0800
Subject: [PATCH 137/149] Mention gpt-4o-mini-audio-preview too #677

!stable-docs
---
 docs/changelog.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index 6a2ecd66..298b2f03 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -5,7 +5,7 @@
 
 - New model, `o1`. This model does not yet support streaming. [#676](https://github.com/simonw/llm/issues/676)
 - `o1-preview` and `o1-mini` models now support streaming.
-- New model, `gpt-4o-audio-preview`. [#677](https://github.com/simonw/llm/issues/677)
+- New models, `gpt-4o-audio-preview` and `gpt-4o-mini-audio-preview`. [#677](https://github.com/simonw/llm/issues/677)
 - `llm prompt -x/--extract` option, which returns just the content of the first fenced code block in the response. Try `llm prompt -x 'Python function to reverse a string'`. [#681](https://github.com/simonw/llm/issues/681)
   - Creating a template using `llm ... --save x` now supports the `-x/--extract` option, which is saved to the template. YAML templates can set this option using `extract: true`.
   - New `llm logs -x/--extract` option extracts the first fenced code block from matching logged responses.

From 3e88628602b8eaf724c67df6b3ae743804ce9047 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Wed, 22 Jan 2025 21:08:09 -0800
Subject: [PATCH 138/149] uv tool upgrade llm, refs #702

!stable-docs
---
 docs/setup.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/setup.md b/docs/setup.md
index 90bcdc3d..9218c503 100644
--- a/docs/setup.md
+++ b/docs/setup.md
@@ -29,6 +29,10 @@ For `pipx`:
 ```bash
 pipx upgrade llm
 ```
+For `uv`:
+```bash
+uv tool upgrade llm
+```
 For Homebrew:
 ```bash
 brew upgrade llm

From e449fd4f4654b169e00511ebcf6de924a03c1786 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Wed, 22 Jan 2025 22:17:07 -0800
Subject: [PATCH 139/149] Typo fix

!stable-docs
---
 docs/plugins/advanced-model-plugins.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/plugins/advanced-model-plugins.md b/docs/plugins/advanced-model-plugins.md
index 9342d355..f00ffa95 100644
--- a/docs/plugins/advanced-model-plugins.md
+++ b/docs/plugins/advanced-model-plugins.md
@@ -79,7 +79,7 @@ class NewModel(llm.Model):
 ```
 These content types are detected when an attachment is passed to LLM using `llm -a filename`, or can be specified by the user using the `--attachment-type filename image/png` option.
 
-**Note:** *MP3 files will have their attachment type detected as `audio/mpeg`, not `audio/mp3`.
+**Note:** MP3 files will have their attachment type detected as `audio/mpeg`, not `audio/mp3`.
 
 LLM will use the `attachment_types` attribute to validate that provided attachments should be accepted before passing them to the model.
 

From 656d8fa3c46f5babf8ee02e57fb0a4060e2da817 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Fri, 24 Jan 2025 10:52:46 -0800
Subject: [PATCH 140/149] --xl/--extract-last flag for prompt and log list
 commands (#718)

Closes #717
---
 docs/help.md        |  2 ++
 docs/logging.md     |  6 +++++-
 docs/usage.md       |  2 ++
 llm/cli.py          | 37 +++++++++++++++++++++++++++++--------
 llm/templates.py    |  3 ++-
 llm/utils.py        | 17 ++++++++++-------
 tests/test_llm.py   | 19 +++++++++++++++++++
 tests/test_utils.py | 31 ++++++++++++++++++++++++++-----
 8 files changed, 95 insertions(+), 22 deletions(-)

diff --git a/docs/help.md b/docs/help.md
index 8dcd88f3..75e9371b 100644
--- a/docs/help.md
+++ b/docs/help.md
@@ -129,6 +129,7 @@ Options:
   --async                         Run prompt asynchronously
   -u, --usage                     Show token usage
   -x, --extract                   Extract first fenced code block
+  --xl, --extract-last            Extract last fenced code block
   --help                          Show this message and exit.
 ```
 
@@ -302,6 +303,7 @@ Options:
   -u, --usage                 Include token usage
   -r, --response              Just output the last response
   -x, --extract               Extract first fenced code block
+  --xl, --extract-last        Extract last fenced code block
   -c, --current               Show logs from the current conversation
   --cid, --conversation TEXT  Show logs for this conversation ID
   --json                      Output logs as JSON
diff --git a/docs/logging.md b/docs/logging.md
index 508f22b5..3f0926a5 100644
--- a/docs/logging.md
+++ b/docs/logging.md
@@ -64,7 +64,11 @@ llm logs -r
 Use `-x/--extract` to extract and return the first fenced code block from the selected log entries:
 
 ```bash
-llm logs -x
+llm logs --extract
+```
+Or `--xl/--extract-last` for the last fenced code block:
+```bash
+llm logs --extract-last
 ```
 
 Add `--json` to get the log messages in JSON instead:
diff --git a/docs/usage.md b/docs/usage.md
index 3420c274..b2847ae8 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -61,6 +61,8 @@ def my_function():
 ````
 It will extract and returns just the content of that block, excluding the fenced coded delimiters. If there are no fenced code blocks it will return the full response.
 
+Use `--xl/--extract-last` to return the last fenced code block instead of the first.
+
 The entire response including explanatory text is still logged to the database, and can be viewed using `llm logs -c`.
 
 (usage-attachments)=
diff --git a/llm/cli.py b/llm/cli.py
index 1bde74bd..3f5fffb1 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -37,7 +37,7 @@
     mimetype_from_path,
     mimetype_from_string,
     token_usage_string,
-    extract_first_fenced_code_block,
+    extract_fenced_code_block,
 )
 import base64
 import httpx
@@ -210,6 +210,13 @@ def cli():
 @click.option("async_", "--async", is_flag=True, help="Run prompt asynchronously")
 @click.option("-u", "--usage", is_flag=True, help="Show token usage")
 @click.option("-x", "--extract", is_flag=True, help="Extract first fenced code block")
+@click.option(
+    "extract_last",
+    "--xl",
+    "--extract-last",
+    is_flag=True,
+    help="Extract last fenced code block",
+)
 def prompt(
     prompt,
     system,
@@ -229,6 +236,7 @@ def prompt(
     async_,
     usage,
     extract,
+    extract_last,
 ):
     """
     Execute a prompt
@@ -318,6 +326,8 @@ def read_prompt():
             to_save["defaults"] = dict(param)
         if extract:
             to_save["extract"] = True
+        if extract_last:
+            to_save["extract_last"] = True
         path.write_text(
             yaml.dump(
                 to_save,
@@ -335,6 +345,7 @@ def read_prompt():
             raise click.ClickException("Cannot use -t/--template and --system together")
         template_obj = load_template(template)
         extract = template_obj.extract
+        extract_last = template_obj.extract_last
         prompt = read_prompt()
         try:
             prompt, system = template_obj.evaluate(prompt, params)
@@ -343,7 +354,7 @@ def read_prompt():
         if model_id is None and template_obj.model:
             model_id = template_obj.model
 
-    if extract:
+    if extract or extract_last:
         no_stream = True
 
     conversation = None
@@ -427,8 +438,10 @@ async def inner():
                         **validated_options,
                     )
                     text = await response.text()
-                    if extract:
-                        text = extract_first_fenced_code_block(text) or text
+                    if extract or extract_last:
+                        text = (
+                            extract_fenced_code_block(text, last=extract_last) or text
+                        )
                     print(text)
                 return response
 
@@ -447,8 +460,8 @@ async def inner():
                 print("")
             else:
                 text = response.text()
-                if extract:
-                    text = extract_first_fenced_code_block(text) or text
+                if extract or extract_last:
+                    text = extract_fenced_code_block(text, last=extract_last) or text
                 print(text)
     # List of exceptions that should never be raised in pytest:
     except (ValueError, NotImplementedError) as ex:
@@ -862,6 +875,13 @@ def logs_turn_off():
 @click.option("-u", "--usage", is_flag=True, help="Include token usage")
 @click.option("-r", "--response", is_flag=True, help="Just output the last response")
 @click.option("-x", "--extract", is_flag=True, help="Extract first fenced code block")
+@click.option(
+    "extract_last",
+    "--xl",
+    "--extract-last",
+    is_flag=True,
+    help="Extract last fenced code block",
+)
 @click.option(
     "current_conversation",
     "-c",
@@ -891,6 +911,7 @@ def logs_list(
     usage,
     response,
     extract,
+    extract_last,
     current_conversation,
     conversation_id,
     json_output,
@@ -996,10 +1017,10 @@ def logs_list(
                 for attachment in attachments_by_id.get(row["id"], [])
             ]
         output = json.dumps(list(rows), indent=2)
-    elif extract:
+    elif extract or extract_last:
         # Extract and return first code block
         for row in rows:
-            output = extract_first_fenced_code_block(row["response"])
+            output = extract_fenced_code_block(row["response"], last=extract_last)
             if output is not None:
                 break
     elif response:
diff --git a/llm/templates.py b/llm/templates.py
index b540fad1..0cf16162 100644
--- a/llm/templates.py
+++ b/llm/templates.py
@@ -9,8 +9,9 @@ class Template(BaseModel):
     system: Optional[str] = None
     model: Optional[str] = None
     defaults: Optional[Dict[str, Any]] = None
-    # Should first fenced code block be extracted?
+    # Should a fenced code block be extracted?
     extract: Optional[bool] = None
+    extract_last: Optional[bool] = None
 
     class Config:
         extra = "forbid"
diff --git a/llm/utils.py b/llm/utils.py
index a4f57a0e..a7170dd5 100644
--- a/llm/utils.py
+++ b/llm/utils.py
@@ -156,9 +156,9 @@ def token_usage_string(input_tokens, output_tokens, token_details) -> str:
     return ", ".join(bits)
 
 
-def extract_first_fenced_code_block(text: str) -> Optional[str]:
+def extract_fenced_code_block(text: str, last: bool = False) -> Optional[str]:
     """
-    Extracts and returns the first Markdown fenced code block found in the given text.
+    Extracts and returns Markdown fenced code block found in the given text.
 
     The function handles fenced code blocks that:
     - Use at least three backticks (`).
@@ -169,9 +169,10 @@ def extract_first_fenced_code_block(text: str) -> Optional[str]:
 
     Args:
         text (str): The input text to search for a fenced code block.
+        last (bool): Extract the last code block if True, otherwise the first.
 
     Returns:
-        Optional[str]: The content of the first fenced code block, or None if not found.
+        Optional[str]: The content of the fenced code block, or None if not found.
     """
     # Regex pattern to match fenced code blocks
     # - ^ or \n ensures that the fence is at the start of a line
@@ -179,13 +180,15 @@ def extract_first_fenced_code_block(text: str) -> Optional[str]:
     # - (\w+)? optionally captures the language tag
     # - \n matches the newline after the opening fence
     # - (.*?) non-greedy match for the code block content
-    # - \1 ensures that the closing fence has the same number of backticks
+    # - (?P=fence) ensures that the closing fence has the same number of backticks
+    # - [ ]* allows for optional spaces between the closing fence and newline
     # - (?=\n|$) ensures that the closing fence is followed by a newline or end of string
     pattern = re.compile(
-        r"""(?m)^(?P<fence>`{3,})(?P<lang>\w+)?\n(?P<code>.*?)^(?P=fence)(?=\n|$)""",
+        r"""(?m)^(?P<fence>`{3,})(?P<lang>\w+)?\n(?P<code>.*?)^(?P=fence)[ ]*(?=\n|$)""",
         re.DOTALL,
     )
-    match = pattern.search(text)
-    if match:
+    matches = list(pattern.finditer(text))
+    if matches:
+        match = matches[-1] if last else matches[0]
         return match.group("code")
     return None
diff --git a/tests/test_llm.py b/tests/test_llm.py
index c1224309..53ec90dc 100644
--- a/tests/test_llm.py
+++ b/tests/test_llm.py
@@ -145,6 +145,25 @@ def test_logs_extract_first_code(args, log_path):
     assert result.output == 'print("hello word")\n\n'
 
 
+@pytest.mark.parametrize(
+    "args",
+    (
+        ["--xl"],
+        ["--extract-last"],
+        ["list", "--xl"],
+        ["list", "--extract-last"],
+        ["--xl", "-r"],
+        ["-x", "--xl"],
+    ),
+)
+def test_logs_extract_last_code(args, log_path):
+    "Test that logs --xl/--extract-last returns the last code block"
+    runner = CliRunner()
+    result = runner.invoke(cli, ["logs"] + args, catch_exceptions=False)
+    assert result.exit_code == 0
+    assert result.output == 'print("hello word")\n\n'
+
+
 @pytest.mark.xfail(sys.platform == "win32", reason="Expected to fail on Windows")
 @pytest.mark.parametrize("env", ({}, {"LLM_USER_PATH": "/tmp/llm-user-path"}))
 def test_logs_path(monkeypatch, env, user_path):
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 783e5892..ef4b8058 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,5 +1,5 @@
 import pytest
-from llm.utils import simplify_usage_dict, extract_first_fenced_code_block
+from llm.utils import simplify_usage_dict, extract_fenced_code_block
 
 
 @pytest.mark.parametrize(
@@ -43,40 +43,61 @@ def test_simplify_usage_dict(input_data, expected_output):
 
 
 @pytest.mark.parametrize(
-    "input,expected",
+    "input,last,expected",
     [
-        ["This is a sample text without any code blocks.", None],
+        ["This is a sample text without any code blocks.", False, None],
         [
             "Here is some text.\n\n```\ndef foo():\n    return 'bar'\n```\n\nMore text.",
+            False,
             "def foo():\n    return 'bar'\n",
         ],
         [
             "Here is some text.\n\n```python\ndef foo():\n    return 'bar'\n```\n\nMore text.",
+            False,
             "def foo():\n    return 'bar'\n",
         ],
         [
             "Here is some text.\n\n````\ndef foo():\n    return 'bar'\n````\n\nMore text.",
+            False,
             "def foo():\n    return 'bar'\n",
         ],
         [
             "Here is some text.\n\n````javascript\nfunction foo() {\n    return 'bar';\n}\n````\n\nMore text.",
+            False,
             "function foo() {\n    return 'bar';\n}\n",
         ],
         [
             "Here is some text.\n\n```python\ndef foo():\n    return 'bar'\n````\n\nMore text.",
+            False,
             None,
         ],
         [
             "First code block:\n\n```python\ndef foo():\n    return 'bar'\n```\n\n"
             "Second code block:\n\n```javascript\nfunction foo() {\n    return 'bar';\n}\n```",
+            False,
             "def foo():\n    return 'bar'\n",
         ],
+        [
+            "First code block:\n\n```python\ndef foo():\n    return 'bar'\n```\n\n"
+            "Second code block:\n\n```javascript\nfunction foo() {\n    return 'bar';\n}\n```",
+            True,
+            "function foo() {\n    return 'bar';\n}\n",
+        ],
+        [
+            "First code block:\n\n```python\ndef foo():\n    return 'bar'\n```\n\n"
+            # This one has trailing whitespace after the second code block:
+            # https://github.com/simonw/llm/pull/718#issuecomment-2613177036
+            "Second code block:\n\n```javascript\nfunction foo() {\n    return 'bar';\n}\n``` ",
+            True,
+            "function foo() {\n    return 'bar';\n}\n",
+        ],
         [
             "Here is some text.\n\n```python\ndef foo():\n    return `bar`\n```\n\nMore text.",
+            False,
             "def foo():\n    return `bar`\n",
         ],
     ],
 )
-def test_extract_first_fenced_code_block(input, expected):
-    actual = extract_first_fenced_code_block(input)
+def test_extract_fenced_code_block(input, last, expected):
+    actual = extract_fenced_code_block(input, last=last)
     assert actual == expected

From eb0e1e761bfd5ae78a665fa98de736075ce49032 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Fri, 31 Jan 2025 12:14:02 -0800
Subject: [PATCH 141/149] o3-mini and reasoning_effort option, refs #728

---
 docs/openai-models.md                |  1 +
 docs/usage.md                        | 14 ++++++++++++
 llm/cli.py                           |  4 ++--
 llm/default_plugins/openai_models.py | 34 ++++++++++++++++++++++++++--
 4 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/docs/openai-models.md b/docs/openai-models.md
index 9d7f2a32..7201bffa 100644
--- a/docs/openai-models.md
+++ b/docs/openai-models.md
@@ -50,6 +50,7 @@ OpenAI Chat: o1
 OpenAI Chat: o1-2024-12-17
 OpenAI Chat: o1-preview
 OpenAI Chat: o1-mini
+OpenAI Chat: o3-mini
 OpenAI Completion: gpt-3.5-turbo-instruct (aliases: 3.5-instruct, chatgpt-instruct)
 ```
 <!-- [[[end]]] -->
diff --git a/docs/usage.md b/docs/usage.md
index b2847ae8..2d6c55a2 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -484,6 +484,7 @@ OpenAI Chat: o1
     logit_bias: dict, str
     seed: int
     json_object: boolean
+    reasoning_effort: str
   Attachment types:
     image/gif, image/jpeg, image/png, image/webp
 OpenAI Chat: o1-2024-12-17
@@ -497,6 +498,7 @@ OpenAI Chat: o1-2024-12-17
     logit_bias: dict, str
     seed: int
     json_object: boolean
+    reasoning_effort: str
   Attachment types:
     image/gif, image/jpeg, image/png, image/webp
 OpenAI Chat: o1-preview
@@ -521,6 +523,18 @@ OpenAI Chat: o1-mini
     logit_bias: dict, str
     seed: int
     json_object: boolean
+OpenAI Chat: o3-mini
+  Options:
+    temperature: float
+    max_tokens: int
+    top_p: float
+    frequency_penalty: float
+    presence_penalty: float
+    stop: str
+    logit_bias: dict, str
+    seed: int
+    json_object: boolean
+    reasoning_effort: str
 OpenAI Completion: gpt-3.5-turbo-instruct (aliases: 3.5-instruct, chatgpt-instruct)
   Options:
     temperature: float
diff --git a/llm/cli.py b/llm/cli.py
index 3f5fffb1..bd607796 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -1139,9 +1139,9 @@ def models_list(options, async_, query):
                     any_of = [{"type": field["type"]}]
                 types = ", ".join(
                     [
-                        _type_lookup.get(item["type"], item["type"])
+                        _type_lookup.get(item.get("type"), item.get("type", "str"))
                         for item in any_of
-                        if item["type"] != "null"
+                        if item.get("type") != "null"
                     ]
                 )
                 bits = ["\n    ", name, ": ", types]
diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
index 33d0566f..0a9dab26 100644
--- a/llm/default_plugins/openai_models.py
+++ b/llm/default_plugins/openai_models.py
@@ -8,6 +8,7 @@
 )
 import click
 import datetime
+from enum import Enum
 import httpx
 import openai
 import os
@@ -71,8 +72,8 @@ def register_models(register):
     # o1
     for model_id in ("o1", "o1-2024-12-17"):
         register(
-            Chat(model_id, vision=True, can_stream=False),
-            AsyncChat(model_id, vision=True, can_stream=False),
+            Chat(model_id, vision=True, can_stream=False, reasoning=True),
+            AsyncChat(model_id, vision=True, can_stream=False, reasoning=True),
         )
 
     register(
@@ -83,6 +84,10 @@ def register_models(register):
         Chat("o1-mini", allows_system_prompt=False),
         AsyncChat("o1-mini", allows_system_prompt=False),
     )
+    register(
+        Chat("o3-mini", reasoning=True),
+        AsyncChat("o3-mini", reasoning=True),
+    )
     # The -instruct completion model
     register(
         Completion("gpt-3.5-turbo-instruct", default_max_tokens=256),
@@ -322,6 +327,27 @@ def validate_logit_bias(cls, logit_bias):
         return validated_logit_bias
 
 
+class ReasoningEffortEnum(str, Enum):
+    low = "low"
+    medium = "medium"
+    high = "high"
+
+
+class OptionsForReasoning(SharedOptions):
+    json_object: Optional[bool] = Field(
+        description="Output a valid JSON object {...}. Prompt must mention JSON.",
+        default=None,
+    )
+    reasoning_effort: Optional[ReasoningEffortEnum] = Field(
+        description=(
+            "Constraints effort on reasoning for reasoning models. Currently supported "
+            "values are low, medium, and high. Reducing reasoning effort can result in "
+            "faster responses and fewer tokens used on reasoning in a response."
+        ),
+        default=None,
+    )
+
+
 def _attachment(attachment):
     url = attachment.url
     base64_content = ""
@@ -355,6 +381,7 @@ def __init__(
         can_stream=True,
         vision=False,
         audio=False,
+        reasoning=False,
         allows_system_prompt=True,
     ):
         self.model_id = model_id
@@ -371,6 +398,9 @@ def __init__(
 
         self.attachment_types = set()
 
+        if reasoning:
+            self.Options = OptionsForReasoning
+
         if vision:
             self.attachment_types.update(
                 {

From 965ad819f97c1c96918c9e699b13587b4ce726ae Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Fri, 31 Jan 2025 12:18:33 -0800
Subject: [PATCH 142/149] Fix for tests with pydantic<2, refs #728

---
 llm/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/cli.py b/llm/cli.py
index bd607796..949b5896 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -1136,7 +1136,7 @@ def models_list(options, async_, query):
             for name, field in model.Options.schema()["properties"].items():
                 any_of = field.get("anyOf")
                 if any_of is None:
-                    any_of = [{"type": field["type"]}]
+                    any_of = [{"type": field.get("type", "str")}]
                 types = ", ".join(
                     [
                         _type_lookup.get(item.get("type"), item.get("type", "str"))

From 4c153ce67500cfa46d9ecb130ea1ee92a692c309 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Fri, 31 Jan 2025 12:32:08 -0800
Subject: [PATCH 143/149] Pin Black to get tests to pass, refs #728

See https://github.com/psf/black/issues/4571
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 26bbec6d..c063be6d 100644
--- a/setup.py
+++ b/setup.py
@@ -58,7 +58,7 @@ def get_long_description():
             "pytest-asyncio",
             "cogapp",
             "mypy>=1.10.0",
-            "black>=24.1.0",
+            "black==24.10.0",
             "ruff",
             "types-click",
             "types-PyYAML",

From f8dcc674555d6783e4fd347c44ae6e5881f255ca Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Fri, 31 Jan 2025 12:35:10 -0800
Subject: [PATCH 144/149] Release 0.21

Refs #717, #728
---
 docs/changelog.md | 7 +++++++
 setup.py          | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index 298b2f03..94124fc5 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,5 +1,12 @@
 # Changelog
 
+(v0_21)=
+## 0.21 (2025-01-31)
+
+- New model: `o3-mini`. [#728](https://github.com/simonw/llm/issues/728)
+- The `o3-mini` and `o1` models now support a `reasoning_effort` option which can be set to `low`, `medium` or `high`.
+- `llm prompt` and `llm logs` now have a `--xl/--extract-last` option for extracting the last fenced code block in the response - a complement to the existing `--x/--extract` option. [#717](https://github.com/simonw/llm/issues/717)
+
 (v0_20)=
 ## 0.20 (2025-01-22)
 
diff --git a/setup.py b/setup.py
index c063be6d..1b052381 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 import os
 
-VERSION = "0.20"
+VERSION = "0.21"
 
 
 def get_long_description():

From deb8bc3b4f5219583009eeb2c600d0b14c852c78 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Fri, 31 Jan 2025 13:18:41 -0800
Subject: [PATCH 145/149] Upgrade to black>=25.1.0

Refs https://github.com/simonw/llm/issues/728#issuecomment-2628348988

Refs https://github.com/psf/black/issues/4571#issuecomment-2628355450
---
 llm/errors.py | 2 --
 setup.py      | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/llm/errors.py b/llm/errors.py
index 4515ab82..10f50bb5 100644
--- a/llm/errors.py
+++ b/llm/errors.py
@@ -1,8 +1,6 @@
 class ModelError(Exception):
     "Models can raise this error, which will be displayed to the user"
-    pass
 
 
 class NeedsKeyException(ModelError):
     "Model needs an API key which has not been provided"
-    pass
diff --git a/setup.py b/setup.py
index 1b052381..8eae1bbf 100644
--- a/setup.py
+++ b/setup.py
@@ -58,7 +58,7 @@ def get_long_description():
             "pytest-asyncio",
             "cogapp",
             "mypy>=1.10.0",
-            "black==24.10.0",
+            "black>=25.1.0",
             "ruff",
             "types-click",
             "types-PyYAML",

From 21df2414436dcf298603d01a2bb9e79af69e69c2 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sat, 1 Feb 2025 22:08:19 -0800
Subject: [PATCH 146/149] llm-claude-3 is now called llm-anthropic

Refs https://github.com/simonw/llm-claude-3/issues/31

!stable-docs
---
 docs/plugins/directory.md | 3 +--
 docs/python-api.md        | 4 ++--
 docs/setup.md             | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/docs/plugins/directory.md b/docs/plugins/directory.md
index a80b1b0f..e1e10a6f 100644
--- a/docs/plugins/directory.md
+++ b/docs/plugins/directory.md
@@ -21,8 +21,7 @@ These plugins can be used to interact with remotely hosted models via their API:
 
 - **[llm-mistral](https://github.com/simonw/llm-mistral)** adds support for [Mistral AI](https://mistral.ai/)'s language and embedding models.
 - **[llm-gemini](https://github.com/simonw/llm-gemini)** adds support for Google's [Gemini](https://ai.google.dev/docs) models.
-- **[llm-claude](https://github.com/tomviner/llm-claude)** by Tom Viner adds support for Claude 2.1 and Claude Instant 2.1 by Anthropic.
-- **[llm-claude-3](https://github.com/simonw/llm-claude-3)** supports Anthropic's [Claude 3 family](https://www.anthropic.com/news/claude-3-family) of models.
+- **[llm-anthropic](https://github.com/simonw/llm-anthropic)** supports Anthropic's [Claude 3 family](https://www.anthropic.com/news/claude-3-family), [3.5 Sonnet](https://www.anthropic.com/news/claude-3-5-sonnet) and beyond.
 - **[llm-command-r](https://github.com/simonw/llm-command-r)** supports Cohere's Command R and [Command R Plus](https://txt.cohere.com/command-r-plus-microsoft-azure/) API models.
 - **[llm-reka](https://github.com/simonw/llm-reka)** supports the [Reka](https://www.reka.ai/) family of models via their API.
 - **[llm-perplexity](https://github.com/hex/llm-perplexity)** by Alexandru Geana supports the [Perplexity Labs](https://docs.perplexity.ai/) API models, including `llama-3-sonar-large-32k-online` which can search for things online and `llama-3-70b-instruct`.
diff --git a/docs/python-api.md b/docs/python-api.md
index 49aff02d..6a7bcfa4 100644
--- a/docs/python-api.md
+++ b/docs/python-api.md
@@ -94,10 +94,10 @@ print(model.prompt("Names for otters", temperature=0.2))
 
 ### Models from plugins
 
-Any models you have installed as plugins will also be available through this mechanism, for example to use Anthropic's Claude 3.5 Sonnet model with [llm-claude-3](https://github.com/simonw/llm-claude-3):
+Any models you have installed as plugins will also be available through this mechanism, for example to use Anthropic's Claude 3.5 Sonnet model with [llm-anthropic](https://github.com/simonw/llm-anthropic):
 
 ```bash
-pip install llm-claude-3
+pip install llm-anthropic
 ```
 Then in your Python code:
 ```python
diff --git a/docs/setup.md b/docs/setup.md
index 9218c503..72801ba6 100644
--- a/docs/setup.md
+++ b/docs/setup.md
@@ -56,7 +56,7 @@ This will install and run LLM using a temporary virtual environment.
 You can use the `--with` option to add extra plugins. To use Anthropic's models, for example:
 ```bash
 export ANTHROPIC_API_KEY='...'
-uvx --with llm-claude-3 llm -m claude-3.5-haiku 'fun facts about skunks'
+uvx --with llm-anthropic llm -m claude-3.5-haiku 'fun facts about skunks'
 ```
 All of the usual LLM commands will work with `uvx llm`. Here's how to set your OpenAI key without needing an environment variable for example:
 ```bash

From 41d64a8f1239322e12aa11c17450054f0c654ed7 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sun, 2 Feb 2025 12:03:01 -0800
Subject: [PATCH 147/149] llm logs --prompts option (#737)

Closes #736
---
 docs/help.md      |  1 +
 docs/logging.md   | 16 +++++++++++++++
 llm/cli.py        | 51 ++++++++++++++++++++++++++++++++++++++++++++---
 tests/test_llm.py | 25 +++++++++++++++++++++++
 4 files changed, 90 insertions(+), 3 deletions(-)

diff --git a/docs/help.md b/docs/help.md
index 75e9371b..eafe670d 100644
--- a/docs/help.md
+++ b/docs/help.md
@@ -302,6 +302,7 @@ Options:
   -t, --truncate              Truncate long strings in output
   -u, --usage                 Include token usage
   -r, --response              Just output the last response
+  --prompts                   Output prompts, end-truncated if necessary
   -x, --extract               Extract first fenced code block
   --xl, --extract-last        Extract last fenced code block
   -c, --current               Show logs from the current conversation
diff --git a/docs/logging.md b/docs/logging.md
index 3f0926a5..99a9a3cb 100644
--- a/docs/logging.md
+++ b/docs/logging.md
@@ -89,6 +89,22 @@ You can truncate the display of the prompts and responses using the `-t/--trunca
 ```bash
 llm logs -n 5 -t --json
 ```
+Or use `--prompts` to see just the truncated prompts:
+```bash
+llm logs -n 2 --prompts
+```
+Example output:
+```
+- model: deepseek-reasoner
+  datetime: 2025-02-02T06:39:53
+  conversation: 01jk2pk05xq3d0vgk0202zrsg1
+  prompt:  H01 There are five huts. H02 The Scotsman lives in the purple hut. H03 The Welshman owns the parrot. H04 Kombucha is...
+- model: o3-mini
+  datetime: 2025-02-02T19:03:05
+  conversation: 01jk40qkxetedzpf1zd8k9bgww
+  system: Formatting re-enabled. Write a detailed README with extensive usage examples.
+  prompt: <documents> <document index="1"> <source>./Cargo.toml</source> <document_content> [package] name = "py-limbo" version...
+```
 
 (logs-conversation)=
 ### Logs for a conversation
diff --git a/llm/cli.py b/llm/cli.py
index 949b5896..ac675860 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -4,6 +4,7 @@
 from dataclasses import asdict
 import io
 import json
+import re
 from llm import (
     Attachment,
     AsyncResponse,
@@ -874,6 +875,9 @@ def logs_turn_off():
 @click.option("-t", "--truncate", is_flag=True, help="Truncate long strings in output")
 @click.option("-u", "--usage", is_flag=True, help="Include token usage")
 @click.option("-r", "--response", is_flag=True, help="Just output the last response")
+@click.option(
+    "--prompts", is_flag=True, help="Output prompts, end-truncated if necessary"
+)
 @click.option("-x", "--extract", is_flag=True, help="Extract first fenced code block")
 @click.option(
     "extract_last",
@@ -910,6 +914,7 @@ def logs_list(
     truncate,
     usage,
     response,
+    prompts,
     extract,
     extract_last,
     current_conversation,
@@ -923,6 +928,18 @@ def logs_list(
     db = sqlite_utils.Database(path)
     migrate(db)
 
+    if prompts and (json_output or response):
+        invalid = " or ".join(
+            [
+                flag[0]
+                for flag in (("--json", json_output), ("--response", response))
+                if flag[1]
+            ]
+        )
+        raise click.ClickException(
+            "Cannot use --prompts and {} together".format(invalid)
+        )
+
     if response and not current_conversation and not conversation_id:
         current_conversation = True
 
@@ -1035,6 +1052,27 @@ def logs_list(
         current_system = None
         should_show_conversation = True
         for row in rows:
+            if prompts:
+                system = _truncate_string(row["system"], 120, end=True)
+                prompt = _truncate_string(row["prompt"], 120, end=True)
+                cid = row["conversation_id"]
+                attachments = attachments_by_id.get(row["id"])
+                lines = [
+                    "- model: {}".format(row["model"]),
+                    "  datetime: {}".format(row["datetime_utc"]).split(".")[0],
+                    "  conversation: {}".format(cid),
+                ]
+                if system:
+                    lines.append("  system: {}".format(system))
+                if prompt:
+                    lines.append("  prompt: {}".format(prompt))
+                if attachments:
+                    lines.append("  attachments:")
+                    for attachment in attachments:
+                        path = attachment["path"] or attachment["url"]
+                        lines.append("  - {}: {}".format(attachment["type"], path))
+                click.echo("\n".join(lines))
+                continue
             click.echo(
                 "# {}{}\n{}".format(
                     row["datetime_utc"].split(".")[0],
@@ -1897,10 +1935,17 @@ def template_dir():
     return path
 
 
-def _truncate_string(s, max_length=100):
-    if len(s) > max_length:
+def _truncate_string(s, max_length=100, end=False):
+    if not s:
+        return s
+    if end:
+        s = re.sub(r"\s+", " ", s)
+        if len(s) <= max_length:
+            return s
         return s[: max_length - 3] + "..."
-    return s
+    if len(s) <= max_length:
+        return s
+    return s[: max_length - 3] + "..."
 
 
 def logs_db_path():
diff --git a/tests/test_llm.py b/tests/test_llm.py
index 53ec90dc..7467ea70 100644
--- a/tests/test_llm.py
+++ b/tests/test_llm.py
@@ -164,6 +164,31 @@ def test_logs_extract_last_code(args, log_path):
     assert result.output == 'print("hello word")\n\n'
 
 
+def test_logs_prompts(log_path):
+    runner = CliRunner()
+    result = runner.invoke(cli, ["logs", "--prompts", "-p", str(log_path)])
+    assert result.exit_code == 0
+    output = datetime_re.sub("YYYY-MM-DDTHH:MM:SS", result.output)
+    expected = (
+        "- model: davinci\n"
+        "  datetime: YYYY-MM-DDTHH:MM:SS\n"
+        "  conversation: abc123\n"
+        "  system: system\n"
+        "  prompt: prompt\n"
+        "- model: davinci\n"
+        "  datetime: YYYY-MM-DDTHH:MM:SS\n"
+        "  conversation: abc123\n"
+        "  system: system\n"
+        "  prompt: prompt\n"
+        "- model: davinci\n"
+        "  datetime: YYYY-MM-DDTHH:MM:SS\n"
+        "  conversation: abc123\n"
+        "  system: system\n"
+        "  prompt: prompt\n"
+    )
+    assert output == expected
+
+
 @pytest.mark.xfail(sys.platform == "win32", reason="Expected to fail on Windows")
 @pytest.mark.parametrize("env", ({}, {"LLM_USER_PATH": "/tmp/llm-user-path"}))
 def test_logs_path(monkeypatch, env, user_path):

From f67c21522bb405162a67273da33447dca576e2af Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Tue, 11 Feb 2025 08:35:27 -0800
Subject: [PATCH 148/149] Docs for response.json() and response.usage()

!stable-docs
---
 docs/python-api.md | 86 +++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 77 insertions(+), 9 deletions(-)

diff --git a/docs/python-api.md b/docs/python-api.md
index 6a7bcfa4..caec03ad 100644
--- a/docs/python-api.md
+++ b/docs/python-api.md
@@ -83,6 +83,8 @@ if "image/jpeg" in model.attachment_types:
     ...
 ```
 
+(python-api-model-options)=
+
 ### Model options
 
 For models that support options (view those with `llm models --options`) you can pass options as keyword arguments to the `.prompt()` method:
@@ -92,6 +94,8 @@ model = llm.get_model()
 print(model.prompt("Names for otters", temperature=0.2))
 ```
 
+(python-api-models-from-plugins)=
+
 ### Models from plugins
 
 Any models you have installed as plugins will also be available through this mechanism, for example to use Anthropic's Claude 3.5 Sonnet model with [llm-anthropic](https://github.com/simonw/llm-anthropic):
@@ -111,25 +115,69 @@ print(response.text())
 ```
 Some models do not use API keys at all.
 
-(python-api-listing-models)=
+(python-api-underlying-json)=
 
-### Listing models
+### Accessing the underlying JSON
 
-The `llm.get_models()` list returns a list of all available models, including those from plugins.
+Most model plugins also make a JSON version of the prompt response available. The structure of this will differ between model plugins, so building against this is likely to result in code that only works with that specific model provider.
+
+You can access this JSON data as a Python dictionary using the `response.json()` method:
 
 ```python
 import llm
+from pprint import pprint
 
-for model in llm.get_models():
-    print(model.model_id)
+model = llm.get_model("gpt-4o-mini")
+response = model.prompt("3 names for an otter")
+json_data = response.json()
+pprint(json_data)
+```
+Here's that example output from GPT-4o mini:
+```python
+{'content': 'Sure! Here are three fun names for an otter:\n'
+            '\n'
+            '1. **Splash**\n'
+            '2. **Bubbles**\n'
+            '3. **Otto** \n'
+            '\n'
+            'Feel free to mix and match or use these as inspiration!',
+ 'created': 1739291215,
+ 'finish_reason': 'stop',
+ 'id': 'chatcmpl-AznO31yxgBjZ4zrzBOwJvHEWgdTaf',
+ 'model': 'gpt-4o-mini-2024-07-18',
+ 'object': 'chat.completion.chunk',
+ 'usage': {'completion_tokens': 43,
+           'completion_tokens_details': {'accepted_prediction_tokens': 0,
+                                         'audio_tokens': 0,
+                                         'reasoning_tokens': 0,
+                                         'rejected_prediction_tokens': 0},
+           'prompt_tokens': 13,
+           'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0},
+           'total_tokens': 56}}
 ```
 
-Use `llm.get_async_models()` to list async models:
+(python-api-token-usage)=
+
+### Token usage
+
+Many models can return a count of the number of tokens used while executing the prompt.
+
+The `response.usage()` method provides an abstraction over this:
 
 ```python
-for model in llm.get_async_models():
-    print(model.model_id)
+pprint(response.usage())
+```
+Example output:
+```python
+Usage(input=5,
+      output=2,
+      details={'candidatesTokensDetails': [{'modality': 'TEXT',
+                                            'tokenCount': 2}],
+               'promptTokensDetails': [{'modality': 'TEXT', 'tokenCount': 5}]})
 ```
+The `.input` and `.output` properties are integers representing the number of input and output tokens. The `.details` property may be a dictionary with additional custom values that vary by model.
+
+(python-api-streaming-responses)=
 
 ### Streaming responses
 
@@ -144,6 +192,26 @@ The `response.text()` method described earlier does this for you - it runs throu
 
 If a response has been evaluated, `response.text()` will continue to return the same string.
 
+(python-api-listing-models)=
+
+## Listing models
+
+The `llm.get_models()` list returns a list of all available models, including those from plugins.
+
+```python
+import llm
+
+for model in llm.get_models():
+    print(model.model_id)
+```
+
+Use `llm.get_async_models()` to list async models:
+
+```python
+for model in llm.get_async_models():
+    print(model.model_id)
+```
+
 (python-api-async)=
 
 ## Async models
@@ -311,4 +379,4 @@ Here the `default=` parameter specifies the value that should be returned if the
 
 ### set_default_embedding_model(alias) and get_default_embedding_model()
 
-These two methods work the same as `set_default_model()` and `get_default_model()` but for the default {ref}`embedding model <embeddings>` instead.
\ No newline at end of file
+These two methods work the same as `set_default_model()` and `get_default_model()` but for the default {ref}`embedding model <embeddings>` instead.

From 9a1374b447314a6cf316d0af9d4ec8bc3cf135e2 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Wed, 12 Feb 2025 15:19:18 -0800
Subject: [PATCH 149/149] llm embed-multi --prepend option (#746)

* llm embed-multi --prepend option

Closes #745
---
 docs/embeddings/cli.md  |  3 ++
 docs/help.md            |  2 ++
 llm/cli.py              | 15 ++++++++--
 tests/test_embed_cli.py | 66 +++++++++++++++++++++++++++--------------
 4 files changed, 61 insertions(+), 25 deletions(-)

diff --git a/docs/embeddings/cli.md b/docs/embeddings/cli.md
index 4c958175..de723ca4 100644
--- a/docs/embeddings/cli.md
+++ b/docs/embeddings/cli.md
@@ -148,8 +148,11 @@ All three mechanisms support these options:
 - `-d database.db` to specify a different database file to store the embeddings in
 - `--store` to store the original content in the embeddings table in addition to the embedding vector
 - `--prefix` to prepend a prefix to the stored ID of each item
+- `--prepend` to prepend a string to the content before embedding 
 - `--batch-size SIZE` to process embeddings in batches of the specified size
 
+The `--prepend` option is useful for embedding models that require you to prepend a special token to the content before embedding it. [nomic-embed-text-v2-moe](https://huggingface.co/nomic-ai/nomic-embed-text-v2-moe) for example requires documents to be prepended `'search_document: '` and search queries to be prepended `'search_query: '`.
+
 (embeddings-cli-embed-multi-csv-etc)=
 ### Embedding data from a CSV, TSV or JSON file
 
diff --git a/docs/help.md b/docs/help.md
index eafe670d..690a109a 100644
--- a/docs/help.md
+++ b/docs/help.md
@@ -573,6 +573,8 @@ Options:
   --batch-size INTEGER         Batch size to use when running embeddings
   --prefix TEXT                Prefix to add to the IDs
   -m, --model TEXT             Embedding model to use
+  --prepend TEXT               Prepend this string to all content before
+                               embedding
   --store                      Store the text itself in the database
   -d, --database FILE
   --help                       Show this message and exit.
diff --git a/llm/cli.py b/llm/cli.py
index ac675860..f5c760d9 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -1574,6 +1574,10 @@ def get_db():
 )
 @click.option("--prefix", help="Prefix to add to the IDs", default="")
 @click.option("-m", "--model", help="Embedding model to use")
+@click.option(
+    "--prepend",
+    help="Prepend this string to all content before embedding",
+)
 @click.option("--store", is_flag=True, help="Store the text itself in the database")
 @click.option(
     "-d",
@@ -1593,6 +1597,7 @@ def embed_multi(
     batch_size,
     prefix,
     model,
+    prepend,
     store,
     database,
 ):
@@ -1715,11 +1720,15 @@ def load_rows(fp):
         def tuples() -> Iterable[Tuple[str, Union[bytes, str]]]:
             for row in rows:
                 values = list(row.values())
-                id = prefix + str(values[0])
+                id: str = prefix + str(values[0])
+                content: Optional[Union[bytes, str]] = None
                 if binary:
-                    yield id, cast(bytes, values[1])
+                    content = cast(bytes, values[1])
                 else:
-                    yield id, " ".join(v or "" for v in values[1:])
+                    content = " ".join(v or "" for v in values[1:])
+                if prepend and isinstance(content, str):
+                    content = prepend + content
+                yield id, content or ""
 
         embed_kwargs = {"store": store}
         if batch_size:
diff --git a/tests/test_embed_cli.py b/tests/test_embed_cli.py
index 57462316..ee1d3307 100644
--- a/tests/test_embed_cli.py
+++ b/tests/test_embed_cli.py
@@ -250,6 +250,7 @@ def test_similar_by_content_cli(tmpdir, user_path_with_embeddings, scenario):
 
 @pytest.mark.parametrize("use_stdin", (False, True))
 @pytest.mark.parametrize("prefix", (None, "prefix"))
+@pytest.mark.parametrize("prepend", (None, "search_document: "))
 @pytest.mark.parametrize(
     "filename,content",
     (
@@ -265,7 +266,7 @@ def test_similar_by_content_cli(tmpdir, user_path_with_embeddings, scenario):
         ),
     ),
 )
-def test_embed_multi_file_input(tmpdir, use_stdin, prefix, filename, content):
+def test_embed_multi_file_input(tmpdir, use_stdin, prefix, prepend, filename, content):
     db_path = tmpdir / "embeddings.db"
     args = ["embed-multi", "phrases", "-d", str(db_path), "-m", "embed-demo"]
     input = None
@@ -278,6 +279,8 @@ def test_embed_multi_file_input(tmpdir, use_stdin, prefix, filename, content):
         args.append(str(path))
     if prefix:
         args.extend(("--prefix", prefix))
+    if prepend:
+        args.extend(("--prepend", prepend))
     # Auto-detection can't detect JSON-nl, so make that explicit
     if filename.endswith(".jsonl"):
         args.extend(("--format", "nl"))
@@ -325,7 +328,8 @@ def test_embed_multi_files_binary_store(tmpdir):
 
 @pytest.mark.parametrize("use_other_db", (True, False))
 @pytest.mark.parametrize("prefix", (None, "prefix"))
-def test_embed_multi_sql(tmpdir, use_other_db, prefix):
+@pytest.mark.parametrize("prepend", (None, "search_document: "))
+def test_embed_multi_sql(tmpdir, use_other_db, prefix, prepend):
     db_path = str(tmpdir / "embeddings.db")
     db = sqlite_utils.Database(db_path)
     extra_args = []
@@ -336,6 +340,8 @@ def test_embed_multi_sql(tmpdir, use_other_db, prefix):
 
     if prefix:
         extra_args.extend(("--prefix", prefix))
+    if prepend:
+        extra_args.extend(("--prepend", prepend))
 
     db["content"].insert_all(
         [
@@ -365,8 +371,14 @@ def test_embed_multi_sql(tmpdir, use_other_db, prefix):
     assert embeddings_db["embeddings"].count == 2
     rows = list(embeddings_db.query("select id, content from embeddings order by id"))
     assert rows == [
-        {"id": (prefix or "") + "1", "content": "cli Command line interface"},
-        {"id": (prefix or "") + "2", "content": "sql Structured query language"},
+        {
+            "id": (prefix or "") + "1",
+            "content": (prepend or "") + "cli Command line interface",
+        },
+        {
+            "id": (prefix or "") + "2",
+            "content": (prepend or "") + "sql Structured query language",
+        },
     ]
 
 
@@ -425,7 +437,8 @@ def multi_files(tmpdir):
 
 @pytest.mark.xfail(sys.platform == "win32", reason="Expected to fail on Windows")
 @pytest.mark.parametrize("scenario", ("single", "multi"))
-def test_embed_multi_files(multi_files, scenario):
+@pytest.mark.parametrize("prepend", (None, "search_document: "))
+def test_embed_multi_files(multi_files, scenario, prepend):
     db_path, files = multi_files
     for filename, content in (
         ("file1.txt", b"hello world"),
@@ -440,17 +453,23 @@ def test_embed_multi_files(multi_files, scenario):
         path.parent.mkdir(parents=True, exist_ok=True)
         path.write_bytes(content)
 
+    extra_args = []
+
+    if prepend:
+        extra_args.extend(("--prepend", prepend))
     if scenario == "single":
-        extra_args = ["--files", str(files), "**/*.txt"]
+        extra_args.extend(["--files", str(files), "**/*.txt"])
     else:
-        extra_args = [
-            "--files",
-            str(files / "nested" / "more"),
-            "**/*.ini",
-            "--files",
-            str(files / "nested"),
-            "*.txt",
-        ]
+        extra_args.extend(
+            [
+                "--files",
+                str(files / "nested" / "more"),
+                "**/*.ini",
+                "--files",
+                str(files / "nested"),
+                "*.txt",
+            ]
+        )
 
     runner = CliRunner()
     result = runner.invoke(
@@ -471,17 +490,20 @@ def test_embed_multi_files(multi_files, scenario):
     rows = list(embeddings_db.query("select id, content from embeddings order by id"))
     if scenario == "single":
         assert rows == [
-            {"id": "file1.txt", "content": "hello world"},
-            {"id": "file2.txt", "content": "goodbye world"},
-            {"id": "nested/more/three.txt", "content": "three"},
-            {"id": "nested/one.txt", "content": "one"},
-            {"id": "nested/two.txt", "content": "two"},
+            {"id": "file1.txt", "content": (prepend or "") + "hello world"},
+            {"id": "file2.txt", "content": (prepend or "") + "goodbye world"},
+            {"id": "nested/more/three.txt", "content": (prepend or "") + "three"},
+            {"id": "nested/one.txt", "content": (prepend or "") + "one"},
+            {"id": "nested/two.txt", "content": (prepend or "") + "two"},
         ]
     else:
         assert rows == [
-            {"id": "ignored.ini", "content": "Has weird \x96 character"},
-            {"id": "one.txt", "content": "one"},
-            {"id": "two.txt", "content": "two"},
+            {
+                "id": "ignored.ini",
+                "content": (prepend or "") + "Has weird \x96 character",
+            },
+            {"id": "one.txt", "content": (prepend or "") + "one"},
+            {"id": "two.txt", "content": (prepend or "") + "two"},
         ]