Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Support for new OpenAI model GPT-4o-mini + improve prompts #200

Merged
merged 2 commits into from
Jul 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/helpers/aiSdkUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ export enum SupportedModels {
Gpt4VisionPreview = "gpt-4-vision-preview",
Gpt4Turbo = "gpt-4-turbo",
Gpt4O = "gpt-4o",
Gpt4OMini = "gpt-4o-mini",
Claude3Sonnet = "claude-3-sonnet-20240229",
Claude3Opus = "claude-3-opus-20240229",
Claude35Sonnet = "claude-3-5-sonnet-20240620",
Expand All @@ -28,6 +29,7 @@ export const DisplayName = {
[SupportedModels.Gpt4VisionPreview]: "GPT-4 Vision (Preview)",
[SupportedModels.Gpt4Turbo]: "GPT-4 Turbo",
[SupportedModels.Gpt4O]: "GPT-4o",
[SupportedModels.Gpt4OMini]: "GPT-4o Mini",
[SupportedModels.Claude3Sonnet]: "Claude 3 Sonnet",
[SupportedModels.Claude3Opus]: "Claude 3 Opus",
[SupportedModels.Claude35Sonnet]: "Claude 3.5 Sonnet",
Expand All @@ -38,6 +40,7 @@ export function hasVisionSupport(model: SupportedModels) {
model === SupportedModels.Gpt4VisionPreview ||
model === SupportedModels.Gpt4Turbo ||
model === SupportedModels.Gpt4O ||
model === SupportedModels.Gpt4OMini ||
model === SupportedModels.Claude3Sonnet ||
model === SupportedModels.Claude3Opus ||
model === SupportedModels.Claude35Sonnet
Expand Down
12 changes: 9 additions & 3 deletions src/helpers/dom-agent/determineNextAction.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,15 @@ Example 1:
}

Example 2:
{
thought: "I am typing 'fish food' into the search bar",
action: "setValue(123, 'fish food')"
}

Example 3:
{
thought: "I continue to scroll down to find the section",
action: "scroll("down")"
action: "scroll('down')"
}

Your response must always be in JSON format and must include "thought" and "action".
Expand Down Expand Up @@ -137,9 +143,9 @@ ${pageContents}`;
// make action compatible with vision agent
// TODO: refactor dom agent so we don't need this
function visionActionAdapter(action: ParsedResponseSuccess): Action {
const args = { ...action.parsedAction.args, label: "" };
const args = { ...action.parsedAction.args, uid: "" };
if ("elementId" in args) {
args.label = args.elementId;
args.uid = args.elementId;
}
return {
thought: action.thought,
Expand Down
21 changes: 6 additions & 15 deletions src/helpers/rpc/performAction.ts
Original file line number Diff line number Diff line change
Expand Up @@ -156,40 +156,31 @@ function createOperateTool(
window.open(action.args.url, "_blank");
break;
case "click": {
const success = await click(domActions, action.args.label);
const success = await click(domActions, action.args.uid);
if (!success) {
console.error(
"Unable to find element with label: ",
action.args.label,
);
console.error("Unable to find element with uid: ", action.args.uid);
}
break;
}
case "setValue": {
const success = await setValue(
domActions,
action.args.label,
action.args.uid,
action.args.value || "",
);
if (!success) {
console.error(
"Unable to find element with label: ",
action.args.label,
);
console.error("Unable to find element with uid: ", action.args.uid);
}
break;
}
case "setValueAndEnter": {
const success = await setValue(
domActions,
action.args.label,
action.args.uid,
(action.args.value || "") + "\n",
);
if (!success) {
console.error(
"Unable to find element with label: ",
action.args.label,
);
console.error("Unable to find element with uid: ", action.args.uid);
}
break;
}
Expand Down
4 changes: 2 additions & 2 deletions src/helpers/vision-agent/determineNextAction.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ You can use the following tools:

${allToolsDescriptions}

You will be given a task to perform, and an image. The image will contain two parts: on the left is a clean screenshot of the current page, and on the right is the same screenshot with interactive elements annotated with corresponding label.
You will be given a task to perform, and an image. The image will contain two parts: on the left is a clean screenshot of the current page, and on the right is the same screenshot with interactive elements annotated with corresponding uid.
You will also be given previous actions that you have taken. If something does not work, try find an alternative solution. For example, instead of searching for a specific item that the user requested, perform a general search and apply filters, or simply browse the results page.
You will also be given additional information of annotations.

Expand All @@ -30,7 +30,7 @@ This is one example of expected response from you:
"action": {
"name": "click",
"args": {
"label": "123"
"uid": "123"
}
}
}
Expand Down
10 changes: 5 additions & 5 deletions src/helpers/vision-agent/tools.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,22 @@ import { z } from "zod";
export const clickSchema = z.object({
name: z.literal("click"),
description: z
.literal("Click on an element with the label on the annotation.")
.literal("Click on an element with the uid on the annotation.")
.optional(),
args: z.object({
label: z.string(),
uid: z.string(),
}),
});

export const setValueSchema = z.object({
name: z.literal("setValue"),
description: z
.literal(
"Focus on and set the value of an input element with the label on the annotation.",
"Focus on and set the value of an input element with the uid on the annotation.",
)
.optional(),
args: z.object({
label: z.string(),
uid: z.string(),
value: z.string(),
}),
});
Expand All @@ -31,7 +31,7 @@ export const setValueAndEnterSchema = z.object({
)
.optional(),
args: z.object({
label: z.string(),
uid: z.string(),
value: z.string(),
}),
});
Expand Down
8 changes: 4 additions & 4 deletions src/pages/content/drawLabels.ts
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ const baseZIndex = 10000;

type LabelDataWithElement = {
element: Element;
label: string;
uid: string;
name: string;
tagName: string;
role?: string;
Expand All @@ -229,7 +229,7 @@ function getLabelData(
const uidString = uid.toString();

const item: LabelDataWithElement = {
label: uidString,
uid: uidString,
name,
tagName: elem.tagName,
element: elem,
Expand Down Expand Up @@ -310,8 +310,8 @@ export function addLabelsToDom(data: LabelDataWithElement[]) {
const wrapper = document.createElement("div");
wrapper.classList.add("_label_overlay_wrapper");
wrapper.popover = "manual";
data.forEach(({ element, label }, index) => {
drawLabel(wrapper, element, label, baseZIndex + data.length - index);
data.forEach(({ element, uid }, index) => {
drawLabel(wrapper, element, uid, baseZIndex + data.length - index);
});
// set wrapper's width and height to match body
wrapper.style.width = `${document.documentElement.scrollWidth}px`;
Expand Down
Loading