diff --git a/src/helpers/aiSdkUtils.ts b/src/helpers/aiSdkUtils.ts index 7d32e23..2cfed85 100644 --- a/src/helpers/aiSdkUtils.ts +++ b/src/helpers/aiSdkUtils.ts @@ -10,6 +10,7 @@ export enum SupportedModels { Gpt4VisionPreview = "gpt-4-vision-preview", Gpt4Turbo = "gpt-4-turbo", Gpt4O = "gpt-4o", + Gpt4OMini = "gpt-4o-mini", Claude3Sonnet = "claude-3-sonnet-20240229", Claude3Opus = "claude-3-opus-20240229", Claude35Sonnet = "claude-3-5-sonnet-20240620", @@ -28,6 +29,7 @@ export const DisplayName = { [SupportedModels.Gpt4VisionPreview]: "GPT-4 Vision (Preview)", [SupportedModels.Gpt4Turbo]: "GPT-4 Turbo", [SupportedModels.Gpt4O]: "GPT-4o", + [SupportedModels.Gpt4OMini]: "GPT-4o Mini", [SupportedModels.Claude3Sonnet]: "Claude 3 Sonnet", [SupportedModels.Claude3Opus]: "Claude 3 Opus", [SupportedModels.Claude35Sonnet]: "Claude 3.5 Sonnet", @@ -38,6 +40,7 @@ export function hasVisionSupport(model: SupportedModels) { model === SupportedModels.Gpt4VisionPreview || model === SupportedModels.Gpt4Turbo || model === SupportedModels.Gpt4O || + model === SupportedModels.Gpt4OMini || model === SupportedModels.Claude3Sonnet || model === SupportedModels.Claude3Opus || model === SupportedModels.Claude35Sonnet diff --git a/src/helpers/dom-agent/determineNextAction.ts b/src/helpers/dom-agent/determineNextAction.ts index cff8a04..3e11716 100644 --- a/src/helpers/dom-agent/determineNextAction.ts +++ b/src/helpers/dom-agent/determineNextAction.ts @@ -35,9 +35,15 @@ Example 1: } Example 2: +{ + thought: "I am typing 'fish food' into the search bar", + action: "setValue(123, 'fish food')" +} + +Example 3: { thought: "I continue to scroll down to find the section", - action: "scroll("down")" + action: "scroll('down')" } Your response must always be in JSON format and must include "thought" and "action". @@ -137,9 +143,9 @@ ${pageContents}`; // make action compatible with vision agent // TODO: refactor dom agent so we don't need this function visionActionAdapter(action: ParsedResponseSuccess): Action { - const args = { ...action.parsedAction.args, label: "" }; + const args = { ...action.parsedAction.args, uid: "" }; if ("elementId" in args) { - args.label = args.elementId; + args.uid = args.elementId; } return { thought: action.thought, diff --git a/src/helpers/rpc/performAction.ts b/src/helpers/rpc/performAction.ts index 26437f6..a999323 100644 --- a/src/helpers/rpc/performAction.ts +++ b/src/helpers/rpc/performAction.ts @@ -156,40 +156,31 @@ function createOperateTool( window.open(action.args.url, "_blank"); break; case "click": { - const success = await click(domActions, action.args.label); + const success = await click(domActions, action.args.uid); if (!success) { - console.error( - "Unable to find element with label: ", - action.args.label, - ); + console.error("Unable to find element with uid: ", action.args.uid); } break; } case "setValue": { const success = await setValue( domActions, - action.args.label, + action.args.uid, action.args.value || "", ); if (!success) { - console.error( - "Unable to find element with label: ", - action.args.label, - ); + console.error("Unable to find element with uid: ", action.args.uid); } break; } case "setValueAndEnter": { const success = await setValue( domActions, - action.args.label, + action.args.uid, (action.args.value || "") + "\n", ); if (!success) { - console.error( - "Unable to find element with label: ", - action.args.label, - ); + console.error("Unable to find element with uid: ", action.args.uid); } break; } diff --git a/src/helpers/vision-agent/determineNextAction.ts b/src/helpers/vision-agent/determineNextAction.ts index cbdf547..497d05b 100644 --- a/src/helpers/vision-agent/determineNextAction.ts +++ b/src/helpers/vision-agent/determineNextAction.ts @@ -14,7 +14,7 @@ You can use the following tools: ${allToolsDescriptions} -You will be given a task to perform, and an image. The image will contain two parts: on the left is a clean screenshot of the current page, and on the right is the same screenshot with interactive elements annotated with corresponding label. +You will be given a task to perform, and an image. The image will contain two parts: on the left is a clean screenshot of the current page, and on the right is the same screenshot with interactive elements annotated with corresponding uid. You will also be given previous actions that you have taken. If something does not work, try find an alternative solution. For example, instead of searching for a specific item that the user requested, perform a general search and apply filters, or simply browse the results page. You will also be given additional information of annotations. @@ -30,7 +30,7 @@ This is one example of expected response from you: "action": { "name": "click", "args": { - "label": "123" + "uid": "123" } } } diff --git a/src/helpers/vision-agent/tools.ts b/src/helpers/vision-agent/tools.ts index 569afab..4d5306b 100644 --- a/src/helpers/vision-agent/tools.ts +++ b/src/helpers/vision-agent/tools.ts @@ -3,10 +3,10 @@ import { z } from "zod"; export const clickSchema = z.object({ name: z.literal("click"), description: z - .literal("Click on an element with the label on the annotation.") + .literal("Click on an element with the uid on the annotation.") .optional(), args: z.object({ - label: z.string(), + uid: z.string(), }), }); @@ -14,11 +14,11 @@ export const setValueSchema = z.object({ name: z.literal("setValue"), description: z .literal( - "Focus on and set the value of an input element with the label on the annotation.", + "Focus on and set the value of an input element with the uid on the annotation.", ) .optional(), args: z.object({ - label: z.string(), + uid: z.string(), value: z.string(), }), }); @@ -31,7 +31,7 @@ export const setValueAndEnterSchema = z.object({ ) .optional(), args: z.object({ - label: z.string(), + uid: z.string(), value: z.string(), }), }); diff --git a/src/pages/content/drawLabels.ts b/src/pages/content/drawLabels.ts index d6dc2d9..438c90f 100644 --- a/src/pages/content/drawLabels.ts +++ b/src/pages/content/drawLabels.ts @@ -202,7 +202,7 @@ const baseZIndex = 10000; type LabelDataWithElement = { element: Element; - label: string; + uid: string; name: string; tagName: string; role?: string; @@ -229,7 +229,7 @@ function getLabelData( const uidString = uid.toString(); const item: LabelDataWithElement = { - label: uidString, + uid: uidString, name, tagName: elem.tagName, element: elem, @@ -310,8 +310,8 @@ export function addLabelsToDom(data: LabelDataWithElement[]) { const wrapper = document.createElement("div"); wrapper.classList.add("_label_overlay_wrapper"); wrapper.popover = "manual"; - data.forEach(({ element, label }, index) => { - drawLabel(wrapper, element, label, baseZIndex + data.length - index); + data.forEach(({ element, uid }, index) => { + drawLabel(wrapper, element, uid, baseZIndex + data.length - index); }); // set wrapper's width and height to match body wrapper.style.width = `${document.documentElement.scrollWidth}px`;