Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance prompt with more and better elements information #105

Merged
merged 3 commits into from
Apr 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
"@emotion/react": "^11.11.1",
"@emotion/styled": "^11.11.0",
"@types/dom-speech-recognition": "^0.0.4",
"accname": "^1.1.0",
"construct-style-sheets-polyfill": "3.1.0",
"formik": "^2.4.5",
"immer": "^10.0.3",
Expand Down
7 changes: 7 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion src/helpers/rpc/performAction.ts
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ function createOperateTool(
}
break;
}
case "setValueAndSubmit": {
case "setValueAndEnter": {
const success = await setValue(
domActions,
action.args.label,
Expand Down
38 changes: 28 additions & 10 deletions src/helpers/vision-agent/determineNextAction.ts
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,13 @@ export function formatPrompt(
labelData: LabelData[],
viewportPercentage: number,
) {
let previousActionsString = "";
// 1. task instructions
let result = `The user requests the following task:
${taskInstructions}`;

// 2. previous actions
let previousActionsString = "";
if (previousActions.length > 0) {
const serializedActions = previousActions
.map(
Expand All @@ -161,38 +166,51 @@ export function formatPrompt(
.join("\n\n");
previousActionsString = `You have already taken the following actions: \n${serializedActions}\n\n`;
}
result += `\n${previousActionsString}\n`;

// 3. current time + current URL + current page scrolling position
let urlString = url.href;
// do not include search if it's too long
if (url.search.length > 100) {
urlString = url.origin + url.pathname;
}

let result = `The user requests the following task:
${taskInstructions}
${previousActionsString}
result += `
Current time: ${new Date().toLocaleString()}
Current URL: ${urlString}
Current page scrolling position: ${viewportPercentage.toFixed(1)}%
`;

// 4. knowledge
if (knowledge.notes != null && knowledge.notes?.length > 0) {
result += `
Notes regarding the current website:
${knowledge.notes.map((k) => ` - ${k}`).join("\n")}`;
}

// 5. label data from HTML
result += `
Use the following data as a reference of the annotated elements (using \`===\` as a delimiter between each annotation):
${labelData.map((item) => tomlLikeStringifyObject(item)).join("\n===\n")}`;
${labelData.map((item) => tomlLikeStringifyObject(item)).join("\n===\n")}
`;
// 6. active element
const currentActiveItem = labelData.find((item) => item.active);
if (currentActiveItem != null) {
result += `
This ${currentActiveItem.tagName.toLocaleLowerCase()} currently has focus:
${tomlLikeStringifyObject(currentActiveItem)}
`;
}
return result;
}

function tomlLikeStringifyObject(obj: Record<string, unknown>): string {
return Object.entries(obj)
.map(([key, value]) => `${key} = ${JSON.stringify(value)}`)
.map(([key, value]) =>
// only include string values
typeof value === "string" ? `${key} = ${value}` : null,
)
.filter((v) => v != null)
.join("\n");
}
9 changes: 5 additions & 4 deletions src/helpers/vision-agent/tools.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,12 @@ export const setValueSchema = z.object({
value: z.string(),
}),
});
export const setValueAndSubmitSchema = z.object({
name: z.literal("setValueAndSubmit"),

export const setValueAndEnterSchema = z.object({
name: z.literal("setValueAndEnter"),
description: z
.literal(
'Like "setValue", except then it presses ENTER to submit the form.',
'Like "setValue", except then it presses ENTER. Use this tool can submit the form when there\'s no "submit" button.',
)
.optional(),
args: z.object({
Expand Down Expand Up @@ -86,7 +87,7 @@ export const failSchema = z.object({
export const toolSchemaUnion = z.discriminatedUnion("name", [
clickSchema,
setValueSchema,
setValueAndSubmitSchema,
setValueAndEnterSchema,
navigateSchema,
scrollSchema,
waitSchema,
Expand Down
56 changes: 15 additions & 41 deletions src/pages/content/drawLabels.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import { getAccessibleName } from "accname";
import {
VISIBLE_TEXT_ATTRIBUTE_NAME,
ARIA_LABEL_ATTRIBUTE_NAME,
Expand Down Expand Up @@ -45,6 +46,9 @@ function isElementNode(node: Node): node is Element {
function isInputElement(node: Node): node is HTMLInputElement {
return isElementNode(node) && node.tagName === "INPUT";
}
function isTextAreaElement(node: Node): node is HTMLTextAreaElement {
return isElementNode(node) && node.tagName === "TEXTAREA";
}

function isTopElement(elem: Element, rect: DOMRect) {
let topEl = document.elementFromPoint(
Expand Down Expand Up @@ -117,37 +121,6 @@ function isTouchedElement(elem: Element) {
);
}

function getAriaLabel(elem: Element): string {
// aria-labelledby has higher priority than aria-label
// https://developer.mozilla.org/en-US/docs/Web/Accessibility/ARIA/Attributes/aria-labelledby
if (elem.hasAttribute("aria-labelledby")) {
// use Set to dedupe
const ids = new Set<string>(
elem.getAttribute("aria-labelledby")?.split(" ") ?? [],
);

const label = Array.from(ids)
.map((id: string) => {
const labelElem = document.getElementById(id);
if (labelElem) {
if (isInputElement(labelElem)) {
// for input elements, use the value as the label
return labelElem.value;
}
// doesn't matter if the text is visible or not
return labelElem.textContent ?? "";
}
})
.join(" ")
.trim();

if (label.length > 0) {
return label;
}
}
return elem.getAttribute("aria-label") ?? "";
}

// find the visible text and best-match aria-label of the element
// note that this function has a side effect of writing the attributes in the DOM
function traverseDom(node: Node, selector: string): DomAttrs {
Expand All @@ -163,7 +136,7 @@ function traverseDom(node: Node, selector: string): DomAttrs {
}

let visibleText = "";
let ariaLabel = getAriaLabel(node);
let ariaLabel = getAccessibleName(node);

// skip children of SVGs because they have their own visibility rules
if (node.tagName.toLocaleLowerCase() !== "svg") {
Expand Down Expand Up @@ -231,6 +204,10 @@ type LabelDataWithElement = {
name: string;
tagName: string;
role?: string;
// for input elements
currentValue?: string;
placeholder?: string;
active?: boolean;
};

export type LabelData = Omit<LabelDataWithElement, "element"> & {
Expand All @@ -255,6 +232,11 @@ function getLabelData(
tagName: elem.tagName,
element: elem,
};
if (isInputElement(elem) || isTextAreaElement(elem)) {
item.currentValue = elem.value;
item.placeholder = elem.placeholder;
item.active = document.activeElement === elem;
}
if (elem.hasAttribute("role")) {
item.role = elem.getAttribute("role") ?? "unknown";
}
Expand All @@ -281,7 +263,7 @@ function getLabelData(
}
// fallback to use aria-label
if (label.length === 0) {
label = getAriaLabel(elem);
label = getAccessibleName(elem);
}
// fallback to use text content
if (label.length === 0) {
Expand Down Expand Up @@ -309,14 +291,6 @@ function getLabelData(
if (elem.getAttribute("aria-hidden") === "true") return;
// if the element is not visible, skip it
if (!isVisible(elem, true, true)) return;
// if the element is an input, hopefully the value or the placeholder is visible
if (isInputElement(elem)) {
const visibleTextOnInput = removeEmojis(
elem.value || elem.placeholder || "",
);
addLabel(visibleTextOnInput, elem);
return;
}

const { visibleText, ariaLabel } = traverseDom(elem, selector);
// use aria-label as name, otherwise use visible text
Expand Down
Loading