Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Agent performance improvements #46

Merged
merged 9 commits into from
Mar 13, 2024
11 changes: 6 additions & 5 deletions src/helpers/availableActions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
export const availableActionsVision = [
{
name: "click",
description:
"Clicks on an element with the text label appears on or associated with it.",
description: "Click on an element with the label on the annotation.",
args: [
{
name: "label",
Expand All @@ -15,7 +14,7 @@ export const availableActionsVision = [
{
name: "setValue",
description:
"Focuses on and sets the value of an input element. Label can be the text label appears on or associated with it, or the value in it",
"Focus on and set the value of an input element with the label on the annotation.",
args: [
{
name: "label",
Expand All @@ -29,7 +28,8 @@ export const availableActionsVision = [
},
{
name: "scroll",
description: 'Scroll the page up or down. Value can be "up" or "down"',
description:
'Scroll the page to see the other parts. Use "up" or "down" to scroll half the height of the window. Use "top" or "bottom" to quickly scroll to the top or bottom of the page.',
args: [
{
name: "value",
Expand Down Expand Up @@ -76,7 +76,8 @@ export const availableActions = [
},
{
name: "scroll",
description: 'Scroll the page up or down. Value can be "up" or "down"',
description:
'Scroll the page to see the other parts. Use "up" or "down" to scroll half the height of the window. Use "top" or "bottom" to scroll to the top or bottom of the page.',
args: [
{
name: "value",
Expand Down
2 changes: 1 addition & 1 deletion src/helpers/browserUtils.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
export async function findActiveTab() {
export async function findActiveTab(): Promise<chrome.tabs.Tab | null> {
const currentWindow = await chrome.windows.getCurrent();
if (!currentWindow || !currentWindow.id) {
throw new Error("Could not find window");
Expand Down
25 changes: 25 additions & 0 deletions src/helpers/buildAnnotatedScreenshots.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import { sleep } from "./utils";
import { callRPCWithTab } from "./rpc/pageRPC";
import mergeImages from "@src/shared/images/mergeScreenshots";
import { type LabelData } from "../pages/content/drawLabels";

export default async function buildAnnotatedScreenshots(
tabId: number,
): Promise<[string, LabelData[]]> {
const imgDataRaw = await chrome.tabs.captureVisibleTab({
format: "png",
});
const labelData = await callRPCWithTab(tabId, "drawLabels", []);
await sleep(300);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[not blocker] could define it as a constants rather than using a magic number

const imgDataAnnotated = await chrome.tabs.captureVisibleTab({
format: "png",
});
const imgData = await mergeImages([
{ src: imgDataRaw, caption: "Clean Screenshot" },
{ src: imgDataAnnotated, caption: "Annotated Screenshot" },
]);
await sleep(300);
await callRPCWithTab(tabId, "removeLabels", []);

return [imgData, labelData];
}
45 changes: 27 additions & 18 deletions src/helpers/chromeDebugger.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,40 +6,49 @@
// return targets.some((target) => target.tabId === tabId && target.attached);
// }

// maintain a set of attached tabs
const attachedTabs = new Set<number>();

export async function attachDebugger(tabId: number) {
console.log('start attachDebugger');
console.log("start attachDebugger");
// const isAttached = await isDebuggerAttached(tabId);
// if (isAttached) {
// console.log('already attached to debugger', tabId);
// return;
// }
const isAttached = attachedTabs.has(tabId);
if (isAttached) {
console.log("already attached to debugger", tabId);
return;
}
return new Promise<void>((resolve, reject) => {
return chrome.debugger.attach({ tabId }, '1.3', async () => {
return chrome.debugger.attach({ tabId }, "1.3", async () => {
if (chrome.runtime.lastError) {
console.error(
'Failed to attach debugger:',
chrome.runtime.lastError.message
"Failed to attach debugger:",
chrome.runtime.lastError.message,
);
reject(
new Error(
`Failed to attach debugger: ${chrome.runtime.lastError.message}`
)
`Failed to attach debugger: ${chrome.runtime.lastError.message}`,
),
);
} else {
console.log('attached to debugger');
await chrome.debugger.sendCommand({ tabId }, 'DOM.enable');
console.log('DOM enabled');
await chrome.debugger.sendCommand({ tabId }, 'Runtime.enable');
console.log('Runtime enabled');
console.log("attached to debugger");
await chrome.debugger.sendCommand({ tabId }, "DOM.enable");
console.log("DOM enabled");
await chrome.debugger.sendCommand({ tabId }, "Runtime.enable");
console.log("Runtime enabled");
attachedTabs.add(tabId);
resolve();
}
});
});
}

export async function detachDebugger(tabId: number) {
// const isAttached = await isDebuggerAttached(tabId);
// if (isAttached) {
attachedTabs.delete(tabId);
chrome.debugger.detach({ tabId: tabId });
// }
}

export async function detachAllDebuggers() {
for (const tabId of attachedTabs) {
await detachDebugger(tabId);
}
}
40 changes: 33 additions & 7 deletions src/helpers/determineNextAction.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import { type LabelData } from "./../pages/content/drawLabels";
import OpenAI from "openai";
import { useAppState } from "../state/store";
import { availableActions, availableActionsVision } from "./availableActions";
import { ParsedResponseSuccess } from "./parseResponse";
import { fetchKnowledge } from "./knowledge/fetchKnowledge";

const formattedActions = availableActions
.map((action, i) => {
Expand Down Expand Up @@ -29,7 +31,8 @@ You can use the following tools:

${formattedActions}

You will be be given a task to perform and the current state of the DOM. You will also be given previous actions that you have taken. You may retry a failed action up to one time.
You will be given a task to perform and the current state of the DOM.
You will also be given previous actions that you have taken. You may retry a failed action up to one time.

There are two examples of actions:

Expand All @@ -56,19 +59,19 @@ You can use the following tools:

${formattedActionsVision}

Pleaes note for setValue, you can press "enter" by including a line break (\`\\n\`) in the parameter to trigger form submitting.

You will be be given a task to perform and a screenshot of the webpage. You will also be given previous actions that you have taken. You may retry a failed action up to one time.
You will be given a task to perform, and an image. The image will contain two parts: on the left is a clean screenshot of the current page, and on the right is the same screenshot with interactive elements annotated with corresponding label.
You will also be given previous actions that you have taken. You may retry a failed action up to one time.
You will also be given additional information of annotations.

This is an example of an action:

{
thought: "I should click the add to cart button",
action: "click('add to cart')"
action: "click('12')"
lynchee-owo marked this conversation as resolved.
Show resolved Hide resolved
}

Your response must always be in JSON format and must include "thought" and "action".
When finish, use "finish()" in "action" and include a brief summary of the task in "thought"; if user is seeking an anwser, also include the answer in "thought".
When finish, use "finish()" in "action" and include a brief summary of the task in "thought"; if user is seeking an answer, also include the answer in "thought".
`;

export type NextAction = {
Expand All @@ -79,8 +82,11 @@ export type NextAction = {

export async function determineNextActionWithVision(
taskInstructions: string,
url: string | undefined,
previousActions: ParsedResponseSuccess[],
screenshotData: string,
labelData: LabelData[],
viewportPercentage: number,
maxAttempts = 3,
notifyError?: (error: string) => void,
): Promise<NextAction> {
Expand All @@ -90,7 +96,21 @@ export async function determineNextActionWithVision(
return null;
}
const model = useAppState.getState().settings.selectedModel;
const prompt = formatPrompt(taskInstructions, previousActions);
const location = new URL(url ?? "");
const knowledge = fetchKnowledge(location);
let prompt =
formatPrompt(taskInstructions, previousActions) +
`Current page progress: ${viewportPercentage.toFixed(1)}%`;
if (knowledge.length > 0) {
prompt += `
Notes regarding the current website:
${knowledge.map((k) => ` - ${k}`).join("\n")}`;
}
prompt += `

Use the following data as a reference of the annotated elements (using \`===\` as a delimiter between each annotation):

${labelData.map((item) => tomlLikeStringifyObject(item)).join("\n===\n")}`;

const openai = new OpenAI({
apiKey: key,
Expand Down Expand Up @@ -249,3 +269,9 @@ ${pageContents}`;
}
return result;
}

function tomlLikeStringifyObject(obj: Record<string, unknown>): string {
return Object.entries(obj)
.map(([key, value]) => `${key} = ${JSON.stringify(value)}`)
.join("\n");
}
22 changes: 22 additions & 0 deletions src/helpers/knowledge/db.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"x.com": {
"rules": [
{
"regexes": [".*"],
"knowledge": [
"The website X (formerly Twitter) is a social media platform. Many people still call it Twitter and use the term \"tweet\" to refer to a post.",
"Do not confuse \"post\" with \"message\". A post is a public message that can be seen by anyone, while a message is a private message that can only be seen by the recipient."
]
},
{
"regexes": ["/compose/post/?$"],
"knowledge": [
"The \"Add post\" button is used to compose a thread. Do not confuse with the \"Post\" button that sends the composed tweet."
]
}
]
},
"twitter.com": {
"redirect": "x.com"
}
}
41 changes: 41 additions & 0 deletions src/helpers/knowledge/fetchKnowledge.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import db from "./db.json";

type Data = {
[host: string]: {
redirect?: string;
rules?: {
regexes: string[];
knowledge: string[];
}[];
};
};

export type LocationInfo = {
host: string;
pathname: string;
};

export function fetchKnowledge(location: LocationInfo): string[] {
// TODO: fetch from a server
const data = db as unknown as Data;
let result: string[] = [];

const { host, pathname } = location;
const hostData = data[host];
if (hostData) {
if (hostData.redirect != null) {
return fetchKnowledge({ host: hostData.redirect, pathname });
}
const rules = hostData.rules;
if (rules != null) {
for (const rule of rules) {
for (const regex of rule.regexes) {
if (new RegExp(regex, "i").test(pathname)) {
result = result.concat(rule.knowledge);
}
}
}
}
}
return result;
}
14 changes: 14 additions & 0 deletions src/helpers/rpc/domActions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,20 @@ export class DomActions {
await sleep(300);
}

public async scrollToTop() {
await this.sendCommand("Runtime.evaluate", {
expression: "window.scroll({left: 0, top: 0})",
});
await sleep(300);
}

public async scrollToBottom() {
await this.sendCommand("Runtime.evaluate", {
expression: "window.scroll({left: 0, top: document.body.offsetHeight})",
});
await sleep(300);
}

public async setValueWithElementId(payload: {
elementId: number;
value: string;
Expand Down
19 changes: 15 additions & 4 deletions src/helpers/rpc/performAction.ts
Original file line number Diff line number Diff line change
Expand Up @@ -148,10 +148,21 @@ async function setValueWithLabel(
}

async function scroll(domActions: DomActions, action: Action) {
if (action.args.value === "up") {
await domActions.scrollUp();
} else {
await domActions.scrollDown();
switch (action.args.value) {
case "up":
await domActions.scrollUp();
break;
case "down":
await domActions.scrollDown();
break;
case "top":
await domActions.scrollToTop();
break;
case "bottom":
await domActions.scrollToBottom();
break;
default:
console.error("Invalid scroll value", action.args.value);
}
}

Expand Down
2 changes: 2 additions & 0 deletions src/pages/content/domOperations.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import attachFile from "./attachFile";
import { drawLabels, removeLabels } from "./drawLabels";
import ripple from "./ripple";
import { getDataFromRenderedMarkdown } from "./reverseMarkdown";
import getViewportPercentage from "./getViewportPercentage";

export const rpcMethods = {
getAnnotatedDOM,
Expand All @@ -17,6 +18,7 @@ export const rpcMethods = {
drawLabels,
removeLabels,
getDataFromRenderedMarkdown,
getViewportPercentage,
} as const;

export type RPCMethods = typeof rpcMethods;
Expand Down
Loading
Loading