diff --git a/resources/prompts/screen.txt b/resources/prompts/screen.txt index abbc1f7..287117a 100644 --- a/resources/prompts/screen.txt +++ b/resources/prompts/screen.txt @@ -1,16 +1,4 @@ -You are a computer assistant actively observing the screen and listening to the speakers and user. - -===== Recent audio input ===== -%s - -===== Recent screen transcripts ===== -%s - -===== Your recent statements ===== -%s - -===== Your recent actions ===== -%s +You are a reactive computer assistant actively observing the screen, listening to the speakers and user and your statements are put out on the speakers. ===== Instructions ===== - Your goal is to assist the user by understanding the context and responding helpfully and concisely. @@ -18,5 +6,12 @@ You are a computer assistant actively observing the screen and listening to the - Respond *only* to any questions asked by the user in the audio input! Consider the context from the screen and your recent statements. - Do not repeat yourself in terms of your recent statements! Continue from where you left off to maintain a natural conversation flow. - Do not explicitly tell the user to feel free to ask questions or follow up! -- You might have heard your own earlier statements also on audio input as a feedback. Ignore those. -- If there is nothing new to add answering the questions of the user, e.g. because you answered it and no further question has been asked, respond with "QUIET". Otherwise, provide the next one or two sentences in your ongoing conversation with the user. \ No newline at end of file +- You might receive your own earlier statements on audio input also on audio input as a feedback. Ignore those. +- The timestamps are for your orientation and do not need to be mentioned in your responses. +- If there is nothing new to add answering the questions of the user, e.g. because you answered it and no further question has been asked, respond with "QUIET". Otherwise, provide the next one or two sentences in your ongoing conversation with the user. + +Your output must follow this JSON format: + +```json +{"statement": "Your statement ..."} +``` \ No newline at end of file diff --git a/src/is/simm/runtimes/openai.clj b/src/is/simm/runtimes/openai.clj index 9e4b49d..725e95d 100644 --- a/src/is/simm/runtimes/openai.clj +++ b/src/is/simm/runtimes/openai.clj @@ -80,7 +80,7 @@ (get "content")))))) (.exceptionally (reify Function (apply [_ e] - (put! res (ex-info "Error in OpenAI chat." {:type :error-in-openai :error e})))))) + (put! res (ex-info "Error in OpenAI chat." {:type :error-in-openai :error (ex-message e)})))))) res)) diff --git a/src/is/simm/runtimes/ubuntu.clj b/src/is/simm/runtimes/ubuntu.clj index d5d1248..42fb4c7 100644 --- a/src/is/simm/runtimes/ubuntu.clj +++ b/src/is/simm/runtimes/ubuntu.clj @@ -11,10 +11,12 @@ [datahike.api :as d] [clojure.edn :as edn] + [clojure.data.json :as json] [clojure.spec.alpha :as s] [clojure.string :as str] [clojure.java.io :as io] - [clojure.java.shell :as shell]) + [clojure.java.shell :as shell] + [clojure.data.json :as json]) (:import [java.util Base64])) @@ -179,7 +181,7 @@ (m/? (m/join vector (m/sleep (* interval 1000)) (m/sp - (let [text (time (m/? (vlm screenshot-prompt f)))] + (let [text "" #_(time (m/? (vlm screenshot-prompt f)))] #_(debug "screen text: " text) (cb f text))))) nil)) @@ -328,11 +330,11 @@ :db/cardinality :db.cardinality/one} {:db/ident :audio/in :db/valueType :db.type/string - :db/cardinality :db.cardinality/one} - {:db/ident :audio/device + :db/cardinality :db.cardinality/one} + {:db/ident :audio/out :db/valueType :db.type/string :db/cardinality :db.cardinality/one} - {:db/ident :audio/out + {:db/ident :audio/device :db/valueType :db.type/string :db/cardinality :db.cardinality/one} {:db/ident :screen/file @@ -341,7 +343,7 @@ {:db/ident :screen/transcript :db/valueType :db.type/string :db/cardinality :db.cardinality/one} - {:db/ident :action + {:db/ident :assistant/output :db/valueType :db.type/string :db/cardinality :db.cardinality/one}]) @@ -390,7 +392,7 @@ (map (fn [event] (let [role (get event :event/role) audio-in (get event :audio/in) - audio-out (get event :audio/out) + system-output (get event :system/output) screen-file (get event :screen/file) screen-transcript (get event :screen/transcript) action (get event :action) @@ -400,9 +402,9 @@ {:role role :content [{:type "text" :text (str created " audio-in: " audio-in)}]} - (and role audio-out) + (and role system-output) {:role role - :content [{:type "text" :text (str created " audio-out: " audio-out)}]} + :content [{:type "text" :text (str created " system-output: " system-output)}]} (and role screen-transcript) {:role role @@ -413,23 +415,30 @@ :content [{:type "image_url" :image_url {:url (str "data:image/jpeg;base64," (encode-file screen-file))}}]} - (and role action) - {:role role - :content [{:type "text" :text (str created "action:" action)}]}))) + ))) events)) -(def system-prompt "You are a reactive computer assistant actively observing the screen, listening to the speakers and user and your statements are put out on the speakers. -===== Instructions ===== -- Your goal is to assist the user by understanding the context and responding helpfully and concisely. -- Maintain continuity by considering previous audio inputs, history of screen transcripts, your statements, and your actions. -- Respond *only* to any questions asked by the user in the audio input! Consider the context from the screen and your recent statements. -- Do not repeat yourself in terms of your recent statements! Continue from where you left off to maintain a natural conversation flow. -- Do not explicitly tell the user to feel free to ask questions or follow up! -- You might receive your own earlier statements on audio input also on audio input as a feedback. Ignore those. -- The timestamps are for your orientation and do not need to be mentioned in your responses. -- If there is nothing new to add answering the questions of the user, e.g. because you answered it and no further question has been asked, respond with \"QUIET\". Otherwise, provide the next one or two sentences in your ongoing conversation with the user.") +(s/def ::actions (s/coll-of (s/or :key (s/keys :req-un [::key ::duration]) + :mouse (s/keys :req-un [::mouse])))) + +(defn parse-spec [input spec default] + (try + (let [p (second (.split input "```clojure")) + p (first (.split p "```")) + p (edn/read-string p)] + (if (s/valid? spec p) p default)) + (catch Exception _ default))) + +(defn parse-json [input default] + (try + (let [p (second (.split input "```json")) + p (first (.split p "```")) + p (json/read-str p)] + p) + (catch Exception _ default))) + (defn baseline-0 [conn] @@ -456,7 +465,8 @@ (m/sp (loop [] (debug "talk loop") - (let [last-screen (->> @conn + (let [system-prompt (slurp (io/resource "prompts/screen.txt")) + last-screen (->> @conn (d/q '[:find ?s ?c :where [?e :screen/file ?s] [?e :event/created ?c]]) (sort-by second) first ;; newest @@ -470,21 +480,20 @@ (debug "last-screen" last-screen) (debug "messages" messages) - (if (and (not (empty? messages)) last-screen) #_last-screen - (let [statement (m/? (> @conn + (d/q '[:find ?s ?c :where [?e :screen/file ?s] [?e :event/created ?c]]) + (sort-by second) + first ;; newest + first) + messages (->> @conn + (d/q '[:find (pull ?e [:*]) :where [?e :event/created ?c]]) + (map first) + (sort-by :event/created) + events->openai-messages)] + (debug "last-screen" last-screen) + (debug "messages" messages) + + (if (and (seq messages) last-screen) #_last-screen + (let [statement (m/? (