diff --git a/resources/prompts/minecraft.txt b/resources/prompts/minecraft.txt index 2818b05..b61cdf9 100644 --- a/resources/prompts/minecraft.txt +++ b/resources/prompts/minecraft.txt @@ -1,30 +1,15 @@ You are a Minecraft player having fun in Minecraft. You get transcripts of the screen and are listening to the speakers. Go ahead and explore the world by pursuing meaningful goals. - - ===== Recent audio input ===== - - %s - ===== Recent screen descriptions ===== - - %s - ===== Your recent statements ===== - - %s - ===== Your recent actions ===== - - %s - ================ - diff --git a/resources/prompts/screen.txt b/resources/prompts/screen.txt index 8fe4d32..abbc1f7 100644 --- a/resources/prompts/screen.txt +++ b/resources/prompts/screen.txt @@ -1,31 +1,22 @@ -You are an computer assistant looking at the screen and listening to the speakers. You execute tasks on the screen. - - +You are a computer assistant actively observing the screen and listening to the speakers and user. ===== Recent audio input ===== - - %s - -===== Recent screen descriptions ===== - - -%s - +===== Recent screen transcripts ===== +%s ===== Your recent statements ===== - - %s - ===== Your recent actions ===== - - %s - -================ - - +===== Instructions ===== +- Your goal is to assist the user by understanding the context and responding helpfully and concisely. +- Maintain continuity by considering previous audio inputs, history of screen transcripts, your statements, and your actions. +- Respond *only* to any questions asked by the user in the audio input! Consider the context from the screen and your recent statements. +- Do not repeat yourself in terms of your recent statements! Continue from where you left off to maintain a natural conversation flow. +- Do not explicitly tell the user to feel free to ask questions or follow up! +- You might have heard your own earlier statements also on audio input as a feedback. Ignore those. +- If there is nothing new to add answering the questions of the user, e.g. because you answered it and no further question has been asked, respond with "QUIET". Otherwise, provide the next one or two sentences in your ongoing conversation with the user. \ No newline at end of file diff --git a/src/is/simm/runtimes/openai.clj b/src/is/simm/runtimes/openai.clj index ed36f3d..9e4b49d 100644 --- a/src/is/simm/runtimes/openai.clj +++ b/src/is/simm/runtimes/openai.clj @@ -14,7 +14,8 @@ [superv.async :refer [S go-try go-loop-try cf (.thenApply (reify Function @@ -70,6 +84,9 @@ res)) +(s/fdef text-chat + :args (s/cat :model string? :text string?) + :ret (s/cat :response string?)) (defn text-chat [model text] (let [res (chan)] (if (>= (count text) (* 4 (window-sizes model))) @@ -77,7 +94,8 @@ (put! res (ex-info "Sorry, the text is too long for this model. Please try a shorter text." {:type ::text-too-long :model model :text-start (subs text 0 100) :count (count text)})) res) - (chat model [{"type" "text" "text" text}])))) + (chat model [{"role" "user" + "content" [{"type" "text" "text" text}]}])))) (comment @@ -153,7 +171,8 @@ request (http/post "https://api.openai.com/v1/audio/transcriptions" {:headers headers :multipart [{:name "file" :content (io/file input-path) :file-name input-path :mimetype "audio/wav"} - {:name "model" :content model}] + {:name "model" :content model} + {:name "language" :content "en"}] :async true})] (-> request (.thenApply (reify Function diff --git a/src/is/simm/runtimes/ubuntu.clj b/src/is/simm/runtimes/ubuntu.clj index cda5e29..d5d1248 100644 --- a/src/is/simm/runtimes/ubuntu.clj +++ b/src/is/simm/runtimes/ubuntu.clj @@ -8,6 +8,7 @@ [libpython-clj2.require :refer [require-python]] [libpython-clj2.python :refer [py. py.. py.-] :as py] [hyperfiddle.rcf :refer [tests]] + [datahike.api :as d] [clojure.edn :as edn] [clojure.spec.alpha :as s] @@ -22,9 +23,9 @@ ;; throw on shell error (defn shell [cmd & args] (let [result (apply shell/sh cmd args)] - (if (zero? (:exit result)) - (:out result) - (throw (ex-info "Shell command failed" result))))) + (if (zero? (:exit result)) + (:out result) + (throw (ex-info "Shell command failed" result))))) (defn open [filename] (shell "xdg-open" filename)) @@ -48,36 +49,41 @@ (comment (shell "cvlc" "--no-loop" "/tmp/58a6708e-6fa3-41eb-b75b-e3cc4934e064.mp3")) - + ;; ===== Language ===== -(defn vlm +(defn vlm ([prompt filename] - (> @conn + (d/q '[:find ?s ?c :where [?e :screen/file ?s] [?e :event/created ?c]]) + (sort-by second) + ffirst + ) + + ;; pull full event history out of conn + (->> @conn + (d/q '[:find (pull ?e [:*]) :where [?e :event/created ?c]]) + (map first) + (sort-by :event/created) + events->openai-messages) + + + ) + +(defn events->openai-messages [events] + (map (fn [event] + (let [role (get event :event/role) + audio-in (get event :audio/in) + audio-out (get event :audio/out) + screen-file (get event :screen/file) + screen-transcript (get event :screen/transcript) + action (get event :action) + created (get event :event/created)] + (cond + (and role audio-in) + {:role role + :content [{:type "text" :text (str created " audio-in: " audio-in)}]} + + (and role audio-out) + {:role role + :content [{:type "text" :text (str created " audio-out: " audio-out)}]} + + (and role screen-transcript) + {:role role + :content [{:type "text" :text (str created "screen-transcript: " screen-transcript)}]} + + #_(and role screen-file) + #_{:role role + :content [{:type "image_url" + :image_url {:url (str "data:image/jpeg;base64," (encode-file screen-file))}}]} + + (and role action) + {:role role + :content [{:type "text" :text (str created "action:" action)}]}))) + events)) + + +(def system-prompt "You are a reactive computer assistant actively observing the screen, listening to the speakers and user and your statements are put out on the speakers. + +===== Instructions ===== +- Your goal is to assist the user by understanding the context and responding helpfully and concisely. +- Maintain continuity by considering previous audio inputs, history of screen transcripts, your statements, and your actions. +- Respond *only* to any questions asked by the user in the audio input! Consider the context from the screen and your recent statements. +- Do not repeat yourself in terms of your recent statements! Continue from where you left off to maintain a natural conversation flow. +- Do not explicitly tell the user to feel free to ask questions or follow up! +- You might receive your own earlier statements on audio input also on audio input as a feedback. Ignore those. +- The timestamps are for your orientation and do not need to be mentioned in your responses. +- If there is nothing new to add answering the questions of the user, e.g. because you answered it and no further question has been asked, respond with \"QUIET\". Otherwise, provide the next one or two sentences in your ongoing conversation with the user.") + + +(defn baseline-0 [conn] + (m/race (audio-listen (:microphone audio-devices) + 10 + #(d/transact! conn [{:audio/in % + :audio/device (:microphone audio-devices) + :event/created (java.util.Date.) + :event/role "user"}])) + + (audio-listen (:speakers audio-devices) + 10 + #(d/transact! conn [{:audio/out % + :audio/device (:speakers audio-devices) + :event/created (java.util.Date.) + :event/role "developer"}])) + + (screen-watch 10 + #(d/transact! conn [{:screen/file %1 + :screen/transcript %2 + :event/created (java.util.Date.) + :event/role "user"}])) + + (m/sp + (loop [] + (debug "talk loop") + (let [last-screen (->> @conn + (d/q '[:find ?s ?c :where [?e :screen/file ?s] [?e :event/created ?c]]) + (sort-by second) + first ;; newest + first + ) + messages (->> @conn + (d/q '[:find (pull ?e [:*]) :where [?e :event/created ?c]]) + (map first) + (sort-by :event/created) + events->openai-messages)] + (debug "last-screen" last-screen) + (debug "messages" messages) + + (if (and (not (empty? messages)) last-screen) #_last-screen + (let [statement (m/? (> @conn + (d/q '[:find (pull ?e [:*]) :where [?e :event/created ?c]]) + (map first) + (sort-by :event/created) + reverse) + + + (def baseline-0-test (baseline-0 conn)) (def baseline-0-dispose (baseline-0-test #(prn ::success %) #(prn ::error %))) - (baseline-0-dispose)) - + (baseline-0-dispose) + + ) + (s/def ::actions (s/coll-of (s/or :key (s/keys :req-un [::key ::duration]) :mouse (s/keys :req-un [::mouse])))) (defn parse-spec [input spec default] - (try - (let [p (second (.split input "```clojure")) - p (first (.split p "```")) - p (edn/read-string p)] - (if (s/valid? spec p) p default)) - (catch Exception _ []))) + (try + (let [p (second (.split input "```clojure")) + p (first (.split p "```")) + p (edn/read-string p)] + (if (s/valid? spec p) p default)) + (catch Exception _ []))) (defn baseline-1 [] (let [!audio-in (atom []) !audio-out (atom []) !screen-in (atom []) - !action-out (atom [])] + !action-out (atom []) ] ;; run audio and screen perception in - (m/race (audio-listen !audio-in 10) - (screen-watch !screen-in 30) + (m/race (audio-listen (:microphone audio-devices) 5 #(swap! !audio-in conj %)) + (audio-listen (:speakers audio-devices) 5 #(swap! !audio-out conj %)) + (screen-watch 10 #(swap! !screen-in conj [%1 %2])) (m/sp (loop [] - (debug "talk loop") - (when true #_(silence? (second (last @!audio-in))) - (let [statement (m/? (llm (str prompts/minecraft - "If there is nothing new to say then reply with QUIET. Otherwise say the next two sentences from the first person perspective in a fun and playful style:\n") - (map second @!audio-in) - @!audio-out - (map second @!screen-in) - @!action-out))] - #_(debug "statement" statement) - (when-not (.contains statement "QUIET") - (swap! !audio-out conj statement) - (m/? (play-audio (m/? (