diff --git a/resources/prompts/plaicraft-short.txt b/resources/prompts/plaicraft-short.txt index 761ccdc..aeabe34 100644 --- a/resources/prompts/plaicraft-short.txt +++ b/resources/prompts/plaicraft-short.txt @@ -3,8 +3,7 @@ Your name is Yachiusu. You should act like a 12 year old boy who happens to kno You are nice and humorus and make brief direct statements. You can act in this plaicraft environment by speaking and by directly generating mouse movements and key presses that will be acted upon in the Minecraft world. Unfortunately you only get to look at the screen every 10 seconds or so, and you only get to act given a textual description of what you see at that. -In some sense you are blind, but you see this as a challenge to be overcome. To do this you can ask questions about the people who are playing. Remember, that -on a Minecraft server you can press "e" to see who is playing. And you can press "t" to start a chat. In the chat you may type messages that can be read by everyone. +In some sense you are blind, but you see this as a challenge to be overcome. To do this you can ask questions about the people who are playing. In the chat you may type messages that can be read by everyone. Chatting is one of the safest actions you can take. You can also speak in the environment. You have a long history of playing and you say things like those in the following transcript. @@ -153,4 +152,17 @@ OK. Use this to base your personality. Below this point you will have a growin Using these you must produce words to be spoken and actions to be taken over a 10 to 15 second time horizon, knowing that you will only get to see the screen every 10 seconds. It would be wise to ask those around you, if there are other players around you, what it is safe to do and what you should not do. Or to otherwise be very slow and deliberate, checking after each action if the action you took had the intended consequence. Remember that "space" is jump. If someone says it is OK to run forward, press and hold "w" -and "space" every once in a while. The rest of Minecraft commands should be familiar to you. \ No newline at end of file +and "space" every once in a while. The rest of Minecraft commands should be familiar to you. Provide output for the next 20 seconds. + +Reply statements and actions in the following JSON format only! Here is an example: + +```json +[{"action": "statement", "text": "What?"}, + {"action": "press-keys", "keys": ["w"], "duration": 0.3}, + {"action": "statement", "text": "Forward!"}, + {"action": "mouse-move", "relative": [20, -5]}, + {"action": "mouse-click", "button": "left"}, + {"action": "press-keys", "keys": ["space" "w"], "duration": 3.2}, + {"action": "statement", "text": "Upwards!"} + ] + ``` diff --git a/resources/prompts/screen.txt b/resources/prompts/screen.txt index 287117a..189bc6f 100644 --- a/resources/prompts/screen.txt +++ b/resources/prompts/screen.txt @@ -3,10 +3,10 @@ You are a reactive computer assistant actively observing the screen, listening t ===== Instructions ===== - Your goal is to assist the user by understanding the context and responding helpfully and concisely. - Maintain continuity by considering previous audio inputs, history of screen transcripts, your statements, and your actions. -- Respond *only* to any questions asked by the user in the audio input! Consider the context from the screen and your recent statements. +- Respond *only* to any questions asked by the user in the audio input and otherwise be silent! Consider the context from the screen and your recent statements. - Do not repeat yourself in terms of your recent statements! Continue from where you left off to maintain a natural conversation flow. - Do not explicitly tell the user to feel free to ask questions or follow up! -- You might receive your own earlier statements on audio input also on audio input as a feedback. Ignore those. +- You might receive your own earlier statements also on audio input as a feedback. Ignore those. - The timestamps are for your orientation and do not need to be mentioned in your responses. - If there is nothing new to add answering the questions of the user, e.g. because you answered it and no further question has been asked, respond with "QUIET". Otherwise, provide the next one or two sentences in your ongoing conversation with the user. diff --git a/src/is/simm/runtimes/ubuntu.clj b/src/is/simm/runtimes/ubuntu.clj index 42fb4c7..23529a1 100644 --- a/src/is/simm/runtimes/ubuntu.clj +++ b/src/is/simm/runtimes/ubuntu.clj @@ -3,7 +3,7 @@ (:require [is.simm.runtimes.openai :refer [text-chat chat whisper-1 tts-1]] [is.simm.prompts :as prompts] [clojure.core.async :refer [timeout put! chan pub sub close! take! poll! go-loop go] :as async] - [taoensso.timbre :refer [debug info warn] :as log] + [taoensso.timbre :refer [debug info warn error] :as log] [missionary.core :as m] [libpython-clj2.require :refer [require-python]] [libpython-clj2.python :refer [py. py.. py.-] :as py] @@ -112,7 +112,7 @@ (m/? (m/via m/blk (shell "ffmpeg" "-f" "pulse" "-i" device "-t" (str interval) filename))) filename))) -(def audio-devices +(def audio-devices {:microphone "alsa_input.pci-0000_00_1f.3-platform-skl_hda_dsp_generic.HiFi__hw_sofhdadsp_6__source" :speakers "alsa_output.pci-0000_00_1f.3-platform-skl_hda_dsp_generic.HiFi__hw_sofhdadsp__sink.monitor" #_"bluez_sink.F8_DF_15_4F_1D_F0.a2dp_sink.monitor"}) @@ -125,9 +125,7 @@ (def tts-test (m/? (ecode key) + (let [keys (map clj->ecode keys) ui (UInput)] (if key (do - (py. ui write (py.- ecodes EV_KEY) key 1) ;; Key press + (doseq [key keys + :when key] + (py. ui write (py.- ecodes EV_KEY) key 1)) ;; Key press (py. ui syn) (time/sleep duration) - (py. ui write (py.- ecodes EV_KEY) key 0) ;; Key release + (doseq [key keys + :when key] + (py. ui write (py.- ecodes EV_KEY) key 0)) ;; Key release (py. ui syn) (py. ui close)) (warn "Invalid key" key)) @@ -294,8 +298,8 @@ (defn mouse-click [button duration] (py/with-gil-stack-rc-context - (let [key ({:left-click (py.- ecodes BTN_LEFT) - :right-click (py.- ecodes BTN_RIGHT)} button) + (let [key ({:left (py.- ecodes BTN_LEFT) + :right (py.- ecodes BTN_RIGHT)} button) ui (UInput {(py.- ecodes EV_KEY) [(py.- ecodes BTN_LEFT) (py.- ecodes BTN_RIGHT)]})] (if key (do @@ -308,7 +312,7 @@ (py. ui close)))) (comment - (press-key :a 0.1) + (press-keys [:a] 0.1) (mouse-move [13 500]) @@ -330,7 +334,7 @@ :db/cardinality :db.cardinality/one} {:db/ident :audio/in :db/valueType :db.type/string - :db/cardinality :db.cardinality/one} + :db/cardinality :db.cardinality/one} {:db/ident :audio/out :db/valueType :db.type/string :db/cardinality :db.cardinality/one} @@ -371,28 +375,25 @@ :event/role "user" :screen/file "/tmp/test_screenshot_123.png"}]) - + (->> @conn (d/q '[:find ?s ?c :where [?e :screen/file ?s] [?e :event/created ?c]]) (sort-by second) - ffirst - ) + ffirst) ;; pull full event history out of conn (->> @conn (d/q '[:find (pull ?e [:*]) :where [?e :event/created ?c]]) (map first) (sort-by :event/created) - events->openai-messages) - - - ) + events->openai-messages)) (defn events->openai-messages [events] (map (fn [event] (let [role (get event :event/role) audio-in (get event :audio/in) - system-output (get event :system/output) + audio-out (get event :audio/out) + assistant-output (get event :assistant/output) screen-file (get event :screen/file) screen-transcript (get event :screen/transcript) action (get event :action) @@ -402,20 +403,26 @@ {:role role :content [{:type "text" :text (str created " audio-in: " audio-in)}]} - (and role system-output) + (and role audio-out) {:role role - :content [{:type "text" :text (str created " system-output: " system-output)}]} + :content [{:type "text" :text (str created " audio-out: " audio-out)}]} + + (and role assistant-output) + {:role role + :content [{:type "text" :text (str created " assistant-output: " assistant-output)}]} (and role screen-transcript) {:role role - :content [{:type "text" :text (str created "screen-transcript: " screen-transcript)}]} + :content [{:type "text" :text (str created " screen-transcript: " screen-transcript)}]} + + :else (do (error "missing event" event) + {:role role :content + [{:type "text" :text "Missing event."}]}) #_(and role screen-file) #_{:role role :content [{:type "image_url" - :image_url {:url (str "data:image/jpeg;base64," (encode-file screen-file))}}]} - - ))) + :image_url {:url (str "data:image/jpeg;base64," (encode-file screen-file))}}]}))) events)) @@ -469,9 +476,9 @@ last-screen (->> @conn (d/q '[:find ?s ?c :where [?e :screen/file ?s] [?e :event/created ?c]]) (sort-by second) + reverse first ;; newest - first - ) + first) messages (->> @conn (d/q '[:find (pull ?e [:*]) :where [?e :event/created ?c]]) (map first) @@ -502,7 +509,7 @@ (recur))))) (comment - + (log/set-min-level! :debug) @@ -532,66 +539,13 @@ #(prn ::error %))) (baseline-0-dispose) - + + ) -(defn baseline-1 [] - (let [!audio-in (atom []) - !audio-out (atom []) - !screen-in (atom []) - !action-out (atom [])] - ;; run audio and screen perception in - (m/race (audio-listen (:microphone audio-devices) 5 #(swap! !audio-in conj %)) - (audio-listen (:speakers audio-devices) 5 #(swap! !audio-out conj %)) - (screen-watch 10 #(swap! !screen-in conj [%1 %2])) - - (m/sp - (loop [] - (let [last-screen (first (last @!screen-in))] - (debug "talk loop") - (when last-screen - (let [statement (m/? (vlm (slurp (io/resource "prompts/plaicraft-short.txt")) - #_(str prompts/minecraft - "If there is nothing new to say then reply with QUIET. Otherwise say the next two sentences from the first person perspective in a fun and playful style:\n") - last-screen - (map second @!audio-in) - @!audio-out - (map second @!screen-in) - @!action-out))] - #_(debug "statement" statement) - (when-not (.contains statement "QUIET") - (swap! !audio-out conj statement) - (m/? (play-audio (m/? (> @conn (d/q '[:find ?s ?c :where [?e :screen/file ?s] [?e :event/created ?c]]) (sort-by second) + reverse first ;; newest first) messages (->> @conn (d/q '[:find (pull ?e [:*]) :where [?e :event/created ?c]]) (map first) (sort-by :event/created) + reverse events->openai-messages)] (debug "last-screen" last-screen) (debug "messages" messages) (if (and (seq messages) last-screen) #_last-screen - (let [statement (m/? (