Skip to content

Commit

Permalink
Factor text-extraction runtime.
Browse files Browse the repository at this point in the history
  • Loading branch information
whilo committed Apr 12, 2024
1 parent fca1d77 commit 65f3cb6
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 59 deletions.
60 changes: 2 additions & 58 deletions src/ie/simm/runtimes/relational_assistance.clj
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
conn)
(catch Exception _
(d/connect cfg)))]
(d/transact conn default-schema)
#_(d/transact conn default-schema)
(swap! peer assoc-in [:conn chat-id] conn)
conn)))

Expand Down Expand Up @@ -101,26 +101,6 @@
(when (seq tags)
{:message/tag tags}))]))))


;; TODO factor into youtube middleware
;; require libpython

(require '[libpython-clj2.require :refer [require-python]]
'[libpython-clj2.python :refer [py. py.. py.-] :as py])

(require-python '[youtube_transcript_api :refer [YouTubeTranscriptApi]])

(defn youtube-transcript [video-id]
;; " ".join([t['text'] for t in transcript])
(let [transcript (py. YouTubeTranscriptApi get_transcript video-id)]
(str/join " " (map :text transcript))))

(comment
(youtube-transcript "20TAkcy3aBY")

)


(def window-size 10)

(defn summarize [S conn conv chat]
Expand Down Expand Up @@ -195,33 +175,6 @@
(.close zip-out)
zip-file))

(defn extract-url [S text chat]
(go-try S
(if-let [;; if text matches http or https web URL extrect URL with regex
url (re-find #"https?://\S+" text)]
(if-let [;; extract youtube video id from URL
youtube-id (second (or (re-find #"youtube.com/watch\?v=([^&]+)" url)
(re-find #"youtu.be/([^\?]+)" url)))]
(try
(debug "summarizing youtube transcript" youtube-id)
(let [transcript (youtube-transcript youtube-id)
summary (<? S (cheap-llm (format pr/summarization transcript)))
summary (str "Youtube transcript summary:\n" summary "\n" url)]
(<? S (send-text! (:id chat) summary))
summary)
(catch Exception e
(warn "Could not extract transcript from youtube video" youtube-id e)
text))
(try
(let [body (<? S (extract-body url))
summary (<? S (cheap-llm (format pr/summarization body)))]
(<? S (send-text! (:id chat) summary))
(str "Website summary:\n" summary "\n" url))
(catch Exception e
(warn "Could not extract body from URL" url e)
text)))
text)))

(defn relational-assistance
"This interpreter can derive facts and effects through a relational database."
[[S peer [in out]]]
Expand Down Expand Up @@ -250,20 +203,11 @@
(when m
(binding [lb/*chans* [next-in pi out po]]
(let [{:keys [msg]
{:keys [text chat voice-path from]} :msg} m]
{:keys [chat from]} :msg} m]
(try
(let [_ (debug "received message" m)
firstname (:first_name from)
start-time-ms (System/currentTimeMillis)
text (if-not voice-path text
(let [transcript (<? S (stt-basic voice-path))
transcript (str "Voice transcript " (:username from) ":\n" transcript)]
(when text (warn "Ignoring text in favor of voice message"))
(debug "created transcript" transcript)
(<? S (send-text! (:id chat) transcript))
transcript))
text (<? S (extract-url S text chat))
msg (assoc msg :text text)

conn (ensure-conn peer (:id chat))
_ (debug "conn" conn)
Expand Down
103 changes: 103 additions & 0 deletions src/ie/simm/runtimes/text_extractor.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
(ns ie.simm.runtimes.text-extractor
"Extract texts from incoming message and augment the text.
Properties: stateless"
(:require [ie.simm.languages.bindings :as lb]
[ie.simm.languages.gen-ai :refer [cheap-llm stt-basic]]
[ie.simm.languages.chat :refer [send-text!]]
[ie.simm.languages.browser :refer [extract-body]]
[ie.simm.prompts :as pr]
[superv.async :refer [<?? go-try S go-loop-try <? >? put? go-for] :as sasync]
[clojure.core.async :refer [chan pub sub mult tap timeout] :as async]
[taoensso.timbre :refer [debug info warn error]]
[hasch.core :refer [uuid]]
[clojure.string :as str]
[libpython-clj2.require :refer [require-python]]
[libpython-clj2.python :refer [py. py.. py.-] :as py]))

(require-python '[youtube_transcript_api :refer [YouTubeTranscriptApi]])

(defn youtube-transcript [video-id]
;; " ".join([t['text'] for t in transcript])
(let [transcript (py. YouTubeTranscriptApi get_transcript video-id)]
(str/join " " (map :text transcript))))

(comment
(youtube-transcript "20TAkcy3aBY")

)

(defn extract-url [S text chat]
(go-try S
(if-let [;; if text matches http or https web URL extrect URL with regex
url (re-find #"https?://\S+" text)]
(if-let [;; extract youtube video id from URL
youtube-id (second (or (re-find #"youtube.com/watch\?v=([^&]+)" url)
(re-find #"youtu.be/([^\?]+)" url)))]
(try
(debug "summarizing youtube transcript" youtube-id)
(let [transcript (youtube-transcript youtube-id)
summary (<? S (cheap-llm (format pr/summarization transcript)))
summary (str "Youtube transcript summary:\n" summary "\n" url)]
(<? S (send-text! (:id chat) summary))
summary)
(catch Exception e
(warn "Could not extract transcript from youtube video" youtube-id e)
text))
(try
(let [body (<? S (extract-body url))
summary (<? S (cheap-llm (format pr/summarization body)))]
(<? S (send-text! (:id chat) summary))
(str "Website summary:\n" summary "\n" url))
(catch Exception e
(warn "Could not extract body from URL" url e)
text)))
text)))

(defn text-extractor
[[S peer [in out]]]
;; pass everything from in to next-in for the next middleware
;; and create publication channel for runtime context
(let [pi (pub in :type)
next-in (chan 1000)

;; subscriptions for this runtime context
p (pub in (fn [{:keys [type]}]
(or ({:ie.simm.runtimes.telegram/message ::message} type)
:unrelated)))
msg-ch (chan 1000)
_ (sub p ::message msg-ch)
_ (sub p :unrelated next-in)

;; do the same in reverse for outputs from below
prev-out (chan)
mo (mult prev-out)
_ (tap mo out)
pub-out (chan)
_ (tap mo pub-out)
po (pub pub-out :type)]
(go-loop-try S [m (<? S msg-ch)]
(when m
(binding [lb/*chans* [next-in pi out po]]
(let [{:keys [msg]
{:keys [text chat voice-path from]} :msg} m]
(try
(let [_ (debug "received message" m)
text (if-not voice-path text
(let [transcript (<? S (stt-basic voice-path))
transcript (str "Voice transcript " (:username from) ":\n" transcript)]
(when text (warn "Ignoring text in favor of voice message"))
(debug "created transcript" transcript)
(<? S (send-text! (:id chat) transcript))
transcript))
text (<? S (extract-url S text chat))
msg (assoc msg :text text)
m (assoc m :msg msg)]
(prn "NEW M" m)
(>? S next-in m))
(catch Exception e
(let [error-id (uuid)]
(error "Could not process message(" error-id "): " m e)
(<? S (send-text! (:id chat) (str "Sorry, I could not process your message. Error: " error-id))))))))
(recur (<? S msg-ch))))
[S peer [next-in prev-out]]))
7 changes: 6 additions & 1 deletion src/ie/simm/towers.clj
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
[ie.simm.runtimes.report :refer [report]]
[ie.simm.runtimes.etaoin :refer [etaoin]]
[ie.simm.runtimes.telegram :refer [telegram long-polling]]
[ie.simm.runtimes.text-extractor :refer [text-extractor]]
[ie.simm.runtimes.relational-assistance :refer [relational-assistance]]
[kabel.peer :refer [drain]]
[clojure.core.async :refer [close!]]
Expand All @@ -17,7 +18,7 @@
[S peer [in out]])

(defn default []
(comp drain brave etaoin openai relational-assistance telegram codrain))
(comp drain brave etaoin openai relational-assistance text-extractor telegram codrain))

(defn debug []
(comp drain
Expand All @@ -30,6 +31,8 @@
(partial report #(println "openai: " (:type %) (:request-id %)))
relational-assistance
(partial report #(println "relational-assistance: " (:type %) (:request-id %)))
text-extractor
(partial report #(println "text-extractor: " (:type %) (:request-id %)))
telegram
(partial report #(println "telegram: " (:type %) (:request-id %)))
codrain))
Expand All @@ -45,6 +48,8 @@
(partial report #(println "openai: " (:type %) (:request-id %)))
relational-assistance
(partial report #(println "relational-assistance: " (:type %) (:request-id %)))
text-extractor
(partial report #(println "text-extractor: " (:type %) (:request-id %)))
(partial telegram long-polling)
(partial report #(println "telegram: " (:type %) (:request-id %)))
codrain))

0 comments on commit 65f3cb6

Please sign in to comment.