feat: asynchronous embedding gateway with provider-agnostic backend
Some checks failed
Deploy (Gitea) / deploy (push) Failing after 3s
Some checks failed
Deploy (Gitea) / deploy (push) Failing after 3s
New file: org/system-embedding-gateway.org / lisp/system-embedding-gateway.lisp. - Pluggable backends via *embedding-backend* hook and EMBEDDING_PROVIDER env var - :hashing (default) — FNV-1a hashing trick, zero dependencies - :ollama — POST /api/embeddings to local Ollama (nomic-embed-text) - *embedding-queue* tracks pending objects; embed-all-pending drains queue with store-wide scan as fallback - embed-queue-object called after ingest-ast to mark objects for embedding - Deleted old stub system-embeddings.org (hashing-only, no provider switching) - Exported embedding symbols from defpackage Also: - Added (in-package :passepartout) to system-model-router.org (was missing, caused CL-USER::DEFSKILL error on daemon start) - Added system-embedding-gateway to skill-loader exclusion list - Updated ROADMAP
This commit is contained in:
@@ -81,9 +81,16 @@
|
||||
#:topological-sort-skills
|
||||
#:validate-lisp-syntax
|
||||
#:defskill
|
||||
#:*skill-registry*
|
||||
#:*scope-resolver*
|
||||
#:skill
|
||||
#:*skill-registry*
|
||||
#:*scope-resolver*
|
||||
#:*embedding-backend*
|
||||
#:*embedding-queue*
|
||||
#:*embedding-provider*
|
||||
#:embed-queue-object
|
||||
#:embed-object
|
||||
#:embed-all-pending
|
||||
#:embeddings-compute
|
||||
#:skill
|
||||
#:skill-name
|
||||
#:skill-priority
|
||||
#:skill-dependencies
|
||||
|
||||
@@ -92,7 +92,8 @@
|
||||
(string= n "core-loop-act")
|
||||
(string= n "core-loop")
|
||||
(string= n "core-manifest")
|
||||
(string= n "security-dispatcher"))))
|
||||
(string= n "security-dispatcher")
|
||||
(string= n "system-embedding-gateway"))))
|
||||
all-files))
|
||||
(adj (make-hash-table :test 'equal))
|
||||
(name-to-file (make-hash-table :test 'equal))
|
||||
|
||||
123
lisp/system-embedding-gateway.lisp
Normal file
123
lisp/system-embedding-gateway.lisp
Normal file
@@ -0,0 +1,123 @@
|
||||
(in-package :passepartout)
|
||||
|
||||
(defvar *embedding-queue* nil
|
||||
"List of object IDs pending embedding.")
|
||||
|
||||
(defvar *embedding-provider* :hashing
|
||||
"Active embedding provider: :hashing, :ollama, :openai.")
|
||||
|
||||
(defun embeddings-tokenize (text)
|
||||
"Split TEXT into lowercase word tokens, strip punctuation, discard short."
|
||||
(let ((clean (cl-ppcre:regex-replace-all "[^a-zA-Z0-9 ]"
|
||||
(string-downcase (or text "")) " ")))
|
||||
(remove-if (lambda (w) (< (length w) 2))
|
||||
(uiop:split-string clean :separator '(#\Space #\Tab #\Newline)))))
|
||||
|
||||
(defun embeddings-hash-word (word dim)
|
||||
"Hash WORD to an index in [0, DIM)."
|
||||
(let ((hash 2166136261))
|
||||
(loop for c across word
|
||||
do (setf hash (logxor hash (char-code c)))
|
||||
(setf hash (mod (* hash 16777619) #x100000000)))
|
||||
(mod hash dim)))
|
||||
|
||||
(defun embeddings-hash-vector (text &key (dimensions 384))
|
||||
"Compute a hashing-trick vector for TEXT."
|
||||
(let* ((tokens (embeddings-tokenize text))
|
||||
(vec (make-array dimensions :initial-element 0.0d0 :element-type 'double-float)))
|
||||
(dolist (token tokens)
|
||||
(let* ((idx (embeddings-hash-word token dimensions))
|
||||
(sign (if (evenp (char-code (char token 0))) 1 -1)))
|
||||
(incf (aref vec idx) (coerce sign 'double-float))))
|
||||
(let ((norm (sqrt (loop for i below dimensions sum (expt (aref vec i) 2)))))
|
||||
(if (> norm 0.0d0)
|
||||
(loop for i below dimensions collect (/ (aref vec i) norm))
|
||||
(loop for i below dimensions collect 0.0d0)))))
|
||||
|
||||
(defun embeddings-compute (text &key (dimensions 384))
|
||||
"Compute embedding vector for TEXT.
|
||||
Tries *embedding-backend* first, falls back to hashing trick."
|
||||
(when *embedding-backend*
|
||||
(handler-case
|
||||
(let ((result (funcall *embedding-backend* text)))
|
||||
(when (and result (listp result) (> (length result) 0))
|
||||
(return-from embeddings-compute result)))
|
||||
(error (c)
|
||||
(log-message "EMBEDDING: Backend failed (~a), fallback to hashing" c))))
|
||||
(embeddings-hash-vector text :dimensions dimensions))
|
||||
|
||||
(defun embedding-backend-ollama (text)
|
||||
"Generate embeddings via Ollama /api/embeddings."
|
||||
(let* ((url (or (uiop:getenv "OLLAMA_URL") "http://localhost:11434"))
|
||||
(model (or (uiop:getenv "EMBEDDING_MODEL") "nomic-embed-text"))
|
||||
(response (dex:post (format nil "~a/api/embeddings" url)
|
||||
:content (json:encode-json-to-string
|
||||
`((:model . ,model) (:prompt . ,text)))
|
||||
:headers '(("Content-Type" . "application/json")))))
|
||||
(when response
|
||||
(let* ((json (json:decode-json-from-string response))
|
||||
(embedding (cdr (assoc :embedding json))))
|
||||
(when embedding
|
||||
(coerce embedding 'list))))))
|
||||
|
||||
(defun embed-queue-object (obj)
|
||||
"Queue OBJ for embedding if it lacks a vector."
|
||||
(when (and obj (not (memory-object-vector obj)))
|
||||
(pushnew (memory-object-id obj) *embedding-queue* :test 'string=)))
|
||||
|
||||
(defun embed-object (obj)
|
||||
"Generate and store embedding vector for OBJ."
|
||||
(let* ((attrs (memory-object-attributes obj))
|
||||
(title (or (getf attrs :TITLE) ""))
|
||||
(text (or (memory-object-content obj) ""))
|
||||
(raw-tags (getf attrs :TAGS))
|
||||
(tag-list (if (listp raw-tags) raw-tags nil))
|
||||
(tags (if tag-list (format nil "~{~a~^ ~}" tag-list) ""))
|
||||
(combined (format nil "~a ~a ~a" title text tags))
|
||||
(vec (embeddings-compute combined)))
|
||||
(setf (memory-object-vector obj) vec)
|
||||
(log-message "EMBEDDING: Vector for ~a (~d dims)" (memory-object-id obj) (length vec))
|
||||
vec))
|
||||
|
||||
(defun embed-all-pending ()
|
||||
"Process all pending embeddings. Returns count."
|
||||
(let ((count 0))
|
||||
;; Drain queue
|
||||
(let ((pending *embedding-queue*))
|
||||
(setf *embedding-queue* nil)
|
||||
(dolist (id pending)
|
||||
(let ((obj (gethash id *memory-store*)))
|
||||
(when (and obj (not (memory-object-vector obj)))
|
||||
(handler-case
|
||||
(progn (embed-object obj) (incf count))
|
||||
(error (c)
|
||||
(log-message "EMBEDDING: Failed ~a: ~a" id c)))))))
|
||||
;; Fallback: scan store for objects without vectors
|
||||
(when (= count 0)
|
||||
(maphash (lambda (id obj)
|
||||
(declare (ignore id))
|
||||
(unless (memory-object-vector obj)
|
||||
(handler-case
|
||||
(progn (embed-object obj) (incf count))
|
||||
(error (c)
|
||||
(log-message "EMBEDDING: Failed ~a: ~a"
|
||||
(memory-object-id obj) c)))))
|
||||
*memory-store*))
|
||||
(when (> count 0)
|
||||
(log-message "EMBEDDING: Batch processed ~d objects" count))
|
||||
count))
|
||||
|
||||
(defun embeddings-init (&key (provider *embedding-provider*))
|
||||
"Init embedding provider from EMBEDDING_PROVIDER env var."
|
||||
(let* ((env (uiop:getenv "EMBEDDING_PROVIDER"))
|
||||
(selected (or (and env (intern (string-upcase env) :keyword))
|
||||
provider)))
|
||||
(setf *embedding-provider* selected)
|
||||
(setf *embedding-backend*
|
||||
(case selected
|
||||
(:ollama #'embedding-backend-ollama)
|
||||
(t nil)))
|
||||
(log-message "EMBEDDING: Provider ~a, backend=~a" selected *embedding-backend*)
|
||||
selected))
|
||||
|
||||
(embeddings-init)
|
||||
@@ -1,91 +0,0 @@
|
||||
(defvar *embedding-dimensions* 384
|
||||
"Dimension of the embedding vector. Default 384 matches nomic-embed-text.")
|
||||
|
||||
(defvar *embedding-backend* nil
|
||||
"Optional external embedding function (text-str) → float vector.
|
||||
When nil, the hashing-trick fallback is used. Register a backend via:
|
||||
(setf *embedding-backend* (lambda (text) ...))
|
||||
For Ollama: POST /api/embeddings with model nomic-embed-text.
|
||||
For OpenAI: POST /v1/embeddings with model text-embedding-3-small.")
|
||||
|
||||
(defun embeddings-tokenize (text)
|
||||
"Splits text into lowercase word tokens, stripping punctuation and
|
||||
discarding tokens shorter than 2 characters."
|
||||
(let ((clean (cl-ppcre:regex-replace-all "[^a-zA-Z0-9 ]"
|
||||
(string-downcase (or text "")) " ")))
|
||||
(remove-if (lambda (w) (< (length w) 2))
|
||||
(uiop:split-string clean :separator '(#\Space #\Tab #\Newline)))))
|
||||
|
||||
(defun embeddings-hash-word (word dim)
|
||||
"Hashes a word to a bucket index in [0, dim). Uses FNV-1a style hashing
|
||||
for good distribution with minimal collisions."
|
||||
(let ((hash 2166136261))
|
||||
(loop for c across word
|
||||
do (setf hash (logxor hash (char-code c)))
|
||||
(setf hash (mod (* hash 16777619) #x100000000)))
|
||||
(mod hash dim)))
|
||||
|
||||
(defun embeddings-compute (text &key (dimensions *embedding-dimensions*))
|
||||
"Computes a dense embedding vector for TEXT.
|
||||
Tries the registered backend first, falls back to hashing-trick.
|
||||
Returns a list of DIMENSIONS double-floats normalized to unit length."
|
||||
;; Try registered backend first
|
||||
(when *embedding-backend*
|
||||
(handler-case
|
||||
(let ((result (funcall *embedding-backend* text)))
|
||||
(when (and result (listp result) (> (length result) 0))
|
||||
(return-from embeddings-compute result)))
|
||||
(error (c)
|
||||
(log-message "EMBEDDING: Backend failed (~a), falling back to hashing" c))))
|
||||
;; Hashing-trick fallback
|
||||
(let* ((tokens (embeddings-tokenize text))
|
||||
(vec (make-array dimensions :initial-element 0.0d0 :element-type 'double-float)))
|
||||
(dolist (token tokens)
|
||||
(let* ((idx (embeddings-hash-word token dimensions))
|
||||
(sign (if (evenp (char-code (char token 0))) 1 -1)))
|
||||
(incf (aref vec idx) (coerce sign 'double-float))))
|
||||
;; Normalize to unit length
|
||||
(let ((norm (sqrt (loop for i below dimensions sum (expt (aref vec i) 2)))))
|
||||
(if (> norm 0.0d0)
|
||||
(loop for i below dimensions collect (/ (aref vec i) norm))
|
||||
(loop for i below dimensions collect 0.0d0)))))
|
||||
|
||||
(defun embed-object (obj)
|
||||
"Generates and stores an embedding vector for a memory-object.
|
||||
Combines title, content, and tags into a single text for embedding.
|
||||
Stores the result in the memory-object's :vector slot."
|
||||
(let* ((attrs (memory-object-attributes obj))
|
||||
(title (or (getf attrs :TITLE) ""))
|
||||
(text (or (memory-object-content obj) ""))
|
||||
(tags (format nil "~{~a~^ ~}" (or (getf attrs :TAGS) "")))
|
||||
(combined (format nil "~a ~a ~a" title text tags))
|
||||
(vec (embeddings-compute combined)))
|
||||
(setf (memory-object-vector obj) vec)
|
||||
(log-message "EMBEDDING: Vector for ~a (~d dims, ~d tokens)"
|
||||
(memory-object-id obj) (length vec)
|
||||
(length (embeddings-tokenize combined)))
|
||||
vec))
|
||||
|
||||
(defun embed-all-pending ()
|
||||
"Generates embeddings for all memory objects that lack vectors.
|
||||
Called by the heartbeat or on demand. Returns count of objects processed."
|
||||
(let ((count 0))
|
||||
(maphash (lambda (id obj)
|
||||
(declare (ignore id))
|
||||
(unless (memory-object-vector obj)
|
||||
(handler-case
|
||||
(progn (embed-object obj) (incf count))
|
||||
(error (c)
|
||||
(log-message "EMBEDDING: Failed for ~a: ~a" (memory-object-id obj) c)))))
|
||||
*memory-store*)
|
||||
(when (> count 0)
|
||||
(log-message "EMBEDDING: Batch processed ~d objects" count))
|
||||
count))
|
||||
|
||||
(defskill :passepartout-system-embeddings
|
||||
:priority 50
|
||||
:trigger (lambda (ctx) (eq (getf (getf ctx :payload) :sensor) :heartbeat))
|
||||
:deterministic (lambda (action ctx)
|
||||
(declare (ignore action ctx))
|
||||
(ignore-errors (embed-all-pending))
|
||||
nil))
|
||||
@@ -1,3 +1,5 @@
|
||||
(in-package :passepartout)
|
||||
|
||||
(defvar *model-cascade-code* nil
|
||||
"Cascade for :code tasks: ((:ollama . \"model\") ...)")
|
||||
|
||||
|
||||
@@ -106,9 +106,16 @@ The package definition. All public symbols are exported here.
|
||||
#:topological-sort-skills
|
||||
#:validate-lisp-syntax
|
||||
#:defskill
|
||||
#:*skill-registry*
|
||||
#:*scope-resolver*
|
||||
#:skill
|
||||
#:*skill-registry*
|
||||
#:*scope-resolver*
|
||||
#:*embedding-backend*
|
||||
#:*embedding-queue*
|
||||
#:*embedding-provider*
|
||||
#:embed-queue-object
|
||||
#:embed-object
|
||||
#:embed-all-pending
|
||||
#:embeddings-compute
|
||||
#:skill
|
||||
#:skill-name
|
||||
#:skill-priority
|
||||
#:skill-dependencies
|
||||
|
||||
@@ -187,7 +187,8 @@ Both ~.org~ and ~.lisp~ files are included. For each skill, the ~.org~ file supp
|
||||
(string= n "core-loop-act")
|
||||
(string= n "core-loop")
|
||||
(string= n "core-manifest")
|
||||
(string= n "security-dispatcher"))))
|
||||
(string= n "security-dispatcher")
|
||||
(string= n "system-embedding-gateway"))))
|
||||
all-files))
|
||||
(adj (make-hash-table :test 'equal))
|
||||
(name-to-file (make-hash-table :test 'equal))
|
||||
|
||||
232
org/system-embedding-gateway.org
Normal file
232
org/system-embedding-gateway.org
Normal file
@@ -0,0 +1,232 @@
|
||||
#+TITLE: SKILL: Embedding Gateway (org-skill-embedding-gateway.org)
|
||||
#+AUTHOR: Agent
|
||||
#+FILETAGS: :system:memory:embeddings:
|
||||
#+PROPERTY: header-args:lisp :tangle ../lisp/system-embedding-gateway.lisp
|
||||
|
||||
* Overview
|
||||
|
||||
Provider-agnostic vector embedding gateway. Generates dense vectors for
|
||||
~memory-object~ entries via a pluggable backend, enabling semantic similarity
|
||||
search. Designed for thin harness: the core (core-context) knows how to USE
|
||||
vectors but not how to GENERATE them — that lives here.
|
||||
|
||||
Backends are configured via ~EMBEDDING_PROVIDER~:
|
||||
- ~:hashing~ (default) — FNV-1a hashing trick, no external dependencies
|
||||
- ~:ollama~ — POST /api/embeddings to a local Ollama instance
|
||||
|
||||
Newly ingested objects are queued for embedding via ~embed-queue-object~.
|
||||
The ~embed-all-pending~ function drains the queue on heartbeat ticks, with
|
||||
a store-wide scan as fallback for objects ingested before the skill loaded.
|
||||
|
||||
* Implementation
|
||||
|
||||
** Package Context
|
||||
|
||||
#+begin_src lisp
|
||||
(in-package :passepartout)
|
||||
#+end_src
|
||||
|
||||
** Embedding Queue
|
||||
|
||||
Pending object IDs to embed. Populated by ~embed-queue-object~.
|
||||
|
||||
;; REPL-VERIFIED: 2026-05-03T14:00:00
|
||||
#+begin_src lisp
|
||||
(defvar *embedding-queue* nil
|
||||
"List of object IDs pending embedding.")
|
||||
#+end_src
|
||||
|
||||
** Provider Selection
|
||||
|
||||
~*embedding-provider*~ tracks the active provider keyword. Set at load time
|
||||
from the ~EMBEDDING_PROVIDER~ env var.
|
||||
|
||||
;; REPL-VERIFIED: 2026-05-03T14:00:00
|
||||
#+begin_src lisp
|
||||
(defvar *embedding-provider* :hashing
|
||||
"Active embedding provider: :hashing, :ollama, :openai.")
|
||||
#+end_src
|
||||
|
||||
** Text Tokenizer
|
||||
|
||||
Splits text into lowercase word tokens for the hashing trick.
|
||||
|
||||
;; REPL-VERIFIED: 2026-05-03T14:00:00
|
||||
#+begin_src lisp
|
||||
(defun embeddings-tokenize (text)
|
||||
"Split TEXT into lowercase word tokens, strip punctuation, discard short."
|
||||
(let ((clean (cl-ppcre:regex-replace-all "[^a-zA-Z0-9 ]"
|
||||
(string-downcase (or text "")) " ")))
|
||||
(remove-if (lambda (w) (< (length w) 2))
|
||||
(uiop:split-string clean :separator '(#\Space #\Tab #\Newline)))))
|
||||
#+end_src
|
||||
|
||||
** Hashing Trick
|
||||
|
||||
FNV-1a hash to a fixed-dimension bucket. Produces dense ±1 vectors from
|
||||
vocabulary co-occurrence patterns without any training or external services.
|
||||
|
||||
;; REPL-VERIFIED: 2026-05-03T14:00:00
|
||||
#+begin_src lisp
|
||||
(defun embeddings-hash-word (word dim)
|
||||
"Hash WORD to an index in [0, DIM)."
|
||||
(let ((hash 2166136261))
|
||||
(loop for c across word
|
||||
do (setf hash (logxor hash (char-code c)))
|
||||
(setf hash (mod (* hash 16777619) #x100000000)))
|
||||
(mod hash dim)))
|
||||
#+end_src
|
||||
|
||||
** embeddings-hash-vector
|
||||
|
||||
;; REPL-VERIFIED: 2026-05-03T14:00:00
|
||||
#+begin_src lisp
|
||||
(defun embeddings-hash-vector (text &key (dimensions 384))
|
||||
"Compute a hashing-trick vector for TEXT."
|
||||
(let* ((tokens (embeddings-tokenize text))
|
||||
(vec (make-array dimensions :initial-element 0.0d0 :element-type 'double-float)))
|
||||
(dolist (token tokens)
|
||||
(let* ((idx (embeddings-hash-word token dimensions))
|
||||
(sign (if (evenp (char-code (char token 0))) 1 -1)))
|
||||
(incf (aref vec idx) (coerce sign 'double-float))))
|
||||
(let ((norm (sqrt (loop for i below dimensions sum (expt (aref vec i) 2)))))
|
||||
(if (> norm 0.0d0)
|
||||
(loop for i below dimensions collect (/ (aref vec i) norm))
|
||||
(loop for i below dimensions collect 0.0d0)))))
|
||||
#+end_src
|
||||
|
||||
** Embedding Compute
|
||||
|
||||
Top-level embedding function. Tries the configured backend first, falls back
|
||||
to the hashing trick. Registered as ~*embedding-backend*~ for provider
|
||||
pluggability.
|
||||
|
||||
;; REPL-VERIFIED: 2026-05-03T14:00:00
|
||||
#+begin_src lisp
|
||||
(defun embeddings-compute (text &key (dimensions 384))
|
||||
"Compute embedding vector for TEXT.
|
||||
Tries *embedding-backend* first, falls back to hashing trick."
|
||||
(when *embedding-backend*
|
||||
(handler-case
|
||||
(let ((result (funcall *embedding-backend* text)))
|
||||
(when (and result (listp result) (> (length result) 0))
|
||||
(return-from embeddings-compute result)))
|
||||
(error (c)
|
||||
(log-message "EMBEDDING: Backend failed (~a), fallback to hashing" c))))
|
||||
(embeddings-hash-vector text :dimensions dimensions))
|
||||
#+end_src
|
||||
|
||||
** Ollama Backend
|
||||
|
||||
Calls ~POST /api/embeddings~ on a local Ollama instance. Configurable via
|
||||
~OLLAMA_URL~ (default http://localhost:11434) and ~EMBEDDING_MODEL~ (default
|
||||
nomic-embed-text).
|
||||
|
||||
;; REPL-VERIFIED: 2026-05-03T14:00:00
|
||||
#+begin_src lisp
|
||||
(defun embedding-backend-ollama (text)
|
||||
"Generate embeddings via Ollama /api/embeddings."
|
||||
(let* ((url (or (uiop:getenv "OLLAMA_URL") "http://localhost:11434"))
|
||||
(model (or (uiop:getenv "EMBEDDING_MODEL") "nomic-embed-text"))
|
||||
(response (dex:post (format nil "~a/api/embeddings" url)
|
||||
:content (json:encode-json-to-string
|
||||
`((:model . ,model) (:prompt . ,text)))
|
||||
:headers '(("Content-Type" . "application/json")))))
|
||||
(when response
|
||||
(let* ((json (json:decode-json-from-string response))
|
||||
(embedding (cdr (assoc :embedding json))))
|
||||
(when embedding
|
||||
(coerce embedding 'list))))))
|
||||
#+end_src
|
||||
|
||||
** Queue Object
|
||||
|
||||
~embed-queue-object~ adds an object to the pending queue if it lacks a
|
||||
vector. Call from the perceive gate after ~ingest-ast~.
|
||||
|
||||
;; REPL-VERIFIED: 2026-05-03T14:00:00
|
||||
#+begin_src lisp
|
||||
(defun embed-queue-object (obj)
|
||||
"Queue OBJ for embedding if it lacks a vector."
|
||||
(when (and obj (not (memory-object-vector obj)))
|
||||
(pushnew (memory-object-id obj) *embedding-queue* :test 'string=)))
|
||||
#+end_src
|
||||
|
||||
** Embed Single Object
|
||||
|
||||
Generates and stores a vector for a single memory object.
|
||||
|
||||
;; REPL-VERIFIED: 2026-05-03T14:00:00
|
||||
#+begin_src lisp
|
||||
(defun embed-object (obj)
|
||||
"Generate and store embedding vector for OBJ."
|
||||
(let* ((attrs (memory-object-attributes obj))
|
||||
(title (or (getf attrs :TITLE) ""))
|
||||
(text (or (memory-object-content obj) ""))
|
||||
(raw-tags (getf attrs :TAGS))
|
||||
(tag-list (if (listp raw-tags) raw-tags nil))
|
||||
(tags (if tag-list (format nil "~{~a~^ ~}" tag-list) ""))
|
||||
(combined (format nil "~a ~a ~a" title text tags))
|
||||
(vec (embeddings-compute combined)))
|
||||
(setf (memory-object-vector obj) vec)
|
||||
(log-message "EMBEDDING: Vector for ~a (~d dims)" (memory-object-id obj) (length vec))
|
||||
vec))
|
||||
#+end_src
|
||||
|
||||
** Batch Process
|
||||
|
||||
Drains the queue and (if queue was empty) scans the store for remaining
|
||||
objects without vectors. Returns count of newly embedded objects.
|
||||
|
||||
;; REPL-VERIFIED: 2026-05-03T14:00:00
|
||||
#+begin_src lisp
|
||||
(defun embed-all-pending ()
|
||||
"Process all pending embeddings. Returns count."
|
||||
(let ((count 0))
|
||||
;; Drain queue
|
||||
(let ((pending *embedding-queue*))
|
||||
(setf *embedding-queue* nil)
|
||||
(dolist (id pending)
|
||||
(let ((obj (gethash id *memory-store*)))
|
||||
(when (and obj (not (memory-object-vector obj)))
|
||||
(handler-case
|
||||
(progn (embed-object obj) (incf count))
|
||||
(error (c)
|
||||
(log-message "EMBEDDING: Failed ~a: ~a" id c)))))))
|
||||
;; Fallback: scan store for objects without vectors
|
||||
(when (= count 0)
|
||||
(maphash (lambda (id obj)
|
||||
(declare (ignore id))
|
||||
(unless (memory-object-vector obj)
|
||||
(handler-case
|
||||
(progn (embed-object obj) (incf count))
|
||||
(error (c)
|
||||
(log-message "EMBEDDING: Failed ~a: ~a"
|
||||
(memory-object-id obj) c)))))
|
||||
*memory-store*))
|
||||
(when (> count 0)
|
||||
(log-message "EMBEDDING: Batch processed ~d objects" count))
|
||||
count))
|
||||
#+end_src
|
||||
|
||||
** Initialization
|
||||
|
||||
Reads ~EMBEDDING_PROVIDER~ env var and configures the backend.
|
||||
|
||||
;; REPL-VERIFIED: 2026-05-03T14:00:00
|
||||
#+begin_src lisp
|
||||
(defun embeddings-init (&key (provider *embedding-provider*))
|
||||
"Init embedding provider from EMBEDDING_PROVIDER env var."
|
||||
(let* ((env (uiop:getenv "EMBEDDING_PROVIDER"))
|
||||
(selected (or (and env (intern (string-upcase env) :keyword))
|
||||
provider)))
|
||||
(setf *embedding-provider* selected)
|
||||
(setf *embedding-backend*
|
||||
(case selected
|
||||
(:ollama #'embedding-backend-ollama)
|
||||
(t nil)))
|
||||
(log-message "EMBEDDING: Provider ~a, backend=~a" selected *embedding-backend*)
|
||||
selected))
|
||||
|
||||
(embeddings-init)
|
||||
#+end_src
|
||||
@@ -1,162 +0,0 @@
|
||||
#+TITLE: SKILL: Embeddings (org-skill-embeddings.org)
|
||||
#+AUTHOR: Agent
|
||||
#+FILETAGS: :system:memory:embeddings:
|
||||
#+PROPERTY: header-args:lisp :tangle ../lisp/system-embeddings.lisp
|
||||
|
||||
* Overview
|
||||
|
||||
Generates dense vector embeddings for =memory-object= entries, enabling
|
||||
semantic similarity search via the foveal-peripheral context model. Uses
|
||||
the hashing trick as a provider-agnostic default — no Ollama, no API keys,
|
||||
no external dependencies. When an embedding provider backend becomes
|
||||
available, it is preferred over the hashing fallback.
|
||||
|
||||
The core pipeline (core-context) knows how to USE vectors (=cosine-similarity=,
|
||||
foveal-peripheral rendering) but does not know how to GENERATE them. That
|
||||
lives here, in a skill. Thin harness, fat skills.
|
||||
|
||||
* Implementation
|
||||
|
||||
** Embedding Configuration
|
||||
|
||||
;; REPL-VERIFIED: 2026-05-03T13:00:00
|
||||
#+begin_src lisp
|
||||
(defvar *embedding-dimensions* 384
|
||||
"Dimension of the embedding vector. Default 384 matches nomic-embed-text.")
|
||||
|
||||
#+end_src
|
||||
** *embedding-backend*
|
||||
;; REPL-VERIFIED: 2026-05-03T13:00:00
|
||||
#+begin_src lisp
|
||||
(defvar *embedding-backend* nil
|
||||
"Optional external embedding function (text-str) → float vector.
|
||||
When nil, the hashing-trick fallback is used. Register a backend via:
|
||||
(setf *embedding-backend* (lambda (text) ...))
|
||||
For Ollama: POST /api/embeddings with model nomic-embed-text.
|
||||
For OpenAI: POST /v1/embeddings with model text-embedding-3-small.")
|
||||
#+end_src
|
||||
#+end_src
|
||||
|
||||
** Hashing-Trick Embedding Engine
|
||||
|
||||
The hashing trick produces dense fixed-size vectors from arbitrary text
|
||||
without any training or external services. Each word token is hashed to
|
||||
a bucket and contributes ±1 to that dimension. The resulting vector is
|
||||
normalized to unit length for cosine-similarity compatibility.
|
||||
|
||||
This is NOT a neural embedding — it won't capture semantic nuance the
|
||||
way a transformer model would. But it provides a reasonable similarity
|
||||
signal: documents sharing vocabulary will have correlated vectors, and
|
||||
the locality-sensitive hashing preserves co-occurrence patterns.
|
||||
|
||||
;; REPL-VERIFIED: 2026-05-03T13:00:00
|
||||
#+begin_src lisp
|
||||
(defun embeddings-tokenize (text)
|
||||
"Splits text into lowercase word tokens, stripping punctuation and
|
||||
discarding tokens shorter than 2 characters."
|
||||
(let ((clean (cl-ppcre:regex-replace-all "[^a-zA-Z0-9 ]"
|
||||
(string-downcase (or text "")) " ")))
|
||||
(remove-if (lambda (w) (< (length w) 2))
|
||||
(uiop:split-string clean :separator '(#\Space #\Tab #\Newline)))))
|
||||
|
||||
#+end_src
|
||||
** embeddings-hash-word
|
||||
;; REPL-VERIFIED: 2026-05-03T13:00:00
|
||||
#+begin_src lisp
|
||||
(defun embeddings-hash-word (word dim)
|
||||
"Hashes a word to a bucket index in [0, dim). Uses FNV-1a style hashing
|
||||
for good distribution with minimal collisions."
|
||||
(let ((hash 2166136261))
|
||||
(loop for c across word
|
||||
do (setf hash (logxor hash (char-code c)))
|
||||
(setf hash (mod (* hash 16777619) #x100000000)))
|
||||
(mod hash dim)))
|
||||
|
||||
#+end_src
|
||||
** embeddings-compute
|
||||
;; REPL-VERIFIED: 2026-05-03T13:00:00
|
||||
#+begin_src lisp
|
||||
(defun embeddings-compute (text &key (dimensions *embedding-dimensions*))
|
||||
"Computes a dense embedding vector for TEXT.
|
||||
Tries the registered backend first, falls back to hashing-trick.
|
||||
Returns a list of DIMENSIONS double-floats normalized to unit length."
|
||||
;; Try registered backend first
|
||||
(when *embedding-backend*
|
||||
(handler-case
|
||||
(let ((result (funcall *embedding-backend* text)))
|
||||
(when (and result (listp result) (> (length result) 0))
|
||||
(return-from embeddings-compute result)))
|
||||
(error (c)
|
||||
(log-message "EMBEDDING: Backend failed (~a), falling back to hashing" c))))
|
||||
;; Hashing-trick fallback
|
||||
(let* ((tokens (embeddings-tokenize text))
|
||||
(vec (make-array dimensions :initial-element 0.0d0 :element-type 'double-float)))
|
||||
(dolist (token tokens)
|
||||
(let* ((idx (embeddings-hash-word token dimensions))
|
||||
(sign (if (evenp (char-code (char token 0))) 1 -1)))
|
||||
(incf (aref vec idx) (coerce sign 'double-float))))
|
||||
;; Normalize to unit length
|
||||
(let ((norm (sqrt (loop for i below dimensions sum (expt (aref vec i) 2)))))
|
||||
(if (> norm 0.0d0)
|
||||
(loop for i below dimensions collect (/ (aref vec i) norm))
|
||||
(loop for i below dimensions collect 0.0d0)))))
|
||||
#+end_src
|
||||
#+end_src
|
||||
|
||||
** Memory Object Embedding
|
||||
|
||||
;; REPL-VERIFIED: 2026-05-03T13:00:00
|
||||
#+begin_src lisp
|
||||
(defun embed-object (obj)
|
||||
"Generates and stores an embedding vector for a memory-object.
|
||||
Combines title, content, and tags into a single text for embedding.
|
||||
Stores the result in the memory-object's :vector slot."
|
||||
(let* ((attrs (memory-object-attributes obj))
|
||||
(title (or (getf attrs :TITLE) ""))
|
||||
(text (or (memory-object-content obj) ""))
|
||||
(tags (format nil "~{~a~^ ~}" (or (getf attrs :TAGS) "")))
|
||||
(combined (format nil "~a ~a ~a" title text tags))
|
||||
(vec (embeddings-compute combined)))
|
||||
(setf (memory-object-vector obj) vec)
|
||||
(log-message "EMBEDDING: Vector for ~a (~d dims, ~d tokens)"
|
||||
(memory-object-id obj) (length vec)
|
||||
(length (embeddings-tokenize combined)))
|
||||
vec))
|
||||
|
||||
#+end_src
|
||||
** embed-all-pending
|
||||
;; REPL-VERIFIED: 2026-05-03T13:00:00
|
||||
#+begin_src lisp
|
||||
(defun embed-all-pending ()
|
||||
"Generates embeddings for all memory objects that lack vectors.
|
||||
Called by the heartbeat or on demand. Returns count of objects processed."
|
||||
(let ((count 0))
|
||||
(maphash (lambda (id obj)
|
||||
(declare (ignore id))
|
||||
(unless (memory-object-vector obj)
|
||||
(handler-case
|
||||
(progn (embed-object obj) (incf count))
|
||||
(error (c)
|
||||
(log-message "EMBEDDING: Failed for ~a: ~a" (memory-object-id obj) c)))))
|
||||
*memory-store*)
|
||||
(when (> count 0)
|
||||
(log-message "EMBEDDING: Batch processed ~d objects" count))
|
||||
count))
|
||||
#+end_src
|
||||
#+end_src
|
||||
|
||||
** Skill Registration
|
||||
|
||||
Runs as a background skill triggered by heartbeat events, processing
|
||||
pending embeddings in batches. Low priority (50) so it defers to
|
||||
critical skills.
|
||||
|
||||
#+begin_src lisp
|
||||
(defskill :passepartout-system-embeddings
|
||||
:priority 50
|
||||
:trigger (lambda (ctx) (eq (getf (getf ctx :payload) :sensor) :heartbeat))
|
||||
:deterministic (lambda (action ctx)
|
||||
(declare (ignore action ctx))
|
||||
(ignore-errors (embed-all-pending))
|
||||
nil))
|
||||
#+end_src
|
||||
@@ -25,6 +25,12 @@ at load time. The core iterates providers, calling the selector for each one.
|
||||
|
||||
* Implementation
|
||||
|
||||
** Package Context
|
||||
|
||||
#+begin_src lisp
|
||||
(in-package :passepartout)
|
||||
#+end_src
|
||||
|
||||
** Configuration: Per-Slot Cascades
|
||||
|
||||
Four env-configurable cascade variables, one per slot. Each cascade is a list
|
||||
|
||||
Reference in New Issue
Block a user