feat: asynchronous embedding gateway with provider-agnostic backend
Some checks failed
Deploy (Gitea) / deploy (push) Failing after 3s

New file: org/system-embedding-gateway.org / lisp/system-embedding-gateway.lisp.

- Pluggable backends via *embedding-backend* hook and EMBEDDING_PROVIDER env var
- :hashing (default) — FNV-1a hashing trick, zero dependencies
- :ollama — POST /api/embeddings to local Ollama (nomic-embed-text)
- *embedding-queue* tracks pending objects; embed-all-pending drains queue
  with store-wide scan as fallback
- embed-queue-object called after ingest-ast to mark objects for embedding
- Deleted old stub system-embeddings.org (hashing-only, no provider switching)
- Exported embedding symbols from defpackage

Also:
- Added (in-package :passepartout) to system-model-router.org (was missing,
  caused CL-USER::DEFSKILL error on daemon start)
- Added system-embedding-gateway to skill-loader exclusion list
- Updated ROADMAP
This commit is contained in:
2026-05-03 19:54:34 -04:00
parent b4150a9771
commit 9799b9db74
10 changed files with 387 additions and 261 deletions

View File

@@ -81,9 +81,16 @@
#:topological-sort-skills
#:validate-lisp-syntax
#:defskill
#:*skill-registry*
#:*scope-resolver*
#:skill
#:*skill-registry*
#:*scope-resolver*
#:*embedding-backend*
#:*embedding-queue*
#:*embedding-provider*
#:embed-queue-object
#:embed-object
#:embed-all-pending
#:embeddings-compute
#:skill
#:skill-name
#:skill-priority
#:skill-dependencies

View File

@@ -92,7 +92,8 @@
(string= n "core-loop-act")
(string= n "core-loop")
(string= n "core-manifest")
(string= n "security-dispatcher"))))
(string= n "security-dispatcher")
(string= n "system-embedding-gateway"))))
all-files))
(adj (make-hash-table :test 'equal))
(name-to-file (make-hash-table :test 'equal))

View File

@@ -0,0 +1,123 @@
(in-package :passepartout)
(defvar *embedding-queue* nil
"List of object IDs pending embedding.")
(defvar *embedding-provider* :hashing
"Active embedding provider: :hashing, :ollama, :openai.")
(defun embeddings-tokenize (text)
"Split TEXT into lowercase word tokens, strip punctuation, discard short."
(let ((clean (cl-ppcre:regex-replace-all "[^a-zA-Z0-9 ]"
(string-downcase (or text "")) " ")))
(remove-if (lambda (w) (< (length w) 2))
(uiop:split-string clean :separator '(#\Space #\Tab #\Newline)))))
(defun embeddings-hash-word (word dim)
"Hash WORD to an index in [0, DIM)."
(let ((hash 2166136261))
(loop for c across word
do (setf hash (logxor hash (char-code c)))
(setf hash (mod (* hash 16777619) #x100000000)))
(mod hash dim)))
(defun embeddings-hash-vector (text &key (dimensions 384))
"Compute a hashing-trick vector for TEXT."
(let* ((tokens (embeddings-tokenize text))
(vec (make-array dimensions :initial-element 0.0d0 :element-type 'double-float)))
(dolist (token tokens)
(let* ((idx (embeddings-hash-word token dimensions))
(sign (if (evenp (char-code (char token 0))) 1 -1)))
(incf (aref vec idx) (coerce sign 'double-float))))
(let ((norm (sqrt (loop for i below dimensions sum (expt (aref vec i) 2)))))
(if (> norm 0.0d0)
(loop for i below dimensions collect (/ (aref vec i) norm))
(loop for i below dimensions collect 0.0d0)))))
(defun embeddings-compute (text &key (dimensions 384))
"Compute embedding vector for TEXT.
Tries *embedding-backend* first, falls back to hashing trick."
(when *embedding-backend*
(handler-case
(let ((result (funcall *embedding-backend* text)))
(when (and result (listp result) (> (length result) 0))
(return-from embeddings-compute result)))
(error (c)
(log-message "EMBEDDING: Backend failed (~a), fallback to hashing" c))))
(embeddings-hash-vector text :dimensions dimensions))
(defun embedding-backend-ollama (text)
"Generate embeddings via Ollama /api/embeddings."
(let* ((url (or (uiop:getenv "OLLAMA_URL") "http://localhost:11434"))
(model (or (uiop:getenv "EMBEDDING_MODEL") "nomic-embed-text"))
(response (dex:post (format nil "~a/api/embeddings" url)
:content (json:encode-json-to-string
`((:model . ,model) (:prompt . ,text)))
:headers '(("Content-Type" . "application/json")))))
(when response
(let* ((json (json:decode-json-from-string response))
(embedding (cdr (assoc :embedding json))))
(when embedding
(coerce embedding 'list))))))
(defun embed-queue-object (obj)
"Queue OBJ for embedding if it lacks a vector."
(when (and obj (not (memory-object-vector obj)))
(pushnew (memory-object-id obj) *embedding-queue* :test 'string=)))
(defun embed-object (obj)
"Generate and store embedding vector for OBJ."
(let* ((attrs (memory-object-attributes obj))
(title (or (getf attrs :TITLE) ""))
(text (or (memory-object-content obj) ""))
(raw-tags (getf attrs :TAGS))
(tag-list (if (listp raw-tags) raw-tags nil))
(tags (if tag-list (format nil "~{~a~^ ~}" tag-list) ""))
(combined (format nil "~a ~a ~a" title text tags))
(vec (embeddings-compute combined)))
(setf (memory-object-vector obj) vec)
(log-message "EMBEDDING: Vector for ~a (~d dims)" (memory-object-id obj) (length vec))
vec))
(defun embed-all-pending ()
"Process all pending embeddings. Returns count."
(let ((count 0))
;; Drain queue
(let ((pending *embedding-queue*))
(setf *embedding-queue* nil)
(dolist (id pending)
(let ((obj (gethash id *memory-store*)))
(when (and obj (not (memory-object-vector obj)))
(handler-case
(progn (embed-object obj) (incf count))
(error (c)
(log-message "EMBEDDING: Failed ~a: ~a" id c)))))))
;; Fallback: scan store for objects without vectors
(when (= count 0)
(maphash (lambda (id obj)
(declare (ignore id))
(unless (memory-object-vector obj)
(handler-case
(progn (embed-object obj) (incf count))
(error (c)
(log-message "EMBEDDING: Failed ~a: ~a"
(memory-object-id obj) c)))))
*memory-store*))
(when (> count 0)
(log-message "EMBEDDING: Batch processed ~d objects" count))
count))
(defun embeddings-init (&key (provider *embedding-provider*))
"Init embedding provider from EMBEDDING_PROVIDER env var."
(let* ((env (uiop:getenv "EMBEDDING_PROVIDER"))
(selected (or (and env (intern (string-upcase env) :keyword))
provider)))
(setf *embedding-provider* selected)
(setf *embedding-backend*
(case selected
(:ollama #'embedding-backend-ollama)
(t nil)))
(log-message "EMBEDDING: Provider ~a, backend=~a" selected *embedding-backend*)
selected))
(embeddings-init)

View File

@@ -1,91 +0,0 @@
(defvar *embedding-dimensions* 384
"Dimension of the embedding vector. Default 384 matches nomic-embed-text.")
(defvar *embedding-backend* nil
"Optional external embedding function (text-str) → float vector.
When nil, the hashing-trick fallback is used. Register a backend via:
(setf *embedding-backend* (lambda (text) ...))
For Ollama: POST /api/embeddings with model nomic-embed-text.
For OpenAI: POST /v1/embeddings with model text-embedding-3-small.")
(defun embeddings-tokenize (text)
"Splits text into lowercase word tokens, stripping punctuation and
discarding tokens shorter than 2 characters."
(let ((clean (cl-ppcre:regex-replace-all "[^a-zA-Z0-9 ]"
(string-downcase (or text "")) " ")))
(remove-if (lambda (w) (< (length w) 2))
(uiop:split-string clean :separator '(#\Space #\Tab #\Newline)))))
(defun embeddings-hash-word (word dim)
"Hashes a word to a bucket index in [0, dim). Uses FNV-1a style hashing
for good distribution with minimal collisions."
(let ((hash 2166136261))
(loop for c across word
do (setf hash (logxor hash (char-code c)))
(setf hash (mod (* hash 16777619) #x100000000)))
(mod hash dim)))
(defun embeddings-compute (text &key (dimensions *embedding-dimensions*))
"Computes a dense embedding vector for TEXT.
Tries the registered backend first, falls back to hashing-trick.
Returns a list of DIMENSIONS double-floats normalized to unit length."
;; Try registered backend first
(when *embedding-backend*
(handler-case
(let ((result (funcall *embedding-backend* text)))
(when (and result (listp result) (> (length result) 0))
(return-from embeddings-compute result)))
(error (c)
(log-message "EMBEDDING: Backend failed (~a), falling back to hashing" c))))
;; Hashing-trick fallback
(let* ((tokens (embeddings-tokenize text))
(vec (make-array dimensions :initial-element 0.0d0 :element-type 'double-float)))
(dolist (token tokens)
(let* ((idx (embeddings-hash-word token dimensions))
(sign (if (evenp (char-code (char token 0))) 1 -1)))
(incf (aref vec idx) (coerce sign 'double-float))))
;; Normalize to unit length
(let ((norm (sqrt (loop for i below dimensions sum (expt (aref vec i) 2)))))
(if (> norm 0.0d0)
(loop for i below dimensions collect (/ (aref vec i) norm))
(loop for i below dimensions collect 0.0d0)))))
(defun embed-object (obj)
"Generates and stores an embedding vector for a memory-object.
Combines title, content, and tags into a single text for embedding.
Stores the result in the memory-object's :vector slot."
(let* ((attrs (memory-object-attributes obj))
(title (or (getf attrs :TITLE) ""))
(text (or (memory-object-content obj) ""))
(tags (format nil "~{~a~^ ~}" (or (getf attrs :TAGS) "")))
(combined (format nil "~a ~a ~a" title text tags))
(vec (embeddings-compute combined)))
(setf (memory-object-vector obj) vec)
(log-message "EMBEDDING: Vector for ~a (~d dims, ~d tokens)"
(memory-object-id obj) (length vec)
(length (embeddings-tokenize combined)))
vec))
(defun embed-all-pending ()
"Generates embeddings for all memory objects that lack vectors.
Called by the heartbeat or on demand. Returns count of objects processed."
(let ((count 0))
(maphash (lambda (id obj)
(declare (ignore id))
(unless (memory-object-vector obj)
(handler-case
(progn (embed-object obj) (incf count))
(error (c)
(log-message "EMBEDDING: Failed for ~a: ~a" (memory-object-id obj) c)))))
*memory-store*)
(when (> count 0)
(log-message "EMBEDDING: Batch processed ~d objects" count))
count))
(defskill :passepartout-system-embeddings
:priority 50
:trigger (lambda (ctx) (eq (getf (getf ctx :payload) :sensor) :heartbeat))
:deterministic (lambda (action ctx)
(declare (ignore action ctx))
(ignore-errors (embed-all-pending))
nil))

View File

@@ -1,3 +1,5 @@
(in-package :passepartout)
(defvar *model-cascade-code* nil
"Cascade for :code tasks: ((:ollama . \"model\") ...)")