Some checks failed
Deploy (Gitea) / deploy (push) Failing after 3s
New file: org/system-embedding-gateway.org / lisp/system-embedding-gateway.lisp. - Pluggable backends via *embedding-backend* hook and EMBEDDING_PROVIDER env var - :hashing (default) — FNV-1a hashing trick, zero dependencies - :ollama — POST /api/embeddings to local Ollama (nomic-embed-text) - *embedding-queue* tracks pending objects; embed-all-pending drains queue with store-wide scan as fallback - embed-queue-object called after ingest-ast to mark objects for embedding - Deleted old stub system-embeddings.org (hashing-only, no provider switching) - Exported embedding symbols from defpackage Also: - Added (in-package :passepartout) to system-model-router.org (was missing, caused CL-USER::DEFSKILL error on daemon start) - Added system-embedding-gateway to skill-loader exclusion list - Updated ROADMAP
124 lines
5.1 KiB
Common Lisp
124 lines
5.1 KiB
Common Lisp
(in-package :passepartout)
|
|
|
|
(defvar *embedding-queue* nil
|
|
"List of object IDs pending embedding.")
|
|
|
|
(defvar *embedding-provider* :hashing
|
|
"Active embedding provider: :hashing, :ollama, :openai.")
|
|
|
|
(defun embeddings-tokenize (text)
|
|
"Split TEXT into lowercase word tokens, strip punctuation, discard short."
|
|
(let ((clean (cl-ppcre:regex-replace-all "[^a-zA-Z0-9 ]"
|
|
(string-downcase (or text "")) " ")))
|
|
(remove-if (lambda (w) (< (length w) 2))
|
|
(uiop:split-string clean :separator '(#\Space #\Tab #\Newline)))))
|
|
|
|
(defun embeddings-hash-word (word dim)
|
|
"Hash WORD to an index in [0, DIM)."
|
|
(let ((hash 2166136261))
|
|
(loop for c across word
|
|
do (setf hash (logxor hash (char-code c)))
|
|
(setf hash (mod (* hash 16777619) #x100000000)))
|
|
(mod hash dim)))
|
|
|
|
(defun embeddings-hash-vector (text &key (dimensions 384))
|
|
"Compute a hashing-trick vector for TEXT."
|
|
(let* ((tokens (embeddings-tokenize text))
|
|
(vec (make-array dimensions :initial-element 0.0d0 :element-type 'double-float)))
|
|
(dolist (token tokens)
|
|
(let* ((idx (embeddings-hash-word token dimensions))
|
|
(sign (if (evenp (char-code (char token 0))) 1 -1)))
|
|
(incf (aref vec idx) (coerce sign 'double-float))))
|
|
(let ((norm (sqrt (loop for i below dimensions sum (expt (aref vec i) 2)))))
|
|
(if (> norm 0.0d0)
|
|
(loop for i below dimensions collect (/ (aref vec i) norm))
|
|
(loop for i below dimensions collect 0.0d0)))))
|
|
|
|
(defun embeddings-compute (text &key (dimensions 384))
|
|
"Compute embedding vector for TEXT.
|
|
Tries *embedding-backend* first, falls back to hashing trick."
|
|
(when *embedding-backend*
|
|
(handler-case
|
|
(let ((result (funcall *embedding-backend* text)))
|
|
(when (and result (listp result) (> (length result) 0))
|
|
(return-from embeddings-compute result)))
|
|
(error (c)
|
|
(log-message "EMBEDDING: Backend failed (~a), fallback to hashing" c))))
|
|
(embeddings-hash-vector text :dimensions dimensions))
|
|
|
|
(defun embedding-backend-ollama (text)
|
|
"Generate embeddings via Ollama /api/embeddings."
|
|
(let* ((url (or (uiop:getenv "OLLAMA_URL") "http://localhost:11434"))
|
|
(model (or (uiop:getenv "EMBEDDING_MODEL") "nomic-embed-text"))
|
|
(response (dex:post (format nil "~a/api/embeddings" url)
|
|
:content (json:encode-json-to-string
|
|
`((:model . ,model) (:prompt . ,text)))
|
|
:headers '(("Content-Type" . "application/json")))))
|
|
(when response
|
|
(let* ((json (json:decode-json-from-string response))
|
|
(embedding (cdr (assoc :embedding json))))
|
|
(when embedding
|
|
(coerce embedding 'list))))))
|
|
|
|
(defun embed-queue-object (obj)
|
|
"Queue OBJ for embedding if it lacks a vector."
|
|
(when (and obj (not (memory-object-vector obj)))
|
|
(pushnew (memory-object-id obj) *embedding-queue* :test 'string=)))
|
|
|
|
(defun embed-object (obj)
|
|
"Generate and store embedding vector for OBJ."
|
|
(let* ((attrs (memory-object-attributes obj))
|
|
(title (or (getf attrs :TITLE) ""))
|
|
(text (or (memory-object-content obj) ""))
|
|
(raw-tags (getf attrs :TAGS))
|
|
(tag-list (if (listp raw-tags) raw-tags nil))
|
|
(tags (if tag-list (format nil "~{~a~^ ~}" tag-list) ""))
|
|
(combined (format nil "~a ~a ~a" title text tags))
|
|
(vec (embeddings-compute combined)))
|
|
(setf (memory-object-vector obj) vec)
|
|
(log-message "EMBEDDING: Vector for ~a (~d dims)" (memory-object-id obj) (length vec))
|
|
vec))
|
|
|
|
(defun embed-all-pending ()
|
|
"Process all pending embeddings. Returns count."
|
|
(let ((count 0))
|
|
;; Drain queue
|
|
(let ((pending *embedding-queue*))
|
|
(setf *embedding-queue* nil)
|
|
(dolist (id pending)
|
|
(let ((obj (gethash id *memory-store*)))
|
|
(when (and obj (not (memory-object-vector obj)))
|
|
(handler-case
|
|
(progn (embed-object obj) (incf count))
|
|
(error (c)
|
|
(log-message "EMBEDDING: Failed ~a: ~a" id c)))))))
|
|
;; Fallback: scan store for objects without vectors
|
|
(when (= count 0)
|
|
(maphash (lambda (id obj)
|
|
(declare (ignore id))
|
|
(unless (memory-object-vector obj)
|
|
(handler-case
|
|
(progn (embed-object obj) (incf count))
|
|
(error (c)
|
|
(log-message "EMBEDDING: Failed ~a: ~a"
|
|
(memory-object-id obj) c)))))
|
|
*memory-store*))
|
|
(when (> count 0)
|
|
(log-message "EMBEDDING: Batch processed ~d objects" count))
|
|
count))
|
|
|
|
(defun embeddings-init (&key (provider *embedding-provider*))
|
|
"Init embedding provider from EMBEDDING_PROVIDER env var."
|
|
(let* ((env (uiop:getenv "EMBEDDING_PROVIDER"))
|
|
(selected (or (and env (intern (string-upcase env) :keyword))
|
|
provider)))
|
|
(setf *embedding-provider* selected)
|
|
(setf *embedding-backend*
|
|
(case selected
|
|
(:ollama #'embedding-backend-ollama)
|
|
(t nil)))
|
|
(log-message "EMBEDDING: Provider ~a, backend=~a" selected *embedding-backend*)
|
|
selected))
|
|
|
|
(embeddings-init)
|