#+TITLE: SKILL: Embedding Gateway (org-skill-embedding-gateway.org) #+AUTHOR: Agent #+FILETAGS: :system:memory:embeddings: #+PROPERTY: header-args:lisp :tangle ../lisp/system-embedding-gateway.lisp * Overview Provider-agnostic vector embedding gateway. Generates dense vectors for ~memory-object~ entries via a pluggable backend, enabling semantic similarity search. Designed for thin harness: the core (core-context) knows how to USE vectors but not how to GENERATE them — that lives here. Backends are configured via ~EMBEDDING_PROVIDER~: - ~:hashing~ (default) — FNV-1a hashing trick, no external dependencies - ~:ollama~ — POST /api/embeddings to a local Ollama instance Newly ingested objects are queued for embedding via ~embed-queue-object~. The ~embed-all-pending~ function drains the queue on heartbeat ticks, with a store-wide scan as fallback for objects ingested before the skill loaded. * Implementation ** Package Context #+begin_src lisp (in-package :passepartout) #+end_src ** Embedding Queue Pending object IDs to embed. Populated by ~embed-queue-object~. ;; REPL-VERIFIED: 2026-05-03T14:00:00 #+begin_src lisp (defvar *embedding-queue* nil "List of object IDs pending embedding.") #+end_src ** Provider Selection ~*embedding-provider*~ tracks the active provider keyword. Set at load time from the ~EMBEDDING_PROVIDER~ env var. ;; REPL-VERIFIED: 2026-05-03T14:00:00 #+begin_src lisp (defvar *embedding-provider* :hashing "Active embedding provider: :hashing, :ollama, :openai.") #+end_src ** Text Tokenizer Splits text into lowercase word tokens for the hashing trick. ;; REPL-VERIFIED: 2026-05-03T14:00:00 #+begin_src lisp (defun embeddings-tokenize (text) "Split TEXT into lowercase word tokens, strip punctuation, discard short." (let ((clean (cl-ppcre:regex-replace-all "[^a-zA-Z0-9 ]" (string-downcase (or text "")) " "))) (remove-if (lambda (w) (< (length w) 2)) (uiop:split-string clean :separator '(#\Space #\Tab #\Newline))))) #+end_src ** Hashing Trick FNV-1a hash to a fixed-dimension bucket. Produces dense ±1 vectors from vocabulary co-occurrence patterns without any training or external services. ;; REPL-VERIFIED: 2026-05-03T14:00:00 #+begin_src lisp (defun embeddings-hash-word (word dim) "Hash WORD to an index in [0, DIM)." (let ((hash 2166136261)) (loop for c across word do (setf hash (logxor hash (char-code c))) (setf hash (mod (* hash 16777619) #x100000000))) (mod hash dim))) #+end_src ** embeddings-hash-vector ;; REPL-VERIFIED: 2026-05-03T14:00:00 #+begin_src lisp (defun embeddings-hash-vector (text &key (dimensions 384)) "Compute a hashing-trick vector for TEXT." (let* ((tokens (embeddings-tokenize text)) (vec (make-array dimensions :initial-element 0.0d0 :element-type 'double-float))) (dolist (token tokens) (let* ((idx (embeddings-hash-word token dimensions)) (sign (if (evenp (char-code (char token 0))) 1 -1))) (incf (aref vec idx) (coerce sign 'double-float)))) (let ((norm (sqrt (loop for i below dimensions sum (expt (aref vec i) 2))))) (if (> norm 0.0d0) (loop for i below dimensions collect (/ (aref vec i) norm)) (loop for i below dimensions collect 0.0d0))))) #+end_src ** Embedding Compute Top-level embedding function. Tries the configured backend first, falls back to the hashing trick. Registered as ~*embedding-backend*~ for provider pluggability. ;; REPL-VERIFIED: 2026-05-03T14:00:00 #+begin_src lisp (defun embeddings-compute (text &key (dimensions 384)) "Compute embedding vector for TEXT. Tries *embedding-backend* first, falls back to hashing trick." (when *embedding-backend* (handler-case (let ((result (funcall *embedding-backend* text))) (when (and result (listp result) (> (length result) 0)) (return-from embeddings-compute result))) (error (c) (log-message "EMBEDDING: Backend failed (~a), fallback to hashing" c)))) (embeddings-hash-vector text :dimensions dimensions)) #+end_src ** Ollama Backend Calls ~POST /api/embeddings~ on a local Ollama instance. Configurable via ~OLLAMA_URL~ (default http://localhost:11434) and ~EMBEDDING_MODEL~ (default nomic-embed-text). ;; REPL-VERIFIED: 2026-05-03T14:00:00 #+begin_src lisp (defun embedding-backend-ollama (text) "Generate embeddings via Ollama /api/embeddings." (let* ((url (or (uiop:getenv "OLLAMA_URL") "http://localhost:11434")) (model (or (uiop:getenv "EMBEDDING_MODEL") "nomic-embed-text")) (response (dex:post (format nil "~a/api/embeddings" url) :content (json:encode-json-to-string `((:model . ,model) (:prompt . ,text))) :headers '(("Content-Type" . "application/json"))))) (when response (let* ((json (json:decode-json-from-string response)) (embedding (cdr (assoc :embedding json)))) (when embedding (coerce embedding 'list)))))) #+end_src ** Queue Object ~embed-queue-object~ adds an object to the pending queue if it lacks a vector. Call from the perceive gate after ~ingest-ast~. ;; REPL-VERIFIED: 2026-05-03T14:00:00 #+begin_src lisp (defun embed-queue-object (obj) "Queue OBJ for embedding if it lacks a vector." (when (and obj (not (memory-object-vector obj))) (pushnew (memory-object-id obj) *embedding-queue* :test 'string=))) #+end_src ** Embed Single Object Generates and stores a vector for a single memory object. ;; REPL-VERIFIED: 2026-05-03T14:00:00 #+begin_src lisp (defun embed-object (obj) "Generate and store embedding vector for OBJ." (let* ((attrs (memory-object-attributes obj)) (title (or (getf attrs :TITLE) "")) (text (or (memory-object-content obj) "")) (raw-tags (getf attrs :TAGS)) (tag-list (if (listp raw-tags) raw-tags nil)) (tags (if tag-list (format nil "~{~a~^ ~}" tag-list) "")) (combined (format nil "~a ~a ~a" title text tags)) (vec (embeddings-compute combined))) (setf (memory-object-vector obj) vec) (log-message "EMBEDDING: Vector for ~a (~d dims)" (memory-object-id obj) (length vec)) vec)) #+end_src ** Batch Process Drains the queue and (if queue was empty) scans the store for remaining objects without vectors. Returns count of newly embedded objects. ;; REPL-VERIFIED: 2026-05-03T14:00:00 #+begin_src lisp (defun embed-all-pending () "Process all pending embeddings. Returns count." (let ((count 0)) ;; Drain queue (let ((pending *embedding-queue*)) (setf *embedding-queue* nil) (dolist (id pending) (let ((obj (gethash id *memory-store*))) (when (and obj (not (memory-object-vector obj))) (handler-case (progn (embed-object obj) (incf count)) (error (c) (log-message "EMBEDDING: Failed ~a: ~a" id c))))))) ;; Fallback: scan store for objects without vectors (when (= count 0) (maphash (lambda (id obj) (declare (ignore id)) (unless (memory-object-vector obj) (handler-case (progn (embed-object obj) (incf count)) (error (c) (log-message "EMBEDDING: Failed ~a: ~a" (memory-object-id obj) c))))) *memory-store*)) (when (> count 0) (log-message "EMBEDDING: Batch processed ~d objects" count)) count)) #+end_src ** Initialization Reads ~EMBEDDING_PROVIDER~ env var and configures the backend. ;; REPL-VERIFIED: 2026-05-03T14:00:00 #+begin_src lisp (defun embeddings-init (&key (provider *embedding-provider*)) "Init embedding provider from EMBEDDING_PROVIDER env var." (let* ((env (uiop:getenv "EMBEDDING_PROVIDER")) (selected (or (and env (intern (string-upcase env) :keyword)) provider))) (setf *embedding-provider* selected) (setf *embedding-backend* (case selected (:ollama #'embedding-backend-ollama) (t nil))) (log-message "EMBEDDING: Provider ~a, backend=~a" selected *embedding-backend*) selected)) (embeddings-init) #+end_src