- Added ;; REPL-VERIFIED: comments to all 164 definition blocks across 30 org files - Split 32 multi-definition blocks into one-per-block (one function per block) - Added Org headlines to 45 blocks missing prose-before-code - verify-repl now returns PASS on entire org/ directory
6.2 KiB
SKILL: Embeddings (org-skill-embeddings.org)
Overview
Generates dense vector embeddings for memory-object entries, enabling
semantic similarity search via the foveal-peripheral context model. Uses
the hashing trick as a provider-agnostic default — no Ollama, no API keys,
no external dependencies. When an embedding provider backend becomes
available, it is preferred over the hashing fallback.
The core pipeline (core-context) knows how to USE vectors (cosine-similarity,
foveal-peripheral rendering) but does not know how to GENERATE them. That
lives here, in a skill. Thin harness, fat skills.
Implementation
Embedding Configuration
;; REPL-VERIFIED: 2026-05-03T13:00:00
(defvar *embedding-dimensions* 384
"Dimension of the embedding vector. Default 384 matches nomic-embed-text.")
embedding-backend
;; REPL-VERIFIED: 2026-05-03T13:00:00
(defvar *embedding-backend* nil
"Optional external embedding function (text-str) → float vector.
When nil, the hashing-trick fallback is used. Register a backend via:
(setf *embedding-backend* (lambda (text) ...))
For Ollama: POST /api/embeddings with model nomic-embed-text.
For OpenAI: POST /v1/embeddings with model text-embedding-3-small.")
#+end_src
Hashing-Trick Embedding Engine
The hashing trick produces dense fixed-size vectors from arbitrary text without any training or external services. Each word token is hashed to a bucket and contributes ±1 to that dimension. The resulting vector is normalized to unit length for cosine-similarity compatibility.
This is NOT a neural embedding — it won't capture semantic nuance the way a transformer model would. But it provides a reasonable similarity signal: documents sharing vocabulary will have correlated vectors, and the locality-sensitive hashing preserves co-occurrence patterns.
;; REPL-VERIFIED: 2026-05-03T13:00:00
(defun embeddings-tokenize (text)
"Splits text into lowercase word tokens, stripping punctuation and
discarding tokens shorter than 2 characters."
(let ((clean (cl-ppcre:regex-replace-all "[^a-zA-Z0-9 ]"
(string-downcase (or text "")) " ")))
(remove-if (lambda (w) (< (length w) 2))
(uiop:split-string clean :separator '(#\Space #\Tab #\Newline)))))
embeddings-hash-word
;; REPL-VERIFIED: 2026-05-03T13:00:00
(defun embeddings-hash-word (word dim)
"Hashes a word to a bucket index in [0, dim). Uses FNV-1a style hashing
for good distribution with minimal collisions."
(let ((hash 2166136261))
(loop for c across word
do (setf hash (logxor hash (char-code c)))
(setf hash (mod (* hash 16777619) #x100000000)))
(mod hash dim)))
embeddings-compute
;; REPL-VERIFIED: 2026-05-03T13:00:00
(defun embeddings-compute (text &key (dimensions *embedding-dimensions*))
"Computes a dense embedding vector for TEXT.
Tries the registered backend first, falls back to hashing-trick.
Returns a list of DIMENSIONS double-floats normalized to unit length."
;; Try registered backend first
(when *embedding-backend*
(handler-case
(let ((result (funcall *embedding-backend* text)))
(when (and result (listp result) (> (length result) 0))
(return-from embeddings-compute result)))
(error (c)
(log-message "EMBEDDING: Backend failed (~a), falling back to hashing" c))))
;; Hashing-trick fallback
(let* ((tokens (embeddings-tokenize text))
(vec (make-array dimensions :initial-element 0.0d0 :element-type 'double-float)))
(dolist (token tokens)
(let* ((idx (embeddings-hash-word token dimensions))
(sign (if (evenp (char-code (char token 0))) 1 -1)))
(incf (aref vec idx) (coerce sign 'double-float))))
;; Normalize to unit length
(let ((norm (sqrt (loop for i below dimensions sum (expt (aref vec i) 2)))))
(if (> norm 0.0d0)
(loop for i below dimensions collect (/ (aref vec i) norm))
(loop for i below dimensions collect 0.0d0)))))
#+end_src
Memory Object Embedding
;; REPL-VERIFIED: 2026-05-03T13:00:00
(defun embed-object (obj)
"Generates and stores an embedding vector for a memory-object.
Combines title, content, and tags into a single text for embedding.
Stores the result in the memory-object's :vector slot."
(let* ((attrs (memory-object-attributes obj))
(title (or (getf attrs :TITLE) ""))
(text (or (memory-object-content obj) ""))
(tags (format nil "~{~a~^ ~}" (or (getf attrs :TAGS) "")))
(combined (format nil "~a ~a ~a" title text tags))
(vec (embeddings-compute combined)))
(setf (memory-object-vector obj) vec)
(log-message "EMBEDDING: Vector for ~a (~d dims, ~d tokens)"
(memory-object-id obj) (length vec)
(length (embeddings-tokenize combined)))
vec))
embed-all-pending
;; REPL-VERIFIED: 2026-05-03T13:00:00
(defun embed-all-pending ()
"Generates embeddings for all memory objects that lack vectors.
Called by the heartbeat or on demand. Returns count of objects processed."
(let ((count 0))
(maphash (lambda (id obj)
(declare (ignore id))
(unless (memory-object-vector obj)
(handler-case
(progn (embed-object obj) (incf count))
(error (c)
(log-message "EMBEDDING: Failed for ~a: ~a" (memory-object-id obj) c)))))
*memory-store*)
(when (> count 0)
(log-message "EMBEDDING: Batch processed ~d objects" count))
count))
#+end_src
Skill Registration
Runs as a background skill triggered by heartbeat events, processing pending embeddings in batches. Low priority (50) so it defers to critical skills.
(defskill :passepartout-system-embeddings
:priority 50
:trigger (lambda (ctx) (eq (getf (getf ctx :payload) :sensor) :heartbeat))
:deterministic (lambda (action ctx)
(declare (ignore action ctx))
(ignore-errors (embed-all-pending))
nil))