feat: P2 — provider-agnostic embeddings + subtree loading
Some checks failed
Deploy (Gitea) / deploy (push) Failing after 2s

- system-embeddings (new skill): hashing-trick embeddings for memory-object
  vectors. Works offline with no Ollama/API dependency. Falls back to
  hashing-trick when no embedding backend is registered. Extensible —
  set *embedding-backend* to use Ollama/OpenAI/any provider.
- programming-org: add org-subtree-extract and org-heading-list for
  targeting specific headlines in Org text without loading whole files.
- core-context: add context-skill-subtree thin wrapper delegating to
  org-subtree-extract. Core stays thin — parsing lives in the skill.
This commit is contained in:
2026-05-03 11:43:27 -04:00
parent 836c9ba7b8
commit b67cd12d88
6 changed files with 349 additions and 4 deletions

View File

@@ -37,7 +37,18 @@
(data-dir (uiop:ensure-directory-pathname (or (uiop:getenv "PASSEPARTOUT_DATA_DIR") (namestring (merge-pathnames ".local/share/passepartout/" (user-homedir-pathname))))))
(org-dir (merge-pathnames "org/" data-dir))
(full-path (merge-pathnames filename org-dir)))
(if (uiop:file-exists-p full-path) (uiop:read-file-string full-path) nil)))
(if (uiop:file-exists-p full-path) (uiop:read-file-string full-path) nil)))
(defun context-skill-subtree (skill-name heading-name)
"Reads a specific headline subtree from a skill's Org source file.
Returns the content under HEADING-NAME (including children) as a string,
or nil if the heading is not found."
(let ((full-source (context-skill-source skill-name)))
(unless full-source (return-from context-skill-subtree nil))
(if (fboundp 'org-subtree-extract)
(org-subtree-extract full-source heading-name)
;; Fallback: no org-subtree-extract available, return full source
full-source)))
(defun context-logs (&optional limit)
"Retrieves the most recent lines from the harness's internal log."

View File

@@ -140,7 +140,46 @@ Returns the filtered content as a string."
(when (listp child)
(let ((found (org-headline-find-by-title child title)))
(when found (return-from org-headline-find-by-title found)))))
nil))
nil))
(defun org-subtree-extract (org-content heading-name)
"Extracts a subtree by heading name from Org text. Returns the subtree
content as a string (headline + body + children), or nil if not found."
(let* ((lines (uiop:split-string org-content :separator '(#\Newline)))
(target-depth nil)
(in-target nil)
(result nil))
(loop for line in lines
for trimmed = (string-trim '(#\Space) line)
do (let ((depth (when (uiop:string-prefix-p "*" trimmed)
(length (subseq trimmed 0
(position-if (lambda (c) (not (char= c #\*)))
trimmed)))))
(headline-title (when (uiop:string-prefix-p "*" trimmed)
(string-trim '(#\* #\Space) trimmed))))
(when depth
(when (string-equal headline-title heading-name)
(setf target-depth depth in-target t))
(when (and in-target target-depth
(<= depth target-depth)
(not (string-equal headline-title heading-name)))
(return-from org-subtree-extract
(format nil "~{~a~^~%~}" (nreverse result)))))
(when in-target (push line result))))
(when result
(format nil "~{~a~^~%~}" (nreverse result)))))
(defun org-heading-list (org-content)
"Returns a list of all top-level heading names in Org text."
(let* ((lines (uiop:split-string org-content :separator '(#\Newline)))
(headings nil))
(dolist (line lines)
(let ((trimmed (string-trim '(#\Space) line)))
(when (uiop:string-prefix-p "* " trimmed)
(let ((title (string-trim '(#\* #\Space) trimmed)))
(unless (find title headings :test #'string-equal)
(push title headings))))))
(nreverse headings)))
(defun org-modify (filepath old-text new-text)
"Replaces all occurrences of OLD-TEXT with NEW-TEXT in filepath.

View File

@@ -0,0 +1,91 @@
(defvar *embedding-dimensions* 384
"Dimension of the embedding vector. Default 384 matches nomic-embed-text.")
(defvar *embedding-backend* nil
"Optional external embedding function (text-str) → float vector.
When nil, the hashing-trick fallback is used. Register a backend via:
(setf *embedding-backend* (lambda (text) ...))
For Ollama: POST /api/embeddings with model nomic-embed-text.
For OpenAI: POST /v1/embeddings with model text-embedding-3-small.")
(defun embeddings-tokenize (text)
"Splits text into lowercase word tokens, stripping punctuation and
discarding tokens shorter than 2 characters."
(let ((clean (cl-ppcre:regex-replace-all "[^a-zA-Z0-9 ]"
(string-downcase (or text "")) " ")))
(remove-if (lambda (w) (< (length w) 2))
(uiop:split-string clean :separator '(#\Space #\Tab #\Newline)))))
(defun embeddings-hash-word (word dim)
"Hashes a word to a bucket index in [0, dim). Uses FNV-1a style hashing
for good distribution with minimal collisions."
(let ((hash 2166136261))
(loop for c across word
do (setf hash (logxor hash (char-code c)))
(setf hash (mod (* hash 16777619) #x100000000)))
(mod hash dim)))
(defun embeddings-compute (text &key (dimensions *embedding-dimensions*))
"Computes a dense embedding vector for TEXT.
Tries the registered backend first, falls back to hashing-trick.
Returns a list of DIMENSIONS double-floats normalized to unit length."
;; Try registered backend first
(when *embedding-backend*
(handler-case
(let ((result (funcall *embedding-backend* text)))
(when (and result (listp result) (> (length result) 0))
(return-from embeddings-compute result)))
(error (c)
(log-message "EMBEDDING: Backend failed (~a), falling back to hashing" c))))
;; Hashing-trick fallback
(let* ((tokens (embeddings-tokenize text))
(vec (make-array dimensions :initial-element 0.0d0 :element-type 'double-float)))
(dolist (token tokens)
(let* ((idx (embeddings-hash-word token dimensions))
(sign (if (evenp (char-code (char token 0))) 1 -1)))
(incf (aref vec idx) (coerce sign 'double-float))))
;; Normalize to unit length
(let ((norm (sqrt (loop for i below dimensions sum (expt (aref vec i) 2)))))
(if (> norm 0.0d0)
(loop for i below dimensions collect (/ (aref vec i) norm))
(loop for i below dimensions collect 0.0d0)))))
(defun embed-object (obj)
"Generates and stores an embedding vector for a memory-object.
Combines title, content, and tags into a single text for embedding.
Stores the result in the memory-object's :vector slot."
(let* ((attrs (memory-object-attributes obj))
(title (or (getf attrs :TITLE) ""))
(text (or (memory-object-content obj) ""))
(tags (format nil "~{~a~^ ~}" (or (getf attrs :TAGS) "")))
(combined (format nil "~a ~a ~a" title text tags))
(vec (embeddings-compute combined)))
(setf (memory-object-vector obj) vec)
(log-message "EMBEDDING: Vector for ~a (~d dims, ~d tokens)"
(memory-object-id obj) (length vec)
(length (embeddings-tokenize combined)))
vec))
(defun embed-all-pending ()
"Generates embeddings for all memory objects that lack vectors.
Called by the heartbeat or on demand. Returns count of objects processed."
(let ((count 0))
(maphash (lambda (id obj)
(declare (ignore id))
(unless (memory-object-vector obj)
(handler-case
(progn (embed-object obj) (incf count))
(error (c)
(log-message "EMBEDDING: Failed for ~a: ~a" (memory-object-id obj) c)))))
*memory-store*)
(when (> count 0)
(log-message "EMBEDDING: Batch processed ~d objects" count))
count))
(defskill :passepartout-system-embeddings
:priority 50
:trigger (lambda (ctx) (eq (getf (getf ctx :payload) :sensor) :heartbeat))
:deterministic (lambda (action ctx)
(declare (ignore action ctx))
(ignore-errors (embed-all-pending))
nil))

View File

@@ -97,7 +97,25 @@ Reads the raw literate source of a specific skill for inspection. Used when the
(data-dir (uiop:ensure-directory-pathname (or (uiop:getenv "PASSEPARTOUT_DATA_DIR") (namestring (merge-pathnames ".local/share/passepartout/" (user-homedir-pathname))))))
(org-dir (merge-pathnames "org/" data-dir))
(full-path (merge-pathnames filename org-dir)))
(if (uiop:file-exists-p full-path) (uiop:read-file-string full-path) nil)))
(if (uiop:file-exists-p full-path) (uiop:read-file-string full-path) nil)))
#+end_src
** Subtree Skill Source (context-skill-subtree)
Returns a specific headline subtree from a skill's Org file. Delegates to
=org-subtree-extract= in the =programming-org= skill for actual parsing.
#+begin_src lisp
(defun context-skill-subtree (skill-name heading-name)
"Reads a specific headline subtree from a skill's Org source file.
Returns the content under HEADING-NAME (including children) as a string,
or nil if the heading is not found."
(let ((full-source (context-skill-source skill-name)))
(unless full-source (return-from context-skill-subtree nil))
(if (fboundp 'org-subtree-extract)
(org-subtree-extract full-source heading-name)
;; Fallback: no org-subtree-extract available, return full source
full-source)))
#+end_src
** Harness Logs (context-logs)

View File

@@ -176,7 +176,53 @@ Returns the filtered content as a string."
(when (listp child)
(let ((found (org-headline-find-by-title child title)))
(when found (return-from org-headline-find-by-title found)))))
nil))
nil))
#+end_src
** Subtree Extraction (from Org text)
Extracts a specific headline subtree from raw Org text by heading name.
Used by =context-skill-subtree= for targeted skill source loading.
#+begin_src lisp
(defun org-subtree-extract (org-content heading-name)
"Extracts a subtree by heading name from Org text. Returns the subtree
content as a string (headline + body + children), or nil if not found."
(let* ((lines (uiop:split-string org-content :separator '(#\Newline)))
(target-depth nil)
(in-target nil)
(result nil))
(loop for line in lines
for trimmed = (string-trim '(#\Space) line)
do (let ((depth (when (uiop:string-prefix-p "*" trimmed)
(length (subseq trimmed 0
(position-if (lambda (c) (not (char= c #\*)))
trimmed)))))
(headline-title (when (uiop:string-prefix-p "*" trimmed)
(string-trim '(#\* #\Space) trimmed))))
(when depth
(when (string-equal headline-title heading-name)
(setf target-depth depth in-target t))
(when (and in-target target-depth
(<= depth target-depth)
(not (string-equal headline-title heading-name)))
(return-from org-subtree-extract
(format nil "~{~a~^~%~}" (nreverse result)))))
(when in-target (push line result))))
(when result
(format nil "~{~a~^~%~}" (nreverse result)))))
(defun org-heading-list (org-content)
"Returns a list of all top-level heading names in Org text."
(let* ((lines (uiop:split-string org-content :separator '(#\Newline)))
(headings nil))
(dolist (line lines)
(let ((trimmed (string-trim '(#\Space) line)))
(when (uiop:string-prefix-p "* " trimmed)
(let ((title (string-trim '(#\* #\Space) trimmed)))
(unless (find title headings :test #'string-equal)
(push title headings))))))
(nreverse headings)))
#+end_src
** Text Modification in Org Files

140
org/system-embeddings.org Normal file
View File

@@ -0,0 +1,140 @@
#+TITLE: SKILL: Embeddings (org-skill-embeddings.org)
#+AUTHOR: Agent
#+FILETAGS: :system:memory:embeddings:
#+PROPERTY: header-args:lisp :tangle ../lisp/system-embeddings.lisp
* Overview
Generates dense vector embeddings for =memory-object= entries, enabling
semantic similarity search via the foveal-peripheral context model. Uses
the hashing trick as a provider-agnostic default — no Ollama, no API keys,
no external dependencies. When an embedding provider backend becomes
available, it is preferred over the hashing fallback.
The core pipeline (core-context) knows how to USE vectors (=cosine-similarity=,
foveal-peripheral rendering) but does not know how to GENERATE them. That
lives here, in a skill. Thin harness, fat skills.
* Implementation
** Embedding Configuration
#+begin_src lisp
(defvar *embedding-dimensions* 384
"Dimension of the embedding vector. Default 384 matches nomic-embed-text.")
(defvar *embedding-backend* nil
"Optional external embedding function (text-str) → float vector.
When nil, the hashing-trick fallback is used. Register a backend via:
(setf *embedding-backend* (lambda (text) ...))
For Ollama: POST /api/embeddings with model nomic-embed-text.
For OpenAI: POST /v1/embeddings with model text-embedding-3-small.")
#+end_src
** Hashing-Trick Embedding Engine
The hashing trick produces dense fixed-size vectors from arbitrary text
without any training or external services. Each word token is hashed to
a bucket and contributes ±1 to that dimension. The resulting vector is
normalized to unit length for cosine-similarity compatibility.
This is NOT a neural embedding — it won't capture semantic nuance the
way a transformer model would. But it provides a reasonable similarity
signal: documents sharing vocabulary will have correlated vectors, and
the locality-sensitive hashing preserves co-occurrence patterns.
#+begin_src lisp
(defun embeddings-tokenize (text)
"Splits text into lowercase word tokens, stripping punctuation and
discarding tokens shorter than 2 characters."
(let ((clean (cl-ppcre:regex-replace-all "[^a-zA-Z0-9 ]"
(string-downcase (or text "")) " ")))
(remove-if (lambda (w) (< (length w) 2))
(uiop:split-string clean :separator '(#\Space #\Tab #\Newline)))))
(defun embeddings-hash-word (word dim)
"Hashes a word to a bucket index in [0, dim). Uses FNV-1a style hashing
for good distribution with minimal collisions."
(let ((hash 2166136261))
(loop for c across word
do (setf hash (logxor hash (char-code c)))
(setf hash (mod (* hash 16777619) #x100000000)))
(mod hash dim)))
(defun embeddings-compute (text &key (dimensions *embedding-dimensions*))
"Computes a dense embedding vector for TEXT.
Tries the registered backend first, falls back to hashing-trick.
Returns a list of DIMENSIONS double-floats normalized to unit length."
;; Try registered backend first
(when *embedding-backend*
(handler-case
(let ((result (funcall *embedding-backend* text)))
(when (and result (listp result) (> (length result) 0))
(return-from embeddings-compute result)))
(error (c)
(log-message "EMBEDDING: Backend failed (~a), falling back to hashing" c))))
;; Hashing-trick fallback
(let* ((tokens (embeddings-tokenize text))
(vec (make-array dimensions :initial-element 0.0d0 :element-type 'double-float)))
(dolist (token tokens)
(let* ((idx (embeddings-hash-word token dimensions))
(sign (if (evenp (char-code (char token 0))) 1 -1)))
(incf (aref vec idx) (coerce sign 'double-float))))
;; Normalize to unit length
(let ((norm (sqrt (loop for i below dimensions sum (expt (aref vec i) 2)))))
(if (> norm 0.0d0)
(loop for i below dimensions collect (/ (aref vec i) norm))
(loop for i below dimensions collect 0.0d0)))))
#+end_src
** Memory Object Embedding
#+begin_src lisp
(defun embed-object (obj)
"Generates and stores an embedding vector for a memory-object.
Combines title, content, and tags into a single text for embedding.
Stores the result in the memory-object's :vector slot."
(let* ((attrs (memory-object-attributes obj))
(title (or (getf attrs :TITLE) ""))
(text (or (memory-object-content obj) ""))
(tags (format nil "~{~a~^ ~}" (or (getf attrs :TAGS) "")))
(combined (format nil "~a ~a ~a" title text tags))
(vec (embeddings-compute combined)))
(setf (memory-object-vector obj) vec)
(log-message "EMBEDDING: Vector for ~a (~d dims, ~d tokens)"
(memory-object-id obj) (length vec)
(length (embeddings-tokenize combined)))
vec))
(defun embed-all-pending ()
"Generates embeddings for all memory objects that lack vectors.
Called by the heartbeat or on demand. Returns count of objects processed."
(let ((count 0))
(maphash (lambda (id obj)
(declare (ignore id))
(unless (memory-object-vector obj)
(handler-case
(progn (embed-object obj) (incf count))
(error (c)
(log-message "EMBEDDING: Failed for ~a: ~a" (memory-object-id obj) c)))))
*memory-store*)
(when (> count 0)
(log-message "EMBEDDING: Batch processed ~d objects" count))
count))
#+end_src
** Skill Registration
Runs as a background skill triggered by heartbeat events, processing
pending embeddings in batches. Low priority (50) so it defers to
critical skills.
#+begin_src lisp
(defskill :passepartout-system-embeddings
:priority 50
:trigger (lambda (ctx) (eq (getf (getf ctx :payload) :sensor) :heartbeat))
:deterministic (lambda (action ctx)
(declare (ignore action ctx))
(ignore-errors (embed-all-pending))
nil))
#+end_src