feat: P2 — provider-agnostic embeddings + subtree loading
Some checks failed
Deploy (Gitea) / deploy (push) Failing after 2s
Some checks failed
Deploy (Gitea) / deploy (push) Failing after 2s
- system-embeddings (new skill): hashing-trick embeddings for memory-object vectors. Works offline with no Ollama/API dependency. Falls back to hashing-trick when no embedding backend is registered. Extensible — set *embedding-backend* to use Ollama/OpenAI/any provider. - programming-org: add org-subtree-extract and org-heading-list for targeting specific headlines in Org text without loading whole files. - core-context: add context-skill-subtree thin wrapper delegating to org-subtree-extract. Core stays thin — parsing lives in the skill.
This commit is contained in:
@@ -37,7 +37,18 @@
|
||||
(data-dir (uiop:ensure-directory-pathname (or (uiop:getenv "PASSEPARTOUT_DATA_DIR") (namestring (merge-pathnames ".local/share/passepartout/" (user-homedir-pathname))))))
|
||||
(org-dir (merge-pathnames "org/" data-dir))
|
||||
(full-path (merge-pathnames filename org-dir)))
|
||||
(if (uiop:file-exists-p full-path) (uiop:read-file-string full-path) nil)))
|
||||
(if (uiop:file-exists-p full-path) (uiop:read-file-string full-path) nil)))
|
||||
|
||||
(defun context-skill-subtree (skill-name heading-name)
|
||||
"Reads a specific headline subtree from a skill's Org source file.
|
||||
Returns the content under HEADING-NAME (including children) as a string,
|
||||
or nil if the heading is not found."
|
||||
(let ((full-source (context-skill-source skill-name)))
|
||||
(unless full-source (return-from context-skill-subtree nil))
|
||||
(if (fboundp 'org-subtree-extract)
|
||||
(org-subtree-extract full-source heading-name)
|
||||
;; Fallback: no org-subtree-extract available, return full source
|
||||
full-source)))
|
||||
|
||||
(defun context-logs (&optional limit)
|
||||
"Retrieves the most recent lines from the harness's internal log."
|
||||
|
||||
@@ -140,7 +140,46 @@ Returns the filtered content as a string."
|
||||
(when (listp child)
|
||||
(let ((found (org-headline-find-by-title child title)))
|
||||
(when found (return-from org-headline-find-by-title found)))))
|
||||
nil))
|
||||
nil))
|
||||
|
||||
(defun org-subtree-extract (org-content heading-name)
|
||||
"Extracts a subtree by heading name from Org text. Returns the subtree
|
||||
content as a string (headline + body + children), or nil if not found."
|
||||
(let* ((lines (uiop:split-string org-content :separator '(#\Newline)))
|
||||
(target-depth nil)
|
||||
(in-target nil)
|
||||
(result nil))
|
||||
(loop for line in lines
|
||||
for trimmed = (string-trim '(#\Space) line)
|
||||
do (let ((depth (when (uiop:string-prefix-p "*" trimmed)
|
||||
(length (subseq trimmed 0
|
||||
(position-if (lambda (c) (not (char= c #\*)))
|
||||
trimmed)))))
|
||||
(headline-title (when (uiop:string-prefix-p "*" trimmed)
|
||||
(string-trim '(#\* #\Space) trimmed))))
|
||||
(when depth
|
||||
(when (string-equal headline-title heading-name)
|
||||
(setf target-depth depth in-target t))
|
||||
(when (and in-target target-depth
|
||||
(<= depth target-depth)
|
||||
(not (string-equal headline-title heading-name)))
|
||||
(return-from org-subtree-extract
|
||||
(format nil "~{~a~^~%~}" (nreverse result)))))
|
||||
(when in-target (push line result))))
|
||||
(when result
|
||||
(format nil "~{~a~^~%~}" (nreverse result)))))
|
||||
|
||||
(defun org-heading-list (org-content)
|
||||
"Returns a list of all top-level heading names in Org text."
|
||||
(let* ((lines (uiop:split-string org-content :separator '(#\Newline)))
|
||||
(headings nil))
|
||||
(dolist (line lines)
|
||||
(let ((trimmed (string-trim '(#\Space) line)))
|
||||
(when (uiop:string-prefix-p "* " trimmed)
|
||||
(let ((title (string-trim '(#\* #\Space) trimmed)))
|
||||
(unless (find title headings :test #'string-equal)
|
||||
(push title headings))))))
|
||||
(nreverse headings)))
|
||||
|
||||
(defun org-modify (filepath old-text new-text)
|
||||
"Replaces all occurrences of OLD-TEXT with NEW-TEXT in filepath.
|
||||
|
||||
91
lisp/system-embeddings.lisp
Normal file
91
lisp/system-embeddings.lisp
Normal file
@@ -0,0 +1,91 @@
|
||||
(defvar *embedding-dimensions* 384
|
||||
"Dimension of the embedding vector. Default 384 matches nomic-embed-text.")
|
||||
|
||||
(defvar *embedding-backend* nil
|
||||
"Optional external embedding function (text-str) → float vector.
|
||||
When nil, the hashing-trick fallback is used. Register a backend via:
|
||||
(setf *embedding-backend* (lambda (text) ...))
|
||||
For Ollama: POST /api/embeddings with model nomic-embed-text.
|
||||
For OpenAI: POST /v1/embeddings with model text-embedding-3-small.")
|
||||
|
||||
(defun embeddings-tokenize (text)
|
||||
"Splits text into lowercase word tokens, stripping punctuation and
|
||||
discarding tokens shorter than 2 characters."
|
||||
(let ((clean (cl-ppcre:regex-replace-all "[^a-zA-Z0-9 ]"
|
||||
(string-downcase (or text "")) " ")))
|
||||
(remove-if (lambda (w) (< (length w) 2))
|
||||
(uiop:split-string clean :separator '(#\Space #\Tab #\Newline)))))
|
||||
|
||||
(defun embeddings-hash-word (word dim)
|
||||
"Hashes a word to a bucket index in [0, dim). Uses FNV-1a style hashing
|
||||
for good distribution with minimal collisions."
|
||||
(let ((hash 2166136261))
|
||||
(loop for c across word
|
||||
do (setf hash (logxor hash (char-code c)))
|
||||
(setf hash (mod (* hash 16777619) #x100000000)))
|
||||
(mod hash dim)))
|
||||
|
||||
(defun embeddings-compute (text &key (dimensions *embedding-dimensions*))
|
||||
"Computes a dense embedding vector for TEXT.
|
||||
Tries the registered backend first, falls back to hashing-trick.
|
||||
Returns a list of DIMENSIONS double-floats normalized to unit length."
|
||||
;; Try registered backend first
|
||||
(when *embedding-backend*
|
||||
(handler-case
|
||||
(let ((result (funcall *embedding-backend* text)))
|
||||
(when (and result (listp result) (> (length result) 0))
|
||||
(return-from embeddings-compute result)))
|
||||
(error (c)
|
||||
(log-message "EMBEDDING: Backend failed (~a), falling back to hashing" c))))
|
||||
;; Hashing-trick fallback
|
||||
(let* ((tokens (embeddings-tokenize text))
|
||||
(vec (make-array dimensions :initial-element 0.0d0 :element-type 'double-float)))
|
||||
(dolist (token tokens)
|
||||
(let* ((idx (embeddings-hash-word token dimensions))
|
||||
(sign (if (evenp (char-code (char token 0))) 1 -1)))
|
||||
(incf (aref vec idx) (coerce sign 'double-float))))
|
||||
;; Normalize to unit length
|
||||
(let ((norm (sqrt (loop for i below dimensions sum (expt (aref vec i) 2)))))
|
||||
(if (> norm 0.0d0)
|
||||
(loop for i below dimensions collect (/ (aref vec i) norm))
|
||||
(loop for i below dimensions collect 0.0d0)))))
|
||||
|
||||
(defun embed-object (obj)
|
||||
"Generates and stores an embedding vector for a memory-object.
|
||||
Combines title, content, and tags into a single text for embedding.
|
||||
Stores the result in the memory-object's :vector slot."
|
||||
(let* ((attrs (memory-object-attributes obj))
|
||||
(title (or (getf attrs :TITLE) ""))
|
||||
(text (or (memory-object-content obj) ""))
|
||||
(tags (format nil "~{~a~^ ~}" (or (getf attrs :TAGS) "")))
|
||||
(combined (format nil "~a ~a ~a" title text tags))
|
||||
(vec (embeddings-compute combined)))
|
||||
(setf (memory-object-vector obj) vec)
|
||||
(log-message "EMBEDDING: Vector for ~a (~d dims, ~d tokens)"
|
||||
(memory-object-id obj) (length vec)
|
||||
(length (embeddings-tokenize combined)))
|
||||
vec))
|
||||
|
||||
(defun embed-all-pending ()
|
||||
"Generates embeddings for all memory objects that lack vectors.
|
||||
Called by the heartbeat or on demand. Returns count of objects processed."
|
||||
(let ((count 0))
|
||||
(maphash (lambda (id obj)
|
||||
(declare (ignore id))
|
||||
(unless (memory-object-vector obj)
|
||||
(handler-case
|
||||
(progn (embed-object obj) (incf count))
|
||||
(error (c)
|
||||
(log-message "EMBEDDING: Failed for ~a: ~a" (memory-object-id obj) c)))))
|
||||
*memory-store*)
|
||||
(when (> count 0)
|
||||
(log-message "EMBEDDING: Batch processed ~d objects" count))
|
||||
count))
|
||||
|
||||
(defskill :passepartout-system-embeddings
|
||||
:priority 50
|
||||
:trigger (lambda (ctx) (eq (getf (getf ctx :payload) :sensor) :heartbeat))
|
||||
:deterministic (lambda (action ctx)
|
||||
(declare (ignore action ctx))
|
||||
(ignore-errors (embed-all-pending))
|
||||
nil))
|
||||
@@ -97,7 +97,25 @@ Reads the raw literate source of a specific skill for inspection. Used when the
|
||||
(data-dir (uiop:ensure-directory-pathname (or (uiop:getenv "PASSEPARTOUT_DATA_DIR") (namestring (merge-pathnames ".local/share/passepartout/" (user-homedir-pathname))))))
|
||||
(org-dir (merge-pathnames "org/" data-dir))
|
||||
(full-path (merge-pathnames filename org-dir)))
|
||||
(if (uiop:file-exists-p full-path) (uiop:read-file-string full-path) nil)))
|
||||
(if (uiop:file-exists-p full-path) (uiop:read-file-string full-path) nil)))
|
||||
#+end_src
|
||||
|
||||
** Subtree Skill Source (context-skill-subtree)
|
||||
|
||||
Returns a specific headline subtree from a skill's Org file. Delegates to
|
||||
=org-subtree-extract= in the =programming-org= skill for actual parsing.
|
||||
|
||||
#+begin_src lisp
|
||||
(defun context-skill-subtree (skill-name heading-name)
|
||||
"Reads a specific headline subtree from a skill's Org source file.
|
||||
Returns the content under HEADING-NAME (including children) as a string,
|
||||
or nil if the heading is not found."
|
||||
(let ((full-source (context-skill-source skill-name)))
|
||||
(unless full-source (return-from context-skill-subtree nil))
|
||||
(if (fboundp 'org-subtree-extract)
|
||||
(org-subtree-extract full-source heading-name)
|
||||
;; Fallback: no org-subtree-extract available, return full source
|
||||
full-source)))
|
||||
#+end_src
|
||||
|
||||
** Harness Logs (context-logs)
|
||||
|
||||
@@ -176,7 +176,53 @@ Returns the filtered content as a string."
|
||||
(when (listp child)
|
||||
(let ((found (org-headline-find-by-title child title)))
|
||||
(when found (return-from org-headline-find-by-title found)))))
|
||||
nil))
|
||||
nil))
|
||||
#+end_src
|
||||
|
||||
** Subtree Extraction (from Org text)
|
||||
|
||||
Extracts a specific headline subtree from raw Org text by heading name.
|
||||
Used by =context-skill-subtree= for targeted skill source loading.
|
||||
|
||||
#+begin_src lisp
|
||||
(defun org-subtree-extract (org-content heading-name)
|
||||
"Extracts a subtree by heading name from Org text. Returns the subtree
|
||||
content as a string (headline + body + children), or nil if not found."
|
||||
(let* ((lines (uiop:split-string org-content :separator '(#\Newline)))
|
||||
(target-depth nil)
|
||||
(in-target nil)
|
||||
(result nil))
|
||||
(loop for line in lines
|
||||
for trimmed = (string-trim '(#\Space) line)
|
||||
do (let ((depth (when (uiop:string-prefix-p "*" trimmed)
|
||||
(length (subseq trimmed 0
|
||||
(position-if (lambda (c) (not (char= c #\*)))
|
||||
trimmed)))))
|
||||
(headline-title (when (uiop:string-prefix-p "*" trimmed)
|
||||
(string-trim '(#\* #\Space) trimmed))))
|
||||
(when depth
|
||||
(when (string-equal headline-title heading-name)
|
||||
(setf target-depth depth in-target t))
|
||||
(when (and in-target target-depth
|
||||
(<= depth target-depth)
|
||||
(not (string-equal headline-title heading-name)))
|
||||
(return-from org-subtree-extract
|
||||
(format nil "~{~a~^~%~}" (nreverse result)))))
|
||||
(when in-target (push line result))))
|
||||
(when result
|
||||
(format nil "~{~a~^~%~}" (nreverse result)))))
|
||||
|
||||
(defun org-heading-list (org-content)
|
||||
"Returns a list of all top-level heading names in Org text."
|
||||
(let* ((lines (uiop:split-string org-content :separator '(#\Newline)))
|
||||
(headings nil))
|
||||
(dolist (line lines)
|
||||
(let ((trimmed (string-trim '(#\Space) line)))
|
||||
(when (uiop:string-prefix-p "* " trimmed)
|
||||
(let ((title (string-trim '(#\* #\Space) trimmed)))
|
||||
(unless (find title headings :test #'string-equal)
|
||||
(push title headings))))))
|
||||
(nreverse headings)))
|
||||
#+end_src
|
||||
|
||||
** Text Modification in Org Files
|
||||
|
||||
140
org/system-embeddings.org
Normal file
140
org/system-embeddings.org
Normal file
@@ -0,0 +1,140 @@
|
||||
#+TITLE: SKILL: Embeddings (org-skill-embeddings.org)
|
||||
#+AUTHOR: Agent
|
||||
#+FILETAGS: :system:memory:embeddings:
|
||||
#+PROPERTY: header-args:lisp :tangle ../lisp/system-embeddings.lisp
|
||||
|
||||
* Overview
|
||||
|
||||
Generates dense vector embeddings for =memory-object= entries, enabling
|
||||
semantic similarity search via the foveal-peripheral context model. Uses
|
||||
the hashing trick as a provider-agnostic default — no Ollama, no API keys,
|
||||
no external dependencies. When an embedding provider backend becomes
|
||||
available, it is preferred over the hashing fallback.
|
||||
|
||||
The core pipeline (core-context) knows how to USE vectors (=cosine-similarity=,
|
||||
foveal-peripheral rendering) but does not know how to GENERATE them. That
|
||||
lives here, in a skill. Thin harness, fat skills.
|
||||
|
||||
* Implementation
|
||||
|
||||
** Embedding Configuration
|
||||
|
||||
#+begin_src lisp
|
||||
(defvar *embedding-dimensions* 384
|
||||
"Dimension of the embedding vector. Default 384 matches nomic-embed-text.")
|
||||
|
||||
(defvar *embedding-backend* nil
|
||||
"Optional external embedding function (text-str) → float vector.
|
||||
When nil, the hashing-trick fallback is used. Register a backend via:
|
||||
(setf *embedding-backend* (lambda (text) ...))
|
||||
For Ollama: POST /api/embeddings with model nomic-embed-text.
|
||||
For OpenAI: POST /v1/embeddings with model text-embedding-3-small.")
|
||||
#+end_src
|
||||
|
||||
** Hashing-Trick Embedding Engine
|
||||
|
||||
The hashing trick produces dense fixed-size vectors from arbitrary text
|
||||
without any training or external services. Each word token is hashed to
|
||||
a bucket and contributes ±1 to that dimension. The resulting vector is
|
||||
normalized to unit length for cosine-similarity compatibility.
|
||||
|
||||
This is NOT a neural embedding — it won't capture semantic nuance the
|
||||
way a transformer model would. But it provides a reasonable similarity
|
||||
signal: documents sharing vocabulary will have correlated vectors, and
|
||||
the locality-sensitive hashing preserves co-occurrence patterns.
|
||||
|
||||
#+begin_src lisp
|
||||
(defun embeddings-tokenize (text)
|
||||
"Splits text into lowercase word tokens, stripping punctuation and
|
||||
discarding tokens shorter than 2 characters."
|
||||
(let ((clean (cl-ppcre:regex-replace-all "[^a-zA-Z0-9 ]"
|
||||
(string-downcase (or text "")) " ")))
|
||||
(remove-if (lambda (w) (< (length w) 2))
|
||||
(uiop:split-string clean :separator '(#\Space #\Tab #\Newline)))))
|
||||
|
||||
(defun embeddings-hash-word (word dim)
|
||||
"Hashes a word to a bucket index in [0, dim). Uses FNV-1a style hashing
|
||||
for good distribution with minimal collisions."
|
||||
(let ((hash 2166136261))
|
||||
(loop for c across word
|
||||
do (setf hash (logxor hash (char-code c)))
|
||||
(setf hash (mod (* hash 16777619) #x100000000)))
|
||||
(mod hash dim)))
|
||||
|
||||
(defun embeddings-compute (text &key (dimensions *embedding-dimensions*))
|
||||
"Computes a dense embedding vector for TEXT.
|
||||
Tries the registered backend first, falls back to hashing-trick.
|
||||
Returns a list of DIMENSIONS double-floats normalized to unit length."
|
||||
;; Try registered backend first
|
||||
(when *embedding-backend*
|
||||
(handler-case
|
||||
(let ((result (funcall *embedding-backend* text)))
|
||||
(when (and result (listp result) (> (length result) 0))
|
||||
(return-from embeddings-compute result)))
|
||||
(error (c)
|
||||
(log-message "EMBEDDING: Backend failed (~a), falling back to hashing" c))))
|
||||
;; Hashing-trick fallback
|
||||
(let* ((tokens (embeddings-tokenize text))
|
||||
(vec (make-array dimensions :initial-element 0.0d0 :element-type 'double-float)))
|
||||
(dolist (token tokens)
|
||||
(let* ((idx (embeddings-hash-word token dimensions))
|
||||
(sign (if (evenp (char-code (char token 0))) 1 -1)))
|
||||
(incf (aref vec idx) (coerce sign 'double-float))))
|
||||
;; Normalize to unit length
|
||||
(let ((norm (sqrt (loop for i below dimensions sum (expt (aref vec i) 2)))))
|
||||
(if (> norm 0.0d0)
|
||||
(loop for i below dimensions collect (/ (aref vec i) norm))
|
||||
(loop for i below dimensions collect 0.0d0)))))
|
||||
#+end_src
|
||||
|
||||
** Memory Object Embedding
|
||||
|
||||
#+begin_src lisp
|
||||
(defun embed-object (obj)
|
||||
"Generates and stores an embedding vector for a memory-object.
|
||||
Combines title, content, and tags into a single text for embedding.
|
||||
Stores the result in the memory-object's :vector slot."
|
||||
(let* ((attrs (memory-object-attributes obj))
|
||||
(title (or (getf attrs :TITLE) ""))
|
||||
(text (or (memory-object-content obj) ""))
|
||||
(tags (format nil "~{~a~^ ~}" (or (getf attrs :TAGS) "")))
|
||||
(combined (format nil "~a ~a ~a" title text tags))
|
||||
(vec (embeddings-compute combined)))
|
||||
(setf (memory-object-vector obj) vec)
|
||||
(log-message "EMBEDDING: Vector for ~a (~d dims, ~d tokens)"
|
||||
(memory-object-id obj) (length vec)
|
||||
(length (embeddings-tokenize combined)))
|
||||
vec))
|
||||
|
||||
(defun embed-all-pending ()
|
||||
"Generates embeddings for all memory objects that lack vectors.
|
||||
Called by the heartbeat or on demand. Returns count of objects processed."
|
||||
(let ((count 0))
|
||||
(maphash (lambda (id obj)
|
||||
(declare (ignore id))
|
||||
(unless (memory-object-vector obj)
|
||||
(handler-case
|
||||
(progn (embed-object obj) (incf count))
|
||||
(error (c)
|
||||
(log-message "EMBEDDING: Failed for ~a: ~a" (memory-object-id obj) c)))))
|
||||
*memory-store*)
|
||||
(when (> count 0)
|
||||
(log-message "EMBEDDING: Batch processed ~d objects" count))
|
||||
count))
|
||||
#+end_src
|
||||
|
||||
** Skill Registration
|
||||
|
||||
Runs as a background skill triggered by heartbeat events, processing
|
||||
pending embeddings in batches. Low priority (50) so it defers to
|
||||
critical skills.
|
||||
|
||||
#+begin_src lisp
|
||||
(defskill :passepartout-system-embeddings
|
||||
:priority 50
|
||||
:trigger (lambda (ctx) (eq (getf (getf ctx :payload) :sensor) :heartbeat))
|
||||
:deterministic (lambda (action ctx)
|
||||
(declare (ignore action ctx))
|
||||
(ignore-errors (embed-all-pending))
|
||||
nil))
|
||||
#+end_src
|
||||
Reference in New Issue
Block a user