diff --git a/lisp/core-context.lisp b/lisp/core-context.lisp index 7357ec3..25d7a6a 100644 --- a/lisp/core-context.lisp +++ b/lisp/core-context.lisp @@ -37,7 +37,18 @@ (data-dir (uiop:ensure-directory-pathname (or (uiop:getenv "PASSEPARTOUT_DATA_DIR") (namestring (merge-pathnames ".local/share/passepartout/" (user-homedir-pathname)))))) (org-dir (merge-pathnames "org/" data-dir)) (full-path (merge-pathnames filename org-dir))) - (if (uiop:file-exists-p full-path) (uiop:read-file-string full-path) nil))) + (if (uiop:file-exists-p full-path) (uiop:read-file-string full-path) nil))) + +(defun context-skill-subtree (skill-name heading-name) + "Reads a specific headline subtree from a skill's Org source file. +Returns the content under HEADING-NAME (including children) as a string, +or nil if the heading is not found." + (let ((full-source (context-skill-source skill-name))) + (unless full-source (return-from context-skill-subtree nil)) + (if (fboundp 'org-subtree-extract) + (org-subtree-extract full-source heading-name) + ;; Fallback: no org-subtree-extract available, return full source + full-source))) (defun context-logs (&optional limit) "Retrieves the most recent lines from the harness's internal log." diff --git a/lisp/programming-org.lisp b/lisp/programming-org.lisp index a7d066d..93a7c02 100644 --- a/lisp/programming-org.lisp +++ b/lisp/programming-org.lisp @@ -140,7 +140,46 @@ Returns the filtered content as a string." (when (listp child) (let ((found (org-headline-find-by-title child title))) (when found (return-from org-headline-find-by-title found))))) - nil)) + nil)) + +(defun org-subtree-extract (org-content heading-name) + "Extracts a subtree by heading name from Org text. Returns the subtree +content as a string (headline + body + children), or nil if not found." + (let* ((lines (uiop:split-string org-content :separator '(#\Newline))) + (target-depth nil) + (in-target nil) + (result nil)) + (loop for line in lines + for trimmed = (string-trim '(#\Space) line) + do (let ((depth (when (uiop:string-prefix-p "*" trimmed) + (length (subseq trimmed 0 + (position-if (lambda (c) (not (char= c #\*))) + trimmed))))) + (headline-title (when (uiop:string-prefix-p "*" trimmed) + (string-trim '(#\* #\Space) trimmed)))) + (when depth + (when (string-equal headline-title heading-name) + (setf target-depth depth in-target t)) + (when (and in-target target-depth + (<= depth target-depth) + (not (string-equal headline-title heading-name))) + (return-from org-subtree-extract + (format nil "~{~a~^~%~}" (nreverse result))))) + (when in-target (push line result)))) + (when result + (format nil "~{~a~^~%~}" (nreverse result))))) + +(defun org-heading-list (org-content) + "Returns a list of all top-level heading names in Org text." + (let* ((lines (uiop:split-string org-content :separator '(#\Newline))) + (headings nil)) + (dolist (line lines) + (let ((trimmed (string-trim '(#\Space) line))) + (when (uiop:string-prefix-p "* " trimmed) + (let ((title (string-trim '(#\* #\Space) trimmed))) + (unless (find title headings :test #'string-equal) + (push title headings)))))) + (nreverse headings))) (defun org-modify (filepath old-text new-text) "Replaces all occurrences of OLD-TEXT with NEW-TEXT in filepath. diff --git a/lisp/system-embeddings.lisp b/lisp/system-embeddings.lisp new file mode 100644 index 0000000..d52d674 --- /dev/null +++ b/lisp/system-embeddings.lisp @@ -0,0 +1,91 @@ +(defvar *embedding-dimensions* 384 + "Dimension of the embedding vector. Default 384 matches nomic-embed-text.") + +(defvar *embedding-backend* nil + "Optional external embedding function (text-str) → float vector. +When nil, the hashing-trick fallback is used. Register a backend via: + (setf *embedding-backend* (lambda (text) ...)) +For Ollama: POST /api/embeddings with model nomic-embed-text. +For OpenAI: POST /v1/embeddings with model text-embedding-3-small.") + +(defun embeddings-tokenize (text) + "Splits text into lowercase word tokens, stripping punctuation and +discarding tokens shorter than 2 characters." + (let ((clean (cl-ppcre:regex-replace-all "[^a-zA-Z0-9 ]" + (string-downcase (or text "")) " "))) + (remove-if (lambda (w) (< (length w) 2)) + (uiop:split-string clean :separator '(#\Space #\Tab #\Newline))))) + +(defun embeddings-hash-word (word dim) + "Hashes a word to a bucket index in [0, dim). Uses FNV-1a style hashing +for good distribution with minimal collisions." + (let ((hash 2166136261)) + (loop for c across word + do (setf hash (logxor hash (char-code c))) + (setf hash (mod (* hash 16777619) #x100000000))) + (mod hash dim))) + +(defun embeddings-compute (text &key (dimensions *embedding-dimensions*)) + "Computes a dense embedding vector for TEXT. +Tries the registered backend first, falls back to hashing-trick. +Returns a list of DIMENSIONS double-floats normalized to unit length." + ;; Try registered backend first + (when *embedding-backend* + (handler-case + (let ((result (funcall *embedding-backend* text))) + (when (and result (listp result) (> (length result) 0)) + (return-from embeddings-compute result))) + (error (c) + (log-message "EMBEDDING: Backend failed (~a), falling back to hashing" c)))) + ;; Hashing-trick fallback + (let* ((tokens (embeddings-tokenize text)) + (vec (make-array dimensions :initial-element 0.0d0 :element-type 'double-float))) + (dolist (token tokens) + (let* ((idx (embeddings-hash-word token dimensions)) + (sign (if (evenp (char-code (char token 0))) 1 -1))) + (incf (aref vec idx) (coerce sign 'double-float)))) + ;; Normalize to unit length + (let ((norm (sqrt (loop for i below dimensions sum (expt (aref vec i) 2))))) + (if (> norm 0.0d0) + (loop for i below dimensions collect (/ (aref vec i) norm)) + (loop for i below dimensions collect 0.0d0))))) + +(defun embed-object (obj) + "Generates and stores an embedding vector for a memory-object. +Combines title, content, and tags into a single text for embedding. +Stores the result in the memory-object's :vector slot." + (let* ((attrs (memory-object-attributes obj)) + (title (or (getf attrs :TITLE) "")) + (text (or (memory-object-content obj) "")) + (tags (format nil "~{~a~^ ~}" (or (getf attrs :TAGS) ""))) + (combined (format nil "~a ~a ~a" title text tags)) + (vec (embeddings-compute combined))) + (setf (memory-object-vector obj) vec) + (log-message "EMBEDDING: Vector for ~a (~d dims, ~d tokens)" + (memory-object-id obj) (length vec) + (length (embeddings-tokenize combined))) + vec)) + +(defun embed-all-pending () + "Generates embeddings for all memory objects that lack vectors. +Called by the heartbeat or on demand. Returns count of objects processed." + (let ((count 0)) + (maphash (lambda (id obj) + (declare (ignore id)) + (unless (memory-object-vector obj) + (handler-case + (progn (embed-object obj) (incf count)) + (error (c) + (log-message "EMBEDDING: Failed for ~a: ~a" (memory-object-id obj) c))))) + *memory-store*) + (when (> count 0) + (log-message "EMBEDDING: Batch processed ~d objects" count)) + count)) + +(defskill :passepartout-system-embeddings + :priority 50 + :trigger (lambda (ctx) (eq (getf (getf ctx :payload) :sensor) :heartbeat)) + :deterministic (lambda (action ctx) + (declare (ignore action ctx)) + (ignore-errors (embed-all-pending)) + nil)) diff --git a/org/core-context.org b/org/core-context.org index 3eb6188..d969ffc 100644 --- a/org/core-context.org +++ b/org/core-context.org @@ -97,7 +97,25 @@ Reads the raw literate source of a specific skill for inspection. Used when the (data-dir (uiop:ensure-directory-pathname (or (uiop:getenv "PASSEPARTOUT_DATA_DIR") (namestring (merge-pathnames ".local/share/passepartout/" (user-homedir-pathname)))))) (org-dir (merge-pathnames "org/" data-dir)) (full-path (merge-pathnames filename org-dir))) - (if (uiop:file-exists-p full-path) (uiop:read-file-string full-path) nil))) + (if (uiop:file-exists-p full-path) (uiop:read-file-string full-path) nil))) +#+end_src + +** Subtree Skill Source (context-skill-subtree) + +Returns a specific headline subtree from a skill's Org file. Delegates to +=org-subtree-extract= in the =programming-org= skill for actual parsing. + +#+begin_src lisp +(defun context-skill-subtree (skill-name heading-name) + "Reads a specific headline subtree from a skill's Org source file. +Returns the content under HEADING-NAME (including children) as a string, +or nil if the heading is not found." + (let ((full-source (context-skill-source skill-name))) + (unless full-source (return-from context-skill-subtree nil)) + (if (fboundp 'org-subtree-extract) + (org-subtree-extract full-source heading-name) + ;; Fallback: no org-subtree-extract available, return full source + full-source))) #+end_src ** Harness Logs (context-logs) diff --git a/org/programming-org.org b/org/programming-org.org index ffa175b..b7f4c80 100644 --- a/org/programming-org.org +++ b/org/programming-org.org @@ -176,7 +176,53 @@ Returns the filtered content as a string." (when (listp child) (let ((found (org-headline-find-by-title child title))) (when found (return-from org-headline-find-by-title found))))) - nil)) + nil)) +#+end_src + +** Subtree Extraction (from Org text) + +Extracts a specific headline subtree from raw Org text by heading name. +Used by =context-skill-subtree= for targeted skill source loading. + +#+begin_src lisp +(defun org-subtree-extract (org-content heading-name) + "Extracts a subtree by heading name from Org text. Returns the subtree +content as a string (headline + body + children), or nil if not found." + (let* ((lines (uiop:split-string org-content :separator '(#\Newline))) + (target-depth nil) + (in-target nil) + (result nil)) + (loop for line in lines + for trimmed = (string-trim '(#\Space) line) + do (let ((depth (when (uiop:string-prefix-p "*" trimmed) + (length (subseq trimmed 0 + (position-if (lambda (c) (not (char= c #\*))) + trimmed))))) + (headline-title (when (uiop:string-prefix-p "*" trimmed) + (string-trim '(#\* #\Space) trimmed)))) + (when depth + (when (string-equal headline-title heading-name) + (setf target-depth depth in-target t)) + (when (and in-target target-depth + (<= depth target-depth) + (not (string-equal headline-title heading-name))) + (return-from org-subtree-extract + (format nil "~{~a~^~%~}" (nreverse result))))) + (when in-target (push line result)))) + (when result + (format nil "~{~a~^~%~}" (nreverse result))))) + +(defun org-heading-list (org-content) + "Returns a list of all top-level heading names in Org text." + (let* ((lines (uiop:split-string org-content :separator '(#\Newline))) + (headings nil)) + (dolist (line lines) + (let ((trimmed (string-trim '(#\Space) line))) + (when (uiop:string-prefix-p "* " trimmed) + (let ((title (string-trim '(#\* #\Space) trimmed))) + (unless (find title headings :test #'string-equal) + (push title headings)))))) + (nreverse headings))) #+end_src ** Text Modification in Org Files diff --git a/org/system-embeddings.org b/org/system-embeddings.org new file mode 100644 index 0000000..33c0604 --- /dev/null +++ b/org/system-embeddings.org @@ -0,0 +1,140 @@ +#+TITLE: SKILL: Embeddings (org-skill-embeddings.org) +#+AUTHOR: Agent +#+FILETAGS: :system:memory:embeddings: +#+PROPERTY: header-args:lisp :tangle ../lisp/system-embeddings.lisp + +* Overview + +Generates dense vector embeddings for =memory-object= entries, enabling +semantic similarity search via the foveal-peripheral context model. Uses +the hashing trick as a provider-agnostic default — no Ollama, no API keys, +no external dependencies. When an embedding provider backend becomes +available, it is preferred over the hashing fallback. + +The core pipeline (core-context) knows how to USE vectors (=cosine-similarity=, +foveal-peripheral rendering) but does not know how to GENERATE them. That +lives here, in a skill. Thin harness, fat skills. + +* Implementation + +** Embedding Configuration + +#+begin_src lisp +(defvar *embedding-dimensions* 384 + "Dimension of the embedding vector. Default 384 matches nomic-embed-text.") + +(defvar *embedding-backend* nil + "Optional external embedding function (text-str) → float vector. +When nil, the hashing-trick fallback is used. Register a backend via: + (setf *embedding-backend* (lambda (text) ...)) +For Ollama: POST /api/embeddings with model nomic-embed-text. +For OpenAI: POST /v1/embeddings with model text-embedding-3-small.") +#+end_src + +** Hashing-Trick Embedding Engine + +The hashing trick produces dense fixed-size vectors from arbitrary text +without any training or external services. Each word token is hashed to +a bucket and contributes ±1 to that dimension. The resulting vector is +normalized to unit length for cosine-similarity compatibility. + +This is NOT a neural embedding — it won't capture semantic nuance the +way a transformer model would. But it provides a reasonable similarity +signal: documents sharing vocabulary will have correlated vectors, and +the locality-sensitive hashing preserves co-occurrence patterns. + +#+begin_src lisp +(defun embeddings-tokenize (text) + "Splits text into lowercase word tokens, stripping punctuation and +discarding tokens shorter than 2 characters." + (let ((clean (cl-ppcre:regex-replace-all "[^a-zA-Z0-9 ]" + (string-downcase (or text "")) " "))) + (remove-if (lambda (w) (< (length w) 2)) + (uiop:split-string clean :separator '(#\Space #\Tab #\Newline))))) + +(defun embeddings-hash-word (word dim) + "Hashes a word to a bucket index in [0, dim). Uses FNV-1a style hashing +for good distribution with minimal collisions." + (let ((hash 2166136261)) + (loop for c across word + do (setf hash (logxor hash (char-code c))) + (setf hash (mod (* hash 16777619) #x100000000))) + (mod hash dim))) + +(defun embeddings-compute (text &key (dimensions *embedding-dimensions*)) + "Computes a dense embedding vector for TEXT. +Tries the registered backend first, falls back to hashing-trick. +Returns a list of DIMENSIONS double-floats normalized to unit length." + ;; Try registered backend first + (when *embedding-backend* + (handler-case + (let ((result (funcall *embedding-backend* text))) + (when (and result (listp result) (> (length result) 0)) + (return-from embeddings-compute result))) + (error (c) + (log-message "EMBEDDING: Backend failed (~a), falling back to hashing" c)))) + ;; Hashing-trick fallback + (let* ((tokens (embeddings-tokenize text)) + (vec (make-array dimensions :initial-element 0.0d0 :element-type 'double-float))) + (dolist (token tokens) + (let* ((idx (embeddings-hash-word token dimensions)) + (sign (if (evenp (char-code (char token 0))) 1 -1))) + (incf (aref vec idx) (coerce sign 'double-float)))) + ;; Normalize to unit length + (let ((norm (sqrt (loop for i below dimensions sum (expt (aref vec i) 2))))) + (if (> norm 0.0d0) + (loop for i below dimensions collect (/ (aref vec i) norm)) + (loop for i below dimensions collect 0.0d0))))) +#+end_src + +** Memory Object Embedding + +#+begin_src lisp +(defun embed-object (obj) + "Generates and stores an embedding vector for a memory-object. +Combines title, content, and tags into a single text for embedding. +Stores the result in the memory-object's :vector slot." + (let* ((attrs (memory-object-attributes obj)) + (title (or (getf attrs :TITLE) "")) + (text (or (memory-object-content obj) "")) + (tags (format nil "~{~a~^ ~}" (or (getf attrs :TAGS) ""))) + (combined (format nil "~a ~a ~a" title text tags)) + (vec (embeddings-compute combined))) + (setf (memory-object-vector obj) vec) + (log-message "EMBEDDING: Vector for ~a (~d dims, ~d tokens)" + (memory-object-id obj) (length vec) + (length (embeddings-tokenize combined))) + vec)) + +(defun embed-all-pending () + "Generates embeddings for all memory objects that lack vectors. +Called by the heartbeat or on demand. Returns count of objects processed." + (let ((count 0)) + (maphash (lambda (id obj) + (declare (ignore id)) + (unless (memory-object-vector obj) + (handler-case + (progn (embed-object obj) (incf count)) + (error (c) + (log-message "EMBEDDING: Failed for ~a: ~a" (memory-object-id obj) c))))) + *memory-store*) + (when (> count 0) + (log-message "EMBEDDING: Batch processed ~d objects" count)) + count)) +#+end_src + +** Skill Registration + +Runs as a background skill triggered by heartbeat events, processing +pending embeddings in batches. Low priority (50) so it defers to +critical skills. + +#+begin_src lisp +(defskill :passepartout-system-embeddings + :priority 50 + :trigger (lambda (ctx) (eq (getf (getf ctx :payload) :sensor) :heartbeat)) + :deterministic (lambda (action ctx) + (declare (ignore action ctx)) + (ignore-errors (embed-all-pending)) + nil)) +#+end_src