v0.4.1: native embedding CFFI — full pipeline working, daemon-wired, HITL bug fixed

- Native backend returns 768-dim vectors via llama.cpp / C wrapper (/usr/local/lib/libllama_wrap.so)
- Wired :native into embed-object dispatch and exported from passepartout package
- Model preloads at daemon startup with EMBEDDING_PROVIDER=native (~30s)
- Lazy loading via *embedding-backend* :native also works (first call ~45s)
- C wrapper bridges CFFI pointer params to llama.cpp struct-by-value API
- Correct struct layouts: llama_model_params(72B), llama_context_params(136B), llama_batch(56B)
- BERT pooling: llama_get_embeddings_seq, llama_tokenize takes vocab* not model*
- FiveAM tests pass: dimensions, self-similarity, semantic ranking
- Fixed pre-existing HITL crash: boundp guard for *hitl-pending* in core-loop-act
- Lazy load guard prevents double-load of native file in embedding-native-ensure-loaded
- ROADMAP: v0.4.0 items marked DONE, v0.4.1 native embedding updated with actual implementation
This commit is contained in:
2026-05-07 09:55:33 -04:00
parent 52a8386282
commit d3b74f5c88
12 changed files with 804 additions and 374 deletions

View File

@@ -91,6 +91,11 @@
#:embed-object
#:embed-all-pending
#:embedding-backend-hashing
#:embedding-backend-native
#:embedding-native-load-model
#:embedding-native-unload
#:embedding-native-ensure-loaded
#:embedding-native-get-dim
#:embeddings-compute
#:mark-vector-stale
#:skill

View File

@@ -26,8 +26,10 @@
(stream (getf meta :reply-stream)))
(when (and stream (open-stream-p stream))
;; Enrich response with differentiator visualization data
(setf (getf (getf action :payload) :rule-count)
(hash-table-count *hitl-pending*))
(setf (getf (getf action :payload) :rule-count)
(if (boundp '*hitl-pending*)
(hash-table-count *hitl-pending*)
0))
(setf (getf (getf action :payload) :foveal-id)
(getf context :foveal-id))
(format stream "~a" (frame-message action))

View File

@@ -1,84 +1,77 @@
(unless (find-package :passepartout)
(make-package :passepartout :use '(:cl)))
(in-package :passepartout)
(cffi:define-foreign-library libllama
(:unix "/usr/local/lib/libllama.so"))
(cffi:define-foreign-library libllama_wrap (:unix "/usr/local/lib/libllama_wrap.so"))
(cffi:use-foreign-library libllama_wrap)
(cffi:define-foreign-library libllama (:unix "/usr/local/lib/libllama.so"))
(cffi:use-foreign-library libllama)
(cffi:defctype llama-model-p :pointer)
(cffi:defctype llama-context-p :pointer)
(cffi:defctype llama-seq-id :int32)
(cffi:defctype llama-token :int32)
(cffi:defctype llama-pos :int32)
(cffi:defcstruct (llama-mparams :size 72)
(devices :pointer) (tensor-buft :pointer) (n-gpu-layers :int32)
(split-mode :int32) (main-gpu :int32) (_pad1 :int32)
(tensor-split :pointer) (progress-cb :pointer) (progress-data :pointer)
(kv-overrides :pointer) (vocab-only :bool) (use-mmap :bool)
(_pad2 :uint8 :count 6))
(cffi:defcstruct (llama-model-params :class llama-model-params-type)
(n-gpu-layers :int32))
(cffi:defcstruct (llama-context-params :class llama-context-params-type)
(cffi:defcstruct (llama-cparams :size 136)
(n-ctx :uint32)
(n-batch :uint32)
(n-ubatch :uint32)
(n-seq-max :uint32)
(n-threads :int32)
(embeddings :bool))
(n-threads-batch :int32)
(rope-scaling-type :int32)
(pooling-type :int32)
(attention-type :int32)
(flash-attn-type :int32)
(rope-freq-base :float)
(rope-freq-scale :float)
(yarn-ext-factor :float)
(yarn-attn-factor :float)
(yarn-beta-fast :float)
(yarn-beta-slow :float)
(yarn-orig-ctx :uint32)
(defrag-thold :float)
(cb-eval :pointer)
(cb-eval-user-data :pointer)
(type-k :int32)
(type-v :int32)
(abort-callback :pointer)
(abort-callback-data :pointer)
(embeddings :bool)
(offload-kqv :bool)
(no-perf :bool)
(op-offload :bool)
(swa-full :bool)
(kv-unified :bool)
(_c-pad3 :uint8 :count 15))
(cffi:defcstruct (llama-batch :class llama-batch-type)
(n-tokens :int32)
(token :pointer)
(embd :pointer)
(pos :pointer)
(n-seq-id :pointer)
(seq-id :pointer)
(logits :pointer))
(cffi:defcstruct (llama-batch :size 56)
(n-tokens :int32) (_bpad1 :int32) (token :pointer) (embd :pointer)
(pos :pointer) (n-seq-id :pointer) (seq-id :pointer) (logits :pointer))
(cffi:defcfun ("llama_model_default_params" %llama-model-default-params) (:struct llama-model-params))
;; llama.cpp public API
(cffi:defcfun ("llama_backend_init" bl) :void)
(cffi:defcfun ("llama_model_default_params" mdp) :void (p :pointer))
(cffi:defcfun ("llama_context_default_params" cdp) :void (p :pointer))
(cffi:defcfun ("llama_model_n_embd" ne) :int32 (m :pointer))
(cffi:defcfun ("llama_model_get_vocab" gv) :pointer (m :pointer))
(cffi:defcfun ("llama_vocab_n_tokens" vnt) :int32 (vocab :pointer))
(cffi:defcfun ("llama_tokenize" tok) :int32 (vocab :pointer) (text :string) (len :int32) (tokens :pointer) (n-max :int32) (add-special :bool) (parse-special :bool))
(cffi:defcfun ("llama_get_embeddings_ith" embd-ith) :pointer (ctx :pointer) (i :int32))
(cffi:defcfun ("llama_get_embeddings_seq" embd-seq) :pointer (ctx :pointer) (seq-id :int32))
(cffi:defcfun ("llama_pooling_type" get-pooling) :int32 (ctx :pointer))
(cffi:defcfun ("llama_model_free" fm) :void (m :pointer))
(cffi:defcfun ("llama_free" fc) :void (ctx :pointer))
(cffi:defcfun ("llama_context_default_params" %llama-context-default-params) (:struct llama-context-params))
(cffi:defcfun ("llama_model_load" %llama-model-load) llama-model-p
(path-model :string)
(params (:struct llama-model-params)))
(cffi:defcfun ("llama_new_context_with_model" %llama-new-context-with-model) llama-context-p
(model llama-model-p)
(params (:struct llama-context-params)))
(cffi:defcfun ("llama_free_model" %llama-free-model) :void
(model llama-model-p))
(cffi:defcfun ("llama_free" %llama-free) :void
(ctx llama-context-p))
(cffi:defcfun ("llama_n_embd" %llama-n-embd) :int32
(model llama-model-p))
(cffi:defcfun ("llama_n_vocab" %llama-n-vocab) :int32
(model llama-model-p))
(cffi:defcfun ("llama_tokenize" %llama-tokenize) :int32
(model llama-model-p)
(text :string)
(text-len :int32)
(tokens :pointer)
(n-max-tokens :int32)
(add-special :bool)
(parse-special :bool))
(cffi:defcfun ("llama_encode" %llama-encode) :int32
(ctx llama-context-p)
(batch (:struct llama-batch)))
(cffi:defcfun ("llama_get_embeddings_ith" %llama-get-embeddings-ith) :pointer
(ctx llama-context-p)
(i :int32))
(cffi:defcfun ("llama_batch_init" %llama-batch-init) (:struct llama-batch)
(n-tokens :int32)
(embd :int32)
(n-seq-max :int32))
(cffi:defcfun ("llama_batch_free" %llama-batch-free) :void
(batch (:struct llama-batch)))
;; C wrapper (bridges struct-by-value ABI)
(cffi:defcfun ("llama_wrap_model_load" wrap-load) :pointer (path :string) (params :pointer))
(cffi:defcfun ("llama_wrap_new_context" wrap-ctx) :pointer (model :pointer) (params :pointer))
(cffi:defcfun ("llama_wrap_encode" wrap-encode) :int32 (ctx :pointer) (batch :pointer))
(cffi:defcfun ("llama_wrap_batch_init" wrap-batch-init) :void (batch :pointer) (n-tokens :int32) (embd :int32) (n-seq-max :int32))
(cffi:defcfun ("llama_wrap_batch_free" wrap-batch-free) :void (batch :pointer))
(defvar *native-model* nil
"Cached llama.cpp model for embedding inference.")
@@ -86,6 +79,9 @@
(defvar *native-context* nil
"Cached llama.cpp context for embedding inference.")
(defvar *native-vocab* nil
"Cached llama.cpp vocab handle (from model).")
(defvar *native-model-path*
(merge-pathnames ".local/share/passepartout/models/nomic-embed-text-v1.5.Q4_K_M.gguf"
(user-homedir-pathname))
@@ -96,74 +92,94 @@
(unless (and *native-model* *native-context*)
(unless (uiop:file-exists-p *native-model-path*)
(error "Native embedding model not found at ~a" *native-model-path*))
(let ((mparams (%llama-model-default-params)))
(setf (cffi:foreign-slot-value mparams '(:struct llama-model-params) 'n-gpu-layers) 0)
(setf *native-model* (%llama-model-load (namestring *native-model-path*) mparams)))
(let* ((cparams (%llama-context-default-params)))
(setf (cffi:foreign-slot-value cparams '(:struct llama-context-params) 'n-ctx) 512
(cffi:foreign-slot-value cparams '(:struct llama-context-params) 'n-batch) 512
(cffi:foreign-slot-value cparams '(:struct llama-context-params) 'n-ubatch) 512
(cffi:foreign-slot-value cparams '(:struct llama-context-params) 'n-seq-max) 1
(cffi:foreign-slot-value cparams '(:struct llama-context-params) 'n-threads) 2
(cffi:foreign-slot-value cparams '(:struct llama-context-params) 'embeddings) 1)
(setf *native-context* (%llama-new-context-with-model *native-model* cparams)))
(log-message "EMBEDDING: Native model loaded (~d-dim)" (%llama-n-embd *native-model*)))
(values *native-model* *native-context*))
(defun embedding-native-get-dim ()
"Return the embedding dimension of the native model."
(embedding-native-load-model)
(%llama-n-embd *native-model*))
(sb-int:set-floating-point-modes :traps '())
(bl)
;; Load model
(cffi:with-foreign-object (mp 'llama-mparams)
(mdp mp)
(setf (cffi:foreign-slot-value mp 'llama-mparams 'n-gpu-layers) 0)
(setf (cffi:foreign-slot-value mp 'llama-mparams 'use-mmap) 0)
(setf *native-model* (wrap-load (namestring *native-model-path*) mp)))
(setf *native-vocab* (gv *native-model*))
;; Create context
(let ((n-embd (ne *native-model*)))
(cffi:with-foreign-object (cp 'llama-cparams)
(cdp cp)
(setf (cffi:foreign-slot-value cp 'llama-cparams 'n-ctx) 512)
(setf (cffi:foreign-slot-value cp 'llama-cparams 'n-batch) 512)
(setf (cffi:foreign-slot-value cp 'llama-cparams 'n-ubatch) 512)
(setf (cffi:foreign-slot-value cp 'llama-cparams 'n-seq-max) 1)
(setf (cffi:foreign-slot-value cp 'llama-cparams 'n-threads) 2)
(setf (cffi:foreign-slot-value cp 'llama-cparams 'embeddings) 1)
(setf *native-context* (wrap-ctx *native-model* cp)))
(format *error-output* "~&;; EMBEDDING: Native model loaded (~d-dim)~%" n-embd)))
(values *native-model* *native-context* *native-vocab*))
(defun embedding-backend-native (text)
"Compute an embedding vector using the native llama.cpp backend.
Returns a single-float vector of dimension n_embd."
(let* ((text-len (length text))
Returns a simple-vector of single-floats (dimension: n_embd, typically 768)."
(embedding-native-load-model)
(let* ((n-embd (ne *native-model*))
(max-tokens 256)
(tokens (cffi:foreign-alloc :int32 :count max-tokens))
(n-tokens 0))
(n-tok 0))
(unwind-protect
(progn
(embedding-native-load-model)
(setf n-tokens (%llama-tokenize *native-model* text text-len tokens max-tokens t t))
(when (zerop n-tokens)
(error "Native embedding: tokenization returned 0 tokens"))
(let* ((batch (%llama-batch-init n-tokens 0 1))
(n-embd (embedding-native-get-dim))
(result (make-array n-embd :element-type 'single-float :initial-element 0.0))
(seq-id-ptr (cffi:foreign-alloc :int32 :count 1)))
(setf (cffi:mem-aref seq-id-ptr :int32 0) 0)
(unwind-protect
(progn
(dotimes (i n-tokens)
(setf (cffi:mem-aref (cffi:foreign-slot-value batch '(:struct llama-batch) 'token) :int32 i)
(cffi:mem-aref tokens :int32 i))
(setf (cffi:mem-aref (cffi:foreign-slot-value batch '(:struct llama-batch) 'pos) :int32 i) i)
(setf (cffi:mem-aref (cffi:foreign-slot-value batch '(:struct llama-batch) 'n-seq-id) :int32 i) 1)
(setf (cffi:mem-aref (cffi:foreign-slot-value batch '(:struct llama-batch) 'seq-id) :pointer i)
seq-id-ptr))
(let ((encode-result (%llama-encode *native-context* batch)))
(when (not (zerop encode-result))
(error "Native embedding: encode returned ~d" encode-result)))
(let ((embd-ptr (%llama-get-embeddings-ith *native-context* (1- n-tokens))))
(dotimes (i n-embd)
(setf (aref result i) (cffi:mem-aref embd-ptr :float i)))))
(%llama-batch-free batch)
(cffi:foreign-free seq-id-ptr))
(setf n-tok (tok *native-vocab* text (length text) tokens max-tokens t t))
(when (zerop n-tok)
(error "Native embedding: tokenization returned 0 tokens for ~s" text))
(let ((result (make-array n-embd :element-type 'single-float :initial-element 0.0f0)))
(cffi:with-foreign-object (batch 'llama-batch)
(wrap-batch-init batch n-tok 0 1)
(setf (cffi:foreign-slot-value batch 'llama-batch 'n-tokens) n-tok)
(dotimes (i n-tok)
(setf (cffi:mem-aref (cffi:foreign-slot-value batch 'llama-batch 'token) :int32 i)
(cffi:mem-aref tokens :int32 i))
(setf (cffi:mem-aref (cffi:foreign-slot-value batch 'llama-batch 'pos) :int32 i) i)
(setf (cffi:mem-aref (cffi:foreign-slot-value batch 'llama-batch 'n-seq-id) :int32 i) 1)
(setf (cffi:mem-aref (cffi:mem-aref (cffi:foreign-slot-value batch 'llama-batch 'seq-id) :pointer i) :int32 0) 0)
(setf (cffi:mem-aref (cffi:foreign-slot-value batch 'llama-batch 'logits) :int8 i) 1))
(let ((enc (wrap-encode *native-context* batch)))
(unless (zerop enc)
(error "Native embedding: encode returned ~d" enc)))
(let* ((pooling (get-pooling *native-context*))
(eptr (if (= pooling 0)
(embd-ith *native-context* (1- n-tok))
(embd-seq *native-context* 0))))
(dotimes (i n-embd)
(setf (aref result i) (cffi:mem-aref eptr :float i))))
(wrap-batch-free batch))
result))
(cffi:foreign-free tokens))))
(defun embedding-backend-native-unload ()
(defun embedding-native-unload ()
"Release native model and context memory."
(when *native-context*
(%llama-free *native-context*)
(fc *native-context*)
(setf *native-context* nil))
(when *native-model*
(%llama-free-model *native-model*)
(setf *native-model* nil))
(fm *native-model*)
(setf *native-model* nil *native-vocab* nil))
(values))
(pushnew (lambda () (embedding-backend-native-unload)) sb-ext:*exit-hooks*)
(defun embedding-native-get-dim ()
"Return embedding dimension of loaded native model (0 if not loaded)."
(if *native-model*
(ne *native-model*)
0))
(defun vector-cosine-similarity (a b)
"Cosine similarity between two simple-vectors of single-floats."
(let ((dot 0.0d0) (anorm 0.0d0) (bnorm 0.0d0))
(dotimes (i (length a))
(let ((af (float (aref a i) 0.0d0))
(bf (float (aref b i) 0.0d0)))
(incf dot (* af bf))
(incf anorm (* af af))
(incf bnorm (* bf bf))))
(if (or (zerop anorm) (zerop bnorm))
0.0d0
(/ dot (sqrt (* anorm bnorm))))))
(eval-when (:compile-toplevel :load-toplevel :execute)
(ql:quickload :fiveam :silent t))
@@ -205,8 +221,8 @@ Returns a single-float vector of dimension n_embd."
"Contract v0.4.1: semantically similar texts are closer than unrelated."
(let ((v-auth (passepartout::embedding-backend-native "implement user login form"))
(v-related (passepartout::embedding-backend-native "add password authentication"))
(v-unrelated (passepartout::embedding-backend-native "banana fruit yellow"))
(sim-related (passepartout::vector-cosine-similarity v-auth v-related))
(sim-unrelated (passepartout::vector-cosine-similarity v-auth v-unrelated)))
(is (> sim-related 0.5))
(is (> sim-related sim-unrelated))))
(v-unrelated (passepartout::embedding-backend-native "banana fruit yellow")))
(let ((sim-related (passepartout::vector-cosine-similarity v-auth v-related))
(sim-unrelated (passepartout::vector-cosine-similarity v-auth v-unrelated)))
(is (> sim-related 0.5))
(is (> sim-related sim-unrelated)))))

View File

@@ -1,7 +1,7 @@
(in-package :passepartout)
(defvar *embedding-provider* :trigram
"Active embedding provider: :trigram, :sha256, :local, :openai.")
"Active embedding provider: :trigram, :sha256, :local, :openai, :native.")
(defvar *embedding-queue* nil
"Queue of text objects awaiting embedding.")
@@ -85,10 +85,14 @@ Pure Lisp, zero external dependencies, works fully offline."
"Embed a single text string using the active backend."
(let* ((selected (or *embedding-backend* *embedding-provider* :trigram))
(backend (case selected
(:local #'embedding-backend-local)
(:openai #'embedding-backend-openai)
(:sha256 #'embedding-backend-sha256)
(t #'embedding-backend-trigram))))
(:local #'embedding-backend-local)
(:openai #'embedding-backend-openai)
(:native
(unless (fboundp 'embedding-backend-native)
(embedding-native-ensure-loaded))
#'embedding-backend-native)
(:sha256 #'embedding-backend-sha256)
(t #'embedding-backend-trigram))))
(if backend
(progn
(log-message "EMBEDDING: Provider ~a, backend=~a" selected backend)
@@ -126,6 +130,34 @@ Pure Lisp, zero external dependencies, works fully offline."
(setf *embedding-provider* kw)
(log-message "EMBEDDING: Set provider to ~a from EMBEDDING_PROVIDER env" kw))))
(defun embedding-native-ensure-loaded ()
"Lazy-load the native CFFI backend. First call blocks ~30s for model init."
(when (fboundp 'embedding-backend-native)
(return-from embedding-native-ensure-loaded t))
(let* ((data-dir (uiop:ensure-directory-pathname
(or (uiop:getenv "PASSEPARTOUT_DATA_DIR")
(namestring (merge-pathnames ".local/share/passepartout/"
(user-homedir-pathname))))))
(native-file (merge-pathnames "lisp/system-model-embedding-native.lisp" data-dir)))
(handler-case
(progn
(load native-file :verbose nil :print nil)
(log-message "EMBEDDING: Native backend loaded from ~a" native-file))
(error (c)
(error "Failed to load native embedding backend (~a): ~a" native-file c)))))
;; Preload native model if configured at startup
(when (eq *embedding-provider* :native)
(log-message "EMBEDDING: Native provider configured, preloading model...")
(embedding-native-ensure-loaded)
(handler-case
(progn
(embedding-native-load-model)
(log-message "EMBEDDING: Native model preloaded (~d dims)"
(embedding-native-get-dim)))
(error (c)
(log-message "EMBEDDING: Preload deferred: ~a (will retry on first call)" c))))
(log-message "EMBEDDING: Gateway loaded with provider ~a" *embedding-provider*)
(defun mark-vector-stale (id &optional content)