#+TITLE: SKILL: Native Embedding Inference (org-skill-embedding-native.org) #+AUTHOR: Agent #+FILETAGS: :skill:system:embedding:cffi: #+PROPERTY: header-args:lisp :tangle /home/user/.local/share/passepartout/lisp/embedding-native.lisp * Architectural Intent =system-model-embedding-native= provides in-process embedding inference via CFFI binding to llama.cpp. Unlike =:local= (Ollama REST API) and =:openai= (paid API), =:native= runs the embedding model directly in the SBCL process — zero network calls, zero external servers. The bundled model is =nomic-embed-text-v1.5= (nomic-bert, 768-dim, 12 layers, Q4_K_M quantization, ~80MB) at =~/.local/share/passepartout/models/nomic-embed-text-v1.5.Q4_K_M.gguf=. It is a BERT-family encoder-only model — single forward pass, no autoregressive decoding. **Key architectural decisions**: - C wrapper library (=/usr/local/lib/libllama_wrap.so=) bridges CFFI pointer params to llama.cpp's struct-by-value API (CFFI cannot pass/return structs by value) - Struct sizes verified via C ~sizeof~ / ~offsetof~: =llama_model_params= (72B), =llama_context_params= (136B), =llama_batch= (56B) - Model and context cached globally in =*native-model*= / =*native-context*= to avoid reloading - BERT pooling: =llama_get_embeddings_seq= for sequence-level embedding (not =llama_get_embeddings_ith=) - =sb-int:set-floating-point-modes= :traps nil required before any llama.cpp call (FPU state conflict) * Implementation ** Package guard #+begin_src lisp (unless (find-package :passepartout) (make-package :passepartout :use '(:cl))) (in-package :passepartout) #+end_src ** CFFI: Load C wrapper + llama libraries The C wrapper (=libllama_wrap.so=) bridges struct-by-value: all wrapper functions take pure pointers and dereference internally. #+begin_src lisp (cffi:define-foreign-library libllama_wrap (:unix "/usr/local/lib/libllama_wrap.so")) (cffi:use-foreign-library libllama_wrap) (cffi:define-foreign-library libllama (:unix "/usr/local/lib/libllama.so")) (cffi:use-foreign-library libllama) #+end_src ** CFFI: Struct definitions Sizes verified via C =sizeof= / =offsetof= at build time. #+begin_src lisp (cffi:defcstruct (llama-mparams :size 72) (devices :pointer) (tensor-buft :pointer) (n-gpu-layers :int32) (split-mode :int32) (main-gpu :int32) (_pad1 :int32) (tensor-split :pointer) (progress-cb :pointer) (progress-data :pointer) (kv-overrides :pointer) (vocab-only :bool) (use-mmap :bool) (_pad2 :uint8 :count 6)) (cffi:defcstruct (llama-cparams :size 136) (n-ctx :uint32) (n-batch :uint32) (n-ubatch :uint32) (n-seq-max :uint32) (n-threads :int32) (n-threads-batch :int32) (rope-scaling-type :int32) (pooling-type :int32) (attention-type :int32) (flash-attn-type :int32) (rope-freq-base :float) (rope-freq-scale :float) (yarn-ext-factor :float) (yarn-attn-factor :float) (yarn-beta-fast :float) (yarn-beta-slow :float) (yarn-orig-ctx :uint32) (defrag-thold :float) (cb-eval :pointer) (cb-eval-user-data :pointer) (type-k :int32) (type-v :int32) (abort-callback :pointer) (abort-callback-data :pointer) (embeddings :bool) (offload-kqv :bool) (no-perf :bool) (op-offload :bool) (swa-full :bool) (kv-unified :bool) (_c-pad3 :uint8 :count 15)) (cffi:defcstruct (llama-batch :size 56) (n-tokens :int32) (_bpad1 :int32) (token :pointer) (embd :pointer) (pos :pointer) (n-seq-id :pointer) (seq-id :pointer) (logits :pointer)) #+end_src ** CFFI: llama.cpp API (current, non-deprecated) llama.cpp has undergone API changes. We target the current stable API: - =llama_model_load_from_file= → C wrapper (=llama_wrap_model_load=) - =llama_init_from_model= → C wrapper (=llama_wrap_new_context=) - =llama_encode= → C wrapper (=llama_wrap_encode=) — takes struct-by-value batch - =llama_batch_init/free= → C wrapper — returns/consumes struct-by-value - =llama_backend_init= REQUIRED before any model load - =llama_model_n_embd= (NOT deprecated =llama_n_embd=) - =llama_model_get_vocab= + =llama_vocab_n_tokens= (NOT deprecated =llama_n_vocab= with model pointer) - =llama_tokenize= now takes =vocab*= not =model*= - =llama_get_embeddings_seq= for BERT pooled embeddings (=llama_get_embeddings_ith= for token embeddings) - =llama_pooling_type= to query context pooling strategy #+begin_src lisp ;; llama.cpp public API (cffi:defcfun ("llama_backend_init" bl) :void) (cffi:defcfun ("llama_model_default_params" mdp) :void (p :pointer)) (cffi:defcfun ("llama_context_default_params" cdp) :void (p :pointer)) (cffi:defcfun ("llama_model_n_embd" ne) :int32 (m :pointer)) (cffi:defcfun ("llama_model_get_vocab" gv) :pointer (m :pointer)) (cffi:defcfun ("llama_vocab_n_tokens" vnt) :int32 (vocab :pointer)) (cffi:defcfun ("llama_tokenize" tok) :int32 (vocab :pointer) (text :string) (len :int32) (tokens :pointer) (n-max :int32) (add-special :bool) (parse-special :bool)) (cffi:defcfun ("llama_get_embeddings_ith" embd-ith) :pointer (ctx :pointer) (i :int32)) (cffi:defcfun ("llama_get_embeddings_seq" embd-seq) :pointer (ctx :pointer) (seq-id :int32)) (cffi:defcfun ("llama_pooling_type" get-pooling) :int32 (ctx :pointer)) (cffi:defcfun ("llama_model_free" fm) :void (m :pointer)) (cffi:defcfun ("llama_free" fc) :void (ctx :pointer)) ;; C wrapper (bridges struct-by-value ABI) (cffi:defcfun ("llama_wrap_model_load" wrap-load) :pointer (path :string) (params :pointer)) (cffi:defcfun ("llama_wrap_new_context" wrap-ctx) :pointer (model :pointer) (params :pointer)) (cffi:defcfun ("llama_wrap_encode" wrap-encode) :int32 (ctx :pointer) (batch :pointer)) (cffi:defcfun ("llama_wrap_batch_init" wrap-batch-init) :void (batch :pointer) (n-tokens :int32) (embd :int32) (n-seq-max :int32)) (cffi:defcfun ("llama_wrap_batch_free" wrap-batch-free) :void (batch :pointer)) #+end_src ** Global state #+begin_src lisp (defvar *native-model* nil "Cached llama.cpp model for embedding inference.") (defvar *native-context* nil "Cached llama.cpp context for embedding inference.") (defvar *native-vocab* nil "Cached llama.cpp vocab handle (from model).") (defvar *native-model-path* (merge-pathnames ".local/share/passepartout/models/nomic-embed-text-v1.5.Q4_K_M.gguf" (user-homedir-pathname)) "Path to the bundled embedding model GGUF file.") #+end_src ** Model loading Loads the GGUF model file and creates an inference context. Caches globally — subsequent calls are no-ops. Key initialization: - =sb-int:set-floating-point-modes= :traps nil — required or llama.cpp FPU ops SIGFPE - =llama_backend_init= — must run before any model operation - Model params: GPU off (=n-gpu-layers=0), no mmap (avoids double-free with SBCL's malloc) - Context params: embeddings=1, 512-token context, 2 threads, =pooling_type= unset (let model decide) #+begin_src lisp (defun embedding-native-load-model () "Load the embedding model and create a context. Caches globally." (unless (and *native-model* *native-context*) (unless (uiop:file-exists-p *native-model-path*) (error "Native embedding model not found at ~a" *native-model-path*)) (sb-int:set-floating-point-modes :traps '()) (bl) ;; Load model (cffi:with-foreign-object (mp '(:struct llama-mparams)) (mdp mp) (setf (cffi:foreign-slot-value mp '(:struct llama-mparams) 'n-gpu-layers) 0) (setf (cffi:foreign-slot-value mp '(:struct llama-mparams) 'use-mmap) 0) (setf *native-model* (wrap-load (namestring *native-model-path*) mp))) (setf *native-vocab* (gv *native-model*)) ;; Create context (let ((n-embd (ne *native-model*))) (cffi:with-foreign-object (cp '(:struct llama-cparams)) (cdp cp) (setf (cffi:foreign-slot-value cp '(:struct llama-cparams) 'n-ctx) 512) (setf (cffi:foreign-slot-value cp '(:struct llama-cparams) 'n-batch) 512) (setf (cffi:foreign-slot-value cp '(:struct llama-cparams) 'n-ubatch) 512) (setf (cffi:foreign-slot-value cp '(:struct llama-cparams) 'n-seq-max) 1) (setf (cffi:foreign-slot-value cp '(:struct llama-cparams) 'n-threads) 2) (setf (cffi:foreign-slot-value cp '(:struct llama-cparams) 'embeddings) 1) (setf *native-context* (wrap-ctx *native-model* cp))) (format *error-output* "~&;; EMBEDDING: Native model loaded (~d-dim)~%" n-embd))) (values *native-model* *native-context* *native-vocab*)) #+end_src ** Embedding inference Computes a 768-dim single-float vector for the given text via llama.cpp. Pipeline: 1. Load/cache model + context 2. Tokenize text via =llama_tokenize= (takes =vocab*= not =model*= since v0.4.1) 3. Initialize batch via C wrapper (=llama_batch_init= returns struct-by-value) 4. Fill batch: set =tokens=, =pos=, =n_seq_id=, =seq_id[0]=, =logits= for each position 5. CRITICAL: set =batch.n_tokens= explicitly — =llama_batch_init= initializes it to 0 6. Encode via C wrapper (=llama_encode= takes struct-by-value batch) 7. Extract pooled embedding via =llama_get_embeddings_seq= (BERT CLS pooling) — falls back to =llama_get_embeddings_ith= if =pooling_type == NONE= 8. Free batch memory via wrapper (=llama_batch_free= takes struct-by-value) NOTE: we write =seq_id= values directly into the arrays allocated by =llama_batch_init= (not foreign-alloc'd separately) to avoid double-free. #+begin_src lisp (defun embedding-backend-native (text) "Compute an embedding vector using the native llama.cpp backend. Returns a simple-vector of single-floats (dimension: n_embd, typically 768)." (embedding-native-load-model) (let* ((n-embd (ne *native-model*)) (max-tokens 256) (tokens (cffi:foreign-alloc :int32 :count max-tokens)) (n-tok 0)) (unwind-protect (progn (setf n-tok (tok *native-vocab* text (length text) tokens max-tokens t t)) (when (zerop n-tok) (error "Native embedding: tokenization returned 0 tokens for ~s" text)) (let ((result (make-array n-embd :element-type 'single-float :initial-element 0.0f0))) (cffi:with-foreign-object (batch '(:struct llama-batch)) (wrap-batch-init batch n-tok 0 1) (setf (cffi:foreign-slot-value batch '(:struct llama-batch) 'n-tokens) n-tok) (dotimes (i n-tok) (setf (cffi:mem-aref (cffi:foreign-slot-value batch '(:struct llama-batch) 'token) :int32 i) (cffi:mem-aref tokens :int32 i)) (setf (cffi:mem-aref (cffi:foreign-slot-value batch '(:struct llama-batch) 'pos) :int32 i) i) (setf (cffi:mem-aref (cffi:foreign-slot-value batch '(:struct llama-batch) 'n-seq-id) :int32 i) 1) (setf (cffi:mem-aref (cffi:mem-aref (cffi:foreign-slot-value batch '(:struct llama-batch) 'seq-id) :pointer i) :int32 0) 0) (setf (cffi:mem-aref (cffi:foreign-slot-value batch '(:struct llama-batch) 'logits) :int8 i) 1)) (let ((enc (wrap-encode *native-context* batch))) (unless (zerop enc) (error "Native embedding: encode returned ~d" enc))) (let* ((pooling (get-pooling *native-context*)) (eptr (if (= pooling 0) (embd-ith *native-context* (1- n-tok)) (embd-seq *native-context* 0)))) (dotimes (i n-embd) (setf (aref result i) (cffi:mem-aref eptr :float i)))) (wrap-batch-free batch)) result)) (cffi:foreign-free tokens)))) #+end_src ** Cleanup and unload #+begin_src lisp (defun embedding-native-unload () "Release native model and context memory." (when *native-context* (fc *native-context*) (setf *native-context* nil)) (when *native-model* (fm *native-model*) (setf *native-model* nil *native-vocab* nil)) (values)) (defun embedding-native-get-dim () "Return embedding dimension of loaded native model (0 if not loaded)." (if *native-model* (ne *native-model*) 0)) #+end_src ** Cosine similarity helper Used in tests and embedding comparisons. #+begin_src lisp (defun vector-cosine-similarity (a b) "Cosine similarity between two simple-vectors of single-floats." (let ((dot 0.0d0) (anorm 0.0d0) (bnorm 0.0d0)) (dotimes (i (length a)) (let ((af (float (aref a i) 0.0d0)) (bf (float (aref b i) 0.0d0))) (incf dot (* af bf)) (incf anorm (* af af)) (incf bnorm (* bf bf)))) (if (or (zerop anorm) (zerop bnorm)) 0.0d0 (/ dot (sqrt (* anorm bnorm)))))) #+end_src * Contract 1. (embedding-backend-native text): computes a 768-dim single-float embedding vector via llama.cpp. Returns a simple-vector. Requires the model file at ~*native-model-path*~ and the C wrapper library at ~/usr/local/lib/libllama_wrap.so~. 2. (embedding-native-load-model): loads the GGUF model file and creates an inference context. Caches globally in ~*native-model*~ / ~*native-context*~ — subsequent calls are no-ops. Calls ~sb-int:set-floating-point-modes~ and ~llama_backend_init~. 3. (embedding-native-unload): releases native model and context memory. Sets cached globals to nil. 4. (embedding-native-get-dim): returns the embedding dimension of the loaded model (768 for nomic-embed-text-v1.5), or 0 if not loaded. * Test Suite #+begin_src lisp (eval-when (:compile-toplevel :load-toplevel :execute) (ql:quickload :fiveam :silent t)) (defpackage :passepartout-embedding-native-tests (:use :cl :fiveam :passepartout) (:export #:embedding-native-suite)) (in-package :passepartout-embedding-native-tests) (def-suite embedding-native-suite :description "Verification of Native Embedding Inference") (in-suite embedding-native-suite) (test test-native-embedding-available "Contract v0.4.1: backend function exists and model file is present." (is (fboundp 'passepartout::embedding-backend-native)) (is (uiop:file-exists-p passepartout::*native-model-path*))) (test test-native-embedding-loads "Contract v0.4.1: model loads and produces a valid context." (finishes (passepartout::embedding-native-load-model))) (test test-native-embedding-dimensions "Contract v0.4.1: embedding produces correct-dimensional vector." (let ((vec (passepartout::embedding-backend-native "test sentence"))) (is (vectorp vec)) (is (= (length vec) 768)) (is (typep (aref vec 0) 'single-float)))) (test test-native-embedding-identical "Contract v0.4.1: identical texts produce identical embeddings." (let ((v1 (passepartout::embedding-backend-native "hello world")) (v2 (passepartout::embedding-backend-native "hello world"))) (is (= (length v1) (length v2))) (let ((sim (passepartout::vector-cosine-similarity v1 v2))) (is (> sim 0.9999))))) (test test-native-embedding-similar "Contract v0.4.1: semantically similar texts are closer than unrelated." (let ((v-auth (passepartout::embedding-backend-native "implement user login form")) (v-related (passepartout::embedding-backend-native "add password authentication")) (v-unrelated (passepartout::embedding-backend-native "banana fruit yellow"))) (let ((sim-related (passepartout::vector-cosine-similarity v-auth v-related)) (sim-unrelated (passepartout::vector-cosine-similarity v-auth v-unrelated))) (is (> sim-related 0.5)) (is (> sim-related sim-unrelated))))) #+end_src * C Wrapper Source The C wrapper bridges CFFI's pointer-only interface to llama.cpp's struct-by-value API. Compile with: =gcc -shared -fPIC -I/tmp/llama.cpp/include -o libllama_wrap.so llama_wrap.c -L/usr/local/lib -lllama= #+begin_src c :tangle ../scripts/llama_wrap.c // C wrapper for llama.cpp — bridges CFFI pointer params to struct-by-value // Compile: gcc -shared -fPIC -I/tmp/llama.cpp/include -o libllama_wrap.so llama_wrap.c -L/usr/local/lib -lllama #include struct llama_model * llama_wrap_model_load(const char * path, struct llama_model_params * params) { return llama_model_load_from_file(path, *params); } struct llama_context * llama_wrap_new_context(struct llama_model * model, struct llama_context_params * params) { return llama_init_from_model(model, *params); } int32_t llama_wrap_encode(struct llama_context * ctx, struct llama_batch * batch) { return llama_encode(ctx, *batch); } void llama_wrap_batch_init(struct llama_batch * batch, int32_t n_tokens, int32_t embd, int32_t n_seq_max) { *batch = llama_batch_init(n_tokens, embd, n_seq_max); } void llama_wrap_batch_free(struct llama_batch * batch) { llama_batch_free(*batch); } #+end_src