#+TITLE: SKILL: Native Embedding Inference (org-skill-embedding-native.org) #+AUTHOR: Agent #+FILETAGS: :skill:system:embedding:cffi: #+PROPERTY: header-args:lisp :tangle ../lisp/system-model-embedding-native.lisp * Architectural Intent ~system-model-embedding-native~ provides in-process embedding inference via CFFI binding to llama.cpp. Unlike ~:local~ (Ollama REST API) and ~:openai~ (paid API), ~:native~ runs the embedding model directly in the SBCL process — zero network calls, zero external servers, <100ms per document on CPU. The bundled model is ~nomic-embed-text-v1.5~ (nomic-bert, 768-dim, 12 layers) at ~~/.local/share/passepartout/models/nomic-embed-text-v1.5.Q4_K_M.gguf~. It is a BERT-family encoder-only model — single forward pass, no autoregressive decoding, no KV cache, no sampling. **Why this matters**: The trigram Jaccard fallback (v0.4.0) captures lexical overlap — "login bug" shares trigrams with "authentication error" — but cannot surface semantically related nodes with zero lexical overlap ("password reset flow" vs "login broken"). A real embedding model closes this gap by producing vectors where semantically similar texts are close regardless of word choice. The CFFI binding targets llama.cpp's public API: - ~llama_model_load~ / ~llama_free_model~ — model lifecycle - ~llama_new_context_with_model~ / ~llama_free~ — context lifecycle - ~llama_encode~ — single forward pass (encoder-only, no generation) - ~llama_get_embeddings_ith(ctx, i)~ — extract float vector at position i - ~llama_n_embd(model)~ — embedding dimension Memory: model and context are cached globally in ~*native-model*~ / ~*native-context*~ to avoid reloading on every embedding call. * Implementation ** Package #+begin_src lisp (in-package :passepartout) #+end_src ** CFFI: Load shared library #+begin_src lisp (cffi:define-foreign-library libllama (:unix "/usr/local/lib/libllama.so")) (cffi:use-foreign-library libllama) #+end_src ** CFFI: Types #+begin_src lisp (cffi:defctype llama-model-p :pointer) (cffi:defctype llama-context-p :pointer) (cffi:defctype llama-seq-id :int32) (cffi:defctype llama-token :int32) (cffi:defctype llama-pos :int32) (cffi:defcstruct (llama-model-params :class llama-model-params-type) (n-gpu-layers :int32)) (cffi:defcstruct (llama-context-params :class llama-context-params-type) (n-ctx :uint32) (n-batch :uint32) (n-ubatch :uint32) (n-seq-max :uint32) (n-threads :int32) (embeddings :bool)) (cffi:defcstruct (llama-batch :class llama-batch-type) (n-tokens :int32) (token :pointer) (embd :pointer) (pos :pointer) (n-seq-id :pointer) (seq-id :pointer) (logits :pointer)) #+end_src ** CFFI: Functions #+begin_src lisp (cffi:defcfun ("llama_model_default_params" %llama-model-default-params) :void (params :pointer)) (cffi:defcfun ("llama_context_default_params" %llama-context-default-params) :void (params :pointer)) (cffi:defcfun ("llama_model_load" %llama-model-load) llama-model-p (path-model :string) (params :pointer)) (cffi:defcfun ("llama_new_context_with_model" %llama-new-context-with-model) llama-context-p (model llama-model-p) (params :pointer)) (cffi:defcfun ("llama_free_model" %llama-free-model) :void (model llama-model-p)) (cffi:defcfun ("llama_free" %llama-free) :void (ctx llama-context-p)) (cffi:defcfun ("llama_n_embd" %llama-n-embd) :int32 (model llama-model-p)) (cffi:defcfun ("llama_n_vocab" %llama-n-vocab) :int32 (model llama-model-p)) (cffi:defcfun ("llama_tokenize" %llama-tokenize) :int32 (model llama-model-p) (text :string) (text-len :int32) (tokens :pointer) (n-max-tokens :int32) (add-special :bool) (parse-special :bool)) (cffi:defcfun ("llama_encode" %llama-encode) :int32 (ctx llama-context-p) (batch :pointer)) (cffi:defcfun ("llama_get_embeddings_ith" %llama-get-embeddings-ith) :pointer (ctx llama-context-p) (i :int32)) (cffi:defcfun ("llama_batch_init" %llama-batch-init) :void (batch :pointer) (n-tokens :int32) (embd :int32) (n-seq-max :int32)) (cffi:defcfun ("llama_batch_free" %llama-batch-free) :void (batch :pointer)) #+end_src ** Global state #+begin_src lisp (defvar *native-model* nil "Cached llama.cpp model for embedding inference.") (defvar *native-context* nil "Cached llama.cpp context for embedding inference.") (defvar *native-model-path* (merge-pathnames ".local/share/passepartout/models/nomic-embed-text-v1.5.Q4_K_M.gguf" (user-homedir-pathname)) "Path to the bundled embedding model GGUF file.") #+end_src ** Embedding Backend #+begin_src lisp (defun embedding-native-load-model () "Load the embedding model and create a context. Caches globally." (unless (and *native-model* *native-context*) (unless (uiop:file-exists-p *native-model-path*) (error "Native embedding model not found at ~a" *native-model-path*)) (cffi:with-foreign-object (mparams '(:struct llama-model-params)) (%llama-model-default-params mparams) (setf (cffi:foreign-slot-value mparams '(:struct llama-model-params) 'n-gpu-layers) 0) (setf *native-model* (%llama-model-load (namestring *native-model-path*) mparams))) (cffi:with-foreign-object (cparams '(:struct llama-context-params)) (%llama-context-default-params cparams) (setf (cffi:foreign-slot-value cparams '(:struct llama-context-params) 'n-ctx) 512 (cffi:foreign-slot-value cparams '(:struct llama-context-params) 'n-batch) 512 (cffi:foreign-slot-value cparams '(:struct llama-context-params) 'n-ubatch) 512 (cffi:foreign-slot-value cparams '(:struct llama-context-params) 'n-seq-max) 1 (cffi:foreign-slot-value cparams '(:struct llama-context-params) 'n-threads) 2 (cffi:foreign-slot-value cparams '(:struct llama-context-params) 'embeddings) 1) (setf *native-context* (%llama-new-context-with-model *native-model* cparams))) (log-message "EMBEDDING: Native model loaded (~d-dim)" (%llama-n-embd *native-model*))) (values *native-model* *native-context*)) (defun embedding-native-get-dim () "Return the embedding dimension of the native model." (embedding-native-load-model) (%llama-n-embd *native-model*)) (defun embedding-backend-native (text) "Compute an embedding vector using the native llama.cpp backend. Returns a single-float vector of dimension n_embd." (let* ((text-len (length text)) (max-tokens 256) (tokens (cffi:foreign-alloc :int32 :count max-tokens)) (n-tokens 0)) (unwind-protect (progn (embedding-native-load-model) (setf n-tokens (%llama-tokenize *native-model* text text-len tokens max-tokens t t)) (when (zerop n-tokens) (error "Native embedding: tokenization returned 0 tokens")) (let* ((batch (%llama-batch-init n-tokens 0 1)) (n-embd (embedding-native-get-dim)) (result (make-array n-embd :element-type 'single-float :initial-element 0.0)) (seq-id-ptr (cffi:foreign-alloc :int32 :count 1))) (setf (cffi:mem-aref seq-id-ptr :int32 0) 0) (unwind-protect (progn (dotimes (i n-tokens) (setf (cffi:mem-aref (cffi:foreign-slot-value batch '(:struct llama-batch) 'token) :int32 i) (cffi:mem-aref tokens :int32 i)) (setf (cffi:mem-aref (cffi:foreign-slot-value batch '(:struct llama-batch) 'pos) :int32 i) i) (setf (cffi:mem-aref (cffi:foreign-slot-value batch '(:struct llama-batch) 'n-seq-id) :int32 i) 1) (setf (cffi:mem-aref (cffi:foreign-slot-value batch '(:struct llama-batch) 'seq-id) :pointer i) seq-id-ptr)) (let ((encode-result (%llama-encode *native-context* batch))) (when (not (zerop encode-result)) (error "Native embedding: encode returned ~d" encode-result))) (let ((embd-ptr (%llama-get-embeddings-ith *native-context* (1- n-tokens)))) (dotimes (i n-embd) (setf (aref result i) (cffi:mem-aref embd-ptr :float i))))) (%llama-batch-free batch) (cffi:foreign-free seq-id-ptr)) result)) (cffi:foreign-free tokens)))) (defun embedding-backend-native-unload () "Release native model and context memory." (when *native-context* (%llama-free *native-context*) (setf *native-context* nil)) (when *native-model* (%llama-free-model *native-model*) (setf *native-model* nil)) (values)) (pushnew (lambda () (embedding-backend-native-unload)) sb-ext:*exit-hooks*) #+end_src * Test Suite #+begin_src lisp (eval-when (:compile-toplevel :load-toplevel :execute) (ql:quickload :fiveam :silent t)) (defpackage :passepartout-embedding-native-tests (:use :cl :fiveam :passepartout) (:export #:embedding-native-suite)) (in-package :passepartout-embedding-native-tests) (def-suite embedding-native-suite :description "Verification of Native Embedding Inference") (in-suite embedding-native-suite) (test test-native-embedding-available "Contract v0.4.1: backend function exists and model file is present." (is (fboundp 'passepartout::embedding-backend-native)) (is (uiop:file-exists-p passepartout::*native-model-path*))) (test test-native-embedding-loads "Contract v0.4.1: model loads and produces a valid context." (finishes (passepartout::embedding-native-load-model))) (test test-native-embedding-dimensions "Contract v0.4.1: embedding produces correct-dimensional vector." (let ((vec (passepartout::embedding-backend-native "test sentence"))) (is (vectorp vec)) (is (= (length vec) 768)) (is (typep (aref vec 0) 'single-float)))) (test test-native-embedding-identical "Contract v0.4.1: identical texts produce identical embeddings." (let ((v1 (passepartout::embedding-backend-native "hello world")) (v2 (passepartout::embedding-backend-native "hello world"))) (is (= (length v1) (length v2))) (let ((sim (passepartout::vector-cosine-similarity v1 v2))) (is (> sim 0.9999))))) (test test-native-embedding-similar "Contract v0.4.1: semantically similar texts are closer than unrelated." (let ((v-auth (passepartout::embedding-backend-native "implement user login form")) (v-related (passepartout::embedding-backend-native "add password authentication")) (v-unrelated (passepartout::embedding-backend-native "banana fruit yellow")) (sim-related (passepartout::vector-cosine-similarity v-auth v-related)) (sim-unrelated (passepartout::vector-cosine-similarity v-auth v-unrelated))) (is (> sim-related 0.5)) (is (> sim-related sim-unrelated)))) #+end_src