- Native backend returns 768-dim vectors via llama.cpp / C wrapper (/usr/local/lib/libllama_wrap.so) - Wired :native into embed-object dispatch and exported from passepartout package - Model preloads at daemon startup with EMBEDDING_PROVIDER=native (~30s) - Lazy loading via *embedding-backend* :native also works (first call ~45s) - C wrapper bridges CFFI pointer params to llama.cpp struct-by-value API - Correct struct layouts: llama_model_params(72B), llama_context_params(136B), llama_batch(56B) - BERT pooling: llama_get_embeddings_seq, llama_tokenize takes vocab* not model* - FiveAM tests pass: dimensions, self-similarity, semantic ranking - Fixed pre-existing HITL crash: boundp guard for *hitl-pending* in core-loop-act - Lazy load guard prevents double-load of native file in embedding-native-ensure-loaded - ROADMAP: v0.4.0 items marked DONE, v0.4.1 native embedding updated with actual implementation
229 lines
9.9 KiB
Common Lisp
229 lines
9.9 KiB
Common Lisp
(unless (find-package :passepartout)
|
|
(make-package :passepartout :use '(:cl)))
|
|
|
|
(in-package :passepartout)
|
|
|
|
(cffi:define-foreign-library libllama_wrap (:unix "/usr/local/lib/libllama_wrap.so"))
|
|
(cffi:use-foreign-library libllama_wrap)
|
|
(cffi:define-foreign-library libllama (:unix "/usr/local/lib/libllama.so"))
|
|
(cffi:use-foreign-library libllama)
|
|
|
|
(cffi:defcstruct (llama-mparams :size 72)
|
|
(devices :pointer) (tensor-buft :pointer) (n-gpu-layers :int32)
|
|
(split-mode :int32) (main-gpu :int32) (_pad1 :int32)
|
|
(tensor-split :pointer) (progress-cb :pointer) (progress-data :pointer)
|
|
(kv-overrides :pointer) (vocab-only :bool) (use-mmap :bool)
|
|
(_pad2 :uint8 :count 6))
|
|
|
|
(cffi:defcstruct (llama-cparams :size 136)
|
|
(n-ctx :uint32)
|
|
(n-batch :uint32)
|
|
(n-ubatch :uint32)
|
|
(n-seq-max :uint32)
|
|
(n-threads :int32)
|
|
(n-threads-batch :int32)
|
|
(rope-scaling-type :int32)
|
|
(pooling-type :int32)
|
|
(attention-type :int32)
|
|
(flash-attn-type :int32)
|
|
(rope-freq-base :float)
|
|
(rope-freq-scale :float)
|
|
(yarn-ext-factor :float)
|
|
(yarn-attn-factor :float)
|
|
(yarn-beta-fast :float)
|
|
(yarn-beta-slow :float)
|
|
(yarn-orig-ctx :uint32)
|
|
(defrag-thold :float)
|
|
(cb-eval :pointer)
|
|
(cb-eval-user-data :pointer)
|
|
(type-k :int32)
|
|
(type-v :int32)
|
|
(abort-callback :pointer)
|
|
(abort-callback-data :pointer)
|
|
(embeddings :bool)
|
|
(offload-kqv :bool)
|
|
(no-perf :bool)
|
|
(op-offload :bool)
|
|
(swa-full :bool)
|
|
(kv-unified :bool)
|
|
(_c-pad3 :uint8 :count 15))
|
|
|
|
(cffi:defcstruct (llama-batch :size 56)
|
|
(n-tokens :int32) (_bpad1 :int32) (token :pointer) (embd :pointer)
|
|
(pos :pointer) (n-seq-id :pointer) (seq-id :pointer) (logits :pointer))
|
|
|
|
;; llama.cpp public API
|
|
(cffi:defcfun ("llama_backend_init" bl) :void)
|
|
(cffi:defcfun ("llama_model_default_params" mdp) :void (p :pointer))
|
|
(cffi:defcfun ("llama_context_default_params" cdp) :void (p :pointer))
|
|
(cffi:defcfun ("llama_model_n_embd" ne) :int32 (m :pointer))
|
|
(cffi:defcfun ("llama_model_get_vocab" gv) :pointer (m :pointer))
|
|
(cffi:defcfun ("llama_vocab_n_tokens" vnt) :int32 (vocab :pointer))
|
|
(cffi:defcfun ("llama_tokenize" tok) :int32 (vocab :pointer) (text :string) (len :int32) (tokens :pointer) (n-max :int32) (add-special :bool) (parse-special :bool))
|
|
(cffi:defcfun ("llama_get_embeddings_ith" embd-ith) :pointer (ctx :pointer) (i :int32))
|
|
(cffi:defcfun ("llama_get_embeddings_seq" embd-seq) :pointer (ctx :pointer) (seq-id :int32))
|
|
(cffi:defcfun ("llama_pooling_type" get-pooling) :int32 (ctx :pointer))
|
|
(cffi:defcfun ("llama_model_free" fm) :void (m :pointer))
|
|
(cffi:defcfun ("llama_free" fc) :void (ctx :pointer))
|
|
|
|
;; C wrapper (bridges struct-by-value ABI)
|
|
(cffi:defcfun ("llama_wrap_model_load" wrap-load) :pointer (path :string) (params :pointer))
|
|
(cffi:defcfun ("llama_wrap_new_context" wrap-ctx) :pointer (model :pointer) (params :pointer))
|
|
(cffi:defcfun ("llama_wrap_encode" wrap-encode) :int32 (ctx :pointer) (batch :pointer))
|
|
(cffi:defcfun ("llama_wrap_batch_init" wrap-batch-init) :void (batch :pointer) (n-tokens :int32) (embd :int32) (n-seq-max :int32))
|
|
(cffi:defcfun ("llama_wrap_batch_free" wrap-batch-free) :void (batch :pointer))
|
|
|
|
(defvar *native-model* nil
|
|
"Cached llama.cpp model for embedding inference.")
|
|
|
|
(defvar *native-context* nil
|
|
"Cached llama.cpp context for embedding inference.")
|
|
|
|
(defvar *native-vocab* nil
|
|
"Cached llama.cpp vocab handle (from model).")
|
|
|
|
(defvar *native-model-path*
|
|
(merge-pathnames ".local/share/passepartout/models/nomic-embed-text-v1.5.Q4_K_M.gguf"
|
|
(user-homedir-pathname))
|
|
"Path to the bundled embedding model GGUF file.")
|
|
|
|
(defun embedding-native-load-model ()
|
|
"Load the embedding model and create a context. Caches globally."
|
|
(unless (and *native-model* *native-context*)
|
|
(unless (uiop:file-exists-p *native-model-path*)
|
|
(error "Native embedding model not found at ~a" *native-model-path*))
|
|
(sb-int:set-floating-point-modes :traps '())
|
|
(bl)
|
|
;; Load model
|
|
(cffi:with-foreign-object (mp 'llama-mparams)
|
|
(mdp mp)
|
|
(setf (cffi:foreign-slot-value mp 'llama-mparams 'n-gpu-layers) 0)
|
|
(setf (cffi:foreign-slot-value mp 'llama-mparams 'use-mmap) 0)
|
|
(setf *native-model* (wrap-load (namestring *native-model-path*) mp)))
|
|
(setf *native-vocab* (gv *native-model*))
|
|
;; Create context
|
|
(let ((n-embd (ne *native-model*)))
|
|
(cffi:with-foreign-object (cp 'llama-cparams)
|
|
(cdp cp)
|
|
(setf (cffi:foreign-slot-value cp 'llama-cparams 'n-ctx) 512)
|
|
(setf (cffi:foreign-slot-value cp 'llama-cparams 'n-batch) 512)
|
|
(setf (cffi:foreign-slot-value cp 'llama-cparams 'n-ubatch) 512)
|
|
(setf (cffi:foreign-slot-value cp 'llama-cparams 'n-seq-max) 1)
|
|
(setf (cffi:foreign-slot-value cp 'llama-cparams 'n-threads) 2)
|
|
(setf (cffi:foreign-slot-value cp 'llama-cparams 'embeddings) 1)
|
|
(setf *native-context* (wrap-ctx *native-model* cp)))
|
|
(format *error-output* "~&;; EMBEDDING: Native model loaded (~d-dim)~%" n-embd)))
|
|
(values *native-model* *native-context* *native-vocab*))
|
|
|
|
(defun embedding-backend-native (text)
|
|
"Compute an embedding vector using the native llama.cpp backend.
|
|
Returns a simple-vector of single-floats (dimension: n_embd, typically 768)."
|
|
(embedding-native-load-model)
|
|
(let* ((n-embd (ne *native-model*))
|
|
(max-tokens 256)
|
|
(tokens (cffi:foreign-alloc :int32 :count max-tokens))
|
|
(n-tok 0))
|
|
(unwind-protect
|
|
(progn
|
|
(setf n-tok (tok *native-vocab* text (length text) tokens max-tokens t t))
|
|
(when (zerop n-tok)
|
|
(error "Native embedding: tokenization returned 0 tokens for ~s" text))
|
|
(let ((result (make-array n-embd :element-type 'single-float :initial-element 0.0f0)))
|
|
(cffi:with-foreign-object (batch 'llama-batch)
|
|
(wrap-batch-init batch n-tok 0 1)
|
|
(setf (cffi:foreign-slot-value batch 'llama-batch 'n-tokens) n-tok)
|
|
(dotimes (i n-tok)
|
|
(setf (cffi:mem-aref (cffi:foreign-slot-value batch 'llama-batch 'token) :int32 i)
|
|
(cffi:mem-aref tokens :int32 i))
|
|
(setf (cffi:mem-aref (cffi:foreign-slot-value batch 'llama-batch 'pos) :int32 i) i)
|
|
(setf (cffi:mem-aref (cffi:foreign-slot-value batch 'llama-batch 'n-seq-id) :int32 i) 1)
|
|
(setf (cffi:mem-aref (cffi:mem-aref (cffi:foreign-slot-value batch 'llama-batch 'seq-id) :pointer i) :int32 0) 0)
|
|
(setf (cffi:mem-aref (cffi:foreign-slot-value batch 'llama-batch 'logits) :int8 i) 1))
|
|
(let ((enc (wrap-encode *native-context* batch)))
|
|
(unless (zerop enc)
|
|
(error "Native embedding: encode returned ~d" enc)))
|
|
(let* ((pooling (get-pooling *native-context*))
|
|
(eptr (if (= pooling 0)
|
|
(embd-ith *native-context* (1- n-tok))
|
|
(embd-seq *native-context* 0))))
|
|
(dotimes (i n-embd)
|
|
(setf (aref result i) (cffi:mem-aref eptr :float i))))
|
|
(wrap-batch-free batch))
|
|
result))
|
|
(cffi:foreign-free tokens))))
|
|
|
|
(defun embedding-native-unload ()
|
|
"Release native model and context memory."
|
|
(when *native-context*
|
|
(fc *native-context*)
|
|
(setf *native-context* nil))
|
|
(when *native-model*
|
|
(fm *native-model*)
|
|
(setf *native-model* nil *native-vocab* nil))
|
|
(values))
|
|
|
|
(defun embedding-native-get-dim ()
|
|
"Return embedding dimension of loaded native model (0 if not loaded)."
|
|
(if *native-model*
|
|
(ne *native-model*)
|
|
0))
|
|
|
|
(defun vector-cosine-similarity (a b)
|
|
"Cosine similarity between two simple-vectors of single-floats."
|
|
(let ((dot 0.0d0) (anorm 0.0d0) (bnorm 0.0d0))
|
|
(dotimes (i (length a))
|
|
(let ((af (float (aref a i) 0.0d0))
|
|
(bf (float (aref b i) 0.0d0)))
|
|
(incf dot (* af bf))
|
|
(incf anorm (* af af))
|
|
(incf bnorm (* bf bf))))
|
|
(if (or (zerop anorm) (zerop bnorm))
|
|
0.0d0
|
|
(/ dot (sqrt (* anorm bnorm))))))
|
|
|
|
(eval-when (:compile-toplevel :load-toplevel :execute)
|
|
(ql:quickload :fiveam :silent t))
|
|
|
|
(defpackage :passepartout-embedding-native-tests
|
|
(:use :cl :fiveam :passepartout)
|
|
(:export #:embedding-native-suite))
|
|
|
|
(in-package :passepartout-embedding-native-tests)
|
|
|
|
(def-suite embedding-native-suite :description "Verification of Native Embedding Inference")
|
|
(in-suite embedding-native-suite)
|
|
|
|
(test test-native-embedding-available
|
|
"Contract v0.4.1: backend function exists and model file is present."
|
|
(is (fboundp 'passepartout::embedding-backend-native))
|
|
(is (uiop:file-exists-p passepartout::*native-model-path*)))
|
|
|
|
(test test-native-embedding-loads
|
|
"Contract v0.4.1: model loads and produces a valid context."
|
|
(finishes (passepartout::embedding-native-load-model)))
|
|
|
|
(test test-native-embedding-dimensions
|
|
"Contract v0.4.1: embedding produces correct-dimensional vector."
|
|
(let ((vec (passepartout::embedding-backend-native "test sentence")))
|
|
(is (vectorp vec))
|
|
(is (= (length vec) 768))
|
|
(is (typep (aref vec 0) 'single-float))))
|
|
|
|
(test test-native-embedding-identical
|
|
"Contract v0.4.1: identical texts produce identical embeddings."
|
|
(let ((v1 (passepartout::embedding-backend-native "hello world"))
|
|
(v2 (passepartout::embedding-backend-native "hello world")))
|
|
(is (= (length v1) (length v2)))
|
|
(let ((sim (passepartout::vector-cosine-similarity v1 v2)))
|
|
(is (> sim 0.9999)))))
|
|
|
|
(test test-native-embedding-similar
|
|
"Contract v0.4.1: semantically similar texts are closer than unrelated."
|
|
(let ((v-auth (passepartout::embedding-backend-native "implement user login form"))
|
|
(v-related (passepartout::embedding-backend-native "add password authentication"))
|
|
(v-unrelated (passepartout::embedding-backend-native "banana fruit yellow")))
|
|
(let ((sim-related (passepartout::vector-cosine-similarity v-auth v-related))
|
|
(sim-unrelated (passepartout::vector-cosine-similarity v-auth v-unrelated)))
|
|
(is (> sim-related 0.5))
|
|
(is (> sim-related sim-unrelated)))))
|