(in-package :passepartout) (cffi:define-foreign-library libllama (:unix "/usr/local/lib/libllama.so")) (cffi:use-foreign-library libllama) (cffi:defctype llama-model-p :pointer) (cffi:defctype llama-context-p :pointer) (cffi:defctype llama-seq-id :int32) (cffi:defctype llama-token :int32) (cffi:defctype llama-pos :int32) (cffi:defcstruct (llama-model-params :class llama-model-params-type) (n-gpu-layers :int32)) (cffi:defcstruct (llama-context-params :class llama-context-params-type) (n-ctx :uint32) (n-batch :uint32) (n-ubatch :uint32) (n-seq-max :uint32) (n-threads :int32) (embeddings :bool)) (cffi:defcstruct (llama-batch :class llama-batch-type) (n-tokens :int32) (token :pointer) (embd :pointer) (pos :pointer) (n-seq-id :pointer) (seq-id :pointer) (logits :pointer)) (cffi:defcfun ("llama_model_default_params" %llama-model-default-params) (:struct llama-model-params)) (cffi:defcfun ("llama_context_default_params" %llama-context-default-params) (:struct llama-context-params)) (cffi:defcfun ("llama_model_load" %llama-model-load) llama-model-p (path-model :string) (params (:struct llama-model-params))) (cffi:defcfun ("llama_new_context_with_model" %llama-new-context-with-model) llama-context-p (model llama-model-p) (params (:struct llama-context-params))) (cffi:defcfun ("llama_free_model" %llama-free-model) :void (model llama-model-p)) (cffi:defcfun ("llama_free" %llama-free) :void (ctx llama-context-p)) (cffi:defcfun ("llama_n_embd" %llama-n-embd) :int32 (model llama-model-p)) (cffi:defcfun ("llama_n_vocab" %llama-n-vocab) :int32 (model llama-model-p)) (cffi:defcfun ("llama_tokenize" %llama-tokenize) :int32 (model llama-model-p) (text :string) (text-len :int32) (tokens :pointer) (n-max-tokens :int32) (add-special :bool) (parse-special :bool)) (cffi:defcfun ("llama_encode" %llama-encode) :int32 (ctx llama-context-p) (batch (:struct llama-batch))) (cffi:defcfun ("llama_get_embeddings_ith" %llama-get-embeddings-ith) :pointer (ctx llama-context-p) (i :int32)) (cffi:defcfun ("llama_batch_init" %llama-batch-init) (:struct llama-batch) (n-tokens :int32) (embd :int32) (n-seq-max :int32)) (cffi:defcfun ("llama_batch_free" %llama-batch-free) :void (batch (:struct llama-batch))) (defvar *native-model* nil "Cached llama.cpp model for embedding inference.") (defvar *native-context* nil "Cached llama.cpp context for embedding inference.") (defvar *native-model-path* (merge-pathnames ".local/share/passepartout/models/nomic-embed-text-v1.5.Q4_K_M.gguf" (user-homedir-pathname)) "Path to the bundled embedding model GGUF file.") (defun embedding-native-load-model () "Load the embedding model and create a context. Caches globally." (unless (and *native-model* *native-context*) (unless (uiop:file-exists-p *native-model-path*) (error "Native embedding model not found at ~a" *native-model-path*)) (let ((mparams (%llama-model-default-params))) (setf (cffi:foreign-slot-value mparams '(:struct llama-model-params) 'n-gpu-layers) 0) (setf *native-model* (%llama-model-load (namestring *native-model-path*) mparams))) (let* ((cparams (%llama-context-default-params))) (setf (cffi:foreign-slot-value cparams '(:struct llama-context-params) 'n-ctx) 512 (cffi:foreign-slot-value cparams '(:struct llama-context-params) 'n-batch) 512 (cffi:foreign-slot-value cparams '(:struct llama-context-params) 'n-ubatch) 512 (cffi:foreign-slot-value cparams '(:struct llama-context-params) 'n-seq-max) 1 (cffi:foreign-slot-value cparams '(:struct llama-context-params) 'n-threads) 2 (cffi:foreign-slot-value cparams '(:struct llama-context-params) 'embeddings) 1) (setf *native-context* (%llama-new-context-with-model *native-model* cparams))) (log-message "EMBEDDING: Native model loaded (~d-dim)" (%llama-n-embd *native-model*))) (values *native-model* *native-context*)) (defun embedding-native-get-dim () "Return the embedding dimension of the native model." (embedding-native-load-model) (%llama-n-embd *native-model*)) (defun embedding-backend-native (text) "Compute an embedding vector using the native llama.cpp backend. Returns a single-float vector of dimension n_embd." (let* ((text-len (length text)) (max-tokens 256) (tokens (cffi:foreign-alloc :int32 :count max-tokens)) (n-tokens 0)) (unwind-protect (progn (embedding-native-load-model) (setf n-tokens (%llama-tokenize *native-model* text text-len tokens max-tokens t t)) (when (zerop n-tokens) (error "Native embedding: tokenization returned 0 tokens")) (let* ((batch (%llama-batch-init n-tokens 0 1)) (n-embd (embedding-native-get-dim)) (result (make-array n-embd :element-type 'single-float :initial-element 0.0)) (seq-id-ptr (cffi:foreign-alloc :int32 :count 1))) (setf (cffi:mem-aref seq-id-ptr :int32 0) 0) (unwind-protect (progn (dotimes (i n-tokens) (setf (cffi:mem-aref (cffi:foreign-slot-value batch '(:struct llama-batch) 'token) :int32 i) (cffi:mem-aref tokens :int32 i)) (setf (cffi:mem-aref (cffi:foreign-slot-value batch '(:struct llama-batch) 'pos) :int32 i) i) (setf (cffi:mem-aref (cffi:foreign-slot-value batch '(:struct llama-batch) 'n-seq-id) :int32 i) 1) (setf (cffi:mem-aref (cffi:foreign-slot-value batch '(:struct llama-batch) 'seq-id) :pointer i) seq-id-ptr)) (let ((encode-result (%llama-encode *native-context* batch))) (when (not (zerop encode-result)) (error "Native embedding: encode returned ~d" encode-result))) (let ((embd-ptr (%llama-get-embeddings-ith *native-context* (1- n-tokens)))) (dotimes (i n-embd) (setf (aref result i) (cffi:mem-aref embd-ptr :float i))))) (%llama-batch-free batch) (cffi:foreign-free seq-id-ptr)) result)) (cffi:foreign-free tokens)))) (defun embedding-backend-native-unload () "Release native model and context memory." (when *native-context* (%llama-free *native-context*) (setf *native-context* nil)) (when *native-model* (%llama-free-model *native-model*) (setf *native-model* nil)) (values)) (pushnew (lambda () (embedding-backend-native-unload)) sb-ext:*exit-hooks*) (eval-when (:compile-toplevel :load-toplevel :execute) (ql:quickload :fiveam :silent t)) (defpackage :passepartout-embedding-native-tests (:use :cl :fiveam :passepartout) (:export #:embedding-native-suite)) (in-package :passepartout-embedding-native-tests) (def-suite embedding-native-suite :description "Verification of Native Embedding Inference") (in-suite embedding-native-suite) (test test-native-embedding-available "Contract v0.4.1: backend function exists and model file is present." (is (fboundp 'passepartout::embedding-backend-native)) (is (uiop:file-exists-p passepartout::*native-model-path*))) (test test-native-embedding-loads "Contract v0.4.1: model loads and produces a valid context." (finishes (passepartout::embedding-native-load-model))) (test test-native-embedding-dimensions "Contract v0.4.1: embedding produces correct-dimensional vector." (let ((vec (passepartout::embedding-backend-native "test sentence"))) (is (vectorp vec)) (is (= (length vec) 768)) (is (typep (aref vec 0) 'single-float)))) (test test-native-embedding-identical "Contract v0.4.1: identical texts produce identical embeddings." (let ((v1 (passepartout::embedding-backend-native "hello world")) (v2 (passepartout::embedding-backend-native "hello world"))) (is (= (length v1) (length v2))) (let ((sim (passepartout::vector-cosine-similarity v1 v2))) (is (> sim 0.9999))))) (test test-native-embedding-similar "Contract v0.4.1: semantically similar texts are closer than unrelated." (let ((v-auth (passepartout::embedding-backend-native "implement user login form")) (v-related (passepartout::embedding-backend-native "add password authentication")) (v-unrelated (passepartout::embedding-backend-native "banana fruit yellow")) (sim-related (passepartout::vector-cosine-similarity v-auth v-related)) (sim-unrelated (passepartout::vector-cosine-similarity v-auth v-unrelated))) (is (> sim-related 0.5)) (is (> sim-related sim-unrelated))))