diff --git a/kustomize/bases/llama-swap/configurations/config.yaml b/kustomize/bases/llama-swap/configurations/config.yaml index a309c63..d361065 100644 --- a/kustomize/bases/llama-swap/configurations/config.yaml +++ b/kustomize/bases/llama-swap/configurations/config.yaml @@ -6,18 +6,34 @@ models: checkEndpoint: /v1/audio/speech unlisted: true - Gemma-3-12B: + Gemma-3-4B: cmd: > /app/llama-server - -hf unsloth/gemma-3-12b-it-qat-GGUF - -hff gemma-3-12b-it-qat-UD-Q4_K_XL.gguf + -hf unsloth/gemma-3-4b-it-qat-GGUF + -hff gemma-3-4b-it-qat-UD-Q6_K_XL.gguf --port ${PORT} --flash-attn --cache-type-k q8_0 --cache-type-v q8_0 --temp 1.0 --top-k 64 --min-p 0.00 --top-p 0.95 --repeat-penalty 1.0 --gpu-layers 49 - + Qwen3-Embedding-0.6B: + cmd: > + /app/llama-server + -hf Qwen/Qwen3-Embedding-0.6B-GGUF:Q8_0 + --port ${PORT} + --embedding + --pooling last + -ub 8912 + --gpu-layers 29 + +groups: + embedding: + persistent: true + swap: true + exclusive: false + members: + - Qwen3-Embedding-0.6B # --no-kv-offload -# --ctx-size 16384 \ No newline at end of file +# --ctx-size 16384