healthCheckTimeout: 60 models: kokoro-tts: proxy: http://kokoro-tts-server.llm.svc:8880 checkEndpoint: /v1/audio/speech unlisted: true Gemma-3-4B: cmd: > /app/llama-server -hf unsloth/gemma-3-4b-it-qat-GGUF -hff gemma-3-4b-it-qat-UD-Q6_K_XL.gguf --port ${PORT} --flash-attn --cache-type-k q8_0 --cache-type-v q8_0 --temp 1.0 --top-k 64 --min-p 0.00 --top-p 0.95 --repeat-penalty 1.0 --gpu-layers 49 Qwen3-Embedding-0.6B: cmd: > /app/llama-server -hf Qwen/Qwen3-Embedding-0.6B-GGUF:Q8_0 --port ${PORT} --embedding --pooling last -ub 8912 --gpu-layers 29 groups: embedding: persistent: true swap: true exclusive: false members: - Qwen3-Embedding-0.6B # --no-kv-offload # --ctx-size 16384