1
0
Fork 0

switch from ollama to llama-swap for better performance

This commit is contained in:
Massaki Archambault 2025-05-12 20:35:14 -04:00
parent cc777b02ca
commit d8fea232c0
10 changed files with 161 additions and 30 deletions

View File

@ -0,0 +1,24 @@
healthCheckTimeout: 60
models:
kokoro-tts:
proxy: http://kokoro-tts-server.llm.svc:8880
checkEndpoint: /v1/audio/speech
unlisted: true
gemma-3-12b-it-qat:
cmd: >
/app/llama-server
-hf unsloth/gemma-3-12b-it-qat-GGUF
-hff gemma-3-12b-it-qat-UD-Q4_K_XL.gguf
--port ${PORT}
--no-mmap
--flash-attn
--cache-type-k q8_0
--cache-type-v q8_0
--temp 1.0 --top-k 64 --min-p 0.00 --top-p 0.95 --repeat-penalty 1.0
--gpu-layers 49
# --no-kv-offload
# --ctx-size 16384

View File

@ -0,0 +1,29 @@
resources:
- llama-swap-deployment.yaml
- llama-swap-ingress.yaml
namePrefix: llama-swap-
commonLabels:
app.kubernetes.io/name: llama-swap
configMapGenerator:
- name: kustomize-generated-config
literals:
- LLAMA_SWAP_EXTERNAL_HOST=openai.badjware.dev
- LLAMA_SWAP_EXTERNAL_URL=https://openai.badjware.dev
- name: llama-swap-config
files:
- config.yaml=configurations/config.yaml
replacements:
- source:
kind: ConfigMap
name: kustomize-generated-config
fieldPath: data.LLAMA_SWAP_EXTERNAL_HOST
targets:
- select:
kind: Ingress
name: server
fieldPaths:
- spec.rules.0.host

View File

@ -0,0 +1,78 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: server
spec:
replicas: 1
strategy:
type: Recreate
selector:
matchLabels:
app.kubernetes.io/component: server
template:
metadata:
labels:
app.kubernetes.io/component: server
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
# Image only supports amd64
- key: kubernetes.io/arch
operator: In
values:
- amd64
priorityClassName: high-priority
containers:
- name: server
image: ghcr.io/mostlygeek/llama-swap:vulkan
imagePullPolicy: Always
args: ["--config", "/config/config.yaml"]
resources:
requests:
memory: 4Gi
cpu: 1000m
limits:
memory: 4Gi
# nvidia.com/gpu: "1"
amd.com/gpu: "1"
ports:
- name: http
hostPort: 8080
containerPort: 8080
volumeMounts:
- name: server-data
mountPath: /root/.cache/llama.cpp
- name: llama-swap-config
mountPath: /config
volumes:
- name: server-data
hostPath:
path: /var/lib/llama.cpp
type: DirectoryOrCreate
- name: llama-swap-config
configMap:
name: llama-swap-config
---
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: high-priority
value: 1000000
globalDefault: false
---
apiVersion: v1
kind: Service
metadata:
name: server
labels:
app.kubernetes.io/component: server
spec:
selector:
app.kubernetes.io/component: server
ports:
- name: http
port: 8080
targetPort: http

View File

@ -0,0 +1,19 @@
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: server
labels:
app.kubernetes.io/name: prometheus
probe: blackbox-http
spec:
rules:
- host: ${LLAMA_SWAP_EXTERNAL_HOST}
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: server
port:
name: http

View File

@ -23,8 +23,10 @@ spec:
valueFrom: valueFrom:
fieldRef: fieldRef:
fieldPath: metadata.namespace fieldPath: metadata.namespace
- name: OLLAMA_BASE_URL - name: OPENAI_BASE_URL
value: http://ollama-server.$(NAMESPACE).svc:11434 value: http://llama-swap-server.$(NAMESPACE).svc:8080/v1
# - name: OLLAMA_BASE_URL
# value: http://ollama-server.$(NAMESPACE).svc:11434
- name: AUDIO_TTS_OPENAI_API_BASE_URL - name: AUDIO_TTS_OPENAI_API_BASE_URL
value: http://kokoro-tts-server.$(NAMESPACE).svc:8880/v1 value: http://kokoro-tts-server.$(NAMESPACE).svc:8880/v1
- name: AUDIO_TTS_OPENAI_API_KEY - name: AUDIO_TTS_OPENAI_API_KEY

View File

@ -43,6 +43,7 @@ spec:
cpu: 1000m cpu: 1000m
memory: 1Gi memory: 1Gi
limits: limits:
cpu: 4000m
memory: 2Gi memory: 2Gi
storage: storage:
volumeClaimTemplate: volumeClaimTemplate:

View File

@ -44,11 +44,13 @@ images:
newTag: "1.31" newTag: "1.31"
- name: ollama/ollama - name: ollama/ollama
newTag: 0.6.6 newTag: 0.6.6
# - name: ghcr.io/mostlygeek/llama-swap
# newTag: v110-vulkan-b5343
# newTag: 0.3.6-rocm # newTag: 0.3.6-rocm
# - name: ghcr.io/berriai/litellm # - name: ghcr.io/berriai/litellm
# newTag: main-v1.43.1 # newTag: main-v1.43.1
- name: ghcr.io/open-webui/open-webui - name: ghcr.io/open-webui/open-webui
newTag: v0.6.7 newTag: v0.6.9
- name: ghcr.io/sillytavern/sillytavern - name: ghcr.io/sillytavern/sillytavern
newTag: 1.12.13 newTag: 1.12.13

View File

@ -1,24 +0,0 @@
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: api
spec:
rules:
- host: openai.badjware.dev
http:
paths:
- path: /v1/audio/speech
pathType: Prefix
backend:
service:
name: kokoro-tts-server
port:
name: http
- path: /v1
pathType: Prefix
backend:
service:
name: ollama-server
port:
name: http

View File

@ -1,12 +1,12 @@
resources: resources:
- namespace.yaml - namespace.yaml
- api-ingress.yaml # - ../../bases/ollama
- ../../bases/ollama
- ../../bases/kokoro-tts-gpu - ../../bases/kokoro-tts-gpu
# - ../../bases/openedai-speech # - ../../bases/openedai-speech
# - ../../bases/litellm # - ../../bases/litellm
- ../../bases/openwebui - ../../bases/openwebui
- ../../bases/sillytavern - ../../bases/sillytavern
- ../../bases/mikupad - ../../bases/mikupad
- ../../bases/llama-swap
namespace: llm namespace: llm

View File

@ -2,6 +2,6 @@ resources:
- ../../bases/longhorn - ../../bases/longhorn
- ../../bases/traefik - ../../bases/traefik
- ../../bases/external-secrets - ../../bases/external-secrets
# - ../../bases/k8s-device-plugin-amd - ../../bases/k8s-device-plugin-amd
- ../../bases/k8s-device-plugin-nvidia - ../../bases/k8s-device-plugin-nvidia
- clustersecretstore.yaml - clustersecretstore.yaml