From d8fea232c0b3d88f9c847e8ecf713db4387b7184 Mon Sep 17 00:00:00 2001 From: Massaki Archambault Date: Mon, 12 May 2025 20:35:14 -0400 Subject: [PATCH] switch from ollama to llama-swap for better performance --- .../llama-swap/configurations/config.yaml | 24 ++++++ kustomize/bases/llama-swap/kustomization.yaml | 29 +++++++ .../llama-swap/llama-swap-deployment.yaml | 78 +++++++++++++++++++ .../bases/llama-swap/llama-swap-ingress.yaml | 19 +++++ .../bases/openwebui/openwebui-deployment.yaml | 6 +- kustomize/bases/prometheus/prometheus.yaml | 1 + kustomize/env/prod/kustomization.yaml | 4 +- kustomize/overlays/llm/api-ingress.yaml | 24 ------ kustomize/overlays/llm/kustomization.yaml | 4 +- kustomize/overlays/system/kustomization.yaml | 2 +- 10 files changed, 161 insertions(+), 30 deletions(-) create mode 100644 kustomize/bases/llama-swap/configurations/config.yaml create mode 100644 kustomize/bases/llama-swap/kustomization.yaml create mode 100644 kustomize/bases/llama-swap/llama-swap-deployment.yaml create mode 100644 kustomize/bases/llama-swap/llama-swap-ingress.yaml delete mode 100644 kustomize/overlays/llm/api-ingress.yaml diff --git a/kustomize/bases/llama-swap/configurations/config.yaml b/kustomize/bases/llama-swap/configurations/config.yaml new file mode 100644 index 0000000..3fa5b2d --- /dev/null +++ b/kustomize/bases/llama-swap/configurations/config.yaml @@ -0,0 +1,24 @@ +healthCheckTimeout: 60 + +models: + kokoro-tts: + proxy: http://kokoro-tts-server.llm.svc:8880 + checkEndpoint: /v1/audio/speech + unlisted: true + + gemma-3-12b-it-qat: + cmd: > + /app/llama-server + -hf unsloth/gemma-3-12b-it-qat-GGUF + -hff gemma-3-12b-it-qat-UD-Q4_K_XL.gguf + --port ${PORT} + --no-mmap + --flash-attn + --cache-type-k q8_0 + --cache-type-v q8_0 + --temp 1.0 --top-k 64 --min-p 0.00 --top-p 0.95 --repeat-penalty 1.0 + --gpu-layers 49 + + +# --no-kv-offload +# --ctx-size 16384 \ No newline at end of file diff --git a/kustomize/bases/llama-swap/kustomization.yaml b/kustomize/bases/llama-swap/kustomization.yaml new file mode 100644 index 0000000..ae46b5d --- /dev/null +++ b/kustomize/bases/llama-swap/kustomization.yaml @@ -0,0 +1,29 @@ +resources: + - llama-swap-deployment.yaml + - llama-swap-ingress.yaml + +namePrefix: llama-swap- + +commonLabels: + app.kubernetes.io/name: llama-swap + +configMapGenerator: + - name: kustomize-generated-config + literals: + - LLAMA_SWAP_EXTERNAL_HOST=openai.badjware.dev + - LLAMA_SWAP_EXTERNAL_URL=https://openai.badjware.dev + - name: llama-swap-config + files: + - config.yaml=configurations/config.yaml + +replacements: + - source: + kind: ConfigMap + name: kustomize-generated-config + fieldPath: data.LLAMA_SWAP_EXTERNAL_HOST + targets: + - select: + kind: Ingress + name: server + fieldPaths: + - spec.rules.0.host \ No newline at end of file diff --git a/kustomize/bases/llama-swap/llama-swap-deployment.yaml b/kustomize/bases/llama-swap/llama-swap-deployment.yaml new file mode 100644 index 0000000..9a0f9c9 --- /dev/null +++ b/kustomize/bases/llama-swap/llama-swap-deployment.yaml @@ -0,0 +1,78 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: server +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app.kubernetes.io/component: server + template: + metadata: + labels: + app.kubernetes.io/component: server + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + # Image only supports amd64 + - key: kubernetes.io/arch + operator: In + values: + - amd64 + priorityClassName: high-priority + containers: + - name: server + image: ghcr.io/mostlygeek/llama-swap:vulkan + imagePullPolicy: Always + args: ["--config", "/config/config.yaml"] + resources: + requests: + memory: 4Gi + cpu: 1000m + limits: + memory: 4Gi + # nvidia.com/gpu: "1" + amd.com/gpu: "1" + ports: + - name: http + hostPort: 8080 + containerPort: 8080 + volumeMounts: + - name: server-data + mountPath: /root/.cache/llama.cpp + - name: llama-swap-config + mountPath: /config + volumes: + - name: server-data + hostPath: + path: /var/lib/llama.cpp + type: DirectoryOrCreate + - name: llama-swap-config + configMap: + name: llama-swap-config +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: high-priority +value: 1000000 +globalDefault: false +--- +apiVersion: v1 +kind: Service +metadata: + name: server + labels: + app.kubernetes.io/component: server +spec: + selector: + app.kubernetes.io/component: server + ports: + - name: http + port: 8080 + targetPort: http \ No newline at end of file diff --git a/kustomize/bases/llama-swap/llama-swap-ingress.yaml b/kustomize/bases/llama-swap/llama-swap-ingress.yaml new file mode 100644 index 0000000..fc75db9 --- /dev/null +++ b/kustomize/bases/llama-swap/llama-swap-ingress.yaml @@ -0,0 +1,19 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: server + labels: + app.kubernetes.io/name: prometheus + probe: blackbox-http +spec: + rules: + - host: ${LLAMA_SWAP_EXTERNAL_HOST} + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: server + port: + name: http diff --git a/kustomize/bases/openwebui/openwebui-deployment.yaml b/kustomize/bases/openwebui/openwebui-deployment.yaml index f3aa4b7..fd8d2df 100644 --- a/kustomize/bases/openwebui/openwebui-deployment.yaml +++ b/kustomize/bases/openwebui/openwebui-deployment.yaml @@ -23,8 +23,10 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace - - name: OLLAMA_BASE_URL - value: http://ollama-server.$(NAMESPACE).svc:11434 + - name: OPENAI_BASE_URL + value: http://llama-swap-server.$(NAMESPACE).svc:8080/v1 + # - name: OLLAMA_BASE_URL + # value: http://ollama-server.$(NAMESPACE).svc:11434 - name: AUDIO_TTS_OPENAI_API_BASE_URL value: http://kokoro-tts-server.$(NAMESPACE).svc:8880/v1 - name: AUDIO_TTS_OPENAI_API_KEY diff --git a/kustomize/bases/prometheus/prometheus.yaml b/kustomize/bases/prometheus/prometheus.yaml index 95b8efd..c15153a 100644 --- a/kustomize/bases/prometheus/prometheus.yaml +++ b/kustomize/bases/prometheus/prometheus.yaml @@ -43,6 +43,7 @@ spec: cpu: 1000m memory: 1Gi limits: + cpu: 4000m memory: 2Gi storage: volumeClaimTemplate: diff --git a/kustomize/env/prod/kustomization.yaml b/kustomize/env/prod/kustomization.yaml index 98c5da4..0b37a5e 100644 --- a/kustomize/env/prod/kustomization.yaml +++ b/kustomize/env/prod/kustomization.yaml @@ -44,11 +44,13 @@ images: newTag: "1.31" - name: ollama/ollama newTag: 0.6.6 + # - name: ghcr.io/mostlygeek/llama-swap + # newTag: v110-vulkan-b5343 # newTag: 0.3.6-rocm # - name: ghcr.io/berriai/litellm # newTag: main-v1.43.1 - name: ghcr.io/open-webui/open-webui - newTag: v0.6.7 + newTag: v0.6.9 - name: ghcr.io/sillytavern/sillytavern newTag: 1.12.13 diff --git a/kustomize/overlays/llm/api-ingress.yaml b/kustomize/overlays/llm/api-ingress.yaml deleted file mode 100644 index 650670c..0000000 --- a/kustomize/overlays/llm/api-ingress.yaml +++ /dev/null @@ -1,24 +0,0 @@ - -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: api -spec: - rules: - - host: openai.badjware.dev - http: - paths: - - path: /v1/audio/speech - pathType: Prefix - backend: - service: - name: kokoro-tts-server - port: - name: http - - path: /v1 - pathType: Prefix - backend: - service: - name: ollama-server - port: - name: http diff --git a/kustomize/overlays/llm/kustomization.yaml b/kustomize/overlays/llm/kustomization.yaml index 563e94e..1501637 100644 --- a/kustomize/overlays/llm/kustomization.yaml +++ b/kustomize/overlays/llm/kustomization.yaml @@ -1,12 +1,12 @@ resources: - namespace.yaml - - api-ingress.yaml - - ../../bases/ollama + # - ../../bases/ollama - ../../bases/kokoro-tts-gpu # - ../../bases/openedai-speech # - ../../bases/litellm - ../../bases/openwebui - ../../bases/sillytavern - ../../bases/mikupad + - ../../bases/llama-swap namespace: llm diff --git a/kustomize/overlays/system/kustomization.yaml b/kustomize/overlays/system/kustomization.yaml index 4dcca22..3a6e832 100644 --- a/kustomize/overlays/system/kustomization.yaml +++ b/kustomize/overlays/system/kustomization.yaml @@ -2,6 +2,6 @@ resources: - ../../bases/longhorn - ../../bases/traefik - ../../bases/external-secrets - # - ../../bases/k8s-device-plugin-amd + - ../../bases/k8s-device-plugin-amd - ../../bases/k8s-device-plugin-nvidia - clustersecretstore.yaml \ No newline at end of file