From d8fea232c0b3d88f9c847e8ecf713db4387b7184 Mon Sep 17 00:00:00 2001
From: Massaki Archambault <marchambault@badjware.dev>
Date: Mon, 12 May 2025 20:35:14 -0400
Subject: [PATCH] switch from ollama to llama-swap for better performance

---
 .../llama-swap/configurations/config.yaml     | 24 ++++++
 kustomize/bases/llama-swap/kustomization.yaml | 29 +++++++
 .../llama-swap/llama-swap-deployment.yaml     | 78 +++++++++++++++++++
 .../bases/llama-swap/llama-swap-ingress.yaml  | 19 +++++
 .../bases/openwebui/openwebui-deployment.yaml |  6 +-
 kustomize/bases/prometheus/prometheus.yaml    |  1 +
 kustomize/env/prod/kustomization.yaml         |  4 +-
 kustomize/overlays/llm/api-ingress.yaml       | 24 ------
 kustomize/overlays/llm/kustomization.yaml     |  4 +-
 kustomize/overlays/system/kustomization.yaml  |  2 +-
 10 files changed, 161 insertions(+), 30 deletions(-)
 create mode 100644 kustomize/bases/llama-swap/configurations/config.yaml
 create mode 100644 kustomize/bases/llama-swap/kustomization.yaml
 create mode 100644 kustomize/bases/llama-swap/llama-swap-deployment.yaml
 create mode 100644 kustomize/bases/llama-swap/llama-swap-ingress.yaml
 delete mode 100644 kustomize/overlays/llm/api-ingress.yaml

diff --git a/kustomize/bases/llama-swap/configurations/config.yaml b/kustomize/bases/llama-swap/configurations/config.yaml
new file mode 100644
index 0000000..3fa5b2d
--- /dev/null
+++ b/kustomize/bases/llama-swap/configurations/config.yaml
@@ -0,0 +1,24 @@
+healthCheckTimeout: 60
+
+models:
+  kokoro-tts:
+    proxy: http://kokoro-tts-server.llm.svc:8880
+    checkEndpoint: /v1/audio/speech
+    unlisted: true
+
+  gemma-3-12b-it-qat:
+    cmd: >
+      /app/llama-server
+      -hf unsloth/gemma-3-12b-it-qat-GGUF
+      -hff gemma-3-12b-it-qat-UD-Q4_K_XL.gguf
+      --port ${PORT}
+      --no-mmap
+      --flash-attn
+      --cache-type-k q8_0
+      --cache-type-v q8_0
+      --temp 1.0 --top-k 64 --min-p 0.00 --top-p 0.95 --repeat-penalty 1.0
+      --gpu-layers 49
+      
+
+# --no-kv-offload
+# --ctx-size 16384
\ No newline at end of file
diff --git a/kustomize/bases/llama-swap/kustomization.yaml b/kustomize/bases/llama-swap/kustomization.yaml
new file mode 100644
index 0000000..ae46b5d
--- /dev/null
+++ b/kustomize/bases/llama-swap/kustomization.yaml
@@ -0,0 +1,29 @@
+resources:
+  - llama-swap-deployment.yaml
+  - llama-swap-ingress.yaml
+
+namePrefix: llama-swap-
+
+commonLabels:
+  app.kubernetes.io/name: llama-swap
+
+configMapGenerator:
+  - name: kustomize-generated-config
+    literals:
+      - LLAMA_SWAP_EXTERNAL_HOST=openai.badjware.dev
+      - LLAMA_SWAP_EXTERNAL_URL=https://openai.badjware.dev
+  - name: llama-swap-config
+    files:
+      - config.yaml=configurations/config.yaml
+
+replacements:
+  - source:
+      kind: ConfigMap
+      name: kustomize-generated-config
+      fieldPath: data.LLAMA_SWAP_EXTERNAL_HOST
+    targets:
+      - select:
+          kind: Ingress
+          name: server
+        fieldPaths:
+          - spec.rules.0.host
\ No newline at end of file
diff --git a/kustomize/bases/llama-swap/llama-swap-deployment.yaml b/kustomize/bases/llama-swap/llama-swap-deployment.yaml
new file mode 100644
index 0000000..9a0f9c9
--- /dev/null
+++ b/kustomize/bases/llama-swap/llama-swap-deployment.yaml
@@ -0,0 +1,78 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: server
+spec:
+  replicas: 1
+  strategy:
+    type: Recreate
+  selector:
+    matchLabels:
+      app.kubernetes.io/component: server
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/component: server
+    spec:
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  # Image only supports amd64
+                  - key: kubernetes.io/arch
+                    operator: In
+                    values:
+                      - amd64
+      priorityClassName: high-priority
+      containers:
+        - name: server
+          image: ghcr.io/mostlygeek/llama-swap:vulkan
+          imagePullPolicy: Always
+          args: ["--config", "/config/config.yaml"]
+          resources:
+            requests:
+              memory: 4Gi
+              cpu: 1000m
+            limits:
+              memory: 4Gi
+              # nvidia.com/gpu: "1"
+              amd.com/gpu: "1"
+          ports:
+            - name: http
+              hostPort: 8080
+              containerPort: 8080
+          volumeMounts:
+            - name: server-data
+              mountPath: /root/.cache/llama.cpp
+            - name: llama-swap-config
+              mountPath: /config
+      volumes:
+        - name: server-data
+          hostPath:
+            path: /var/lib/llama.cpp
+            type: DirectoryOrCreate
+        - name: llama-swap-config
+          configMap:
+            name: llama-swap-config
+---
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+  name: high-priority
+value: 1000000
+globalDefault: false
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: server
+  labels:
+    app.kubernetes.io/component: server
+spec:
+  selector:
+    app.kubernetes.io/component: server
+  ports:
+    - name: http
+      port: 8080
+      targetPort: http
\ No newline at end of file
diff --git a/kustomize/bases/llama-swap/llama-swap-ingress.yaml b/kustomize/bases/llama-swap/llama-swap-ingress.yaml
new file mode 100644
index 0000000..fc75db9
--- /dev/null
+++ b/kustomize/bases/llama-swap/llama-swap-ingress.yaml
@@ -0,0 +1,19 @@
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: server
+  labels:
+    app.kubernetes.io/name: prometheus
+    probe: blackbox-http
+spec:
+  rules:
+  - host: ${LLAMA_SWAP_EXTERNAL_HOST}
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend:
+          service:
+            name: server
+            port:
+              name: http
diff --git a/kustomize/bases/openwebui/openwebui-deployment.yaml b/kustomize/bases/openwebui/openwebui-deployment.yaml
index f3aa4b7..fd8d2df 100644
--- a/kustomize/bases/openwebui/openwebui-deployment.yaml
+++ b/kustomize/bases/openwebui/openwebui-deployment.yaml
@@ -23,8 +23,10 @@ spec:
               valueFrom:
                 fieldRef:
                   fieldPath: metadata.namespace
-            - name: OLLAMA_BASE_URL
-              value: http://ollama-server.$(NAMESPACE).svc:11434
+            - name: OPENAI_BASE_URL
+              value: http://llama-swap-server.$(NAMESPACE).svc:8080/v1
+            # - name: OLLAMA_BASE_URL
+            #   value: http://ollama-server.$(NAMESPACE).svc:11434
             - name: AUDIO_TTS_OPENAI_API_BASE_URL
               value: http://kokoro-tts-server.$(NAMESPACE).svc:8880/v1
             - name: AUDIO_TTS_OPENAI_API_KEY
diff --git a/kustomize/bases/prometheus/prometheus.yaml b/kustomize/bases/prometheus/prometheus.yaml
index 95b8efd..c15153a 100644
--- a/kustomize/bases/prometheus/prometheus.yaml
+++ b/kustomize/bases/prometheus/prometheus.yaml
@@ -43,6 +43,7 @@ spec:
       cpu: 1000m
       memory: 1Gi
     limits:
+      cpu: 4000m
       memory: 2Gi
   storage:
     volumeClaimTemplate:
diff --git a/kustomize/env/prod/kustomization.yaml b/kustomize/env/prod/kustomization.yaml
index 98c5da4..0b37a5e 100644
--- a/kustomize/env/prod/kustomization.yaml
+++ b/kustomize/env/prod/kustomization.yaml
@@ -44,11 +44,13 @@ images:
     newTag: "1.31"
   - name: ollama/ollama
     newTag: 0.6.6
+  # - name: ghcr.io/mostlygeek/llama-swap
+  #   newTag: v110-vulkan-b5343
     # newTag: 0.3.6-rocm
   # - name: ghcr.io/berriai/litellm
   #   newTag: main-v1.43.1
   - name: ghcr.io/open-webui/open-webui
-    newTag: v0.6.7
+    newTag: v0.6.9
   - name: ghcr.io/sillytavern/sillytavern
     newTag: 1.12.13
 
diff --git a/kustomize/overlays/llm/api-ingress.yaml b/kustomize/overlays/llm/api-ingress.yaml
deleted file mode 100644
index 650670c..0000000
--- a/kustomize/overlays/llm/api-ingress.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-
-apiVersion: networking.k8s.io/v1
-kind: Ingress
-metadata:
-  name: api
-spec:
-  rules:
-  - host: openai.badjware.dev
-    http:
-      paths:
-      - path: /v1/audio/speech
-        pathType: Prefix
-        backend:
-          service:
-            name: kokoro-tts-server
-            port:
-              name: http
-      - path: /v1
-        pathType: Prefix
-        backend:
-          service:
-            name: ollama-server
-            port:
-              name: http
diff --git a/kustomize/overlays/llm/kustomization.yaml b/kustomize/overlays/llm/kustomization.yaml
index 563e94e..1501637 100644
--- a/kustomize/overlays/llm/kustomization.yaml
+++ b/kustomize/overlays/llm/kustomization.yaml
@@ -1,12 +1,12 @@
 resources:
   - namespace.yaml
-  - api-ingress.yaml
-  - ../../bases/ollama
+  # - ../../bases/ollama
   - ../../bases/kokoro-tts-gpu
   # - ../../bases/openedai-speech
   # - ../../bases/litellm
   - ../../bases/openwebui
   - ../../bases/sillytavern
   - ../../bases/mikupad
+  - ../../bases/llama-swap
 
 namespace: llm
diff --git a/kustomize/overlays/system/kustomization.yaml b/kustomize/overlays/system/kustomization.yaml
index 4dcca22..3a6e832 100644
--- a/kustomize/overlays/system/kustomization.yaml
+++ b/kustomize/overlays/system/kustomization.yaml
@@ -2,6 +2,6 @@ resources:
   - ../../bases/longhorn
   - ../../bases/traefik
   - ../../bases/external-secrets
-  # - ../../bases/k8s-device-plugin-amd
+  - ../../bases/k8s-device-plugin-amd
   - ../../bases/k8s-device-plugin-nvidia
   - clustersecretstore.yaml
\ No newline at end of file