switch from ollama to llama-swap for better performance

2025-05-12 20:35:14 -04:00 · 2025-05-12 20:35:14 -04:00 · d8fea232c0
parent cc777b02ca
commit d8fea232c0
10 changed files with 161 additions and 30 deletions
--- a/kustomize/bases/llama-swap/configurations/config.yaml
+++ b/kustomize/bases/llama-swap/configurations/config.yaml
@ -0,0 +1,24 @@
+healthCheckTimeout: 60
+
+models:
+  kokoro-tts:
+    proxy: http://kokoro-tts-server.llm.svc:8880
+    checkEndpoint: /v1/audio/speech
+    unlisted: true
+
+  gemma-3-12b-it-qat:
+    cmd: >
+      /app/llama-server
+      -hf unsloth/gemma-3-12b-it-qat-GGUF
+      -hff gemma-3-12b-it-qat-UD-Q4_K_XL.gguf
+      --port ${PORT}
+      --no-mmap
+      --flash-attn
+      --cache-type-k q8_0
+      --cache-type-v q8_0
+      --temp 1.0 --top-k 64 --min-p 0.00 --top-p 0.95 --repeat-penalty 1.0
+      --gpu-layers 49
+      
+
+# --no-kv-offload
+# --ctx-size 16384
--- a/kustomize/bases/llama-swap/kustomization.yaml
+++ b/kustomize/bases/llama-swap/kustomization.yaml
@ -0,0 +1,29 @@
+resources:
+  - llama-swap-deployment.yaml
+  - llama-swap-ingress.yaml
+
+namePrefix: llama-swap-
+
+commonLabels:
+  app.kubernetes.io/name: llama-swap
+
+configMapGenerator:
+  - name: kustomize-generated-config
+    literals:
+      - LLAMA_SWAP_EXTERNAL_HOST=openai.badjware.dev
+      - LLAMA_SWAP_EXTERNAL_URL=https://openai.badjware.dev
+  - name: llama-swap-config
+    files:
+      - config.yaml=configurations/config.yaml
+
+replacements:
+  - source:
+      kind: ConfigMap
+      name: kustomize-generated-config
+      fieldPath: data.LLAMA_SWAP_EXTERNAL_HOST
+    targets:
+      - select:
+          kind: Ingress
+          name: server
+        fieldPaths:
+          - spec.rules.0.host
--- a/kustomize/bases/llama-swap/llama-swap-deployment.yaml
+++ b/kustomize/bases/llama-swap/llama-swap-deployment.yaml
@ -0,0 +1,78 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: server
+spec:
+  replicas: 1
+  strategy:
+    type: Recreate
+  selector:
+    matchLabels:
+      app.kubernetes.io/component: server
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/component: server
+    spec:
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  # Image only supports amd64
+                  - key: kubernetes.io/arch
+                    operator: In
+                    values:
+                      - amd64
+      priorityClassName: high-priority
+      containers:
+        - name: server
+          image: ghcr.io/mostlygeek/llama-swap:vulkan
+          imagePullPolicy: Always
+          args: ["--config", "/config/config.yaml"]
+          resources:
+            requests:
+              memory: 4Gi
+              cpu: 1000m
+            limits:
+              memory: 4Gi
+              # nvidia.com/gpu: "1"
+              amd.com/gpu: "1"
+          ports:
+            - name: http
+              hostPort: 8080
+              containerPort: 8080
+          volumeMounts:
+            - name: server-data
+              mountPath: /root/.cache/llama.cpp
+            - name: llama-swap-config
+              mountPath: /config
+      volumes:
+        - name: server-data
+          hostPath:
+            path: /var/lib/llama.cpp
+            type: DirectoryOrCreate
+        - name: llama-swap-config
+          configMap:
+            name: llama-swap-config
+---
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+  name: high-priority
+value: 1000000
+globalDefault: false
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: server
+  labels:
+    app.kubernetes.io/component: server
+spec:
+  selector:
+    app.kubernetes.io/component: server
+  ports:
+    - name: http
+      port: 8080
+      targetPort: http
--- a/kustomize/bases/llama-swap/llama-swap-ingress.yaml
+++ b/kustomize/bases/llama-swap/llama-swap-ingress.yaml
@ -0,0 +1,19 @@
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: server
+  labels:
+    app.kubernetes.io/name: prometheus
+    probe: blackbox-http
+spec:
+  rules:
+  - host: ${LLAMA_SWAP_EXTERNAL_HOST}
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend:
+          service:
+            name: server
+            port:
+              name: http
--- a/kustomize/bases/openwebui/openwebui-deployment.yaml
+++ b/kustomize/bases/openwebui/openwebui-deployment.yaml
@ -23,8 +23,10 @@ spec:
              valueFrom:
                fieldRef:
                  fieldPath: metadata.namespace
-            - name: OLLAMA_BASE_URL
-              value: http://ollama-server.$(NAMESPACE).svc:11434
+            - name: OPENAI_BASE_URL
+              value: http://llama-swap-server.$(NAMESPACE).svc:8080/v1
+            # - name: OLLAMA_BASE_URL
+            #   value: http://ollama-server.$(NAMESPACE).svc:11434
            - name: AUDIO_TTS_OPENAI_API_BASE_URL
              value: http://kokoro-tts-server.$(NAMESPACE).svc:8880/v1
            - name: AUDIO_TTS_OPENAI_API_KEY
--- a/kustomize/bases/prometheus/prometheus.yaml
+++ b/kustomize/bases/prometheus/prometheus.yaml
@ -43,6 +43,7 @@ spec:
      cpu: 1000m
      memory: 1Gi
    limits:
+      cpu: 4000m
      memory: 2Gi
  storage:
    volumeClaimTemplate:
--- a/kustomize/env/prod/kustomization.yaml
+++ b/kustomize/env/prod/kustomization.yaml
@ -44,11 +44,13 @@ images:
    newTag: "1.31"
  - name: ollama/ollama
    newTag: 0.6.6
+  # - name: ghcr.io/mostlygeek/llama-swap
+  #   newTag: v110-vulkan-b5343
    # newTag: 0.3.6-rocm
  # - name: ghcr.io/berriai/litellm
  #   newTag: main-v1.43.1
  - name: ghcr.io/open-webui/open-webui
-    newTag: v0.6.7
+    newTag: v0.6.9
  - name: ghcr.io/sillytavern/sillytavern
    newTag: 1.12.13

--- a/kustomize/overlays/llm/api-ingress.yaml
+++ b/kustomize/overlays/llm/api-ingress.yaml
@ -1,24 +0,0 @@
-
-apiVersion: networking.k8s.io/v1
-kind: Ingress
-metadata:
-  name: api
-spec:
-  rules:
-  - host: openai.badjware.dev
-    http:
-      paths:
-      - path: /v1/audio/speech
-        pathType: Prefix
-        backend:
-          service:
-            name: kokoro-tts-server
-            port:
-              name: http
-      - path: /v1
-        pathType: Prefix
-        backend:
-          service:
-            name: ollama-server
-            port:
-              name: http
--- a/kustomize/overlays/llm/kustomization.yaml
+++ b/kustomize/overlays/llm/kustomization.yaml
@ -1,12 +1,12 @@
 resources:
  - namespace.yaml
-  - api-ingress.yaml
-  - ../../bases/ollama
+  # - ../../bases/ollama
  - ../../bases/kokoro-tts-gpu
  # - ../../bases/openedai-speech
  # - ../../bases/litellm
  - ../../bases/openwebui
  - ../../bases/sillytavern
  - ../../bases/mikupad
+  - ../../bases/llama-swap

 namespace: llm
--- a/kustomize/overlays/system/kustomization.yaml
+++ b/kustomize/overlays/system/kustomization.yaml
@ -2,6 +2,6 @@ resources:
  - ../../bases/longhorn
  - ../../bases/traefik
  - ../../bases/external-secrets
-  # - ../../bases/k8s-device-plugin-amd
+  - ../../bases/k8s-device-plugin-amd
  - ../../bases/k8s-device-plugin-nvidia
  - clustersecretstore.yaml