switch from ollama to llama-swap for better performance

2025-05-12 20:35:14 -04:00 · 2025-05-12 20:35:14 -04:00 · d8fea232c0
parent cc777b02ca
commit d8fea232c0
10 changed files with 161 additions and 30 deletions
--- a/kustomize/bases/llama-swap/configurations/config.yaml
+++ b/kustomize/bases/llama-swap/configurations/config.yaml
@ -0,0 +1,24 @@
 healthCheckTimeout: 60
 models:
  kokoro-tts:
    proxy: http://kokoro-tts-server.llm.svc:8880
    checkEndpoint: /v1/audio/speech
    unlisted: true
  gemma-3-12b-it-qat:
    cmd: >
      /app/llama-server
      -hf unsloth/gemma-3-12b-it-qat-GGUF
      -hff gemma-3-12b-it-qat-UD-Q4_K_XL.gguf
      --port ${PORT}
      --no-mmap
      --flash-attn
      --cache-type-k q8_0
      --cache-type-v q8_0
      --temp 1.0 --top-k 64 --min-p 0.00 --top-p 0.95 --repeat-penalty 1.0
      --gpu-layers 49
 # --no-kv-offload
 # --ctx-size 16384
--- a/kustomize/bases/llama-swap/kustomization.yaml
+++ b/kustomize/bases/llama-swap/kustomization.yaml
@ -0,0 +1,29 @@
 resources:
  - llama-swap-deployment.yaml
  - llama-swap-ingress.yaml
 namePrefix: llama-swap-
 commonLabels:
  app.kubernetes.io/name: llama-swap
 configMapGenerator:
  - name: kustomize-generated-config
    literals:
      - LLAMA_SWAP_EXTERNAL_HOST=openai.badjware.dev
      - LLAMA_SWAP_EXTERNAL_URL=https://openai.badjware.dev
  - name: llama-swap-config
    files:
      - config.yaml=configurations/config.yaml
 replacements:
  - source:
      kind: ConfigMap
      name: kustomize-generated-config
      fieldPath: data.LLAMA_SWAP_EXTERNAL_HOST
    targets:
      - select:
          kind: Ingress
          name: server
        fieldPaths:
          - spec.rules.0.host
--- a/kustomize/bases/llama-swap/llama-swap-deployment.yaml
+++ b/kustomize/bases/llama-swap/llama-swap-deployment.yaml
@ -0,0 +1,78 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: server
 spec:
  replicas: 1
  strategy:
    type: Recreate
  selector:
    matchLabels:
      app.kubernetes.io/component: server
  template:
    metadata:
      labels:
        app.kubernetes.io/component: server
    spec:
      affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
              - matchExpressions:
                  # Image only supports amd64
                  - key: kubernetes.io/arch
                    operator: In
                    values:
                      - amd64
      priorityClassName: high-priority
      containers:
        - name: server
          image: ghcr.io/mostlygeek/llama-swap:vulkan
          imagePullPolicy: Always
          args: ["--config", "/config/config.yaml"]
          resources:
            requests:
              memory: 4Gi
              cpu: 1000m
            limits:
              memory: 4Gi
              # nvidia.com/gpu: "1"
              amd.com/gpu: "1"
          ports:
            - name: http
              hostPort: 8080
              containerPort: 8080
          volumeMounts:
            - name: server-data
              mountPath: /root/.cache/llama.cpp
            - name: llama-swap-config
              mountPath: /config
      volumes:
        - name: server-data
          hostPath:
            path: /var/lib/llama.cpp
            type: DirectoryOrCreate
        - name: llama-swap-config
          configMap:
            name: llama-swap-config
 ---
 apiVersion: scheduling.k8s.io/v1
 kind: PriorityClass
 metadata:
  name: high-priority
 value: 1000000
 globalDefault: false
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: server
  labels:
    app.kubernetes.io/component: server
 spec:
  selector:
    app.kubernetes.io/component: server
  ports:
    - name: http
      port: 8080
      targetPort: http
--- a/kustomize/bases/llama-swap/llama-swap-ingress.yaml
+++ b/kustomize/bases/llama-swap/llama-swap-ingress.yaml
@ -0,0 +1,19 @@
 apiVersion: networking.k8s.io/v1
 kind: Ingress
 metadata:
  name: server
  labels:
    app.kubernetes.io/name: prometheus
    probe: blackbox-http
 spec:
  rules:
  - host: ${LLAMA_SWAP_EXTERNAL_HOST}
    http:
      paths:
      - path: /
        pathType: Prefix
        backend:
          service:
            name: server
            port:
              name: http
--- a/kustomize/bases/openwebui/openwebui-deployment.yaml
+++ b/kustomize/bases/openwebui/openwebui-deployment.yaml
@ -23,8 +23,10 @@ spec:
              valueFrom:
                fieldRef:
                  fieldPath: metadata.namespace
-            - name: OLLAMA_BASE_URL
+            - name: OPENAI_BASE_URL
-              value: http://ollama-server.$(NAMESPACE).svc:11434
+              value: http://llama-swap-server.$(NAMESPACE).svc:8080/v1
            # - name: OLLAMA_BASE_URL
            #   value: http://ollama-server.$(NAMESPACE).svc:11434
            - name: AUDIO_TTS_OPENAI_API_BASE_URL
              value: http://kokoro-tts-server.$(NAMESPACE).svc:8880/v1
            - name: AUDIO_TTS_OPENAI_API_KEY
--- a/kustomize/bases/prometheus/prometheus.yaml
+++ b/kustomize/bases/prometheus/prometheus.yaml
@ -43,6 +43,7 @@ spec:
      cpu: 1000m
      memory: 1Gi
    limits:
      cpu: 4000m
      memory: 2Gi
  storage:
    volumeClaimTemplate:
--- a/kustomize/env/prod/kustomization.yaml
+++ b/kustomize/env/prod/kustomization.yaml
@ -44,11 +44,13 @@ images:
    newTag: "1.31"
  - name: ollama/ollama
    newTag: 0.6.6
  # - name: ghcr.io/mostlygeek/llama-swap
  #   newTag: v110-vulkan-b5343
    # newTag: 0.3.6-rocm
  # - name: ghcr.io/berriai/litellm
  #   newTag: main-v1.43.1
  - name: ghcr.io/open-webui/open-webui
-    newTag: v0.6.7
+    newTag: v0.6.9
  - name: ghcr.io/sillytavern/sillytavern
    newTag: 1.12.13
--- a/kustomize/overlays/llm/api-ingress.yaml
+++ b/kustomize/overlays/llm/api-ingress.yaml
@ -1,24 +0,0 @@
 apiVersion: networking.k8s.io/v1
 kind: Ingress
 metadata:
  name: api
 spec:
  rules:
  - host: openai.badjware.dev
    http:
      paths:
      - path: /v1/audio/speech
        pathType: Prefix
        backend:
          service:
            name: kokoro-tts-server
            port:
              name: http
      - path: /v1
        pathType: Prefix
        backend:
          service:
            name: ollama-server
            port:
              name: http
--- a/kustomize/overlays/llm/kustomization.yaml
+++ b/kustomize/overlays/llm/kustomization.yaml
@ -1,12 +1,12 @@
 resources:
  - namespace.yaml
-  - api-ingress.yaml
+  # - ../../bases/ollama
  - ../../bases/ollama
  - ../../bases/kokoro-tts-gpu
  # - ../../bases/openedai-speech
  # - ../../bases/litellm
  - ../../bases/openwebui
  - ../../bases/sillytavern
  - ../../bases/mikupad
  - ../../bases/llama-swap
 namespace: llm
--- a/kustomize/overlays/system/kustomization.yaml
+++ b/kustomize/overlays/system/kustomization.yaml
@ -2,6 +2,6 @@ resources:
  - ../../bases/longhorn
  - ../../bases/traefik
  - ../../bases/external-secrets
-  # - ../../bases/k8s-device-plugin-amd
+  - ../../bases/k8s-device-plugin-amd
  - ../../bases/k8s-device-plugin-nvidia
  - clustersecretstore.yaml