switch from ollama to llama-swap for better performance
This commit is contained in:
parent
cc777b02ca
commit
d8fea232c0
|
@ -0,0 +1,24 @@
|
|||
healthCheckTimeout: 60
|
||||
|
||||
models:
|
||||
kokoro-tts:
|
||||
proxy: http://kokoro-tts-server.llm.svc:8880
|
||||
checkEndpoint: /v1/audio/speech
|
||||
unlisted: true
|
||||
|
||||
gemma-3-12b-it-qat:
|
||||
cmd: >
|
||||
/app/llama-server
|
||||
-hf unsloth/gemma-3-12b-it-qat-GGUF
|
||||
-hff gemma-3-12b-it-qat-UD-Q4_K_XL.gguf
|
||||
--port ${PORT}
|
||||
--no-mmap
|
||||
--flash-attn
|
||||
--cache-type-k q8_0
|
||||
--cache-type-v q8_0
|
||||
--temp 1.0 --top-k 64 --min-p 0.00 --top-p 0.95 --repeat-penalty 1.0
|
||||
--gpu-layers 49
|
||||
|
||||
|
||||
# --no-kv-offload
|
||||
# --ctx-size 16384
|
|
@ -0,0 +1,29 @@
|
|||
resources:
|
||||
- llama-swap-deployment.yaml
|
||||
- llama-swap-ingress.yaml
|
||||
|
||||
namePrefix: llama-swap-
|
||||
|
||||
commonLabels:
|
||||
app.kubernetes.io/name: llama-swap
|
||||
|
||||
configMapGenerator:
|
||||
- name: kustomize-generated-config
|
||||
literals:
|
||||
- LLAMA_SWAP_EXTERNAL_HOST=openai.badjware.dev
|
||||
- LLAMA_SWAP_EXTERNAL_URL=https://openai.badjware.dev
|
||||
- name: llama-swap-config
|
||||
files:
|
||||
- config.yaml=configurations/config.yaml
|
||||
|
||||
replacements:
|
||||
- source:
|
||||
kind: ConfigMap
|
||||
name: kustomize-generated-config
|
||||
fieldPath: data.LLAMA_SWAP_EXTERNAL_HOST
|
||||
targets:
|
||||
- select:
|
||||
kind: Ingress
|
||||
name: server
|
||||
fieldPaths:
|
||||
- spec.rules.0.host
|
|
@ -0,0 +1,78 @@
|
|||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: server
|
||||
spec:
|
||||
replicas: 1
|
||||
strategy:
|
||||
type: Recreate
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/component: server
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: server
|
||||
spec:
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
# Image only supports amd64
|
||||
- key: kubernetes.io/arch
|
||||
operator: In
|
||||
values:
|
||||
- amd64
|
||||
priorityClassName: high-priority
|
||||
containers:
|
||||
- name: server
|
||||
image: ghcr.io/mostlygeek/llama-swap:vulkan
|
||||
imagePullPolicy: Always
|
||||
args: ["--config", "/config/config.yaml"]
|
||||
resources:
|
||||
requests:
|
||||
memory: 4Gi
|
||||
cpu: 1000m
|
||||
limits:
|
||||
memory: 4Gi
|
||||
# nvidia.com/gpu: "1"
|
||||
amd.com/gpu: "1"
|
||||
ports:
|
||||
- name: http
|
||||
hostPort: 8080
|
||||
containerPort: 8080
|
||||
volumeMounts:
|
||||
- name: server-data
|
||||
mountPath: /root/.cache/llama.cpp
|
||||
- name: llama-swap-config
|
||||
mountPath: /config
|
||||
volumes:
|
||||
- name: server-data
|
||||
hostPath:
|
||||
path: /var/lib/llama.cpp
|
||||
type: DirectoryOrCreate
|
||||
- name: llama-swap-config
|
||||
configMap:
|
||||
name: llama-swap-config
|
||||
---
|
||||
apiVersion: scheduling.k8s.io/v1
|
||||
kind: PriorityClass
|
||||
metadata:
|
||||
name: high-priority
|
||||
value: 1000000
|
||||
globalDefault: false
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: server
|
||||
labels:
|
||||
app.kubernetes.io/component: server
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/component: server
|
||||
ports:
|
||||
- name: http
|
||||
port: 8080
|
||||
targetPort: http
|
|
@ -0,0 +1,19 @@
|
|||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: server
|
||||
labels:
|
||||
app.kubernetes.io/name: prometheus
|
||||
probe: blackbox-http
|
||||
spec:
|
||||
rules:
|
||||
- host: ${LLAMA_SWAP_EXTERNAL_HOST}
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: server
|
||||
port:
|
||||
name: http
|
|
@ -23,8 +23,10 @@ spec:
|
|||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.namespace
|
||||
- name: OLLAMA_BASE_URL
|
||||
value: http://ollama-server.$(NAMESPACE).svc:11434
|
||||
- name: OPENAI_BASE_URL
|
||||
value: http://llama-swap-server.$(NAMESPACE).svc:8080/v1
|
||||
# - name: OLLAMA_BASE_URL
|
||||
# value: http://ollama-server.$(NAMESPACE).svc:11434
|
||||
- name: AUDIO_TTS_OPENAI_API_BASE_URL
|
||||
value: http://kokoro-tts-server.$(NAMESPACE).svc:8880/v1
|
||||
- name: AUDIO_TTS_OPENAI_API_KEY
|
||||
|
|
|
@ -43,6 +43,7 @@ spec:
|
|||
cpu: 1000m
|
||||
memory: 1Gi
|
||||
limits:
|
||||
cpu: 4000m
|
||||
memory: 2Gi
|
||||
storage:
|
||||
volumeClaimTemplate:
|
||||
|
|
|
@ -44,11 +44,13 @@ images:
|
|||
newTag: "1.31"
|
||||
- name: ollama/ollama
|
||||
newTag: 0.6.6
|
||||
# - name: ghcr.io/mostlygeek/llama-swap
|
||||
# newTag: v110-vulkan-b5343
|
||||
# newTag: 0.3.6-rocm
|
||||
# - name: ghcr.io/berriai/litellm
|
||||
# newTag: main-v1.43.1
|
||||
- name: ghcr.io/open-webui/open-webui
|
||||
newTag: v0.6.7
|
||||
newTag: v0.6.9
|
||||
- name: ghcr.io/sillytavern/sillytavern
|
||||
newTag: 1.12.13
|
||||
|
||||
|
|
|
@ -1,24 +0,0 @@
|
|||
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: api
|
||||
spec:
|
||||
rules:
|
||||
- host: openai.badjware.dev
|
||||
http:
|
||||
paths:
|
||||
- path: /v1/audio/speech
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: kokoro-tts-server
|
||||
port:
|
||||
name: http
|
||||
- path: /v1
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: ollama-server
|
||||
port:
|
||||
name: http
|
|
@ -1,12 +1,12 @@
|
|||
resources:
|
||||
- namespace.yaml
|
||||
- api-ingress.yaml
|
||||
- ../../bases/ollama
|
||||
# - ../../bases/ollama
|
||||
- ../../bases/kokoro-tts-gpu
|
||||
# - ../../bases/openedai-speech
|
||||
# - ../../bases/litellm
|
||||
- ../../bases/openwebui
|
||||
- ../../bases/sillytavern
|
||||
- ../../bases/mikupad
|
||||
- ../../bases/llama-swap
|
||||
|
||||
namespace: llm
|
||||
|
|
|
@ -2,6 +2,6 @@ resources:
|
|||
- ../../bases/longhorn
|
||||
- ../../bases/traefik
|
||||
- ../../bases/external-secrets
|
||||
# - ../../bases/k8s-device-plugin-amd
|
||||
- ../../bases/k8s-device-plugin-amd
|
||||
- ../../bases/k8s-device-plugin-nvidia
|
||||
- clustersecretstore.yaml
|
Loading…
Reference in New Issue