apiVersion: apps/v1 kind: Deployment metadata: name: server spec: replicas: 1 strategy: type: Recreate selector: matchLabels: app.kubernetes.io/component: server template: metadata: labels: app.kubernetes.io/component: server spec: affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: # Image only supports amd64 - key: kubernetes.io/arch operator: In values: - amd64 priorityClassName: high-priority containers: - name: server image: ghcr.io/mostlygeek/llama-swap:vulkan imagePullPolicy: Always args: ["--config", "/config/config.yaml"] resources: requests: memory: 4Gi cpu: 1000m limits: memory: 4Gi # nvidia.com/gpu: "1" amd.com/gpu: "1" ports: - name: http hostPort: 8080 containerPort: 8080 volumeMounts: - name: server-data mountPath: /root/.cache/llama.cpp - name: llama-swap-config mountPath: /config volumes: - name: server-data hostPath: path: /var/lib/llama.cpp type: DirectoryOrCreate - name: llama-swap-config configMap: name: llama-swap-config --- apiVersion: scheduling.k8s.io/v1 kind: PriorityClass metadata: name: high-priority value: 1000000 globalDefault: false --- apiVersion: v1 kind: Service metadata: name: server labels: app.kubernetes.io/component: server spec: selector: app.kubernetes.io/component: server ports: - name: http port: 8080 targetPort: http