apiVersion: apps/v1 kind: Deployment metadata: name: server spec: replicas: 1 strategy: type: Recreate selector: matchLabels: app.kubernetes.io/component: server template: metadata: labels: app.kubernetes.io/component: server spec: affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: # Image only supports amd64 - key: kubernetes.io/arch operator: In values: - amd64 priorityClassName: high-priority runtimeClassName: nvidia containers: - name: server image: ollama/ollama # image: badjware/ollama-tweak env: - name: OLLAMA_KEEP_ALIVE value: 36h # - name: HSA_OVERRIDE_GFX_VERSION # value: 10.1.0 # - name: HSA_ENABLE_SDMA # value: "0" resources: requests: memory: 4Gi cpu: 2000m limits: memory: 4Gi cpu: 3000m nvidia.com/gpu: 1 ports: - containerPort: 11434 name: http volumeMounts: - name: server-data mountPath: /root/.ollama volumes: - name: server-data hostPath: path: /var/lib/ollama type: DirectoryOrCreate --- apiVersion: scheduling.k8s.io/v1 kind: PriorityClass metadata: name: high-priority value: 1000000 globalDefault: false --- apiVersion: v1 kind: Service metadata: name: server labels: app.kubernetes.io/component: server spec: selector: app.kubernetes.io/component: server ports: - name: http port: 11434 targetPort: http