apiVersion: apps/v1 kind: Deployment metadata: name: server spec: replicas: 1 strategy: type: Recreate selector: matchLabels: app.kubernetes.io/component: server template: metadata: labels: app.kubernetes.io/component: server spec: affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: # Image only supports amd64 - key: kubernetes.io/arch operator: In values: - amd64 priorityClassName: high-priority runtimeClassName: nvidia containers: - name: server image: ollama/ollama env: - name: OLLAMA_KEEP_ALIVE value: 12h # - name: HSA_OVERRIDE_GFX_VERSION # value: 10.1.0 # - name: HSA_ENABLE_SDMA # value: "0" resources: requests: memory: 4Gi cpu: 2000m limits: memory: 4Gi cpu: 3000m nvidia.com/gpu: "2" # amd.com/gpu: "1" ports: - name: http hostPort: 11434 containerPort: 11434 volumeMounts: - name: server-data mountPath: /root/.ollama volumes: - name: server-data hostPath: path: /var/lib/ollama type: DirectoryOrCreate --- apiVersion: scheduling.k8s.io/v1 kind: PriorityClass metadata: name: high-priority value: 1000000 globalDefault: false --- apiVersion: v1 kind: Service metadata: name: server labels: app.kubernetes.io/component: server spec: selector: app.kubernetes.io/component: server ports: - name: http port: 11434 targetPort: http --- apiVersion: v1 kind: Service metadata: name: nodeport labels: app.kubernetes.io/component: nodeport spec: type: NodePort selector: app.kubernetes.io/component: server ports: - targetPort: http port: 11434 nodePort: 31002