CoreWeave SDK Patterns
Overview
CoreWeave is Kubernetes-native -- use kubectl, Kubernetes Python client, or Helm for programmatic management. These patterns cover GPU-aware deployment templates, inference client wrappers, and node affinity configurations.
Instructions
GPU Affinity Helper
# coreweave_helpers.py
from dataclasses import dataclass
@dataclass
class GPUConfig:
gpu_class: str # A100_PCIE_80GB, H100_SXM5, L40, etc.
gpu_count: int = 1
memory_gb: int = 32
cpu_cores: int = 4
GPU_CATALOG = {
"a100-80gb": GPUConfig("A100_PCIE_80GB", memory_gb=48, cpu_cores=8),
"h100-80gb": GPUConfig("H100_SXM5", memory_gb=64, cpu_cores=12),
"l40": GPUConfig("L40", memory_gb=24, cpu_cores=4),
"a100-8x": GPUConfig("A100_NVLINK_A100_SXM4_80GB", gpu_count=8, memory_gb=256, cpu_cores=64),
}
def gpu_affinity_block(gpu_class: str) -> dict:
return {
"nodeAffinity": {
"requiredDuringSchedulingIgnoredDuringExecution": {
"nodeSelectorTerms": [{
"matchExpressions": [{
"key": "gpu.nvidia.com/class",
"operator": "In",
"values": [gpu_class],
}]
}]
}
}
}
def gpu_resources(config: GPUConfig) -> dict:
return {
"limits": {
"nvidia.com/gpu": str(config.gpu_count),
"memory": f"{config.memory_gb}Gi",
"cpu": str(config.cpu_cores),
},
"requests": {
"nvidia.com/gpu": str(config.gpu_count),
"memory": f"{config.memory_gb // 2}Gi",
"cpu": str(config.cpu_cores // 2),
},
}
Inference Client Wrapper
# inference_client.py
import requests
from typing import Optional
class CoreWeaveInferenceClient:
def __init__(self, endpoint: str, timeout: int = 30):
self.endpoint = endpoint.rstrip("/")
self.timeout = timeout
self.session = requests.Session()
def generate(self, prompt: str, max_tokens: int = 256, **kwargs) -> str:
resp = self.session.post(
f"{self.endpoint}/v1/completions",
json={"prompt": prompt, "max_tokens": max_tokens, **kwargs},
timeout=self.timeout,
)
resp.raise_for_status()
return resp.json()["choices"][0]["text"]
def chat(self, messages: list[dict], **kwargs) -> str:
resp = self.session.post(
f"{self.endpoint}/v1/chat/completions",
json={"messages": messages, **kwargs},
timeout=self.timeout,
)
resp.raise_for_status()
return resp.json()["choices"][0]["message"]["content"]
def health(self) -> bool:
try:
resp = self.session.get(f"{self.endpoint}/health", timeout=5)
return resp.status_code == 200
except Exception:
return False
Deployment Template Generator
import yaml
def generate_inference_deployment(
name: str,
image: str,
gpu_type: str = "a100-80gb",
replicas: int = 1,
port: int = 8000,
) -> str:
config = GPU_CATALOG[gpu_type]
return yaml.dump({
"apiVersion": "apps/v1",
"kind": "Deployment",
"metadata": {"name": name},
"spec": {
"replicas": replicas,
"selector": {"matchLabels": {"app": name}},
"template": {
"metadata": {"labels": {"app": name}},
"spec": {
"containers": [{
"name": name,
"image": image,
"ports": [{"containerPort": port}],
"resources": gpu_resources(config),
}],
"affinity": gpu_affinity_block(config.gpu_class),
},
},
},
})
Error Handling
| Error | Cause | Solution |
|-------|-------|----------|
| GPU class not found | Typo in node label | Use exact values from gpu.nvidia.com/class |
| OOM on inference | Model too large for GPU | Use larger GPU or quantized model |
| Connection refused | Service not ready | Check pod readiness probe |
Resources
Next Steps
Apply patterns in coreweave-core-workflow-a for KServe inference deployments.