From 66c4d8f118412124d7729ce2f7303339965b59d9 Mon Sep 17 00:00:00 2001 From: vishalkadam47 Date: Fri, 8 Nov 2024 03:32:33 +0530 Subject: [PATCH] refactor: gpu setup and status checks, extract functions, and improve error handling --- .../settings/servers/gpu-support.tsx | 105 ++-- apps/dokploy/server/api/routers/settings.ts | 6 +- packages/server/src/utils/gpu-setup.ts | 473 ++++++++++-------- 3 files changed, 329 insertions(+), 255 deletions(-) diff --git a/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx b/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx index d0c178c4f..e89a9b66f 100644 --- a/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx +++ b/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx @@ -10,7 +10,7 @@ import { } from "@/components/ui/card"; import { api } from "@/utils/api"; import { TRPCClientError } from "@trpc/client"; -import { CheckCircle2, Cpu, Loader2, XCircle } from "lucide-react"; +import { CheckCircle2, Cpu, Loader2, RefreshCw, XCircle } from "lucide-react"; import { useState } from "react"; import { toast } from "sonner"; @@ -20,16 +20,19 @@ interface GPUSupportProps { export function GPUSupport({ serverId }: GPUSupportProps) { const [isLoading, setIsLoading] = useState(false); + const [isRefreshing, setIsRefreshing] = useState(false); const utils = api.useContext(); - const { data: gpuStatus, isLoading: isChecking } = - api.settings.checkGPUStatus.useQuery( - { serverId }, - { - enabled: serverId !== undefined, - refetchInterval: 5000, - }, - ); + const { + data: gpuStatus, + isLoading: isChecking, + refetch, + } = api.settings.checkGPUStatus.useQuery( + { serverId }, + { + enabled: serverId !== undefined, + }, + ); const setupGPU = api.settings.setupGPU.useMutation({ onMutate: () => { @@ -41,31 +44,20 @@ export function GPUSupport({ serverId }: GPUSupportProps) { await utils.settings.checkGPUStatus.invalidate({ serverId }); }, onError: (error) => { - if (error instanceof TRPCClientError) { - const errorMessage = error.message; - if ( - errorMessage.includes( - "Permission denied. Please ensure proper sudo access.", - ) || - errorMessage.includes("sudo access required") - ) { - toast.error( - "Administrator privileges required. Please enter your password when prompted.", - ); - } else if (errorMessage.includes("Failed to configure GPU")) { - toast.error( - "GPU configuration failed. Please check system requirements.", - ); - } else { - toast.error(errorMessage); - } - } else { - toast.error("Failed to enable GPU support. Please check server logs."); - } + toast.error( + error.message || + "Failed to enable GPU support. Please check server logs.", + ); setIsLoading(false); }, }); + const handleRefresh = async () => { + setIsRefreshing(true); + await refetch(); + setIsRefreshing(false); + }; + const handleEnableGPU = async () => { if (serverId === undefined) { toast.error("No server selected"); @@ -94,22 +86,33 @@ export function GPUSupport({ serverId }: GPUSupportProps) { Configure and monitor GPU support - - + + - + @@ -117,9 +120,17 @@ export function GPUSupport({ serverId }: GPUSupportProps) {
System Requirements:
diff --git a/apps/dokploy/server/api/routers/settings.ts b/apps/dokploy/server/api/routers/settings.ts index a700c82ad..56b6431ce 100644 --- a/apps/dokploy/server/api/routers/settings.ts +++ b/apps/dokploy/server/api/routers/settings.ts @@ -665,8 +665,8 @@ export const settingsRouter = createTRPCRouter({ }), ) .mutation(async ({ input }) => { - if (IS_CLOUD) { - throw new Error("GPU setup is not available in cloud mode"); + if (IS_CLOUD && !input.serverId) { + throw new Error("Select a server to enable the GPU Setup"); } try { @@ -684,7 +684,7 @@ export const settingsRouter = createTRPCRouter({ }), ) .query(async ({ input }) => { - if (IS_CLOUD) { + if (IS_CLOUD && !input.serverId) { return { driverInstalled: false, driverVersion: undefined, diff --git a/packages/server/src/utils/gpu-setup.ts b/packages/server/src/utils/gpu-setup.ts index ecdb3e2b9..bb366762a 100644 --- a/packages/server/src/utils/gpu-setup.ts +++ b/packages/server/src/utils/gpu-setup.ts @@ -18,117 +18,21 @@ interface GPUInfo { export async function checkGPUStatus(serverId?: string): Promise { try { - // Check NVIDIA Driver - let driverInstalled = false; - let driverVersion: string | undefined; - let availableGPUs = 0; - - try { - const driverCommand = - "nvidia-smi --query-gpu=driver_version --format=csv,noheader"; - const { stdout: nvidiaSmi } = serverId - ? await execAsyncRemote(serverId, driverCommand) - : await execAsync(driverCommand); - - driverVersion = nvidiaSmi.trim(); - if (driverVersion) { - driverInstalled = true; - const countCommand = - "nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l"; - const { stdout: gpuCount } = serverId - ? await execAsyncRemote(serverId, countCommand) - : await execAsync(countCommand); - - availableGPUs = Number.parseInt(gpuCount.trim(), 10); - } - } catch (error) { - console.debug("GPU driver check:", error); - } - - // Check Runtime Configuration - let runtimeInstalled = false; - let runtimeConfigured = false; - try { - const runtimeCommand = 'docker info --format "{{json .Runtimes}}"'; - const { stdout: runtimeInfo } = serverId - ? await execAsyncRemote(serverId, runtimeCommand) - : await execAsync(runtimeCommand); - - const runtimes = JSON.parse(runtimeInfo); - runtimeInstalled = "nvidia" in runtimes; - - // Check if it's the default runtime - const defaultCommand = 'docker info --format "{{.DefaultRuntime}}"'; - const { stdout: defaultRuntime } = serverId - ? await execAsyncRemote(serverId, defaultCommand) - : await execAsync(defaultCommand); - - runtimeConfigured = defaultRuntime.trim() === "nvidia"; - } catch (error) { - console.debug("Runtime check:", error); - } - - // Check Swarm GPU Resources - let swarmEnabled = false; - let gpuResources = 0; - - try { - // Check node resources directly from inspect - const nodeCommand = - "docker node inspect self --format '{{json .Description.Resources.GenericResources}}'"; - const { stdout: resources } = serverId - ? await execAsyncRemote(serverId, nodeCommand) - : await execAsync(nodeCommand); - - if (resources && resources !== "null") { - const genericResources = JSON.parse(resources); - for (const resource of genericResources) { - if ( - resource.DiscreteResourceSpec && - (resource.DiscreteResourceSpec.Kind === "GPU" || - resource.DiscreteResourceSpec.Kind === "gpu") - ) { - gpuResources = resource.DiscreteResourceSpec.Value; - swarmEnabled = true; - break; - } - } - } - } catch (error) { - console.debug("Swarm resource check:", error); - } - - // Get GPU Model and Memory Info - const gpuInfoCommand = - "nvidia-smi --query-gpu=gpu_name,memory.total --format=csv,noheader"; - const { stdout: gpuInfo } = serverId - ? await execAsyncRemote(serverId, gpuInfoCommand) - : await execAsync(gpuInfoCommand); - - const [gpuModel, memoryTotal] = gpuInfo.split(",").map((s) => s.trim()); - - // Check CUDA Support - const cudaCommand = 'nvidia-smi -q | grep "CUDA Version"'; - const { stdout: cudaInfo } = serverId - ? await execAsyncRemote(serverId, cudaCommand) - : await execAsync(cudaCommand); - - const cudaMatch = cudaInfo.match(/CUDA Version\s*:\s*([\d\.]+)/); - const cudaVersion = cudaMatch ? cudaMatch[1] : undefined; - const cudaSupport = !!cudaVersion; + const [driverInfo, runtimeInfo, swarmInfo, gpuInfo, cudaInfo] = + await Promise.all([ + checkGpuDriver(serverId), + checkRuntime(serverId), + checkSwarmResources(serverId), + checkGpuInfo(serverId), + checkCudaSupport(serverId), + ]); return { - driverInstalled, - driverVersion, - runtimeInstalled, - runtimeConfigured, - availableGPUs, - swarmEnabled, - gpuResources, - gpuModel, - memoryInfo: memoryTotal, - cudaSupport, - cudaVersion, + ...driverInfo, + ...runtimeInfo, + ...swarmInfo, + ...gpuInfo, + ...cudaInfo, }; } catch (error) { console.error("Error in checkGPUStatus:", error); @@ -148,118 +52,167 @@ export async function checkGPUStatus(serverId?: string): Promise { } } +const checkGpuDriver = async (serverId?: string) => { + let driverVersion: string | undefined; + let driverInstalled = false; + let availableGPUs = 0; + + try { + const driverCommand = + "nvidia-smi --query-gpu=driver_version --format=csv,noheader"; + const { stdout: nvidiaSmi } = serverId + ? await execAsyncRemote(serverId, driverCommand) + : await execAsync(driverCommand); + + driverVersion = nvidiaSmi.trim(); + if (driverVersion) { + driverInstalled = true; + const countCommand = + "nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l"; + const { stdout: gpuCount } = serverId + ? await execAsyncRemote(serverId, countCommand) + : await execAsync(countCommand); + + availableGPUs = Number.parseInt(gpuCount.trim(), 10); + } + } catch (error) { + console.debug("GPU driver check:", error); + } + + return { driverVersion, driverInstalled, availableGPUs }; +}; + +const checkRuntime = async (serverId?: string) => { + let runtimeInstalled = false; + let runtimeConfigured = false; + + try { + const runtimeCommand = 'docker info --format "{{json .Runtimes}}"'; + const { stdout: runtimeInfo } = serverId + ? await execAsyncRemote(serverId, runtimeCommand) + : await execAsync(runtimeCommand); + + const runtimes = JSON.parse(runtimeInfo); + runtimeInstalled = "nvidia" in runtimes; + + const defaultCommand = 'docker info --format "{{.DefaultRuntime}}"'; + const { stdout: defaultRuntime } = serverId + ? await execAsyncRemote(serverId, defaultCommand) + : await execAsync(defaultCommand); + + runtimeConfigured = defaultRuntime.trim() === "nvidia"; + } catch (error) { + console.debug("Runtime check:", error); + } + + return { runtimeInstalled, runtimeConfigured }; +}; + +const checkSwarmResources = async (serverId?: string) => { + let swarmEnabled = false; + let gpuResources = 0; + + try { + const nodeCommand = + "docker node inspect self --format '{{json .Description.Resources.GenericResources}}'"; + const { stdout: resources } = serverId + ? await execAsyncRemote(serverId, nodeCommand) + : await execAsync(nodeCommand); + + if (resources && resources !== "null") { + const genericResources = JSON.parse(resources); + for (const resource of genericResources) { + if ( + resource.DiscreteResourceSpec && + (resource.DiscreteResourceSpec.Kind === "GPU" || + resource.DiscreteResourceSpec.Kind === "gpu") + ) { + gpuResources = resource.DiscreteResourceSpec.Value; + swarmEnabled = true; + break; + } + } + } + } catch (error) { + console.debug("Swarm resource check:", error); + } + + return { swarmEnabled, gpuResources }; +}; + +const checkGpuInfo = async (serverId?: string) => { + let gpuModel: string | undefined; + let memoryInfo: string | undefined; + + try { + const gpuInfoCommand = + "nvidia-smi --query-gpu=gpu_name,memory.total --format=csv,noheader"; + const { stdout: gpuInfo } = serverId + ? await execAsyncRemote(serverId, gpuInfoCommand) + : await execAsync(gpuInfoCommand); + + [gpuModel, memoryInfo] = gpuInfo.split(",").map((s) => s.trim()); + } catch (error) { + console.debug("GPU info check:", error); + } + + return { gpuModel, memoryInfo }; +}; + +const checkCudaSupport = async (serverId?: string) => { + let cudaVersion: string | undefined; + let cudaSupport = false; + + try { + const cudaCommand = 'nvidia-smi -q | grep "CUDA Version"'; + const { stdout: cudaInfo } = serverId + ? await execAsyncRemote(serverId, cudaCommand) + : await execAsync(cudaCommand); + + const cudaMatch = cudaInfo.match(/CUDA Version\s*:\s*([\d\.]+)/); + cudaVersion = cudaMatch ? cudaMatch[1] : undefined; + cudaSupport = !!cudaVersion; + } catch (error) { + console.debug("CUDA support check:", error); + } + + return { cudaVersion, cudaSupport }; +}; + export async function setupGPUSupport(serverId?: string): Promise { try { - // 1. Check current GPU status first + // 1. Initial status check and validation const initialStatus = await checkGPUStatus(serverId); + const shouldContinue = await validatePrerequisites(initialStatus); + if (!shouldContinue) return; - // If GPU is already configured, just verify and return quickly - if ( - initialStatus.swarmEnabled && - initialStatus.runtimeConfigured && - initialStatus.driverInstalled - ) { - console.log("GPU already configured, skipping setup"); - return; - } + // 2. Get node ID + const nodeId = await getNodeId(serverId); - // 2. Verify GPU prerequisites - if (!initialStatus.driverInstalled || !initialStatus.runtimeInstalled) { - throw new Error( - "NVIDIA drivers or runtime not installed. Please install them first.", - ); - } + // 3. Create daemon configuration + const daemonConfig = createDaemonConfig(initialStatus.availableGPUs); - // Get the node ID - const nodeIdCommand = 'docker info --format "{{.Swarm.NodeID}}"'; - const { stdout: nodeId } = serverId - ? await execAsyncRemote(serverId, nodeIdCommand) - : await execAsync(nodeIdCommand); - - if (!nodeId.trim()) { - throw new Error("Setup Server before enabling GPU support"); - } - - // 3. Configure NVIDIA runtime in daemon.json - const daemonConfig = { - runtimes: { - nvidia: { - path: "nvidia-container-runtime", - runtimeArgs: [], - }, - }, - "default-runtime": "nvidia", - "node-generic-resources": [`GPU=${initialStatus.availableGPUs}`], - }; - - // Different commands for local and remote setup + // 4. Setup server based on environment if (serverId) { - // Remote server setup (using sudo) - const setupCommands = [ - "sudo -n true", - `echo '${JSON.stringify(daemonConfig, null, 2)}' | sudo tee /etc/docker/daemon.json`, - "sudo mkdir -p /etc/nvidia-container-runtime", - 'echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" | sudo tee -a /etc/nvidia-container-runtime/config.toml', - "sudo systemctl daemon-reload", - "sudo systemctl restart docker", - ].join(" && "); - - await execAsyncRemote(serverId, setupCommands); + await setupRemoteServer(serverId, daemonConfig); } else { - // Local server setup (using pkexec for GUI password prompt) - const configFile = `/tmp/docker-daemon-${Date.now()}.json`; - await fs.writeFile(configFile, JSON.stringify(daemonConfig, null, 2)); - - const setupCommands = [ - // Use pkexec for GUI password prompt - `pkexec sh -c ' - cp ${configFile} /etc/docker/daemon.json && - mkdir -p /etc/nvidia-container-runtime && - echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" >> /etc/nvidia-container-runtime/config.toml && - systemctl daemon-reload && - systemctl restart docker - '`, - `rm ${configFile}`, // Clean up temp file - ].join(" && "); - - await execAsync(setupCommands); + await setupLocalServer(daemonConfig); } - // 4. Reduced wait time for Docker restart - await new Promise((resolve) => setTimeout(resolve, 10000)); + // 5. Wait for Docker restart + await sleep(10000); - // 5. Add GPU label to the node - const labelCommand = `docker node update --label-add gpu=true ${nodeId.trim()}`; - if (serverId) { - await execAsyncRemote(serverId, labelCommand); - } else { - await execAsync(labelCommand); - } + // 6. Add GPU label + await addGpuLabel(nodeId, serverId); - // 6. Quick final verification - await new Promise((resolve) => setTimeout(resolve, 5000)); - const finalStatus = await checkGPUStatus(serverId); - - if (!finalStatus.swarmEnabled) { - const diagnosticCommands = [ - `docker node inspect ${nodeId.trim()}`, - 'nvidia-smi -a | grep "GPU UUID"', - "cat /etc/docker/daemon.json", - "cat /etc/nvidia-container-runtime/config.toml", - ].join(" && "); - - const { stdout: diagnostics } = serverId - ? await execAsyncRemote(serverId, diagnosticCommands) - : await execAsync(diagnosticCommands); - - console.error("Diagnostic Information:", diagnostics); - throw new Error("GPU support not detected in swarm after setup"); - } + // 7. Final verification + await sleep(5000); + const finalStatus = await verifySetup(nodeId, serverId); console.log("GPU setup completed successfully:", { availableGPUs: initialStatus.availableGPUs, driverVersion: initialStatus.driverVersion, - nodeId: nodeId.trim(), + nodeId, }); } catch (error) { console.error("GPU Setup Error:", error); @@ -274,3 +227,113 @@ export async function setupGPUSupport(serverId?: string): Promise { throw error; } } + +const validatePrerequisites = async (initialStatus: GPUInfo) => { + if (!initialStatus.driverInstalled) { + throw new Error( + "NVIDIA drivers not installed. Please install appropriate NVIDIA drivers first.", + ); + } + + if (!initialStatus.runtimeInstalled) { + throw new Error( + "NVIDIA Container Runtime not installed. Please install nvidia-container-runtime first.", + ); + } + + if (initialStatus.swarmEnabled && initialStatus.runtimeConfigured) { + console.log("GPU already configured, skipping setup"); + return false; + } + + return true; +}; + +const getNodeId = async (serverId?: string) => { + const nodeIdCommand = 'docker info --format "{{.Swarm.NodeID}}"'; + const { stdout: nodeId } = serverId + ? await execAsyncRemote(serverId, nodeIdCommand) + : await execAsync(nodeIdCommand); + + const trimmedNodeId = nodeId.trim(); + if (!trimmedNodeId) { + throw new Error("Setup Server before enabling GPU support"); + } + + return trimmedNodeId; +}; + +const createDaemonConfig = (availableGPUs: number) => ({ + runtimes: { + nvidia: { + path: "nvidia-container-runtime", + runtimeArgs: [], + }, + }, + "default-runtime": "nvidia", + "node-generic-resources": [`GPU=${availableGPUs}`], +}); + +const setupRemoteServer = async (serverId: string, daemonConfig: any) => { + const setupCommands = [ + "sudo -n true", + `echo '${JSON.stringify(daemonConfig, null, 2)}' | sudo tee /etc/docker/daemon.json`, + "sudo mkdir -p /etc/nvidia-container-runtime", + 'echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" | sudo tee -a /etc/nvidia-container-runtime/config.toml', + "sudo systemctl daemon-reload", + "sudo systemctl restart docker", + ].join(" && "); + + await execAsyncRemote(serverId, setupCommands); +}; + +const setupLocalServer = async (daemonConfig: any) => { + const configFile = `/tmp/docker-daemon-${Date.now()}.json`; + await fs.writeFile(configFile, JSON.stringify(daemonConfig, null, 2)); + + const setupCommands = [ + `pkexec sh -c ' + cp ${configFile} /etc/docker/daemon.json && + mkdir -p /etc/nvidia-container-runtime && + echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" >> /etc/nvidia-container-runtime/config.toml && + systemctl daemon-reload && + systemctl restart docker + '`, + `rm ${configFile}`, + ].join(" && "); + + await execAsync(setupCommands); +}; + +const addGpuLabel = async (nodeId: string, serverId?: string) => { + const labelCommand = `docker node update --label-add gpu=true ${nodeId}`; + if (serverId) { + await execAsyncRemote(serverId, labelCommand); + } else { + await execAsync(labelCommand); + } +}; + +const verifySetup = async (nodeId: string, serverId?: string) => { + const finalStatus = await checkGPUStatus(serverId); + + if (!finalStatus.swarmEnabled) { + const diagnosticCommands = [ + `docker node inspect ${nodeId}`, + 'nvidia-smi -a | grep "GPU UUID"', + "cat /etc/docker/daemon.json", + "cat /etc/nvidia-container-runtime/config.toml", + ].join(" && "); + + const { stdout: diagnostics } = serverId + ? await execAsyncRemote(serverId, diagnosticCommands) + : await execAsync(diagnosticCommands); + + console.error("Diagnostic Information:", diagnostics); + throw new Error("GPU support not detected in swarm after setup"); + } + + return finalStatus; +}; + +const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));