From 66c4d8f118412124d7729ce2f7303339965b59d9 Mon Sep 17 00:00:00 2001
From: vishalkadam47 <vishal@jeevops.com>
Date: Fri, 8 Nov 2024 03:32:33 +0530
Subject: [PATCH] refactor: gpu setup and status checks, extract functions, and
 improve error handling

---
 .../settings/servers/gpu-support.tsx          | 105 ++--
 apps/dokploy/server/api/routers/settings.ts   |   6 +-
 packages/server/src/utils/gpu-setup.ts        | 473 ++++++++++--------
 3 files changed, 329 insertions(+), 255 deletions(-)
diff --git a/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx b/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx
index d0c178c4f..e89a9b66f 100644
--- a/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx
+++ b/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx
@@ -10,7 +10,7 @@ import {
 } from "@/components/ui/card";
 import { api } from "@/utils/api";
 import { TRPCClientError } from "@trpc/client";
-import { CheckCircle2, Cpu, Loader2, XCircle } from "lucide-react";
+import { CheckCircle2, Cpu, Loader2, RefreshCw, XCircle } from "lucide-react";
 import { useState } from "react";
 import { toast } from "sonner";
 
@@ -20,16 +20,19 @@ interface GPUSupportProps {
 
 export function GPUSupport({ serverId }: GPUSupportProps) {
 	const [isLoading, setIsLoading] = useState(false);
+	const [isRefreshing, setIsRefreshing] = useState(false);
 	const utils = api.useContext();
 
-	const { data: gpuStatus, isLoading: isChecking } =
-		api.settings.checkGPUStatus.useQuery(
-			{ serverId },
-			{
-				enabled: serverId !== undefined,
-				refetchInterval: 5000,
-			},
-		);
+	const {
+		data: gpuStatus,
+		isLoading: isChecking,
+		refetch,
+	} = api.settings.checkGPUStatus.useQuery(
+		{ serverId },
+		{
+			enabled: serverId !== undefined,
+		},
+	);
 
 	const setupGPU = api.settings.setupGPU.useMutation({
 		onMutate: () => {
@@ -41,31 +44,20 @@ export function GPUSupport({ serverId }: GPUSupportProps) {
 			await utils.settings.checkGPUStatus.invalidate({ serverId });
 		},
 		onError: (error) => {
-			if (error instanceof TRPCClientError) {
-				const errorMessage = error.message;
-				if (
-					errorMessage.includes(
-						"Permission denied. Please ensure proper sudo access.",
-					) ||
-					errorMessage.includes("sudo access required")
-				) {
-					toast.error(
-						"Administrator privileges required. Please enter your password when prompted.",
-					);
-				} else if (errorMessage.includes("Failed to configure GPU")) {
-					toast.error(
-						"GPU configuration failed. Please check system requirements.",
-					);
-				} else {
-					toast.error(errorMessage);
-				}
-			} else {
-				toast.error("Failed to enable GPU support. Please check server logs.");
-			}
+			toast.error(
+				error.message ||
+					"Failed to enable GPU support. Please check server logs.",
+			);
 			setIsLoading(false);
 		},
 	});
 
+	const handleRefresh = async () => {
+		setIsRefreshing(true);
+		await refetch();
+		setIsRefreshing(false);
+	};
+
 	const handleEnableGPU = async () => {
 		if (serverId === undefined) {
 			toast.error("No server selected");
@@ -94,22 +86,33 @@ export function GPUSupport({ serverId }: GPUSupportProps) {
 									Configure and monitor GPU support
 								</CardDescription>
 							</div>
-							<DialogAction
-								title="Enable GPU Support?"
-								description="This will enable GPU support for Docker Swarm on this server. Make sure you have the required hardware and drivers installed."
-								onClick={handleEnableGPU}
-							>
-								<Button
-									isLoading={isLoading}
-									disabled={isLoading || serverId === undefined || isChecking}
+							<div className="flex items-center gap-2">
+								<DialogAction
+									title="Enable GPU Support?"
+									description="This will enable GPU support for Docker Swarm on this server. Make sure you have the required hardware and drivers installed."
+									onClick={handleEnableGPU}
 								>
-									{isLoading
-										? "Enabling GPU..."
-										: gpuStatus?.swarmEnabled
-											? "Reconfigure GPU"
-											: "Enable GPU"}
+									<Button
+										isLoading={isLoading}
+										disabled={isLoading || serverId === undefined || isChecking}
+									>
+										{isLoading
+											? "Enabling GPU..."
+											: gpuStatus?.swarmEnabled
+												? "Reconfigure GPU"
+												: "Enable GPU"}
+									</Button>
+								</DialogAction>
+								<Button
+									size="icon"
+									onClick={handleRefresh}
+									disabled={isChecking || isRefreshing}
+								>
+									<RefreshCw
+										className={`h-5 w-5 ${isChecking || isRefreshing ? "animate-spin" : ""}`}
+									/>
 								</Button>
-							</DialogAction>
+							</div>
 						</div>
 					</CardHeader>
 
@@ -117,9 +120,17 @@ export function GPUSupport({ serverId }: GPUSupportProps) {
 						<AlertBlock type="info">
 							<div className="font-medium mb-2">System Requirements:</div>
 							<ul className="list-disc list-inside text-sm space-y-1">
-								<li>NVIDIA drivers must be installed on the host system</li>
-								<li>NVIDIA Container Runtime is required for GPU support</li>
-								<li>Compatible GPU hardware must be present</li>
+								<li>NVIDIA GPU hardware must be physically installed</li>
+								<li>
+									NVIDIA drivers must be installed and running (check with
+									nvidia-smi)
+								</li>
+								<li>
+									NVIDIA Container Runtime must be installed
+									(nvidia-container-runtime)
+								</li>
+								<li>User must have sudo/administrative privileges</li>
+								<li>System must support CUDA for GPU acceleration</li>
 							</ul>
 						</AlertBlock>
 
diff --git a/apps/dokploy/server/api/routers/settings.ts b/apps/dokploy/server/api/routers/settings.ts
index a700c82ad..56b6431ce 100644
--- a/apps/dokploy/server/api/routers/settings.ts
+++ b/apps/dokploy/server/api/routers/settings.ts
@@ -665,8 +665,8 @@ export const settingsRouter = createTRPCRouter({
 			}),
 		)
 		.mutation(async ({ input }) => {
-			if (IS_CLOUD) {
-				throw new Error("GPU setup is not available in cloud mode");
+			if (IS_CLOUD && !input.serverId) {
+				throw new Error("Select a server to enable the GPU Setup");
 			}
 
 			try {
@@ -684,7 +684,7 @@ export const settingsRouter = createTRPCRouter({
 			}),
 		)
 		.query(async ({ input }) => {
-			if (IS_CLOUD) {
+			if (IS_CLOUD && !input.serverId) {
 				return {
 					driverInstalled: false,
 					driverVersion: undefined,
diff --git a/packages/server/src/utils/gpu-setup.ts b/packages/server/src/utils/gpu-setup.ts
index ecdb3e2b9..bb366762a 100644
--- a/packages/server/src/utils/gpu-setup.ts
+++ b/packages/server/src/utils/gpu-setup.ts
@@ -18,117 +18,21 @@ interface GPUInfo {
 
 export async function checkGPUStatus(serverId?: string): Promise<GPUInfo> {
 	try {
-		// Check NVIDIA Driver
-		let driverInstalled = false;
-		let driverVersion: string | undefined;
-		let availableGPUs = 0;
-
-		try {
-			const driverCommand =
-				"nvidia-smi --query-gpu=driver_version --format=csv,noheader";
-			const { stdout: nvidiaSmi } = serverId
-				? await execAsyncRemote(serverId, driverCommand)
-				: await execAsync(driverCommand);
-
-			driverVersion = nvidiaSmi.trim();
-			if (driverVersion) {
-				driverInstalled = true;
-				const countCommand =
-					"nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l";
-				const { stdout: gpuCount } = serverId
-					? await execAsyncRemote(serverId, countCommand)
-					: await execAsync(countCommand);
-
-				availableGPUs = Number.parseInt(gpuCount.trim(), 10);
-			}
-		} catch (error) {
-			console.debug("GPU driver check:", error);
-		}
-
-		// Check Runtime Configuration
-		let runtimeInstalled = false;
-		let runtimeConfigured = false;
-		try {
-			const runtimeCommand = 'docker info --format "{{json .Runtimes}}"';
-			const { stdout: runtimeInfo } = serverId
-				? await execAsyncRemote(serverId, runtimeCommand)
-				: await execAsync(runtimeCommand);
-
-			const runtimes = JSON.parse(runtimeInfo);
-			runtimeInstalled = "nvidia" in runtimes;
-
-			// Check if it's the default runtime
-			const defaultCommand = 'docker info --format "{{.DefaultRuntime}}"';
-			const { stdout: defaultRuntime } = serverId
-				? await execAsyncRemote(serverId, defaultCommand)
-				: await execAsync(defaultCommand);
-
-			runtimeConfigured = defaultRuntime.trim() === "nvidia";
-		} catch (error) {
-			console.debug("Runtime check:", error);
-		}
-
-		// Check Swarm GPU Resources
-		let swarmEnabled = false;
-		let gpuResources = 0;
-
-		try {
-			// Check node resources directly from inspect
-			const nodeCommand =
-				"docker node inspect self --format '{{json .Description.Resources.GenericResources}}'";
-			const { stdout: resources } = serverId
-				? await execAsyncRemote(serverId, nodeCommand)
-				: await execAsync(nodeCommand);
-
-			if (resources && resources !== "null") {
-				const genericResources = JSON.parse(resources);
-				for (const resource of genericResources) {
-					if (
-						resource.DiscreteResourceSpec &&
-						(resource.DiscreteResourceSpec.Kind === "GPU" ||
-							resource.DiscreteResourceSpec.Kind === "gpu")
-					) {
-						gpuResources = resource.DiscreteResourceSpec.Value;
-						swarmEnabled = true;
-						break;
-					}
-				}
-			}
-		} catch (error) {
-			console.debug("Swarm resource check:", error);
-		}
-
-		// Get GPU Model and Memory Info
-		const gpuInfoCommand =
-			"nvidia-smi --query-gpu=gpu_name,memory.total --format=csv,noheader";
-		const { stdout: gpuInfo } = serverId
-			? await execAsyncRemote(serverId, gpuInfoCommand)
-			: await execAsync(gpuInfoCommand);
-
-		const [gpuModel, memoryTotal] = gpuInfo.split(",").map((s) => s.trim());
-
-		// Check CUDA Support
-		const cudaCommand = 'nvidia-smi -q | grep "CUDA Version"';
-		const { stdout: cudaInfo } = serverId
-			? await execAsyncRemote(serverId, cudaCommand)
-			: await execAsync(cudaCommand);
-
-		const cudaMatch = cudaInfo.match(/CUDA Version\s*:\s*([\d\.]+)/);
-		const cudaVersion = cudaMatch ? cudaMatch[1] : undefined;
-		const cudaSupport = !!cudaVersion;
+		const [driverInfo, runtimeInfo, swarmInfo, gpuInfo, cudaInfo] =
+			await Promise.all([
+				checkGpuDriver(serverId),
+				checkRuntime(serverId),
+				checkSwarmResources(serverId),
+				checkGpuInfo(serverId),
+				checkCudaSupport(serverId),
+			]);
 
 		return {
-			driverInstalled,
-			driverVersion,
-			runtimeInstalled,
-			runtimeConfigured,
-			availableGPUs,
-			swarmEnabled,
-			gpuResources,
-			gpuModel,
-			memoryInfo: memoryTotal,
-			cudaSupport,
-			cudaVersion,
+			...driverInfo,
+			...runtimeInfo,
+			...swarmInfo,
+			...gpuInfo,
+			...cudaInfo,
 		};
 	} catch (error) {
 		console.error("Error in checkGPUStatus:", error);
@@ -148,118 +52,167 @@ export async function checkGPUStatus(serverId?: string): Promise<GPUInfo> {
 	}
 }
 
+const checkGpuDriver = async (serverId?: string) => {
+	let driverVersion: string | undefined;
+	let driverInstalled = false;
+	let availableGPUs = 0;
+
+	try {
+		const driverCommand =
+			"nvidia-smi --query-gpu=driver_version --format=csv,noheader";
+		const { stdout: nvidiaSmi } = serverId
+			? await execAsyncRemote(serverId, driverCommand)
+			: await execAsync(driverCommand);
+
+		driverVersion = nvidiaSmi.trim();
+		if (driverVersion) {
+			driverInstalled = true;
+			const countCommand =
+				"nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l";
+			const { stdout: gpuCount } = serverId
+				? await execAsyncRemote(serverId, countCommand)
+				: await execAsync(countCommand);
+
+			availableGPUs = Number.parseInt(gpuCount.trim(), 10);
+		}
+	} catch (error) {
+		console.debug("GPU driver check:", error);
+	}
+
+	return { driverVersion, driverInstalled, availableGPUs };
+};
+
+const checkRuntime = async (serverId?: string) => {
+	let runtimeInstalled = false;
+	let runtimeConfigured = false;
+
+	try {
+		const runtimeCommand = 'docker info --format "{{json .Runtimes}}"';
+		const { stdout: runtimeInfo } = serverId
+			? await execAsyncRemote(serverId, runtimeCommand)
+			: await execAsync(runtimeCommand);
+
+		const runtimes = JSON.parse(runtimeInfo);
+		runtimeInstalled = "nvidia" in runtimes;
+
+		const defaultCommand = 'docker info --format "{{.DefaultRuntime}}"';
+		const { stdout: defaultRuntime } = serverId
+			? await execAsyncRemote(serverId, defaultCommand)
+			: await execAsync(defaultCommand);
+
+		runtimeConfigured = defaultRuntime.trim() === "nvidia";
+	} catch (error) {
+		console.debug("Runtime check:", error);
+	}
+
+	return { runtimeInstalled, runtimeConfigured };
+};
+
+const checkSwarmResources = async (serverId?: string) => {
+	let swarmEnabled = false;
+	let gpuResources = 0;
+
+	try {
+		const nodeCommand =
+			"docker node inspect self --format '{{json .Description.Resources.GenericResources}}'";
+		const { stdout: resources } = serverId
+			? await execAsyncRemote(serverId, nodeCommand)
+			: await execAsync(nodeCommand);
+
+		if (resources && resources !== "null") {
+			const genericResources = JSON.parse(resources);
+			for (const resource of genericResources) {
+				if (
+					resource.DiscreteResourceSpec &&
+					(resource.DiscreteResourceSpec.Kind === "GPU" ||
+						resource.DiscreteResourceSpec.Kind === "gpu")
+				) {
+					gpuResources = resource.DiscreteResourceSpec.Value;
+					swarmEnabled = true;
+					break;
+				}
+			}
+		}
+	} catch (error) {
+		console.debug("Swarm resource check:", error);
+	}
+
+	return { swarmEnabled, gpuResources };
+};
+
+const checkGpuInfo = async (serverId?: string) => {
+	let gpuModel: string | undefined;
+	let memoryInfo: string | undefined;
+
+	try {
+		const gpuInfoCommand =
+			"nvidia-smi --query-gpu=gpu_name,memory.total --format=csv,noheader";
+		const { stdout: gpuInfo } = serverId
+			? await execAsyncRemote(serverId, gpuInfoCommand)
+			: await execAsync(gpuInfoCommand);
+
+		[gpuModel, memoryInfo] = gpuInfo.split(",").map((s) => s.trim());
+	} catch (error) {
+		console.debug("GPU info check:", error);
+	}
+
+	return { gpuModel, memoryInfo };
+};
+
+const checkCudaSupport = async (serverId?: string) => {
+	let cudaVersion: string | undefined;
+	let cudaSupport = false;
+
+	try {
+		const cudaCommand = 'nvidia-smi -q | grep "CUDA Version"';
+		const { stdout: cudaInfo } = serverId
+			? await execAsyncRemote(serverId, cudaCommand)
+			: await execAsync(cudaCommand);
+
+		const cudaMatch = cudaInfo.match(/CUDA Version\s*:\s*([\d\.]+)/);
+		cudaVersion = cudaMatch ? cudaMatch[1] : undefined;
+		cudaSupport = !!cudaVersion;
+	} catch (error) {
+		console.debug("CUDA support check:", error);
+	}
+
+	return { cudaVersion, cudaSupport };
+};
+
 export async function setupGPUSupport(serverId?: string): Promise<void> {
 	try {
-		// 1. Check current GPU status first
+		// 1. Initial status check and validation
 		const initialStatus = await checkGPUStatus(serverId);
+		const shouldContinue = await validatePrerequisites(initialStatus);
+		if (!shouldContinue) return;
 
-		// If GPU is already configured, just verify and return quickly
-		if (
-			initialStatus.swarmEnabled &&
-			initialStatus.runtimeConfigured &&
-			initialStatus.driverInstalled
-		) {
-			console.log("GPU already configured, skipping setup");
-			return;
-		}
+		// 2. Get node ID
+		const nodeId = await getNodeId(serverId);
 
-		// 2. Verify GPU prerequisites
-		if (!initialStatus.driverInstalled || !initialStatus.runtimeInstalled) {
-			throw new Error(
-				"NVIDIA drivers or runtime not installed. Please install them first.",
-			);
-		}
+		// 3. Create daemon configuration
+		const daemonConfig = createDaemonConfig(initialStatus.availableGPUs);
 
-		// Get the node ID
-		const nodeIdCommand = 'docker info --format "{{.Swarm.NodeID}}"';
-		const { stdout: nodeId } = serverId
-			? await execAsyncRemote(serverId, nodeIdCommand)
-			: await execAsync(nodeIdCommand);
-
-		if (!nodeId.trim()) {
-			throw new Error("Setup Server before enabling GPU support");
-		}
-
-		// 3. Configure NVIDIA runtime in daemon.json
-		const daemonConfig = {
-			runtimes: {
-				nvidia: {
-					path: "nvidia-container-runtime",
-					runtimeArgs: [],
-				},
-			},
-			"default-runtime": "nvidia",
-			"node-generic-resources": [`GPU=${initialStatus.availableGPUs}`],
-		};
-
-		// Different commands for local and remote setup
+		// 4. Setup server based on environment
 		if (serverId) {
-			// Remote server setup (using sudo)
-			const setupCommands = [
-				"sudo -n true",
-				`echo '${JSON.stringify(daemonConfig, null, 2)}' | sudo tee /etc/docker/daemon.json`,
-				"sudo mkdir -p /etc/nvidia-container-runtime",
-				'echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" | sudo tee -a /etc/nvidia-container-runtime/config.toml',
-				"sudo systemctl daemon-reload",
-				"sudo systemctl restart docker",
-			].join(" && ");
-
-			await execAsyncRemote(serverId, setupCommands);
+			await setupRemoteServer(serverId, daemonConfig);
 		} else {
-			// Local server setup (using pkexec for GUI password prompt)
-			const configFile = `/tmp/docker-daemon-${Date.now()}.json`;
-			await fs.writeFile(configFile, JSON.stringify(daemonConfig, null, 2));
-
-			const setupCommands = [
-				// Use pkexec for GUI password prompt
-				`pkexec sh -c '
-					cp ${configFile} /etc/docker/daemon.json && 
-					mkdir -p /etc/nvidia-container-runtime && 
-					echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" >> /etc/nvidia-container-runtime/config.toml && 
-					systemctl daemon-reload && 
-					systemctl restart docker
-				'`,
-				`rm ${configFile}`, // Clean up temp file
-			].join(" && ");
-
-			await execAsync(setupCommands);
+			await setupLocalServer(daemonConfig);
 		}
 
-		// 4. Reduced wait time for Docker restart
-		await new Promise((resolve) => setTimeout(resolve, 10000));
+		// 5. Wait for Docker restart
+		await sleep(10000);
 
-		// 5. Add GPU label to the node
-		const labelCommand = `docker node update --label-add gpu=true ${nodeId.trim()}`;
-		if (serverId) {
-			await execAsyncRemote(serverId, labelCommand);
-		} else {
-			await execAsync(labelCommand);
-		}
+		// 6. Add GPU label
+		await addGpuLabel(nodeId, serverId);
 
-		// 6. Quick final verification
-		await new Promise((resolve) => setTimeout(resolve, 5000));
-		const finalStatus = await checkGPUStatus(serverId);
-
-		if (!finalStatus.swarmEnabled) {
-			const diagnosticCommands = [
-				`docker node inspect ${nodeId.trim()}`,
-				'nvidia-smi -a | grep "GPU UUID"',
-				"cat /etc/docker/daemon.json",
-				"cat /etc/nvidia-container-runtime/config.toml",
-			].join(" && ");
-
-			const { stdout: diagnostics } = serverId
-				? await execAsyncRemote(serverId, diagnosticCommands)
-				: await execAsync(diagnosticCommands);
-
-			console.error("Diagnostic Information:", diagnostics);
-			throw new Error("GPU support not detected in swarm after setup");
-		}
+		// 7. Final verification
+		await sleep(5000);
+		const finalStatus = await verifySetup(nodeId, serverId);
 
 		console.log("GPU setup completed successfully:", {
 			availableGPUs: initialStatus.availableGPUs,
 			driverVersion: initialStatus.driverVersion,
-			nodeId: nodeId.trim(),
+			nodeId,
 		});
 	} catch (error) {
 		console.error("GPU Setup Error:", error);
@@ -274,3 +227,113 @@ export async function setupGPUSupport(serverId?: string): Promise<void> {
 		throw error;
 	}
 }
+
+const validatePrerequisites = async (initialStatus: GPUInfo) => {
+	if (!initialStatus.driverInstalled) {
+		throw new Error(
+			"NVIDIA drivers not installed. Please install appropriate NVIDIA drivers first.",
+		);
+	}
+
+	if (!initialStatus.runtimeInstalled) {
+		throw new Error(
+			"NVIDIA Container Runtime not installed. Please install nvidia-container-runtime first.",
+		);
+	}
+
+	if (initialStatus.swarmEnabled && initialStatus.runtimeConfigured) {
+		console.log("GPU already configured, skipping setup");
+		return false;
+	}
+
+	return true;
+};
+
+const getNodeId = async (serverId?: string) => {
+	const nodeIdCommand = 'docker info --format "{{.Swarm.NodeID}}"';
+	const { stdout: nodeId } = serverId
+		? await execAsyncRemote(serverId, nodeIdCommand)
+		: await execAsync(nodeIdCommand);
+
+	const trimmedNodeId = nodeId.trim();
+	if (!trimmedNodeId) {
+		throw new Error("Setup Server before enabling GPU support");
+	}
+
+	return trimmedNodeId;
+};
+
+const createDaemonConfig = (availableGPUs: number) => ({
+	runtimes: {
+		nvidia: {
+			path: "nvidia-container-runtime",
+			runtimeArgs: [],
+		},
+	},
+	"default-runtime": "nvidia",
+	"node-generic-resources": [`GPU=${availableGPUs}`],
+});
+
+const setupRemoteServer = async (serverId: string, daemonConfig: any) => {
+	const setupCommands = [
+		"sudo -n true",
+		`echo '${JSON.stringify(daemonConfig, null, 2)}' | sudo tee /etc/docker/daemon.json`,
+		"sudo mkdir -p /etc/nvidia-container-runtime",
+		'echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" | sudo tee -a /etc/nvidia-container-runtime/config.toml',
+		"sudo systemctl daemon-reload",
+		"sudo systemctl restart docker",
+	].join(" && ");
+
+	await execAsyncRemote(serverId, setupCommands);
+};
+
+const setupLocalServer = async (daemonConfig: any) => {
+	const configFile = `/tmp/docker-daemon-${Date.now()}.json`;
+	await fs.writeFile(configFile, JSON.stringify(daemonConfig, null, 2));
+
+	const setupCommands = [
+		`pkexec sh -c '
+			cp ${configFile} /etc/docker/daemon.json && 
+			mkdir -p /etc/nvidia-container-runtime && 
+			echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" >> /etc/nvidia-container-runtime/config.toml && 
+			systemctl daemon-reload && 
+			systemctl restart docker
+		'`,
+		`rm ${configFile}`,
+	].join(" && ");
+
+	await execAsync(setupCommands);
+};
+
+const addGpuLabel = async (nodeId: string, serverId?: string) => {
+	const labelCommand = `docker node update --label-add gpu=true ${nodeId}`;
+	if (serverId) {
+		await execAsyncRemote(serverId, labelCommand);
+	} else {
+		await execAsync(labelCommand);
+	}
+};
+
+const verifySetup = async (nodeId: string, serverId?: string) => {
+	const finalStatus = await checkGPUStatus(serverId);
+
+	if (!finalStatus.swarmEnabled) {
+		const diagnosticCommands = [
+			`docker node inspect ${nodeId}`,
+			'nvidia-smi -a | grep "GPU UUID"',
+			"cat /etc/docker/daemon.json",
+			"cat /etc/nvidia-container-runtime/config.toml",
+		].join(" && ");
+
+		const { stdout: diagnostics } = serverId
+			? await execAsyncRemote(serverId, diagnosticCommands)
+			: await execAsync(diagnosticCommands);
+
+		console.error("Diagnostic Information:", diagnostics);
+		throw new Error("GPU support not detected in swarm after setup");
+	}
+
+	return finalStatus;
+};
+
+const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));