diff --git a/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx b/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx
index d0c178c4f..e89a9b66f 100644
--- a/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx
+++ b/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx
@@ -10,7 +10,7 @@ import {
} from "@/components/ui/card";
import { api } from "@/utils/api";
import { TRPCClientError } from "@trpc/client";
-import { CheckCircle2, Cpu, Loader2, XCircle } from "lucide-react";
+import { CheckCircle2, Cpu, Loader2, RefreshCw, XCircle } from "lucide-react";
import { useState } from "react";
import { toast } from "sonner";
@@ -20,16 +20,19 @@ interface GPUSupportProps {
export function GPUSupport({ serverId }: GPUSupportProps) {
const [isLoading, setIsLoading] = useState(false);
+ const [isRefreshing, setIsRefreshing] = useState(false);
const utils = api.useContext();
- const { data: gpuStatus, isLoading: isChecking } =
- api.settings.checkGPUStatus.useQuery(
- { serverId },
- {
- enabled: serverId !== undefined,
- refetchInterval: 5000,
- },
- );
+ const {
+ data: gpuStatus,
+ isLoading: isChecking,
+ refetch,
+ } = api.settings.checkGPUStatus.useQuery(
+ { serverId },
+ {
+ enabled: serverId !== undefined,
+ },
+ );
const setupGPU = api.settings.setupGPU.useMutation({
onMutate: () => {
@@ -41,31 +44,20 @@ export function GPUSupport({ serverId }: GPUSupportProps) {
await utils.settings.checkGPUStatus.invalidate({ serverId });
},
onError: (error) => {
- if (error instanceof TRPCClientError) {
- const errorMessage = error.message;
- if (
- errorMessage.includes(
- "Permission denied. Please ensure proper sudo access.",
- ) ||
- errorMessage.includes("sudo access required")
- ) {
- toast.error(
- "Administrator privileges required. Please enter your password when prompted.",
- );
- } else if (errorMessage.includes("Failed to configure GPU")) {
- toast.error(
- "GPU configuration failed. Please check system requirements.",
- );
- } else {
- toast.error(errorMessage);
- }
- } else {
- toast.error("Failed to enable GPU support. Please check server logs.");
- }
+ toast.error(
+ error.message ||
+ "Failed to enable GPU support. Please check server logs.",
+ );
setIsLoading(false);
},
});
+ const handleRefresh = async () => {
+ setIsRefreshing(true);
+ await refetch();
+ setIsRefreshing(false);
+ };
+
const handleEnableGPU = async () => {
if (serverId === undefined) {
toast.error("No server selected");
@@ -94,22 +86,33 @@ export function GPUSupport({ serverId }: GPUSupportProps) {
Configure and monitor GPU support
-
-
+
@@ -117,9 +120,17 @@ export function GPUSupport({ serverId }: GPUSupportProps) {
System Requirements:
- - NVIDIA drivers must be installed on the host system
- - NVIDIA Container Runtime is required for GPU support
- - Compatible GPU hardware must be present
+ - NVIDIA GPU hardware must be physically installed
+ -
+ NVIDIA drivers must be installed and running (check with
+ nvidia-smi)
+
+ -
+ NVIDIA Container Runtime must be installed
+ (nvidia-container-runtime)
+
+ - User must have sudo/administrative privileges
+ - System must support CUDA for GPU acceleration
diff --git a/apps/dokploy/server/api/routers/settings.ts b/apps/dokploy/server/api/routers/settings.ts
index a700c82ad..56b6431ce 100644
--- a/apps/dokploy/server/api/routers/settings.ts
+++ b/apps/dokploy/server/api/routers/settings.ts
@@ -665,8 +665,8 @@ export const settingsRouter = createTRPCRouter({
}),
)
.mutation(async ({ input }) => {
- if (IS_CLOUD) {
- throw new Error("GPU setup is not available in cloud mode");
+ if (IS_CLOUD && !input.serverId) {
+ throw new Error("Select a server to enable the GPU Setup");
}
try {
@@ -684,7 +684,7 @@ export const settingsRouter = createTRPCRouter({
}),
)
.query(async ({ input }) => {
- if (IS_CLOUD) {
+ if (IS_CLOUD && !input.serverId) {
return {
driverInstalled: false,
driverVersion: undefined,
diff --git a/packages/server/src/utils/gpu-setup.ts b/packages/server/src/utils/gpu-setup.ts
index ecdb3e2b9..bb366762a 100644
--- a/packages/server/src/utils/gpu-setup.ts
+++ b/packages/server/src/utils/gpu-setup.ts
@@ -18,117 +18,21 @@ interface GPUInfo {
export async function checkGPUStatus(serverId?: string): Promise {
try {
- // Check NVIDIA Driver
- let driverInstalled = false;
- let driverVersion: string | undefined;
- let availableGPUs = 0;
-
- try {
- const driverCommand =
- "nvidia-smi --query-gpu=driver_version --format=csv,noheader";
- const { stdout: nvidiaSmi } = serverId
- ? await execAsyncRemote(serverId, driverCommand)
- : await execAsync(driverCommand);
-
- driverVersion = nvidiaSmi.trim();
- if (driverVersion) {
- driverInstalled = true;
- const countCommand =
- "nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l";
- const { stdout: gpuCount } = serverId
- ? await execAsyncRemote(serverId, countCommand)
- : await execAsync(countCommand);
-
- availableGPUs = Number.parseInt(gpuCount.trim(), 10);
- }
- } catch (error) {
- console.debug("GPU driver check:", error);
- }
-
- // Check Runtime Configuration
- let runtimeInstalled = false;
- let runtimeConfigured = false;
- try {
- const runtimeCommand = 'docker info --format "{{json .Runtimes}}"';
- const { stdout: runtimeInfo } = serverId
- ? await execAsyncRemote(serverId, runtimeCommand)
- : await execAsync(runtimeCommand);
-
- const runtimes = JSON.parse(runtimeInfo);
- runtimeInstalled = "nvidia" in runtimes;
-
- // Check if it's the default runtime
- const defaultCommand = 'docker info --format "{{.DefaultRuntime}}"';
- const { stdout: defaultRuntime } = serverId
- ? await execAsyncRemote(serverId, defaultCommand)
- : await execAsync(defaultCommand);
-
- runtimeConfigured = defaultRuntime.trim() === "nvidia";
- } catch (error) {
- console.debug("Runtime check:", error);
- }
-
- // Check Swarm GPU Resources
- let swarmEnabled = false;
- let gpuResources = 0;
-
- try {
- // Check node resources directly from inspect
- const nodeCommand =
- "docker node inspect self --format '{{json .Description.Resources.GenericResources}}'";
- const { stdout: resources } = serverId
- ? await execAsyncRemote(serverId, nodeCommand)
- : await execAsync(nodeCommand);
-
- if (resources && resources !== "null") {
- const genericResources = JSON.parse(resources);
- for (const resource of genericResources) {
- if (
- resource.DiscreteResourceSpec &&
- (resource.DiscreteResourceSpec.Kind === "GPU" ||
- resource.DiscreteResourceSpec.Kind === "gpu")
- ) {
- gpuResources = resource.DiscreteResourceSpec.Value;
- swarmEnabled = true;
- break;
- }
- }
- }
- } catch (error) {
- console.debug("Swarm resource check:", error);
- }
-
- // Get GPU Model and Memory Info
- const gpuInfoCommand =
- "nvidia-smi --query-gpu=gpu_name,memory.total --format=csv,noheader";
- const { stdout: gpuInfo } = serverId
- ? await execAsyncRemote(serverId, gpuInfoCommand)
- : await execAsync(gpuInfoCommand);
-
- const [gpuModel, memoryTotal] = gpuInfo.split(",").map((s) => s.trim());
-
- // Check CUDA Support
- const cudaCommand = 'nvidia-smi -q | grep "CUDA Version"';
- const { stdout: cudaInfo } = serverId
- ? await execAsyncRemote(serverId, cudaCommand)
- : await execAsync(cudaCommand);
-
- const cudaMatch = cudaInfo.match(/CUDA Version\s*:\s*([\d\.]+)/);
- const cudaVersion = cudaMatch ? cudaMatch[1] : undefined;
- const cudaSupport = !!cudaVersion;
+ const [driverInfo, runtimeInfo, swarmInfo, gpuInfo, cudaInfo] =
+ await Promise.all([
+ checkGpuDriver(serverId),
+ checkRuntime(serverId),
+ checkSwarmResources(serverId),
+ checkGpuInfo(serverId),
+ checkCudaSupport(serverId),
+ ]);
return {
- driverInstalled,
- driverVersion,
- runtimeInstalled,
- runtimeConfigured,
- availableGPUs,
- swarmEnabled,
- gpuResources,
- gpuModel,
- memoryInfo: memoryTotal,
- cudaSupport,
- cudaVersion,
+ ...driverInfo,
+ ...runtimeInfo,
+ ...swarmInfo,
+ ...gpuInfo,
+ ...cudaInfo,
};
} catch (error) {
console.error("Error in checkGPUStatus:", error);
@@ -148,118 +52,167 @@ export async function checkGPUStatus(serverId?: string): Promise {
}
}
+const checkGpuDriver = async (serverId?: string) => {
+ let driverVersion: string | undefined;
+ let driverInstalled = false;
+ let availableGPUs = 0;
+
+ try {
+ const driverCommand =
+ "nvidia-smi --query-gpu=driver_version --format=csv,noheader";
+ const { stdout: nvidiaSmi } = serverId
+ ? await execAsyncRemote(serverId, driverCommand)
+ : await execAsync(driverCommand);
+
+ driverVersion = nvidiaSmi.trim();
+ if (driverVersion) {
+ driverInstalled = true;
+ const countCommand =
+ "nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l";
+ const { stdout: gpuCount } = serverId
+ ? await execAsyncRemote(serverId, countCommand)
+ : await execAsync(countCommand);
+
+ availableGPUs = Number.parseInt(gpuCount.trim(), 10);
+ }
+ } catch (error) {
+ console.debug("GPU driver check:", error);
+ }
+
+ return { driverVersion, driverInstalled, availableGPUs };
+};
+
+const checkRuntime = async (serverId?: string) => {
+ let runtimeInstalled = false;
+ let runtimeConfigured = false;
+
+ try {
+ const runtimeCommand = 'docker info --format "{{json .Runtimes}}"';
+ const { stdout: runtimeInfo } = serverId
+ ? await execAsyncRemote(serverId, runtimeCommand)
+ : await execAsync(runtimeCommand);
+
+ const runtimes = JSON.parse(runtimeInfo);
+ runtimeInstalled = "nvidia" in runtimes;
+
+ const defaultCommand = 'docker info --format "{{.DefaultRuntime}}"';
+ const { stdout: defaultRuntime } = serverId
+ ? await execAsyncRemote(serverId, defaultCommand)
+ : await execAsync(defaultCommand);
+
+ runtimeConfigured = defaultRuntime.trim() === "nvidia";
+ } catch (error) {
+ console.debug("Runtime check:", error);
+ }
+
+ return { runtimeInstalled, runtimeConfigured };
+};
+
+const checkSwarmResources = async (serverId?: string) => {
+ let swarmEnabled = false;
+ let gpuResources = 0;
+
+ try {
+ const nodeCommand =
+ "docker node inspect self --format '{{json .Description.Resources.GenericResources}}'";
+ const { stdout: resources } = serverId
+ ? await execAsyncRemote(serverId, nodeCommand)
+ : await execAsync(nodeCommand);
+
+ if (resources && resources !== "null") {
+ const genericResources = JSON.parse(resources);
+ for (const resource of genericResources) {
+ if (
+ resource.DiscreteResourceSpec &&
+ (resource.DiscreteResourceSpec.Kind === "GPU" ||
+ resource.DiscreteResourceSpec.Kind === "gpu")
+ ) {
+ gpuResources = resource.DiscreteResourceSpec.Value;
+ swarmEnabled = true;
+ break;
+ }
+ }
+ }
+ } catch (error) {
+ console.debug("Swarm resource check:", error);
+ }
+
+ return { swarmEnabled, gpuResources };
+};
+
+const checkGpuInfo = async (serverId?: string) => {
+ let gpuModel: string | undefined;
+ let memoryInfo: string | undefined;
+
+ try {
+ const gpuInfoCommand =
+ "nvidia-smi --query-gpu=gpu_name,memory.total --format=csv,noheader";
+ const { stdout: gpuInfo } = serverId
+ ? await execAsyncRemote(serverId, gpuInfoCommand)
+ : await execAsync(gpuInfoCommand);
+
+ [gpuModel, memoryInfo] = gpuInfo.split(",").map((s) => s.trim());
+ } catch (error) {
+ console.debug("GPU info check:", error);
+ }
+
+ return { gpuModel, memoryInfo };
+};
+
+const checkCudaSupport = async (serverId?: string) => {
+ let cudaVersion: string | undefined;
+ let cudaSupport = false;
+
+ try {
+ const cudaCommand = 'nvidia-smi -q | grep "CUDA Version"';
+ const { stdout: cudaInfo } = serverId
+ ? await execAsyncRemote(serverId, cudaCommand)
+ : await execAsync(cudaCommand);
+
+ const cudaMatch = cudaInfo.match(/CUDA Version\s*:\s*([\d\.]+)/);
+ cudaVersion = cudaMatch ? cudaMatch[1] : undefined;
+ cudaSupport = !!cudaVersion;
+ } catch (error) {
+ console.debug("CUDA support check:", error);
+ }
+
+ return { cudaVersion, cudaSupport };
+};
+
export async function setupGPUSupport(serverId?: string): Promise {
try {
- // 1. Check current GPU status first
+ // 1. Initial status check and validation
const initialStatus = await checkGPUStatus(serverId);
+ const shouldContinue = await validatePrerequisites(initialStatus);
+ if (!shouldContinue) return;
- // If GPU is already configured, just verify and return quickly
- if (
- initialStatus.swarmEnabled &&
- initialStatus.runtimeConfigured &&
- initialStatus.driverInstalled
- ) {
- console.log("GPU already configured, skipping setup");
- return;
- }
+ // 2. Get node ID
+ const nodeId = await getNodeId(serverId);
- // 2. Verify GPU prerequisites
- if (!initialStatus.driverInstalled || !initialStatus.runtimeInstalled) {
- throw new Error(
- "NVIDIA drivers or runtime not installed. Please install them first.",
- );
- }
+ // 3. Create daemon configuration
+ const daemonConfig = createDaemonConfig(initialStatus.availableGPUs);
- // Get the node ID
- const nodeIdCommand = 'docker info --format "{{.Swarm.NodeID}}"';
- const { stdout: nodeId } = serverId
- ? await execAsyncRemote(serverId, nodeIdCommand)
- : await execAsync(nodeIdCommand);
-
- if (!nodeId.trim()) {
- throw new Error("Setup Server before enabling GPU support");
- }
-
- // 3. Configure NVIDIA runtime in daemon.json
- const daemonConfig = {
- runtimes: {
- nvidia: {
- path: "nvidia-container-runtime",
- runtimeArgs: [],
- },
- },
- "default-runtime": "nvidia",
- "node-generic-resources": [`GPU=${initialStatus.availableGPUs}`],
- };
-
- // Different commands for local and remote setup
+ // 4. Setup server based on environment
if (serverId) {
- // Remote server setup (using sudo)
- const setupCommands = [
- "sudo -n true",
- `echo '${JSON.stringify(daemonConfig, null, 2)}' | sudo tee /etc/docker/daemon.json`,
- "sudo mkdir -p /etc/nvidia-container-runtime",
- 'echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" | sudo tee -a /etc/nvidia-container-runtime/config.toml',
- "sudo systemctl daemon-reload",
- "sudo systemctl restart docker",
- ].join(" && ");
-
- await execAsyncRemote(serverId, setupCommands);
+ await setupRemoteServer(serverId, daemonConfig);
} else {
- // Local server setup (using pkexec for GUI password prompt)
- const configFile = `/tmp/docker-daemon-${Date.now()}.json`;
- await fs.writeFile(configFile, JSON.stringify(daemonConfig, null, 2));
-
- const setupCommands = [
- // Use pkexec for GUI password prompt
- `pkexec sh -c '
- cp ${configFile} /etc/docker/daemon.json &&
- mkdir -p /etc/nvidia-container-runtime &&
- echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" >> /etc/nvidia-container-runtime/config.toml &&
- systemctl daemon-reload &&
- systemctl restart docker
- '`,
- `rm ${configFile}`, // Clean up temp file
- ].join(" && ");
-
- await execAsync(setupCommands);
+ await setupLocalServer(daemonConfig);
}
- // 4. Reduced wait time for Docker restart
- await new Promise((resolve) => setTimeout(resolve, 10000));
+ // 5. Wait for Docker restart
+ await sleep(10000);
- // 5. Add GPU label to the node
- const labelCommand = `docker node update --label-add gpu=true ${nodeId.trim()}`;
- if (serverId) {
- await execAsyncRemote(serverId, labelCommand);
- } else {
- await execAsync(labelCommand);
- }
+ // 6. Add GPU label
+ await addGpuLabel(nodeId, serverId);
- // 6. Quick final verification
- await new Promise((resolve) => setTimeout(resolve, 5000));
- const finalStatus = await checkGPUStatus(serverId);
-
- if (!finalStatus.swarmEnabled) {
- const diagnosticCommands = [
- `docker node inspect ${nodeId.trim()}`,
- 'nvidia-smi -a | grep "GPU UUID"',
- "cat /etc/docker/daemon.json",
- "cat /etc/nvidia-container-runtime/config.toml",
- ].join(" && ");
-
- const { stdout: diagnostics } = serverId
- ? await execAsyncRemote(serverId, diagnosticCommands)
- : await execAsync(diagnosticCommands);
-
- console.error("Diagnostic Information:", diagnostics);
- throw new Error("GPU support not detected in swarm after setup");
- }
+ // 7. Final verification
+ await sleep(5000);
+ const finalStatus = await verifySetup(nodeId, serverId);
console.log("GPU setup completed successfully:", {
availableGPUs: initialStatus.availableGPUs,
driverVersion: initialStatus.driverVersion,
- nodeId: nodeId.trim(),
+ nodeId,
});
} catch (error) {
console.error("GPU Setup Error:", error);
@@ -274,3 +227,113 @@ export async function setupGPUSupport(serverId?: string): Promise {
throw error;
}
}
+
+const validatePrerequisites = async (initialStatus: GPUInfo) => {
+ if (!initialStatus.driverInstalled) {
+ throw new Error(
+ "NVIDIA drivers not installed. Please install appropriate NVIDIA drivers first.",
+ );
+ }
+
+ if (!initialStatus.runtimeInstalled) {
+ throw new Error(
+ "NVIDIA Container Runtime not installed. Please install nvidia-container-runtime first.",
+ );
+ }
+
+ if (initialStatus.swarmEnabled && initialStatus.runtimeConfigured) {
+ console.log("GPU already configured, skipping setup");
+ return false;
+ }
+
+ return true;
+};
+
+const getNodeId = async (serverId?: string) => {
+ const nodeIdCommand = 'docker info --format "{{.Swarm.NodeID}}"';
+ const { stdout: nodeId } = serverId
+ ? await execAsyncRemote(serverId, nodeIdCommand)
+ : await execAsync(nodeIdCommand);
+
+ const trimmedNodeId = nodeId.trim();
+ if (!trimmedNodeId) {
+ throw new Error("Setup Server before enabling GPU support");
+ }
+
+ return trimmedNodeId;
+};
+
+const createDaemonConfig = (availableGPUs: number) => ({
+ runtimes: {
+ nvidia: {
+ path: "nvidia-container-runtime",
+ runtimeArgs: [],
+ },
+ },
+ "default-runtime": "nvidia",
+ "node-generic-resources": [`GPU=${availableGPUs}`],
+});
+
+const setupRemoteServer = async (serverId: string, daemonConfig: any) => {
+ const setupCommands = [
+ "sudo -n true",
+ `echo '${JSON.stringify(daemonConfig, null, 2)}' | sudo tee /etc/docker/daemon.json`,
+ "sudo mkdir -p /etc/nvidia-container-runtime",
+ 'echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" | sudo tee -a /etc/nvidia-container-runtime/config.toml',
+ "sudo systemctl daemon-reload",
+ "sudo systemctl restart docker",
+ ].join(" && ");
+
+ await execAsyncRemote(serverId, setupCommands);
+};
+
+const setupLocalServer = async (daemonConfig: any) => {
+ const configFile = `/tmp/docker-daemon-${Date.now()}.json`;
+ await fs.writeFile(configFile, JSON.stringify(daemonConfig, null, 2));
+
+ const setupCommands = [
+ `pkexec sh -c '
+ cp ${configFile} /etc/docker/daemon.json &&
+ mkdir -p /etc/nvidia-container-runtime &&
+ echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" >> /etc/nvidia-container-runtime/config.toml &&
+ systemctl daemon-reload &&
+ systemctl restart docker
+ '`,
+ `rm ${configFile}`,
+ ].join(" && ");
+
+ await execAsync(setupCommands);
+};
+
+const addGpuLabel = async (nodeId: string, serverId?: string) => {
+ const labelCommand = `docker node update --label-add gpu=true ${nodeId}`;
+ if (serverId) {
+ await execAsyncRemote(serverId, labelCommand);
+ } else {
+ await execAsync(labelCommand);
+ }
+};
+
+const verifySetup = async (nodeId: string, serverId?: string) => {
+ const finalStatus = await checkGPUStatus(serverId);
+
+ if (!finalStatus.swarmEnabled) {
+ const diagnosticCommands = [
+ `docker node inspect ${nodeId}`,
+ 'nvidia-smi -a | grep "GPU UUID"',
+ "cat /etc/docker/daemon.json",
+ "cat /etc/nvidia-container-runtime/config.toml",
+ ].join(" && ");
+
+ const { stdout: diagnostics } = serverId
+ ? await execAsyncRemote(serverId, diagnosticCommands)
+ : await execAsync(diagnosticCommands);
+
+ console.error("Diagnostic Information:", diagnostics);
+ throw new Error("GPU support not detected in swarm after setup");
+ }
+
+ return finalStatus;
+};
+
+const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));