Merge pull request #4033 from Dokploy/feat/improve-update-process-to-validate-dokploy-services

feat: enhance web server update process with health checks
This commit is contained in:
Mauricio Siu
2026-03-18 22:28:09 -06:00
committed by GitHub
3 changed files with 369 additions and 30 deletions

View File

@@ -1,4 +1,11 @@
import { HardDriveDownload, Loader2 } from "lucide-react"; import {
AlertTriangle,
CheckCircle2,
HardDriveDownload,
Loader2,
RefreshCw,
XCircle,
} from "lucide-react";
import { useState } from "react"; import { useState } from "react";
import { toast } from "sonner"; import { toast } from "sonner";
import { import {
@@ -15,11 +22,70 @@ import {
import { Button } from "@/components/ui/button"; import { Button } from "@/components/ui/button";
import { api } from "@/utils/api"; import { api } from "@/utils/api";
type ServiceStatus = {
status: "healthy" | "unhealthy";
message?: string;
};
type HealthResult = {
postgres: ServiceStatus;
redis: ServiceStatus;
traefik: ServiceStatus;
};
type ModalState = "idle" | "checking" | "results" | "updating";
const ServiceStatusItem = ({
name,
service,
}: {
name: string;
service: ServiceStatus;
}) => (
<div className="flex items-center gap-2">
{service.status === "healthy" ? (
<CheckCircle2 className="h-4 w-4 text-green-500" />
) : (
<XCircle className="h-4 w-4 text-red-500" />
)}
<span className="text-sm font-medium">{name}</span>
{service.status === "unhealthy" && service.message && (
<span className="text-xs text-muted-foreground"> {service.message}</span>
)}
</div>
);
export const UpdateWebServer = () => { export const UpdateWebServer = () => {
const [updating, setUpdating] = useState(false); const [modalState, setModalState] = useState<ModalState>("idle");
const [open, setOpen] = useState(false); const [open, setOpen] = useState(false);
const [healthResult, setHealthResult] = useState<HealthResult | null>(null);
const { mutateAsync: updateServer } = api.settings.updateServer.useMutation(); const { mutateAsync: updateServer } = api.settings.updateServer.useMutation();
const { refetch: checkHealth } =
api.settings.checkInfrastructureHealth.useQuery(undefined, {
enabled: false,
});
const handleVerify = async () => {
setModalState("checking");
setHealthResult(null);
try {
const result = await checkHealth();
if (result.data) {
setHealthResult(result.data);
}
} catch {
// checkHealth failed entirely
}
setModalState("results");
};
const allHealthy =
healthResult &&
healthResult.postgres.status === "healthy" &&
healthResult.redis.status === "healthy" &&
healthResult.traefik.status === "healthy";
const checkIsUpdateFinished = async () => { const checkIsUpdateFinished = async () => {
try { try {
@@ -33,28 +99,24 @@ export const UpdateWebServer = () => {
); );
setTimeout(() => { setTimeout(() => {
// Allow seeing the toast before reloading
window.location.reload(); window.location.reload();
}, 2000); }, 2000);
} catch { } catch {
// Delay each request
await new Promise((resolve) => setTimeout(resolve, 2000)); await new Promise((resolve) => setTimeout(resolve, 2000));
// Keep running until it returns 200
void checkIsUpdateFinished(); void checkIsUpdateFinished();
} }
}; };
const handleConfirm = async () => { const handleConfirm = async () => {
try { try {
setUpdating(true); setModalState("updating");
await updateServer(); await updateServer();
// Give some time for docker service restart before starting to check status
await new Promise((resolve) => setTimeout(resolve, 8000)); await new Promise((resolve) => setTimeout(resolve, 8000));
await checkIsUpdateFinished(); await checkIsUpdateFinished();
} catch (error) { } catch (error) {
setUpdating(false); setModalState("results");
console.error("Error updating server:", error); console.error("Error updating server:", error);
toast.error( toast.error(
"An error occurred while updating the server, please try again.", "An error occurred while updating the server, please try again.",
@@ -62,6 +124,14 @@ export const UpdateWebServer = () => {
} }
}; };
const handleClose = () => {
if (modalState !== "updating") {
setOpen(false);
setModalState("idle");
setHealthResult(null);
}
};
return ( return (
<AlertDialog open={open}> <AlertDialog open={open}>
<AlertDialogTrigger asChild> <AlertDialogTrigger asChild>
@@ -81,36 +151,111 @@ export const UpdateWebServer = () => {
<AlertDialogContent> <AlertDialogContent>
<AlertDialogHeader> <AlertDialogHeader>
<AlertDialogTitle> <AlertDialogTitle>
{updating {modalState === "idle" && "Are you absolutely sure?"}
? "Server update in progress" {modalState === "checking" && "Verifying Services..."}
: "Are you absolutely sure?"} {modalState === "results" &&
(allHealthy ? "Ready to Update" : "Service Issues Detected")}
{modalState === "updating" && "Server update in progress"}
</AlertDialogTitle> </AlertDialogTitle>
<AlertDialogDescription> <AlertDialogDescription asChild>
{updating ? ( <div>
<span className="flex items-center gap-1"> {modalState === "idle" && (
<Loader2 className="animate-spin" /> <span>
The server is being updated, please wait... This will update the web server to the new version. You will
</span> not be able to use the panel during the update process. The
) : ( page will be reloaded once the update is finished.
<> <br />
This action cannot be undone. This will update the web server to <br />
the new version. You will not be able to use the panel during We recommend verifying that all services are running before
the update process. The page will be reloaded once the update is updating.
finished. </span>
</> )}
)}
{modalState === "checking" && (
<span className="flex items-center gap-2">
<Loader2 className="animate-spin h-4 w-4" />
Checking PostgreSQL, Redis and Traefik...
</span>
)}
{modalState === "results" && healthResult && (
<div className="flex flex-col gap-3">
<div className="flex flex-col gap-2">
<ServiceStatusItem
name="PostgreSQL"
service={healthResult.postgres}
/>
<ServiceStatusItem
name="Redis"
service={healthResult.redis}
/>
<ServiceStatusItem
name="Traefik"
service={healthResult.traefik}
/>
</div>
{!allHealthy && (
<div className="flex items-start gap-2 rounded-md border border-yellow-500/30 bg-yellow-500/10 p-3">
<AlertTriangle className="h-4 w-4 text-yellow-500 mt-0.5 shrink-0" />
<span className="text-sm text-yellow-600 dark:text-yellow-400">
Some services are not healthy. You can still proceed
with the update.
</span>
</div>
)}
{allHealthy && (
<span className="text-sm text-muted-foreground">
All services are running. You can proceed with the update.
</span>
)}
</div>
)}
{modalState === "results" && !healthResult && (
<div className="flex items-start gap-2 rounded-md border border-yellow-500/30 bg-yellow-500/10 p-3">
<AlertTriangle className="h-4 w-4 text-yellow-500 mt-0.5 shrink-0" />
<span className="text-sm text-yellow-600 dark:text-yellow-400">
Could not verify services. You can still proceed with the
update.
</span>
</div>
)}
{modalState === "updating" && (
<span className="flex items-center gap-2">
<Loader2 className="animate-spin h-4 w-4" />
The server is being updated, please wait...
</span>
)}
</div>
</AlertDialogDescription> </AlertDialogDescription>
</AlertDialogHeader> </AlertDialogHeader>
{!updating && ( {modalState === "idle" && (
<AlertDialogFooter> <AlertDialogFooter>
<AlertDialogCancel onClick={() => setOpen(false)}> <AlertDialogCancel onClick={handleClose}>Cancel</AlertDialogCancel>
Cancel <Button variant="secondary" onClick={handleVerify}>
</AlertDialogCancel> <RefreshCw className="h-4 w-4" />
Verify Status
</Button>
<AlertDialogAction onClick={handleConfirm}> <AlertDialogAction onClick={handleConfirm}>
Confirm Confirm
</AlertDialogAction> </AlertDialogAction>
</AlertDialogFooter> </AlertDialogFooter>
)} )}
{modalState === "results" && (
<AlertDialogFooter>
<AlertDialogCancel onClick={handleClose}>Cancel</AlertDialogCancel>
<Button variant="secondary" onClick={handleVerify}>
<RefreshCw className="h-4 w-4" />
Re-check
</Button>
<AlertDialogAction onClick={handleConfirm}>
{allHealthy ? "Confirm" : "Confirm Anyway"}
</AlertDialogAction>
</AlertDialogFooter>
)}
</AlertDialogContent> </AlertDialogContent>
</AlertDialog> </AlertDialog>
); );

View File

@@ -2,6 +2,9 @@ import {
CLEANUP_CRON_JOB, CLEANUP_CRON_JOB,
checkGPUStatus, checkGPUStatus,
checkPortInUse, checkPortInUse,
checkPostgresHealth,
checkRedisHealth,
checkTraefikHealth,
cleanupAll, cleanupAll,
cleanupAllBackground, cleanupAllBackground,
cleanupBuilders, cleanupBuilders,
@@ -44,8 +47,8 @@ import {
writeTraefikConfigInPath, writeTraefikConfigInPath,
writeTraefikSetup, writeTraefikSetup,
} from "@dokploy/server"; } from "@dokploy/server";
import { checkPermission } from "@dokploy/server/services/permission";
import { db } from "@dokploy/server/db"; import { db } from "@dokploy/server/db";
import { checkPermission } from "@dokploy/server/services/permission";
import { generateOpenApiDocument } from "@dokploy/trpc-openapi"; import { generateOpenApiDocument } from "@dokploy/trpc-openapi";
import { TRPCError } from "@trpc/server"; import { TRPCError } from "@trpc/server";
import { eq, sql } from "drizzle-orm"; import { eq, sql } from "drizzle-orm";
@@ -864,6 +867,23 @@ export const settingsRouter = createTRPCRouter({
throw error; throw error;
} }
}), }),
checkInfrastructureHealth: adminProcedure.query(async () => {
if (IS_CLOUD) {
return {
postgres: { status: "healthy" as const },
redis: { status: "healthy" as const },
traefik: { status: "healthy" as const },
};
}
const [postgres, redis, traefik] = await Promise.all([
checkPostgresHealth(),
checkRedisHealth(),
checkTraefikHealth(),
]);
return { postgres, redis, traefik };
}),
setupGPU: adminProcedure setupGPU: adminProcedure
.input( .input(
z.object({ z.object({

View File

@@ -741,3 +741,177 @@ export const getComposeContainer = async (
throw error; throw error;
} }
}; };
type ServiceHealthStatus = {
status: "healthy" | "unhealthy";
message?: string;
};
const checkSwarmServiceRunning = async (
serviceName: string,
): Promise<ServiceHealthStatus> => {
try {
const service = docker.getService(serviceName);
const info = await service.inspect();
const replicas = info.Spec?.Mode?.Replicated?.Replicas ?? 0;
if (replicas === 0) {
return {
status: "unhealthy",
message: "Service has 0 replicas configured",
};
}
// Check that at least one task is actually running
const tasks = await docker.listTasks({
filters: JSON.stringify({
service: [serviceName],
"desired-state": ["running"],
}),
});
const runningTask = tasks.find((t) => t.Status?.State === "running");
if (!runningTask) {
const latestTask = tasks[0];
const taskState = latestTask?.Status?.State ?? "unknown";
return {
status: "unhealthy",
message: `No running tasks (current state: ${taskState})`,
};
}
return { status: "healthy" };
} catch (error) {
return {
status: "unhealthy",
message: error instanceof Error ? error.message : "Service not found",
};
}
};
const getSwarmServiceContainerId = async (
serviceName: string,
): Promise<string | null> => {
try {
const tasks = await docker.listTasks({
filters: JSON.stringify({
service: [serviceName],
"desired-state": ["running"],
}),
});
const runningTask = tasks.find((t) => t.Status?.State === "running");
return runningTask?.Status?.ContainerStatus?.ContainerID ?? null;
} catch {
return null;
}
};
export const checkPostgresHealth = async (): Promise<ServiceHealthStatus> => {
const serviceCheck = await checkSwarmServiceRunning("dokploy-postgres");
if (serviceCheck.status === "unhealthy") {
return serviceCheck;
}
// Verify PostgreSQL actually accepts connections
const containerId = await getSwarmServiceContainerId("dokploy-postgres");
if (!containerId) {
return { status: "unhealthy", message: "Could not find running container" };
}
try {
const exec = await docker.getContainer(containerId).exec({
Cmd: ["pg_isready", "-U", "dokploy"],
AttachStdout: true,
AttachStderr: true,
});
const stream = await exec.start({});
const output = await new Promise<string>((resolve) => {
let data = "";
stream.on("data", (chunk: Buffer) => {
data += chunk.toString();
});
stream.on("end", () => resolve(data));
});
const inspectResult = await exec.inspect();
if (inspectResult.ExitCode !== 0) {
return {
status: "unhealthy",
message: `PostgreSQL not ready: ${output.trim()}`,
};
}
return { status: "healthy" };
} catch (error) {
return {
status: "unhealthy",
message:
error instanceof Error ? error.message : "Failed to check PostgreSQL",
};
}
};
export const checkRedisHealth = async (): Promise<ServiceHealthStatus> => {
const serviceCheck = await checkSwarmServiceRunning("dokploy-redis");
if (serviceCheck.status === "unhealthy") {
return serviceCheck;
}
// Verify Redis actually responds to PING
const containerId = await getSwarmServiceContainerId("dokploy-redis");
if (!containerId) {
return { status: "unhealthy", message: "Could not find running container" };
}
try {
const exec = await docker.getContainer(containerId).exec({
Cmd: ["redis-cli", "ping"],
AttachStdout: true,
AttachStderr: true,
});
const stream = await exec.start({});
const output = await new Promise<string>((resolve) => {
let data = "";
stream.on("data", (chunk: Buffer) => {
data += chunk.toString();
});
stream.on("end", () => resolve(data));
});
if (!output.includes("PONG")) {
return {
status: "unhealthy",
message: `Redis did not respond with PONG: ${output.trim()}`,
};
}
return { status: "healthy" };
} catch (error) {
return {
status: "unhealthy",
message: error instanceof Error ? error.message : "Failed to check Redis",
};
}
};
export const checkTraefikHealth = async (): Promise<ServiceHealthStatus> => {
// Traefik can run as a standalone container or a swarm service
try {
const container = docker.getContainer("dokploy-traefik");
const info = await container.inspect();
if (!info.State.Running) {
return {
status: "unhealthy",
message: "Container is not running",
};
}
return { status: "healthy" };
} catch {
// Not a standalone container, check as swarm service
return checkSwarmServiceRunning("dokploy-traefik");
}
};