feat(soak): artifacts, graceful shutdown, health probes, smoke script, v3.3.4

Batched remaining harness tasks (27-30, 33): Task 27 — Artifact capture on failure: screenshots, HTML snapshots, game state JSON, and console error tails are captured into tests/soak/artifacts/<run-id>/ when a scenario throws. Successful runs get a summary.json. Old runs (>7d) are pruned on startup. Task 28 — Graceful shutdown: first SIGINT/SIGTERM flips the abort signal (scenarios finish current turn then unwind). 10s after, a hard-kill fires if cleanup hangs. Double Ctrl-C = immediate exit. Exit codes: 0 success, 1 errors, 2 interrupted. Task 29 — Periodic health probes: every 30s GET /health against the target server. Three consecutive failures abort the run with health_fatal, preventing staging outages from being misattributed to harness bugs. Corrected endpoint from /api/health to /health per server/routers/health.py. Task 30 — Smoke test script: tests/soak/scripts/smoke.sh, a 60s end-to-end canary that health-probes the target, seeds if needed, and runs one minimal populate game. Task 33 — Version bump to v3.3.4: both index.html footers (was v3.1.6), new footer added to admin.html (had none), pyproject.toml. Also fixes discovered during stress testing: - SessionPool sets baseURL on all contexts so relative goto('/') resolves correctly between games (was "invalid URL" error) - RoomCoordinator key is now unique per game-start (Date.now suffix) so Deferred promises don't carry stale room codes from previous games Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 22:57:15 -04:00
parent d3b468575b
commit b8bc432175
8 changed files with 242 additions and 5 deletions
--- a/tests/soak/runner.ts
+++ b/tests/soak/runner.ts
@@ -18,6 +18,7 @@ import { RoomCoordinator } from './core/room-coordinator';
 import { DashboardServer } from './dashboard/server';
 import { Screencaster } from './core/screencaster';
 import { Watchdog } from './core/watchdog';
+import { Artifacts, pruneOldRuns } from './core/artifacts';
 import { getScenario, listScenarios } from './scenarios';
 import type { DashboardReporter, ScenarioContext, Session } from './core/types';

@@ -72,6 +73,13 @@ async function main(): Promise<void> {
    cli,
  });

+  // Artifacts: instantiate now so both failure path + success summary
+  // can reach it. Prune old runs (>7d) on startup so the directory
+  // doesn't grow unbounded.
+  const artifactsRoot = path.resolve(__dirname, 'artifacts');
+  const artifacts = new Artifacts({ runId, rootDir: artifactsRoot, logger });
+  pruneOldRuns(artifactsRoot, 7 * 24 * 3600 * 1000, logger);
+
  // Resolve final config: scenarioDefaults → env → CLI (later wins)
  const config = mergeConfig(
    cli as Record<string, unknown>,
@@ -115,13 +123,47 @@ async function main(): Promise<void> {

  const abortController = new AbortController();

+  // Graceful shutdown: first signal flips abort, scenarios finish the
+  // current turn then unwind. 10 seconds later, if cleanup is still
+  // hanging, the runner force-exits. A second Ctrl-C skips the wait.
+  let forceExitTimer: NodeJS.Timeout | null = null;
  const onSignal = (sig: string) => {
+    if (abortController.signal.aborted) {
+      logger.warn('force_exit', { signal: sig });
+      process.exit(130);
+    }
    logger.warn('signal_received', { signal: sig });
    abortController.abort();
+    forceExitTimer = setTimeout(() => {
+      logger.error('graceful_shutdown_timeout');
+      process.exit(130);
+    }, 10_000);
  };
  process.on('SIGINT', () => onSignal('SIGINT'));
  process.on('SIGTERM', () => onSignal('SIGTERM'));

+  // Health probes: every 30s GET /api/health. Three consecutive failures
+  // abort the run with a fatal error so staging outages don't get
+  // misattributed to harness bugs.
+  let healthFailures = 0;
+  const healthTimer = setInterval(async () => {
+    try {
+      const res = await fetch(`${targetUrl}/health`);
+      if (!res.ok) throw new Error(`status ${res.status}`);
+      healthFailures = 0;
+    } catch (err) {
+      healthFailures++;
+      logger.warn('health_probe_failed', {
+        consecutive: healthFailures,
+        error: err instanceof Error ? err.message : String(err),
+      });
+      if (healthFailures >= 3) {
+        logger.error('health_fatal', { consecutive: healthFailures });
+        abortController.abort();
+      }
+    }
+  }, 30_000);
+
  let dashboardServer: DashboardServer | null = null;
  let dashboard: DashboardReporter = noopDashboard();
  const watchdogs = new Map<string, Watchdog>();
@@ -217,6 +259,15 @@ async function main(): Promise<void> {
    console.log(`Games completed: ${result.gamesCompleted}`);
    console.log(`Errors:          ${result.errors.length}`);
    console.log(`Duration:        ${(result.durationMs / 1000).toFixed(1)}s`);
+    artifacts.writeSummary({
+      runId,
+      scenario: scenario.name,
+      targetUrl,
+      gamesCompleted: result.gamesCompleted,
+      errors: result.errors,
+      durationMs: result.durationMs,
+      customMetrics: result.customMetrics,
+    });
    if (result.errors.length > 0) {
      console.log('Errors:');
      for (const e of result.errors) {
@@ -229,8 +280,23 @@ async function main(): Promise<void> {
      error: err instanceof Error ? err.message : String(err),
      stack: err instanceof Error ? err.stack : undefined,
    });
+    // Best-effort artifact capture from still-live sessions. The pool's
+    // activeSessions field is private but accessible for this error path —
+    // we want every frame we can grab before release() tears them down.
+    try {
+      const liveSessions = (pool as unknown as { activeSessions: Session[] }).activeSessions;
+      if (liveSessions && liveSessions.length > 0) {
+        await artifacts.captureAll(liveSessions);
+      }
+    } catch (captureErr) {
+      logger.warn('artifact_capture_failed', {
+        error: captureErr instanceof Error ? captureErr.message : String(captureErr),
+      });
+    }
    exitCode = 1;
  } finally {
+    clearInterval(healthTimer);
+    if (forceExitTimer) clearTimeout(forceExitTimer);
    for (const w of watchdogs.values()) w.stop();
    await screencaster.stopAll();
    await pool.release();