Files
golfgame/tests/soak/runner.ts
adlee-was-taken b8bc432175 feat(soak): artifacts, graceful shutdown, health probes, smoke script, v3.3.4
Batched remaining harness tasks (27-30, 33):

Task 27 — Artifact capture on failure: screenshots, HTML snapshots,
game state JSON, and console error tails are captured into
tests/soak/artifacts/<run-id>/ when a scenario throws. Successful
runs get a summary.json. Old runs (>7d) are pruned on startup.

Task 28 — Graceful shutdown: first SIGINT/SIGTERM flips the abort
signal (scenarios finish current turn then unwind). 10s after, a
hard-kill fires if cleanup hangs. Double Ctrl-C = immediate exit.
Exit codes: 0 success, 1 errors, 2 interrupted.

Task 29 — Periodic health probes: every 30s GET /health against the
target server. Three consecutive failures abort the run with
health_fatal, preventing staging outages from being misattributed
to harness bugs. Corrected endpoint from /api/health to /health
per server/routers/health.py.

Task 30 — Smoke test script: tests/soak/scripts/smoke.sh, a 60s
end-to-end canary that health-probes the target, seeds if needed,
and runs one minimal populate game.

Task 33 — Version bump to v3.3.4: both index.html footers (was
v3.1.6), new footer added to admin.html (had none), pyproject.toml.

Also fixes discovered during stress testing:
- SessionPool sets baseURL on all contexts so relative goto('/')
  resolves correctly between games (was "invalid URL" error)
- RoomCoordinator key is now unique per game-start (Date.now
  suffix) so Deferred promises don't carry stale room codes from
  previous games

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 22:57:15 -04:00

316 lines
10 KiB
TypeScript

#!/usr/bin/env tsx
/**
* Golf Soak Harness — entry point.
*
* Usage:
* TEST_URL=http://localhost:8000 \
* SOAK_INVITE_CODE=SOAKTEST \
* bun run soak -- --scenario=populate --rooms=1 --accounts=2 \
* --cpus-per-room=0 --games-per-room=1 --holes=1 --watch=none
*/
import * as path from 'path';
import { spawn } from 'child_process';
import { parseArgs, mergeConfig, CliArgs } from './config';
import { createLogger } from './core/logger';
import { SessionPool } from './core/session-pool';
import { RoomCoordinator } from './core/room-coordinator';
import { DashboardServer } from './dashboard/server';
import { Screencaster } from './core/screencaster';
import { Watchdog } from './core/watchdog';
import { Artifacts, pruneOldRuns } from './core/artifacts';
import { getScenario, listScenarios } from './scenarios';
import type { DashboardReporter, ScenarioContext, Session } from './core/types';
function noopDashboard(): DashboardReporter {
return {
update: () => {},
log: () => {},
incrementMetric: () => {},
};
}
function printScenarioList(): void {
console.log('Available scenarios:');
for (const s of listScenarios()) {
console.log(` ${s.name.padEnd(12)} ${s.description}`);
console.log(
` needs: accounts=${s.needs.accounts}, rooms=${s.needs.rooms ?? 1}, cpus=${s.needs.cpusPerRoom ?? 0}`,
);
}
}
async function main(): Promise<void> {
const cli: CliArgs = parseArgs(process.argv.slice(2));
if (cli.listOnly) {
printScenarioList();
return;
}
if (!cli.scenario) {
console.error('Error: --scenario=<name> is required. Use --list to see scenarios.');
process.exit(2);
}
const scenario = getScenario(cli.scenario);
if (!scenario) {
console.error(`Error: unknown scenario "${cli.scenario}". Use --list to see scenarios.`);
process.exit(2);
}
const runId =
cli.runId ?? `${cli.scenario}-${new Date().toISOString().replace(/[:.]/g, '-')}`;
const targetUrl = cli.target ?? process.env.TEST_URL ?? 'http://localhost:8000';
const inviteCode = process.env.SOAK_INVITE_CODE ?? 'SOAKTEST';
const watch = cli.watch ?? 'dashboard';
const logger = createLogger({ runId });
logger.info('run_start', {
scenario: scenario.name,
targetUrl,
watch,
cli,
});
// Artifacts: instantiate now so both failure path + success summary
// can reach it. Prune old runs (>7d) on startup so the directory
// doesn't grow unbounded.
const artifactsRoot = path.resolve(__dirname, 'artifacts');
const artifacts = new Artifacts({ runId, rootDir: artifactsRoot, logger });
pruneOldRuns(artifactsRoot, 7 * 24 * 3600 * 1000, logger);
// Resolve final config: scenarioDefaults → env → CLI (later wins)
const config = mergeConfig(
cli as Record<string, unknown>,
process.env,
scenario.defaultConfig,
);
// Ensure core knobs exist, falling back to scenario.needs
const accounts = Number(config.accounts ?? scenario.needs.accounts);
const rooms = Number(config.rooms ?? scenario.needs.rooms ?? 1);
const cpusPerRoom = Number(config.cpusPerRoom ?? scenario.needs.cpusPerRoom ?? 0);
if (accounts % rooms !== 0) {
console.error(
`Error: --accounts=${accounts} does not divide evenly into --rooms=${rooms}`,
);
process.exit(2);
}
config.accounts = accounts;
config.rooms = rooms;
config.cpusPerRoom = cpusPerRoom;
if (cli.dryRun) {
logger.info('dry_run', { config });
console.log('Dry run OK. Resolved config:');
console.log(JSON.stringify(config, null, 2));
return;
}
// Build dependencies
const credFile = path.resolve(__dirname, '.env.stresstest');
const headedHostCount = watch === 'tiled' ? rooms : 0;
const pool = new SessionPool({
targetUrl,
inviteCode,
credFile,
logger,
headedHostCount,
});
const coordinator = new RoomCoordinator();
const screencaster = new Screencaster(logger);
const abortController = new AbortController();
// Graceful shutdown: first signal flips abort, scenarios finish the
// current turn then unwind. 10 seconds later, if cleanup is still
// hanging, the runner force-exits. A second Ctrl-C skips the wait.
let forceExitTimer: NodeJS.Timeout | null = null;
const onSignal = (sig: string) => {
if (abortController.signal.aborted) {
logger.warn('force_exit', { signal: sig });
process.exit(130);
}
logger.warn('signal_received', { signal: sig });
abortController.abort();
forceExitTimer = setTimeout(() => {
logger.error('graceful_shutdown_timeout');
process.exit(130);
}, 10_000);
};
process.on('SIGINT', () => onSignal('SIGINT'));
process.on('SIGTERM', () => onSignal('SIGTERM'));
// Health probes: every 30s GET /api/health. Three consecutive failures
// abort the run with a fatal error so staging outages don't get
// misattributed to harness bugs.
let healthFailures = 0;
const healthTimer = setInterval(async () => {
try {
const res = await fetch(`${targetUrl}/health`);
if (!res.ok) throw new Error(`status ${res.status}`);
healthFailures = 0;
} catch (err) {
healthFailures++;
logger.warn('health_probe_failed', {
consecutive: healthFailures,
error: err instanceof Error ? err.message : String(err),
});
if (healthFailures >= 3) {
logger.error('health_fatal', { consecutive: healthFailures });
abortController.abort();
}
}
}, 30_000);
let dashboardServer: DashboardServer | null = null;
let dashboard: DashboardReporter = noopDashboard();
const watchdogs = new Map<string, Watchdog>();
let exitCode = 0;
try {
const sessions = await pool.acquire(accounts);
logger.info('sessions_acquired', { count: sessions.length });
// Build a session lookup for click-to-watch
const sessionsByKey = new Map<string, Session>();
for (const s of sessions) sessionsByKey.set(s.key, s);
// Dashboard with screencaster handlers now that sessions exist
if (watch === 'dashboard') {
const port = Number(config.dashboardPort ?? 7777);
dashboardServer = new DashboardServer(port, logger, {
onStartStream: (key) => {
const session = sessionsByKey.get(key);
if (!session) {
logger.warn('stream_start_unknown_session', { sessionKey: key });
return;
}
screencaster
.start(key, session.page, (jpegBase64) => {
dashboardServer!.broadcast({ type: 'frame', sessionKey: key, jpegBase64 });
})
.catch((err) =>
logger.error('screencast_start_failed', {
sessionKey: key,
error: err instanceof Error ? err.message : String(err),
}),
);
},
onStopStream: (key) => {
screencaster.stop(key).catch(() => {
// best-effort — errors already logged inside Screencaster
});
},
onDisconnect: () => {
screencaster.stopAll().catch(() => {});
},
});
await dashboardServer.start();
dashboard = dashboardServer.reporter();
const url = `http://localhost:${port}`;
console.log(`Dashboard: ${url}`);
try {
const opener =
process.platform === 'darwin'
? 'open'
: process.platform === 'win32'
? 'start'
: 'xdg-open';
spawn(opener, [url], { stdio: 'ignore', detached: true }).unref();
} catch {
// If auto-open fails, the URL is already printed.
}
}
// Per-room watchdogs — fire if no heartbeat arrives within 60s.
// Declared at outer scope so the finally block can stop them and
// drain any pending timers before the process exits.
for (let i = 0; i < rooms; i++) {
const roomId = `room-${i}`;
const w = new Watchdog(60_000, () => {
logger.error('watchdog_fired', { room: roomId });
dashboard.update(roomId, { phase: 'error' });
abortController.abort();
});
w.start();
watchdogs.set(roomId, w);
}
const ctx: ScenarioContext = {
config,
sessions,
coordinator,
dashboard,
logger,
signal: abortController.signal,
heartbeat: (roomId: string) => {
const w = watchdogs.get(roomId);
if (w) w.heartbeat();
},
};
const result = await scenario.run(ctx);
logger.info('run_complete', {
gamesCompleted: result.gamesCompleted,
errors: result.errors.length,
durationMs: result.durationMs,
});
console.log(`Games completed: ${result.gamesCompleted}`);
console.log(`Errors: ${result.errors.length}`);
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
artifacts.writeSummary({
runId,
scenario: scenario.name,
targetUrl,
gamesCompleted: result.gamesCompleted,
errors: result.errors,
durationMs: result.durationMs,
customMetrics: result.customMetrics,
});
if (result.errors.length > 0) {
console.log('Errors:');
for (const e of result.errors) {
console.log(` ${e.room}: ${e.reason}${e.detail ? ' — ' + e.detail : ''}`);
}
exitCode = 1;
}
} catch (err) {
logger.error('run_failed', {
error: err instanceof Error ? err.message : String(err),
stack: err instanceof Error ? err.stack : undefined,
});
// Best-effort artifact capture from still-live sessions. The pool's
// activeSessions field is private but accessible for this error path —
// we want every frame we can grab before release() tears them down.
try {
const liveSessions = (pool as unknown as { activeSessions: Session[] }).activeSessions;
if (liveSessions && liveSessions.length > 0) {
await artifacts.captureAll(liveSessions);
}
} catch (captureErr) {
logger.warn('artifact_capture_failed', {
error: captureErr instanceof Error ? captureErr.message : String(captureErr),
});
}
exitCode = 1;
} finally {
clearInterval(healthTimer);
if (forceExitTimer) clearTimeout(forceExitTimer);
for (const w of watchdogs.values()) w.stop();
await screencaster.stopAll();
await pool.release();
if (dashboardServer) {
await dashboardServer.stop();
}
}
if (abortController.signal.aborted && exitCode === 0) exitCode = 2;
process.exit(exitCode);
}
main().catch((err) => {
console.error(err);
process.exit(1);
});