Batched remaining harness tasks (27-30, 33):
Task 27 — Artifact capture on failure: screenshots, HTML snapshots,
game state JSON, and console error tails are captured into
tests/soak/artifacts/<run-id>/ when a scenario throws. Successful
runs get a summary.json. Old runs (>7d) are pruned on startup.
Task 28 — Graceful shutdown: first SIGINT/SIGTERM flips the abort
signal (scenarios finish current turn then unwind). 10s after, a
hard-kill fires if cleanup hangs. Double Ctrl-C = immediate exit.
Exit codes: 0 success, 1 errors, 2 interrupted.
Task 29 — Periodic health probes: every 30s GET /health against the
target server. Three consecutive failures abort the run with
health_fatal, preventing staging outages from being misattributed
to harness bugs. Corrected endpoint from /api/health to /health
per server/routers/health.py.
Task 30 — Smoke test script: tests/soak/scripts/smoke.sh, a 60s
end-to-end canary that health-probes the target, seeds if needed,
and runs one minimal populate game.
Task 33 — Version bump to v3.3.4: both index.html footers (was
v3.1.6), new footer added to admin.html (had none), pyproject.toml.
Also fixes discovered during stress testing:
- SessionPool sets baseURL on all contexts so relative goto('/')
resolves correctly between games (was "invalid URL" error)
- RoomCoordinator key is now unique per game-start (Date.now
suffix) so Deferred promises don't carry stale room codes from
previous games
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
316 lines
10 KiB
TypeScript
316 lines
10 KiB
TypeScript
#!/usr/bin/env tsx
|
|
/**
|
|
* Golf Soak Harness — entry point.
|
|
*
|
|
* Usage:
|
|
* TEST_URL=http://localhost:8000 \
|
|
* SOAK_INVITE_CODE=SOAKTEST \
|
|
* bun run soak -- --scenario=populate --rooms=1 --accounts=2 \
|
|
* --cpus-per-room=0 --games-per-room=1 --holes=1 --watch=none
|
|
*/
|
|
|
|
import * as path from 'path';
|
|
import { spawn } from 'child_process';
|
|
import { parseArgs, mergeConfig, CliArgs } from './config';
|
|
import { createLogger } from './core/logger';
|
|
import { SessionPool } from './core/session-pool';
|
|
import { RoomCoordinator } from './core/room-coordinator';
|
|
import { DashboardServer } from './dashboard/server';
|
|
import { Screencaster } from './core/screencaster';
|
|
import { Watchdog } from './core/watchdog';
|
|
import { Artifacts, pruneOldRuns } from './core/artifacts';
|
|
import { getScenario, listScenarios } from './scenarios';
|
|
import type { DashboardReporter, ScenarioContext, Session } from './core/types';
|
|
|
|
function noopDashboard(): DashboardReporter {
|
|
return {
|
|
update: () => {},
|
|
log: () => {},
|
|
incrementMetric: () => {},
|
|
};
|
|
}
|
|
|
|
function printScenarioList(): void {
|
|
console.log('Available scenarios:');
|
|
for (const s of listScenarios()) {
|
|
console.log(` ${s.name.padEnd(12)} ${s.description}`);
|
|
console.log(
|
|
` needs: accounts=${s.needs.accounts}, rooms=${s.needs.rooms ?? 1}, cpus=${s.needs.cpusPerRoom ?? 0}`,
|
|
);
|
|
}
|
|
}
|
|
|
|
async function main(): Promise<void> {
|
|
const cli: CliArgs = parseArgs(process.argv.slice(2));
|
|
|
|
if (cli.listOnly) {
|
|
printScenarioList();
|
|
return;
|
|
}
|
|
|
|
if (!cli.scenario) {
|
|
console.error('Error: --scenario=<name> is required. Use --list to see scenarios.');
|
|
process.exit(2);
|
|
}
|
|
|
|
const scenario = getScenario(cli.scenario);
|
|
if (!scenario) {
|
|
console.error(`Error: unknown scenario "${cli.scenario}". Use --list to see scenarios.`);
|
|
process.exit(2);
|
|
}
|
|
|
|
const runId =
|
|
cli.runId ?? `${cli.scenario}-${new Date().toISOString().replace(/[:.]/g, '-')}`;
|
|
const targetUrl = cli.target ?? process.env.TEST_URL ?? 'http://localhost:8000';
|
|
const inviteCode = process.env.SOAK_INVITE_CODE ?? 'SOAKTEST';
|
|
const watch = cli.watch ?? 'dashboard';
|
|
|
|
const logger = createLogger({ runId });
|
|
logger.info('run_start', {
|
|
scenario: scenario.name,
|
|
targetUrl,
|
|
watch,
|
|
cli,
|
|
});
|
|
|
|
// Artifacts: instantiate now so both failure path + success summary
|
|
// can reach it. Prune old runs (>7d) on startup so the directory
|
|
// doesn't grow unbounded.
|
|
const artifactsRoot = path.resolve(__dirname, 'artifacts');
|
|
const artifacts = new Artifacts({ runId, rootDir: artifactsRoot, logger });
|
|
pruneOldRuns(artifactsRoot, 7 * 24 * 3600 * 1000, logger);
|
|
|
|
// Resolve final config: scenarioDefaults → env → CLI (later wins)
|
|
const config = mergeConfig(
|
|
cli as Record<string, unknown>,
|
|
process.env,
|
|
scenario.defaultConfig,
|
|
);
|
|
|
|
// Ensure core knobs exist, falling back to scenario.needs
|
|
const accounts = Number(config.accounts ?? scenario.needs.accounts);
|
|
const rooms = Number(config.rooms ?? scenario.needs.rooms ?? 1);
|
|
const cpusPerRoom = Number(config.cpusPerRoom ?? scenario.needs.cpusPerRoom ?? 0);
|
|
if (accounts % rooms !== 0) {
|
|
console.error(
|
|
`Error: --accounts=${accounts} does not divide evenly into --rooms=${rooms}`,
|
|
);
|
|
process.exit(2);
|
|
}
|
|
config.accounts = accounts;
|
|
config.rooms = rooms;
|
|
config.cpusPerRoom = cpusPerRoom;
|
|
|
|
if (cli.dryRun) {
|
|
logger.info('dry_run', { config });
|
|
console.log('Dry run OK. Resolved config:');
|
|
console.log(JSON.stringify(config, null, 2));
|
|
return;
|
|
}
|
|
|
|
// Build dependencies
|
|
const credFile = path.resolve(__dirname, '.env.stresstest');
|
|
const headedHostCount = watch === 'tiled' ? rooms : 0;
|
|
const pool = new SessionPool({
|
|
targetUrl,
|
|
inviteCode,
|
|
credFile,
|
|
logger,
|
|
headedHostCount,
|
|
});
|
|
const coordinator = new RoomCoordinator();
|
|
const screencaster = new Screencaster(logger);
|
|
|
|
const abortController = new AbortController();
|
|
|
|
// Graceful shutdown: first signal flips abort, scenarios finish the
|
|
// current turn then unwind. 10 seconds later, if cleanup is still
|
|
// hanging, the runner force-exits. A second Ctrl-C skips the wait.
|
|
let forceExitTimer: NodeJS.Timeout | null = null;
|
|
const onSignal = (sig: string) => {
|
|
if (abortController.signal.aborted) {
|
|
logger.warn('force_exit', { signal: sig });
|
|
process.exit(130);
|
|
}
|
|
logger.warn('signal_received', { signal: sig });
|
|
abortController.abort();
|
|
forceExitTimer = setTimeout(() => {
|
|
logger.error('graceful_shutdown_timeout');
|
|
process.exit(130);
|
|
}, 10_000);
|
|
};
|
|
process.on('SIGINT', () => onSignal('SIGINT'));
|
|
process.on('SIGTERM', () => onSignal('SIGTERM'));
|
|
|
|
// Health probes: every 30s GET /api/health. Three consecutive failures
|
|
// abort the run with a fatal error so staging outages don't get
|
|
// misattributed to harness bugs.
|
|
let healthFailures = 0;
|
|
const healthTimer = setInterval(async () => {
|
|
try {
|
|
const res = await fetch(`${targetUrl}/health`);
|
|
if (!res.ok) throw new Error(`status ${res.status}`);
|
|
healthFailures = 0;
|
|
} catch (err) {
|
|
healthFailures++;
|
|
logger.warn('health_probe_failed', {
|
|
consecutive: healthFailures,
|
|
error: err instanceof Error ? err.message : String(err),
|
|
});
|
|
if (healthFailures >= 3) {
|
|
logger.error('health_fatal', { consecutive: healthFailures });
|
|
abortController.abort();
|
|
}
|
|
}
|
|
}, 30_000);
|
|
|
|
let dashboardServer: DashboardServer | null = null;
|
|
let dashboard: DashboardReporter = noopDashboard();
|
|
const watchdogs = new Map<string, Watchdog>();
|
|
let exitCode = 0;
|
|
try {
|
|
const sessions = await pool.acquire(accounts);
|
|
logger.info('sessions_acquired', { count: sessions.length });
|
|
|
|
// Build a session lookup for click-to-watch
|
|
const sessionsByKey = new Map<string, Session>();
|
|
for (const s of sessions) sessionsByKey.set(s.key, s);
|
|
|
|
// Dashboard with screencaster handlers now that sessions exist
|
|
if (watch === 'dashboard') {
|
|
const port = Number(config.dashboardPort ?? 7777);
|
|
dashboardServer = new DashboardServer(port, logger, {
|
|
onStartStream: (key) => {
|
|
const session = sessionsByKey.get(key);
|
|
if (!session) {
|
|
logger.warn('stream_start_unknown_session', { sessionKey: key });
|
|
return;
|
|
}
|
|
screencaster
|
|
.start(key, session.page, (jpegBase64) => {
|
|
dashboardServer!.broadcast({ type: 'frame', sessionKey: key, jpegBase64 });
|
|
})
|
|
.catch((err) =>
|
|
logger.error('screencast_start_failed', {
|
|
sessionKey: key,
|
|
error: err instanceof Error ? err.message : String(err),
|
|
}),
|
|
);
|
|
},
|
|
onStopStream: (key) => {
|
|
screencaster.stop(key).catch(() => {
|
|
// best-effort — errors already logged inside Screencaster
|
|
});
|
|
},
|
|
onDisconnect: () => {
|
|
screencaster.stopAll().catch(() => {});
|
|
},
|
|
});
|
|
await dashboardServer.start();
|
|
dashboard = dashboardServer.reporter();
|
|
const url = `http://localhost:${port}`;
|
|
console.log(`Dashboard: ${url}`);
|
|
try {
|
|
const opener =
|
|
process.platform === 'darwin'
|
|
? 'open'
|
|
: process.platform === 'win32'
|
|
? 'start'
|
|
: 'xdg-open';
|
|
spawn(opener, [url], { stdio: 'ignore', detached: true }).unref();
|
|
} catch {
|
|
// If auto-open fails, the URL is already printed.
|
|
}
|
|
}
|
|
|
|
// Per-room watchdogs — fire if no heartbeat arrives within 60s.
|
|
// Declared at outer scope so the finally block can stop them and
|
|
// drain any pending timers before the process exits.
|
|
for (let i = 0; i < rooms; i++) {
|
|
const roomId = `room-${i}`;
|
|
const w = new Watchdog(60_000, () => {
|
|
logger.error('watchdog_fired', { room: roomId });
|
|
dashboard.update(roomId, { phase: 'error' });
|
|
abortController.abort();
|
|
});
|
|
w.start();
|
|
watchdogs.set(roomId, w);
|
|
}
|
|
|
|
const ctx: ScenarioContext = {
|
|
config,
|
|
sessions,
|
|
coordinator,
|
|
dashboard,
|
|
logger,
|
|
signal: abortController.signal,
|
|
heartbeat: (roomId: string) => {
|
|
const w = watchdogs.get(roomId);
|
|
if (w) w.heartbeat();
|
|
},
|
|
};
|
|
|
|
const result = await scenario.run(ctx);
|
|
logger.info('run_complete', {
|
|
gamesCompleted: result.gamesCompleted,
|
|
errors: result.errors.length,
|
|
durationMs: result.durationMs,
|
|
});
|
|
console.log(`Games completed: ${result.gamesCompleted}`);
|
|
console.log(`Errors: ${result.errors.length}`);
|
|
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
|
|
artifacts.writeSummary({
|
|
runId,
|
|
scenario: scenario.name,
|
|
targetUrl,
|
|
gamesCompleted: result.gamesCompleted,
|
|
errors: result.errors,
|
|
durationMs: result.durationMs,
|
|
customMetrics: result.customMetrics,
|
|
});
|
|
if (result.errors.length > 0) {
|
|
console.log('Errors:');
|
|
for (const e of result.errors) {
|
|
console.log(` ${e.room}: ${e.reason}${e.detail ? ' — ' + e.detail : ''}`);
|
|
}
|
|
exitCode = 1;
|
|
}
|
|
} catch (err) {
|
|
logger.error('run_failed', {
|
|
error: err instanceof Error ? err.message : String(err),
|
|
stack: err instanceof Error ? err.stack : undefined,
|
|
});
|
|
// Best-effort artifact capture from still-live sessions. The pool's
|
|
// activeSessions field is private but accessible for this error path —
|
|
// we want every frame we can grab before release() tears them down.
|
|
try {
|
|
const liveSessions = (pool as unknown as { activeSessions: Session[] }).activeSessions;
|
|
if (liveSessions && liveSessions.length > 0) {
|
|
await artifacts.captureAll(liveSessions);
|
|
}
|
|
} catch (captureErr) {
|
|
logger.warn('artifact_capture_failed', {
|
|
error: captureErr instanceof Error ? captureErr.message : String(captureErr),
|
|
});
|
|
}
|
|
exitCode = 1;
|
|
} finally {
|
|
clearInterval(healthTimer);
|
|
if (forceExitTimer) clearTimeout(forceExitTimer);
|
|
for (const w of watchdogs.values()) w.stop();
|
|
await screencaster.stopAll();
|
|
await pool.release();
|
|
if (dashboardServer) {
|
|
await dashboardServer.stop();
|
|
}
|
|
}
|
|
|
|
if (abortController.signal.aborted && exitCode === 0) exitCode = 2;
|
|
process.exit(exitCode);
|
|
}
|
|
|
|
main().catch((err) => {
|
|
console.error(err);
|
|
process.exit(1);
|
|
});
|