feat(soak): artifacts, graceful shutdown, health probes, smoke script, v3.3.4
Batched remaining harness tasks (27-30, 33):
Task 27 — Artifact capture on failure: screenshots, HTML snapshots,
game state JSON, and console error tails are captured into
tests/soak/artifacts/<run-id>/ when a scenario throws. Successful
runs get a summary.json. Old runs (>7d) are pruned on startup.
Task 28 — Graceful shutdown: first SIGINT/SIGTERM flips the abort
signal (scenarios finish current turn then unwind). 10s after, a
hard-kill fires if cleanup hangs. Double Ctrl-C = immediate exit.
Exit codes: 0 success, 1 errors, 2 interrupted.
Task 29 — Periodic health probes: every 30s GET /health against the
target server. Three consecutive failures abort the run with
health_fatal, preventing staging outages from being misattributed
to harness bugs. Corrected endpoint from /api/health to /health
per server/routers/health.py.
Task 30 — Smoke test script: tests/soak/scripts/smoke.sh, a 60s
end-to-end canary that health-probes the target, seeds if needed,
and runs one minimal populate game.
Task 33 — Version bump to v3.3.4: both index.html footers (was
v3.1.6), new footer added to admin.html (had none), pyproject.toml.
Also fixes discovered during stress testing:
- SessionPool sets baseURL on all contexts so relative goto('/')
resolves correctly between games (was "invalid URL" error)
- RoomCoordinator key is now unique per game-start (Date.now
suffix) so Deferred promises don't carry stale room codes from
previous games
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
121
tests/soak/core/artifacts.ts
Normal file
121
tests/soak/core/artifacts.ts
Normal file
@@ -0,0 +1,121 @@
|
||||
/**
|
||||
* Artifacts — capture session debugging info on scenario failure.
|
||||
*
|
||||
* When runner.ts hits an unrecoverable error during a scenario, it
|
||||
* calls `artifacts.captureAll(liveSessions)` which dumps one
|
||||
* screenshot + HTML snapshot + game state JSON + console tail per
|
||||
* session into `tests/soak/artifacts/<run-id>/`.
|
||||
*
|
||||
* Successful runs get a lightweight `summary.json` written at the
|
||||
* same path so post-run inspection has something to grep.
|
||||
*
|
||||
* `pruneOldRuns` sweeps run dirs older than maxAgeMs on startup so
|
||||
* the artifacts directory doesn't grow unbounded.
|
||||
*/
|
||||
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import type { Session, Logger } from './types';
|
||||
|
||||
export interface ArtifactsOptions {
|
||||
runId: string;
|
||||
/** Absolute path to the artifacts root, e.g. /path/to/tests/soak/artifacts */
|
||||
rootDir: string;
|
||||
logger: Logger;
|
||||
}
|
||||
|
||||
export class Artifacts {
|
||||
readonly runDir: string;
|
||||
|
||||
constructor(private opts: ArtifactsOptions) {
|
||||
this.runDir = path.join(opts.rootDir, opts.runId);
|
||||
fs.mkdirSync(this.runDir, { recursive: true });
|
||||
}
|
||||
|
||||
/** Capture screenshot + HTML + state + console tail for one session. */
|
||||
async captureSession(session: Session, roomId: string): Promise<void> {
|
||||
const dir = path.join(this.runDir, roomId);
|
||||
fs.mkdirSync(dir, { recursive: true });
|
||||
const prefix = session.key;
|
||||
|
||||
try {
|
||||
const png = await session.page.screenshot({ fullPage: true });
|
||||
fs.writeFileSync(path.join(dir, `${prefix}.png`), png);
|
||||
} catch (err) {
|
||||
this.opts.logger.warn('artifact_screenshot_failed', {
|
||||
session: session.key,
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
const html = await session.page.content();
|
||||
fs.writeFileSync(path.join(dir, `${prefix}.html`), html);
|
||||
} catch (err) {
|
||||
this.opts.logger.warn('artifact_html_failed', {
|
||||
session: session.key,
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
const state = await session.bot.getGameState();
|
||||
fs.writeFileSync(
|
||||
path.join(dir, `${prefix}.state.json`),
|
||||
JSON.stringify(state, null, 2),
|
||||
);
|
||||
} catch (err) {
|
||||
this.opts.logger.warn('artifact_state_failed', {
|
||||
session: session.key,
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
const errors = session.bot.getConsoleErrors?.() ?? [];
|
||||
fs.writeFileSync(path.join(dir, `${prefix}.console.txt`), errors.join('\n'));
|
||||
} catch {
|
||||
// ignore — not all bot flavors expose console errors
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Best-effort capture for every live session. We don't know which
|
||||
* room each session belongs to at this level, so everything lands
|
||||
* under `room-unknown/` unless callers partition sessions first.
|
||||
*/
|
||||
async captureAll(sessions: Session[]): Promise<void> {
|
||||
await Promise.all(
|
||||
sessions.map((s) => this.captureSession(s, 'room-unknown')),
|
||||
);
|
||||
}
|
||||
|
||||
writeSummary(summary: object): void {
|
||||
fs.writeFileSync(
|
||||
path.join(this.runDir, 'summary.json'),
|
||||
JSON.stringify(summary, null, 2),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/** Prune run directories older than `maxAgeMs`. Called on runner startup. */
|
||||
export function pruneOldRuns(
|
||||
rootDir: string,
|
||||
maxAgeMs: number,
|
||||
logger: Logger,
|
||||
): void {
|
||||
if (!fs.existsSync(rootDir)) return;
|
||||
const now = Date.now();
|
||||
for (const entry of fs.readdirSync(rootDir)) {
|
||||
const full = path.join(rootDir, entry);
|
||||
try {
|
||||
const stat = fs.statSync(full);
|
||||
if (stat.isDirectory() && now - stat.mtimeMs > maxAgeMs) {
|
||||
fs.rmSync(full, { recursive: true, force: true });
|
||||
logger.info('artifact_pruned', { runId: entry });
|
||||
}
|
||||
} catch {
|
||||
// ignore — best effort
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -259,8 +259,13 @@ export class SessionPool {
|
||||
// a typical 1920×1080 display. Two windows side-by-side still fit
|
||||
// horizontally; if the user runs more than 2 rooms in tiled mode
|
||||
// the extra windows will overlap and need to be arranged manually.
|
||||
//
|
||||
// baseURL is set on every context so relative goto('/') calls
|
||||
// (used between games to bounce back to the lobby) resolve to
|
||||
// the target server instead of failing with "invalid URL".
|
||||
const context = await targetBrowser.newContext({
|
||||
...this.opts.contextOptions,
|
||||
baseURL: this.opts.targetUrl,
|
||||
...(useHeaded ? { viewport: { width: 960, height: 900 } } : {}),
|
||||
});
|
||||
await this.injectAuth(context, account);
|
||||
|
||||
@@ -18,6 +18,7 @@ import { RoomCoordinator } from './core/room-coordinator';
|
||||
import { DashboardServer } from './dashboard/server';
|
||||
import { Screencaster } from './core/screencaster';
|
||||
import { Watchdog } from './core/watchdog';
|
||||
import { Artifacts, pruneOldRuns } from './core/artifacts';
|
||||
import { getScenario, listScenarios } from './scenarios';
|
||||
import type { DashboardReporter, ScenarioContext, Session } from './core/types';
|
||||
|
||||
@@ -72,6 +73,13 @@ async function main(): Promise<void> {
|
||||
cli,
|
||||
});
|
||||
|
||||
// Artifacts: instantiate now so both failure path + success summary
|
||||
// can reach it. Prune old runs (>7d) on startup so the directory
|
||||
// doesn't grow unbounded.
|
||||
const artifactsRoot = path.resolve(__dirname, 'artifacts');
|
||||
const artifacts = new Artifacts({ runId, rootDir: artifactsRoot, logger });
|
||||
pruneOldRuns(artifactsRoot, 7 * 24 * 3600 * 1000, logger);
|
||||
|
||||
// Resolve final config: scenarioDefaults → env → CLI (later wins)
|
||||
const config = mergeConfig(
|
||||
cli as Record<string, unknown>,
|
||||
@@ -115,13 +123,47 @@ async function main(): Promise<void> {
|
||||
|
||||
const abortController = new AbortController();
|
||||
|
||||
// Graceful shutdown: first signal flips abort, scenarios finish the
|
||||
// current turn then unwind. 10 seconds later, if cleanup is still
|
||||
// hanging, the runner force-exits. A second Ctrl-C skips the wait.
|
||||
let forceExitTimer: NodeJS.Timeout | null = null;
|
||||
const onSignal = (sig: string) => {
|
||||
if (abortController.signal.aborted) {
|
||||
logger.warn('force_exit', { signal: sig });
|
||||
process.exit(130);
|
||||
}
|
||||
logger.warn('signal_received', { signal: sig });
|
||||
abortController.abort();
|
||||
forceExitTimer = setTimeout(() => {
|
||||
logger.error('graceful_shutdown_timeout');
|
||||
process.exit(130);
|
||||
}, 10_000);
|
||||
};
|
||||
process.on('SIGINT', () => onSignal('SIGINT'));
|
||||
process.on('SIGTERM', () => onSignal('SIGTERM'));
|
||||
|
||||
// Health probes: every 30s GET /api/health. Three consecutive failures
|
||||
// abort the run with a fatal error so staging outages don't get
|
||||
// misattributed to harness bugs.
|
||||
let healthFailures = 0;
|
||||
const healthTimer = setInterval(async () => {
|
||||
try {
|
||||
const res = await fetch(`${targetUrl}/health`);
|
||||
if (!res.ok) throw new Error(`status ${res.status}`);
|
||||
healthFailures = 0;
|
||||
} catch (err) {
|
||||
healthFailures++;
|
||||
logger.warn('health_probe_failed', {
|
||||
consecutive: healthFailures,
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
});
|
||||
if (healthFailures >= 3) {
|
||||
logger.error('health_fatal', { consecutive: healthFailures });
|
||||
abortController.abort();
|
||||
}
|
||||
}
|
||||
}, 30_000);
|
||||
|
||||
let dashboardServer: DashboardServer | null = null;
|
||||
let dashboard: DashboardReporter = noopDashboard();
|
||||
const watchdogs = new Map<string, Watchdog>();
|
||||
@@ -217,6 +259,15 @@ async function main(): Promise<void> {
|
||||
console.log(`Games completed: ${result.gamesCompleted}`);
|
||||
console.log(`Errors: ${result.errors.length}`);
|
||||
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
|
||||
artifacts.writeSummary({
|
||||
runId,
|
||||
scenario: scenario.name,
|
||||
targetUrl,
|
||||
gamesCompleted: result.gamesCompleted,
|
||||
errors: result.errors,
|
||||
durationMs: result.durationMs,
|
||||
customMetrics: result.customMetrics,
|
||||
});
|
||||
if (result.errors.length > 0) {
|
||||
console.log('Errors:');
|
||||
for (const e of result.errors) {
|
||||
@@ -229,8 +280,23 @@ async function main(): Promise<void> {
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
stack: err instanceof Error ? err.stack : undefined,
|
||||
});
|
||||
// Best-effort artifact capture from still-live sessions. The pool's
|
||||
// activeSessions field is private but accessible for this error path —
|
||||
// we want every frame we can grab before release() tears them down.
|
||||
try {
|
||||
const liveSessions = (pool as unknown as { activeSessions: Session[] }).activeSessions;
|
||||
if (liveSessions && liveSessions.length > 0) {
|
||||
await artifacts.captureAll(liveSessions);
|
||||
}
|
||||
} catch (captureErr) {
|
||||
logger.warn('artifact_capture_failed', {
|
||||
error: captureErr instanceof Error ? captureErr.message : String(captureErr),
|
||||
});
|
||||
}
|
||||
exitCode = 1;
|
||||
} finally {
|
||||
clearInterval(healthTimer);
|
||||
if (forceExitTimer) clearTimeout(forceExitTimer);
|
||||
for (const w of watchdogs.values()) w.stop();
|
||||
await screencaster.stopAll();
|
||||
await pool.release();
|
||||
|
||||
@@ -55,9 +55,15 @@ export async function runOneMultiplayerGame(
|
||||
// goto('/') bounces them back; localStorage-cached auth persists.
|
||||
await Promise.all(sessions.map((s) => s.bot.goto('/')));
|
||||
|
||||
// Use a unique coordinator key per game-start so Deferreds don't
|
||||
// carry stale room codes from previous games. The coordinator's
|
||||
// Promises only resolve once — reusing `opts.roomId` across games
|
||||
// would make joiners receive the first game's code on every game.
|
||||
const coordKey = `${opts.roomId}-${Date.now()}`;
|
||||
|
||||
// Host creates game and announces the code
|
||||
const code = await host.bot.createGame(host.account.username);
|
||||
ctx.coordinator.announce(opts.roomId, code);
|
||||
ctx.coordinator.announce(coordKey, code);
|
||||
ctx.heartbeat(opts.roomId);
|
||||
ctx.dashboard.update(opts.roomId, { phase: 'lobby' });
|
||||
ctx.logger.info('room_created', { room: opts.roomId, code });
|
||||
@@ -65,7 +71,7 @@ export async function runOneMultiplayerGame(
|
||||
// Joiners join concurrently
|
||||
await Promise.all(
|
||||
joiners.map(async (joiner) => {
|
||||
const awaited = await ctx.coordinator.await(opts.roomId);
|
||||
const awaited = await ctx.coordinator.await(coordKey);
|
||||
await joiner.bot.joinGame(awaited, joiner.account.username);
|
||||
}),
|
||||
);
|
||||
|
||||
37
tests/soak/scripts/smoke.sh
Executable file
37
tests/soak/scripts/smoke.sh
Executable file
@@ -0,0 +1,37 @@
|
||||
#!/usr/bin/env bash
|
||||
# Soak harness smoke test — end-to-end canary against local dev.
|
||||
# Expected runtime: ~60 seconds.
|
||||
set -euo pipefail
|
||||
|
||||
cd "$(dirname "$0")/.."
|
||||
|
||||
: "${TEST_URL:=http://localhost:8000}"
|
||||
: "${SOAK_INVITE_CODE:=SOAKTEST}"
|
||||
|
||||
echo "Smoke target: $TEST_URL"
|
||||
echo "Invite code: $SOAK_INVITE_CODE"
|
||||
|
||||
# 1. Health probe
|
||||
curl -fsS "$TEST_URL/api/health" > /dev/null || {
|
||||
echo "FAIL: target server unreachable at $TEST_URL"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# 2. Ensure minimum accounts
|
||||
if [ ! -f .env.stresstest ]; then
|
||||
echo "Seeding accounts..."
|
||||
bun run seed -- --count=4
|
||||
fi
|
||||
|
||||
# 3. Run minimum viable scenario
|
||||
TEST_URL="$TEST_URL" SOAK_INVITE_CODE="$SOAK_INVITE_CODE" \
|
||||
bun run soak -- \
|
||||
--scenario=populate \
|
||||
--accounts=2 \
|
||||
--rooms=1 \
|
||||
--cpus-per-room=0 \
|
||||
--games-per-room=1 \
|
||||
--holes=1 \
|
||||
--watch=none
|
||||
|
||||
echo "Smoke PASSED"
|
||||
Reference in New Issue
Block a user