Features: - Multiplayer WebSocket game server (FastAPI) - 8 AI personalities with distinct play styles - 15+ house rule variants - SQLite game logging for AI analysis - Comprehensive test suite (80+ tests) AI improvements: - Fixed Maya bug (taking bad cards, discarding good ones) - Personality traits influence style without overriding competence - Zero blunders detected in 1000+ game simulations Testing infrastructure: - Game rules verification (test_game.py) - AI decision analysis (game_analyzer.py) - Score distribution analysis (score_analysis.py) - House rules testing (test_house_rules.py) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
572 lines
17 KiB
Python
572 lines
17 KiB
Python
"""
|
|
House Rules Testing Suite
|
|
|
|
Tests all house rule combinations to:
|
|
1. Find edge cases and bugs
|
|
2. Establish baseline performance metrics
|
|
3. Verify rules affect gameplay as expected
|
|
"""
|
|
|
|
import random
|
|
import sys
|
|
from collections import defaultdict
|
|
from dataclasses import dataclass
|
|
from typing import Optional
|
|
|
|
from game import Game, Player, GamePhase, GameOptions
|
|
from ai import GolfAI, CPUProfile, CPU_PROFILES, get_ai_card_value
|
|
|
|
|
|
@dataclass
|
|
class RuleTestResult:
|
|
"""Results from testing a house rule configuration."""
|
|
name: str
|
|
options: GameOptions
|
|
games_played: int
|
|
scores: list[int]
|
|
turn_counts: list[int]
|
|
negative_scores: int # Count of scores < 0
|
|
zero_scores: int # Count of exactly 0
|
|
high_scores: int # Count of scores > 25
|
|
errors: list[str]
|
|
|
|
@property
|
|
def mean_score(self) -> float:
|
|
return sum(self.scores) / len(self.scores) if self.scores else 0
|
|
|
|
@property
|
|
def median_score(self) -> float:
|
|
if not self.scores:
|
|
return 0
|
|
s = sorted(self.scores)
|
|
n = len(s)
|
|
if n % 2 == 0:
|
|
return (s[n//2 - 1] + s[n//2]) / 2
|
|
return s[n//2]
|
|
|
|
@property
|
|
def mean_turns(self) -> float:
|
|
return sum(self.turn_counts) / len(self.turn_counts) if self.turn_counts else 0
|
|
|
|
@property
|
|
def min_score(self) -> int:
|
|
return min(self.scores) if self.scores else 0
|
|
|
|
@property
|
|
def max_score(self) -> int:
|
|
return max(self.scores) if self.scores else 0
|
|
|
|
|
|
def run_game_with_options(options: GameOptions, num_players: int = 4) -> tuple[list[int], int, Optional[str]]:
|
|
"""
|
|
Run a single game with given options.
|
|
Returns (scores, turn_count, error_message).
|
|
"""
|
|
profiles = random.sample(CPU_PROFILES, min(num_players, len(CPU_PROFILES)))
|
|
|
|
game = Game()
|
|
player_profiles: dict[str, CPUProfile] = {}
|
|
|
|
for i, profile in enumerate(profiles):
|
|
player = Player(id=f"cpu_{i}", name=profile.name)
|
|
game.add_player(player)
|
|
player_profiles[player.id] = profile
|
|
|
|
try:
|
|
game.start_game(num_decks=1, num_rounds=1, options=options)
|
|
|
|
# Initial flips
|
|
for player in game.players:
|
|
positions = GolfAI.choose_initial_flips(options.initial_flips)
|
|
game.flip_initial_cards(player.id, positions)
|
|
|
|
# Play game
|
|
turn = 0
|
|
max_turns = 300 # Higher limit for edge cases
|
|
|
|
while game.phase in (GamePhase.PLAYING, GamePhase.FINAL_TURN) and turn < max_turns:
|
|
current = game.current_player()
|
|
if not current:
|
|
break
|
|
|
|
profile = player_profiles[current.id]
|
|
|
|
# Draw
|
|
discard_top = game.discard_top()
|
|
take_discard = GolfAI.should_take_discard(discard_top, current, profile, game)
|
|
source = "discard" if take_discard else "deck"
|
|
drawn = game.draw_card(current.id, source)
|
|
|
|
if not drawn:
|
|
# Deck exhausted - this is an edge case
|
|
break
|
|
|
|
# Swap or discard
|
|
swap_pos = GolfAI.choose_swap_or_discard(drawn, current, profile, game)
|
|
|
|
if swap_pos is None and game.drawn_from_discard:
|
|
face_down = [i for i, c in enumerate(current.cards) if not c.face_up]
|
|
if face_down:
|
|
swap_pos = random.choice(face_down)
|
|
else:
|
|
worst_pos = 0
|
|
worst_val = -999
|
|
for i, c in enumerate(current.cards):
|
|
card_val = get_ai_card_value(c, game.options)
|
|
if card_val > worst_val:
|
|
worst_val = card_val
|
|
worst_pos = i
|
|
swap_pos = worst_pos
|
|
|
|
if swap_pos is not None:
|
|
game.swap_card(current.id, swap_pos)
|
|
else:
|
|
game.discard_drawn(current.id)
|
|
if game.flip_on_discard:
|
|
flip_pos = GolfAI.choose_flip_after_discard(current, profile)
|
|
game.flip_and_end_turn(current.id, flip_pos)
|
|
|
|
turn += 1
|
|
|
|
if turn >= max_turns:
|
|
return [], turn, f"Game exceeded {max_turns} turns - possible infinite loop"
|
|
|
|
scores = [p.total_score for p in game.players]
|
|
return scores, turn, None
|
|
|
|
except Exception as e:
|
|
return [], 0, f"Exception: {str(e)}"
|
|
|
|
|
|
def test_rule_config(name: str, options: GameOptions, num_games: int = 50) -> RuleTestResult:
|
|
"""Test a specific rule configuration."""
|
|
|
|
all_scores = []
|
|
turn_counts = []
|
|
errors = []
|
|
negative_count = 0
|
|
zero_count = 0
|
|
high_count = 0
|
|
|
|
for _ in range(num_games):
|
|
scores, turns, error = run_game_with_options(options)
|
|
|
|
if error:
|
|
errors.append(error)
|
|
continue
|
|
|
|
all_scores.extend(scores)
|
|
turn_counts.append(turns)
|
|
|
|
for s in scores:
|
|
if s < 0:
|
|
negative_count += 1
|
|
elif s == 0:
|
|
zero_count += 1
|
|
elif s > 25:
|
|
high_count += 1
|
|
|
|
return RuleTestResult(
|
|
name=name,
|
|
options=options,
|
|
games_played=num_games,
|
|
scores=all_scores,
|
|
turn_counts=turn_counts,
|
|
negative_scores=negative_count,
|
|
zero_scores=zero_count,
|
|
high_scores=high_count,
|
|
errors=errors
|
|
)
|
|
|
|
|
|
# =============================================================================
|
|
# House Rule Configurations to Test
|
|
# =============================================================================
|
|
|
|
def get_test_configs() -> list[tuple[str, GameOptions]]:
|
|
"""Get all house rule configurations to test."""
|
|
|
|
configs = []
|
|
|
|
# Baseline (no house rules)
|
|
configs.append(("BASELINE", GameOptions(
|
|
initial_flips=2,
|
|
flip_on_discard=False,
|
|
use_jokers=False,
|
|
)))
|
|
|
|
# === Standard Options ===
|
|
|
|
configs.append(("flip_on_discard", GameOptions(
|
|
initial_flips=2,
|
|
flip_on_discard=True,
|
|
)))
|
|
|
|
configs.append(("initial_flips=0", GameOptions(
|
|
initial_flips=0,
|
|
flip_on_discard=False,
|
|
)))
|
|
|
|
configs.append(("initial_flips=1", GameOptions(
|
|
initial_flips=1,
|
|
flip_on_discard=False,
|
|
)))
|
|
|
|
configs.append(("knock_penalty", GameOptions(
|
|
initial_flips=2,
|
|
knock_penalty=True,
|
|
)))
|
|
|
|
configs.append(("use_jokers", GameOptions(
|
|
initial_flips=2,
|
|
use_jokers=True,
|
|
)))
|
|
|
|
# === Point Modifiers ===
|
|
|
|
configs.append(("lucky_swing", GameOptions(
|
|
initial_flips=2,
|
|
use_jokers=True,
|
|
lucky_swing=True,
|
|
)))
|
|
|
|
configs.append(("super_kings", GameOptions(
|
|
initial_flips=2,
|
|
super_kings=True,
|
|
)))
|
|
|
|
configs.append(("lucky_sevens", GameOptions(
|
|
initial_flips=2,
|
|
lucky_sevens=True,
|
|
)))
|
|
|
|
configs.append(("ten_penny", GameOptions(
|
|
initial_flips=2,
|
|
ten_penny=True,
|
|
)))
|
|
|
|
# === Bonuses/Penalties ===
|
|
|
|
configs.append(("knock_bonus", GameOptions(
|
|
initial_flips=2,
|
|
knock_bonus=True,
|
|
)))
|
|
|
|
configs.append(("underdog_bonus", GameOptions(
|
|
initial_flips=2,
|
|
underdog_bonus=True,
|
|
)))
|
|
|
|
configs.append(("tied_shame", GameOptions(
|
|
initial_flips=2,
|
|
tied_shame=True,
|
|
)))
|
|
|
|
configs.append(("blackjack", GameOptions(
|
|
initial_flips=2,
|
|
blackjack=True,
|
|
)))
|
|
|
|
# === Gameplay Twists ===
|
|
|
|
configs.append(("queens_wild", GameOptions(
|
|
initial_flips=2,
|
|
queens_wild=True,
|
|
)))
|
|
|
|
configs.append(("four_of_a_kind", GameOptions(
|
|
initial_flips=2,
|
|
four_of_a_kind=True,
|
|
)))
|
|
|
|
configs.append(("eagle_eye", GameOptions(
|
|
initial_flips=2,
|
|
use_jokers=True,
|
|
eagle_eye=True,
|
|
)))
|
|
|
|
# === Interesting Combinations ===
|
|
|
|
configs.append(("CHAOS (all point mods)", GameOptions(
|
|
initial_flips=2,
|
|
use_jokers=True,
|
|
lucky_swing=True,
|
|
super_kings=True,
|
|
lucky_sevens=True,
|
|
ten_penny=True,
|
|
)))
|
|
|
|
configs.append(("COMPETITIVE (penalties)", GameOptions(
|
|
initial_flips=2,
|
|
knock_penalty=True,
|
|
tied_shame=True,
|
|
)))
|
|
|
|
configs.append(("GENEROUS (bonuses)", GameOptions(
|
|
initial_flips=2,
|
|
knock_bonus=True,
|
|
underdog_bonus=True,
|
|
)))
|
|
|
|
configs.append(("WILD CARDS", GameOptions(
|
|
initial_flips=2,
|
|
use_jokers=True,
|
|
queens_wild=True,
|
|
four_of_a_kind=True,
|
|
eagle_eye=True,
|
|
)))
|
|
|
|
configs.append(("CLASSIC+ (jokers + flip)", GameOptions(
|
|
initial_flips=2,
|
|
flip_on_discard=True,
|
|
use_jokers=True,
|
|
)))
|
|
|
|
configs.append(("EVERYTHING", GameOptions(
|
|
initial_flips=2,
|
|
flip_on_discard=True,
|
|
knock_penalty=True,
|
|
use_jokers=True,
|
|
lucky_swing=True,
|
|
super_kings=True,
|
|
lucky_sevens=True,
|
|
ten_penny=True,
|
|
knock_bonus=True,
|
|
underdog_bonus=True,
|
|
tied_shame=True,
|
|
blackjack=True,
|
|
queens_wild=True,
|
|
four_of_a_kind=True,
|
|
eagle_eye=True,
|
|
)))
|
|
|
|
return configs
|
|
|
|
|
|
# =============================================================================
|
|
# Reporting
|
|
# =============================================================================
|
|
|
|
def print_results_table(results: list[RuleTestResult]):
|
|
"""Print a summary table of all results."""
|
|
|
|
print("\n" + "=" * 100)
|
|
print("HOUSE RULES TEST RESULTS")
|
|
print("=" * 100)
|
|
|
|
# Find baseline for comparison
|
|
baseline = next((r for r in results if r.name == "BASELINE"), results[0])
|
|
baseline_mean = baseline.mean_score
|
|
|
|
print(f"\n{'Rule Config':<25} {'Games':>6} {'Mean':>7} {'Med':>6} {'Min':>5} {'Max':>5} {'Turns':>6} {'Neg%':>6} {'Err':>4} {'vs Base':>8}")
|
|
print("-" * 100)
|
|
|
|
for r in results:
|
|
if not r.scores:
|
|
print(f"{r.name:<25} {'ERROR':>6} - no scores collected")
|
|
continue
|
|
|
|
neg_pct = r.negative_scores / len(r.scores) * 100 if r.scores else 0
|
|
diff = r.mean_score - baseline_mean
|
|
diff_str = f"{diff:+.1f}" if r.name != "BASELINE" else "---"
|
|
|
|
err_str = str(len(r.errors)) if r.errors else ""
|
|
|
|
print(f"{r.name:<25} {r.games_played:>6} {r.mean_score:>7.1f} {r.median_score:>6.1f} "
|
|
f"{r.min_score:>5} {r.max_score:>5} {r.mean_turns:>6.0f} {neg_pct:>5.1f}% {err_str:>4} {diff_str:>8}")
|
|
|
|
print("-" * 100)
|
|
|
|
|
|
def print_anomalies(results: list[RuleTestResult]):
|
|
"""Identify and print any anomalies or edge cases."""
|
|
|
|
print("\n" + "=" * 100)
|
|
print("ANOMALY DETECTION")
|
|
print("=" * 100)
|
|
|
|
baseline = next((r for r in results if r.name == "BASELINE"), results[0])
|
|
issues_found = False
|
|
|
|
for r in results:
|
|
issues = []
|
|
|
|
# Check for errors
|
|
if r.errors:
|
|
issues.append(f" ERRORS: {r.errors[:3]}") # Show first 3
|
|
|
|
# Check for extreme scores
|
|
if r.min_score < -15:
|
|
issues.append(f" Very low min score: {r.min_score} (possible scoring bug)")
|
|
|
|
if r.max_score > 60:
|
|
issues.append(f" Very high max score: {r.max_score} (possible stuck game)")
|
|
|
|
# Check for unusual turn counts
|
|
if r.mean_turns > 150:
|
|
issues.append(f" High turn count: {r.mean_turns:.0f} avg (games taking too long)")
|
|
|
|
if r.mean_turns < 20:
|
|
issues.append(f" Low turn count: {r.mean_turns:.0f} avg (games ending too fast)")
|
|
|
|
# Check for dramatic score shifts from baseline
|
|
if r.name != "BASELINE" and r.scores:
|
|
diff = r.mean_score - baseline.mean_score
|
|
if abs(diff) > 10:
|
|
issues.append(f" Large score shift from baseline: {diff:+.1f} points")
|
|
|
|
# Check for too many negative scores (unless expected)
|
|
neg_pct = r.negative_scores / len(r.scores) * 100 if r.scores else 0
|
|
if neg_pct > 20 and "super_kings" not in r.name.lower() and "lucky" not in r.name.lower():
|
|
issues.append(f" High negative score rate: {neg_pct:.1f}%")
|
|
|
|
if issues:
|
|
issues_found = True
|
|
print(f"\n{r.name}:")
|
|
for issue in issues:
|
|
print(issue)
|
|
|
|
if not issues_found:
|
|
print("\nNo anomalies detected - all configurations behaving as expected.")
|
|
|
|
|
|
def print_expected_effects(results: list[RuleTestResult]):
|
|
"""Verify house rules have expected effects."""
|
|
|
|
print("\n" + "=" * 100)
|
|
print("EXPECTED EFFECTS VERIFICATION")
|
|
print("=" * 100)
|
|
|
|
baseline = next((r for r in results if r.name == "BASELINE"), None)
|
|
if not baseline:
|
|
print("No baseline found!")
|
|
return
|
|
|
|
checks = []
|
|
|
|
# Find specific results
|
|
def find(name):
|
|
return next((r for r in results if r.name == name), None)
|
|
|
|
# super_kings should lower scores (Kings worth -2 instead of 0)
|
|
r = find("super_kings")
|
|
if r and r.scores:
|
|
diff = r.mean_score - baseline.mean_score
|
|
expected = "LOWER scores"
|
|
actual = "lower" if diff < -1 else "higher" if diff > 1 else "similar"
|
|
status = "✓" if diff < 0 else "✗"
|
|
checks.append((r.name, expected, f"{actual} ({diff:+.1f})", status))
|
|
|
|
# lucky_sevens should lower scores (7s worth 0 instead of 7)
|
|
r = find("lucky_sevens")
|
|
if r and r.scores:
|
|
diff = r.mean_score - baseline.mean_score
|
|
expected = "LOWER scores"
|
|
actual = "lower" if diff < -1 else "higher" if diff > 1 else "similar"
|
|
status = "✓" if diff < 0 else "✗"
|
|
checks.append((r.name, expected, f"{actual} ({diff:+.1f})", status))
|
|
|
|
# ten_penny should lower scores (10s worth 1 instead of 10)
|
|
r = find("ten_penny")
|
|
if r and r.scores:
|
|
diff = r.mean_score - baseline.mean_score
|
|
expected = "LOWER scores"
|
|
actual = "lower" if diff < -1 else "higher" if diff > 1 else "similar"
|
|
status = "✓" if diff < 0 else "✗"
|
|
checks.append((r.name, expected, f"{actual} ({diff:+.1f})", status))
|
|
|
|
# use_jokers should lower scores (jokers are -2)
|
|
r = find("use_jokers")
|
|
if r and r.scores:
|
|
diff = r.mean_score - baseline.mean_score
|
|
expected = "LOWER scores"
|
|
actual = "lower" if diff < -1 else "higher" if diff > 1 else "similar"
|
|
status = "✓" if diff < 0 else "?" # Might be small effect
|
|
checks.append((r.name, expected, f"{actual} ({diff:+.1f})", status))
|
|
|
|
# knock_bonus should lower scores (-5 for going out)
|
|
r = find("knock_bonus")
|
|
if r and r.scores:
|
|
diff = r.mean_score - baseline.mean_score
|
|
expected = "LOWER scores"
|
|
actual = "lower" if diff < -1 else "higher" if diff > 1 else "similar"
|
|
status = "✓" if diff < 0 else "?"
|
|
checks.append((r.name, expected, f"{actual} ({diff:+.1f})", status))
|
|
|
|
# tied_shame should raise scores (+5 penalty for ties)
|
|
r = find("tied_shame")
|
|
if r and r.scores:
|
|
diff = r.mean_score - baseline.mean_score
|
|
expected = "HIGHER scores"
|
|
actual = "lower" if diff < -1 else "higher" if diff > 1 else "similar"
|
|
status = "✓" if diff > 0 else "?"
|
|
checks.append((r.name, expected, f"{actual} ({diff:+.1f})", status))
|
|
|
|
# flip_on_discard might slightly lower scores (more info)
|
|
r = find("flip_on_discard")
|
|
if r and r.scores:
|
|
diff = r.mean_score - baseline.mean_score
|
|
expected = "SIMILAR or lower"
|
|
actual = "lower" if diff < -1 else "higher" if diff > 1 else "similar"
|
|
status = "✓" if diff <= 1 else "?"
|
|
checks.append((r.name, expected, f"{actual} ({diff:+.1f})", status))
|
|
|
|
# CHAOS mode should have very low scores
|
|
r = find("CHAOS (all point mods)")
|
|
if r and r.scores:
|
|
diff = r.mean_score - baseline.mean_score
|
|
expected = "MUCH LOWER scores"
|
|
actual = "much lower" if diff < -5 else "lower" if diff < -1 else "similar"
|
|
status = "✓" if diff < -3 else "✗"
|
|
checks.append((r.name, expected, f"{actual} ({diff:+.1f})", status))
|
|
|
|
print(f"\n{'Rule':<30} {'Expected':<20} {'Actual':<20} {'Status'}")
|
|
print("-" * 80)
|
|
for name, expected, actual, status in checks:
|
|
print(f"{name:<30} {expected:<20} {actual:<20} {status}")
|
|
|
|
|
|
# =============================================================================
|
|
# Main
|
|
# =============================================================================
|
|
|
|
def main():
|
|
num_games = int(sys.argv[1]) if len(sys.argv) > 1 else 30
|
|
|
|
print(f"Testing house rules with {num_games} games each...")
|
|
print("This may take a few minutes...\n")
|
|
|
|
configs = get_test_configs()
|
|
results = []
|
|
|
|
for i, (name, options) in enumerate(configs):
|
|
print(f"[{i+1}/{len(configs)}] Testing: {name}...")
|
|
result = test_rule_config(name, options, num_games)
|
|
results.append(result)
|
|
|
|
# Quick status
|
|
if result.errors:
|
|
print(f" WARNING: {len(result.errors)} errors")
|
|
else:
|
|
print(f" Mean: {result.mean_score:.1f}, Turns: {result.mean_turns:.0f}")
|
|
|
|
# Reports
|
|
print_results_table(results)
|
|
print_expected_effects(results)
|
|
print_anomalies(results)
|
|
|
|
print("\n" + "=" * 100)
|
|
print("SUMMARY")
|
|
print("=" * 100)
|
|
total_games = sum(r.games_played for r in results)
|
|
total_errors = sum(len(r.errors) for r in results)
|
|
print(f"Total games run: {total_games}")
|
|
print(f"Total errors: {total_errors}")
|
|
|
|
if total_errors == 0:
|
|
print("All house rule configurations working correctly!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|