golfgame/server/test_house_rules.py
Aaron D. Lee d18cea2104 Initial commit: 6-Card Golf with AI opponents
Features:
- Multiplayer WebSocket game server (FastAPI)
- 8 AI personalities with distinct play styles
- 15+ house rule variants
- SQLite game logging for AI analysis
- Comprehensive test suite (80+ tests)

AI improvements:
- Fixed Maya bug (taking bad cards, discarding good ones)
- Personality traits influence style without overriding competence
- Zero blunders detected in 1000+ game simulations

Testing infrastructure:
- Game rules verification (test_game.py)
- AI decision analysis (game_analyzer.py)
- Score distribution analysis (score_analysis.py)
- House rules testing (test_house_rules.py)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 19:30:13 -05:00

572 lines
17 KiB
Python

"""
House Rules Testing Suite
Tests all house rule combinations to:
1. Find edge cases and bugs
2. Establish baseline performance metrics
3. Verify rules affect gameplay as expected
"""
import random
import sys
from collections import defaultdict
from dataclasses import dataclass
from typing import Optional
from game import Game, Player, GamePhase, GameOptions
from ai import GolfAI, CPUProfile, CPU_PROFILES, get_ai_card_value
@dataclass
class RuleTestResult:
"""Results from testing a house rule configuration."""
name: str
options: GameOptions
games_played: int
scores: list[int]
turn_counts: list[int]
negative_scores: int # Count of scores < 0
zero_scores: int # Count of exactly 0
high_scores: int # Count of scores > 25
errors: list[str]
@property
def mean_score(self) -> float:
return sum(self.scores) / len(self.scores) if self.scores else 0
@property
def median_score(self) -> float:
if not self.scores:
return 0
s = sorted(self.scores)
n = len(s)
if n % 2 == 0:
return (s[n//2 - 1] + s[n//2]) / 2
return s[n//2]
@property
def mean_turns(self) -> float:
return sum(self.turn_counts) / len(self.turn_counts) if self.turn_counts else 0
@property
def min_score(self) -> int:
return min(self.scores) if self.scores else 0
@property
def max_score(self) -> int:
return max(self.scores) if self.scores else 0
def run_game_with_options(options: GameOptions, num_players: int = 4) -> tuple[list[int], int, Optional[str]]:
"""
Run a single game with given options.
Returns (scores, turn_count, error_message).
"""
profiles = random.sample(CPU_PROFILES, min(num_players, len(CPU_PROFILES)))
game = Game()
player_profiles: dict[str, CPUProfile] = {}
for i, profile in enumerate(profiles):
player = Player(id=f"cpu_{i}", name=profile.name)
game.add_player(player)
player_profiles[player.id] = profile
try:
game.start_game(num_decks=1, num_rounds=1, options=options)
# Initial flips
for player in game.players:
positions = GolfAI.choose_initial_flips(options.initial_flips)
game.flip_initial_cards(player.id, positions)
# Play game
turn = 0
max_turns = 300 # Higher limit for edge cases
while game.phase in (GamePhase.PLAYING, GamePhase.FINAL_TURN) and turn < max_turns:
current = game.current_player()
if not current:
break
profile = player_profiles[current.id]
# Draw
discard_top = game.discard_top()
take_discard = GolfAI.should_take_discard(discard_top, current, profile, game)
source = "discard" if take_discard else "deck"
drawn = game.draw_card(current.id, source)
if not drawn:
# Deck exhausted - this is an edge case
break
# Swap or discard
swap_pos = GolfAI.choose_swap_or_discard(drawn, current, profile, game)
if swap_pos is None and game.drawn_from_discard:
face_down = [i for i, c in enumerate(current.cards) if not c.face_up]
if face_down:
swap_pos = random.choice(face_down)
else:
worst_pos = 0
worst_val = -999
for i, c in enumerate(current.cards):
card_val = get_ai_card_value(c, game.options)
if card_val > worst_val:
worst_val = card_val
worst_pos = i
swap_pos = worst_pos
if swap_pos is not None:
game.swap_card(current.id, swap_pos)
else:
game.discard_drawn(current.id)
if game.flip_on_discard:
flip_pos = GolfAI.choose_flip_after_discard(current, profile)
game.flip_and_end_turn(current.id, flip_pos)
turn += 1
if turn >= max_turns:
return [], turn, f"Game exceeded {max_turns} turns - possible infinite loop"
scores = [p.total_score for p in game.players]
return scores, turn, None
except Exception as e:
return [], 0, f"Exception: {str(e)}"
def test_rule_config(name: str, options: GameOptions, num_games: int = 50) -> RuleTestResult:
"""Test a specific rule configuration."""
all_scores = []
turn_counts = []
errors = []
negative_count = 0
zero_count = 0
high_count = 0
for _ in range(num_games):
scores, turns, error = run_game_with_options(options)
if error:
errors.append(error)
continue
all_scores.extend(scores)
turn_counts.append(turns)
for s in scores:
if s < 0:
negative_count += 1
elif s == 0:
zero_count += 1
elif s > 25:
high_count += 1
return RuleTestResult(
name=name,
options=options,
games_played=num_games,
scores=all_scores,
turn_counts=turn_counts,
negative_scores=negative_count,
zero_scores=zero_count,
high_scores=high_count,
errors=errors
)
# =============================================================================
# House Rule Configurations to Test
# =============================================================================
def get_test_configs() -> list[tuple[str, GameOptions]]:
"""Get all house rule configurations to test."""
configs = []
# Baseline (no house rules)
configs.append(("BASELINE", GameOptions(
initial_flips=2,
flip_on_discard=False,
use_jokers=False,
)))
# === Standard Options ===
configs.append(("flip_on_discard", GameOptions(
initial_flips=2,
flip_on_discard=True,
)))
configs.append(("initial_flips=0", GameOptions(
initial_flips=0,
flip_on_discard=False,
)))
configs.append(("initial_flips=1", GameOptions(
initial_flips=1,
flip_on_discard=False,
)))
configs.append(("knock_penalty", GameOptions(
initial_flips=2,
knock_penalty=True,
)))
configs.append(("use_jokers", GameOptions(
initial_flips=2,
use_jokers=True,
)))
# === Point Modifiers ===
configs.append(("lucky_swing", GameOptions(
initial_flips=2,
use_jokers=True,
lucky_swing=True,
)))
configs.append(("super_kings", GameOptions(
initial_flips=2,
super_kings=True,
)))
configs.append(("lucky_sevens", GameOptions(
initial_flips=2,
lucky_sevens=True,
)))
configs.append(("ten_penny", GameOptions(
initial_flips=2,
ten_penny=True,
)))
# === Bonuses/Penalties ===
configs.append(("knock_bonus", GameOptions(
initial_flips=2,
knock_bonus=True,
)))
configs.append(("underdog_bonus", GameOptions(
initial_flips=2,
underdog_bonus=True,
)))
configs.append(("tied_shame", GameOptions(
initial_flips=2,
tied_shame=True,
)))
configs.append(("blackjack", GameOptions(
initial_flips=2,
blackjack=True,
)))
# === Gameplay Twists ===
configs.append(("queens_wild", GameOptions(
initial_flips=2,
queens_wild=True,
)))
configs.append(("four_of_a_kind", GameOptions(
initial_flips=2,
four_of_a_kind=True,
)))
configs.append(("eagle_eye", GameOptions(
initial_flips=2,
use_jokers=True,
eagle_eye=True,
)))
# === Interesting Combinations ===
configs.append(("CHAOS (all point mods)", GameOptions(
initial_flips=2,
use_jokers=True,
lucky_swing=True,
super_kings=True,
lucky_sevens=True,
ten_penny=True,
)))
configs.append(("COMPETITIVE (penalties)", GameOptions(
initial_flips=2,
knock_penalty=True,
tied_shame=True,
)))
configs.append(("GENEROUS (bonuses)", GameOptions(
initial_flips=2,
knock_bonus=True,
underdog_bonus=True,
)))
configs.append(("WILD CARDS", GameOptions(
initial_flips=2,
use_jokers=True,
queens_wild=True,
four_of_a_kind=True,
eagle_eye=True,
)))
configs.append(("CLASSIC+ (jokers + flip)", GameOptions(
initial_flips=2,
flip_on_discard=True,
use_jokers=True,
)))
configs.append(("EVERYTHING", GameOptions(
initial_flips=2,
flip_on_discard=True,
knock_penalty=True,
use_jokers=True,
lucky_swing=True,
super_kings=True,
lucky_sevens=True,
ten_penny=True,
knock_bonus=True,
underdog_bonus=True,
tied_shame=True,
blackjack=True,
queens_wild=True,
four_of_a_kind=True,
eagle_eye=True,
)))
return configs
# =============================================================================
# Reporting
# =============================================================================
def print_results_table(results: list[RuleTestResult]):
"""Print a summary table of all results."""
print("\n" + "=" * 100)
print("HOUSE RULES TEST RESULTS")
print("=" * 100)
# Find baseline for comparison
baseline = next((r for r in results if r.name == "BASELINE"), results[0])
baseline_mean = baseline.mean_score
print(f"\n{'Rule Config':<25} {'Games':>6} {'Mean':>7} {'Med':>6} {'Min':>5} {'Max':>5} {'Turns':>6} {'Neg%':>6} {'Err':>4} {'vs Base':>8}")
print("-" * 100)
for r in results:
if not r.scores:
print(f"{r.name:<25} {'ERROR':>6} - no scores collected")
continue
neg_pct = r.negative_scores / len(r.scores) * 100 if r.scores else 0
diff = r.mean_score - baseline_mean
diff_str = f"{diff:+.1f}" if r.name != "BASELINE" else "---"
err_str = str(len(r.errors)) if r.errors else ""
print(f"{r.name:<25} {r.games_played:>6} {r.mean_score:>7.1f} {r.median_score:>6.1f} "
f"{r.min_score:>5} {r.max_score:>5} {r.mean_turns:>6.0f} {neg_pct:>5.1f}% {err_str:>4} {diff_str:>8}")
print("-" * 100)
def print_anomalies(results: list[RuleTestResult]):
"""Identify and print any anomalies or edge cases."""
print("\n" + "=" * 100)
print("ANOMALY DETECTION")
print("=" * 100)
baseline = next((r for r in results if r.name == "BASELINE"), results[0])
issues_found = False
for r in results:
issues = []
# Check for errors
if r.errors:
issues.append(f" ERRORS: {r.errors[:3]}") # Show first 3
# Check for extreme scores
if r.min_score < -15:
issues.append(f" Very low min score: {r.min_score} (possible scoring bug)")
if r.max_score > 60:
issues.append(f" Very high max score: {r.max_score} (possible stuck game)")
# Check for unusual turn counts
if r.mean_turns > 150:
issues.append(f" High turn count: {r.mean_turns:.0f} avg (games taking too long)")
if r.mean_turns < 20:
issues.append(f" Low turn count: {r.mean_turns:.0f} avg (games ending too fast)")
# Check for dramatic score shifts from baseline
if r.name != "BASELINE" and r.scores:
diff = r.mean_score - baseline.mean_score
if abs(diff) > 10:
issues.append(f" Large score shift from baseline: {diff:+.1f} points")
# Check for too many negative scores (unless expected)
neg_pct = r.negative_scores / len(r.scores) * 100 if r.scores else 0
if neg_pct > 20 and "super_kings" not in r.name.lower() and "lucky" not in r.name.lower():
issues.append(f" High negative score rate: {neg_pct:.1f}%")
if issues:
issues_found = True
print(f"\n{r.name}:")
for issue in issues:
print(issue)
if not issues_found:
print("\nNo anomalies detected - all configurations behaving as expected.")
def print_expected_effects(results: list[RuleTestResult]):
"""Verify house rules have expected effects."""
print("\n" + "=" * 100)
print("EXPECTED EFFECTS VERIFICATION")
print("=" * 100)
baseline = next((r for r in results if r.name == "BASELINE"), None)
if not baseline:
print("No baseline found!")
return
checks = []
# Find specific results
def find(name):
return next((r for r in results if r.name == name), None)
# super_kings should lower scores (Kings worth -2 instead of 0)
r = find("super_kings")
if r and r.scores:
diff = r.mean_score - baseline.mean_score
expected = "LOWER scores"
actual = "lower" if diff < -1 else "higher" if diff > 1 else "similar"
status = "" if diff < 0 else ""
checks.append((r.name, expected, f"{actual} ({diff:+.1f})", status))
# lucky_sevens should lower scores (7s worth 0 instead of 7)
r = find("lucky_sevens")
if r and r.scores:
diff = r.mean_score - baseline.mean_score
expected = "LOWER scores"
actual = "lower" if diff < -1 else "higher" if diff > 1 else "similar"
status = "" if diff < 0 else ""
checks.append((r.name, expected, f"{actual} ({diff:+.1f})", status))
# ten_penny should lower scores (10s worth 1 instead of 10)
r = find("ten_penny")
if r and r.scores:
diff = r.mean_score - baseline.mean_score
expected = "LOWER scores"
actual = "lower" if diff < -1 else "higher" if diff > 1 else "similar"
status = "" if diff < 0 else ""
checks.append((r.name, expected, f"{actual} ({diff:+.1f})", status))
# use_jokers should lower scores (jokers are -2)
r = find("use_jokers")
if r and r.scores:
diff = r.mean_score - baseline.mean_score
expected = "LOWER scores"
actual = "lower" if diff < -1 else "higher" if diff > 1 else "similar"
status = "" if diff < 0 else "?" # Might be small effect
checks.append((r.name, expected, f"{actual} ({diff:+.1f})", status))
# knock_bonus should lower scores (-5 for going out)
r = find("knock_bonus")
if r and r.scores:
diff = r.mean_score - baseline.mean_score
expected = "LOWER scores"
actual = "lower" if diff < -1 else "higher" if diff > 1 else "similar"
status = "" if diff < 0 else "?"
checks.append((r.name, expected, f"{actual} ({diff:+.1f})", status))
# tied_shame should raise scores (+5 penalty for ties)
r = find("tied_shame")
if r and r.scores:
diff = r.mean_score - baseline.mean_score
expected = "HIGHER scores"
actual = "lower" if diff < -1 else "higher" if diff > 1 else "similar"
status = "" if diff > 0 else "?"
checks.append((r.name, expected, f"{actual} ({diff:+.1f})", status))
# flip_on_discard might slightly lower scores (more info)
r = find("flip_on_discard")
if r and r.scores:
diff = r.mean_score - baseline.mean_score
expected = "SIMILAR or lower"
actual = "lower" if diff < -1 else "higher" if diff > 1 else "similar"
status = "" if diff <= 1 else "?"
checks.append((r.name, expected, f"{actual} ({diff:+.1f})", status))
# CHAOS mode should have very low scores
r = find("CHAOS (all point mods)")
if r and r.scores:
diff = r.mean_score - baseline.mean_score
expected = "MUCH LOWER scores"
actual = "much lower" if diff < -5 else "lower" if diff < -1 else "similar"
status = "" if diff < -3 else ""
checks.append((r.name, expected, f"{actual} ({diff:+.1f})", status))
print(f"\n{'Rule':<30} {'Expected':<20} {'Actual':<20} {'Status'}")
print("-" * 80)
for name, expected, actual, status in checks:
print(f"{name:<30} {expected:<20} {actual:<20} {status}")
# =============================================================================
# Main
# =============================================================================
def main():
num_games = int(sys.argv[1]) if len(sys.argv) > 1 else 30
print(f"Testing house rules with {num_games} games each...")
print("This may take a few minutes...\n")
configs = get_test_configs()
results = []
for i, (name, options) in enumerate(configs):
print(f"[{i+1}/{len(configs)}] Testing: {name}...")
result = test_rule_config(name, options, num_games)
results.append(result)
# Quick status
if result.errors:
print(f" WARNING: {len(result.errors)} errors")
else:
print(f" Mean: {result.mean_score:.1f}, Turns: {result.mean_turns:.0f}")
# Reports
print_results_table(results)
print_expected_effects(results)
print_anomalies(results)
print("\n" + "=" * 100)
print("SUMMARY")
print("=" * 100)
total_games = sum(r.games_played for r in results)
total_errors = sum(len(r.errors) for r in results)
print(f"Total games run: {total_games}")
print(f"Total errors: {total_errors}")
if total_errors == 0:
print("All house rule configurations working correctly!")
if __name__ == "__main__":
main()