Fix V2 race conditions, auth gaps, serialization bugs, and async stats

Phase 1 - Critical Fixes: - Add game_lock (asyncio.Lock) to Room class for serializing mutations - Wrap all game action handlers in lock to prevent race conditions - Split Card.to_dict into to_dict (full data) and to_client_dict (hidden) - Fix CardState.from_dict to handle missing rank/suit gracefully - Fix GameOptions reconstruction in recovery_service (dict -> object) - Extend state cache TTL from 4h to 24h, add touch_game method Phase 2 - Security: - Add optional WebSocket authentication via token query param - Use authenticated user ID/name when available - Add auth support to spectator WebSocket endpoint Phase 3 - Performance: - Make stats processing async (fire-and-forget) to avoid blocking game completion notifications Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-27 16:27:30 -05:00
parent 1dbfb3f14b
commit f27020f21b
8 changed files with 446 additions and 231 deletions
--- a/server/game.py
+++ b/server/game.py
@@ -140,18 +140,37 @@ class Card:
        """
        Convert card to dictionary for JSON serialization.

+        Always includes full card data (rank/suit/face_up) for server-side
+        caching and event sourcing. Use to_client_dict() for client views
+        that should hide face-down cards.
+
        Args:
-            reveal: If True, show card details even if face-down.
+            reveal: Ignored for backwards compatibility. Use to_client_dict() instead.

        Returns:
-            Dict with card info, or just {face_up: False} if hidden.
+            Dict with full card info (suit, rank, face_up).
        """
-        if self.face_up or reveal:
        return {
            "suit": self.suit.value,
            "rank": self.rank.value,
            "face_up": self.face_up,
        }
+
+    def to_client_dict(self) -> dict:
+        """
+        Convert card to dictionary for client display.
+
+        Hides card details if face-down to prevent cheating.
+
+        Returns:
+            Dict with card info, or just {face_up: False} if hidden.
+        """
+        if self.face_up:
+            return {
+                "suit": self.suit.value,
+                "rank": self.rank.value,
+                "face_up": True,
+            }
        return {"face_up": False}

    def value(self) -> int:
@@ -390,7 +409,9 @@ class Player:
        Returns:
            List of card dictionaries.
        """
-        return [card.to_dict(reveal) for card in self.cards]
+        if reveal:
+            return [card.to_dict() for card in self.cards]
+        return [card.to_client_dict() for card in self.cards]


 class GamePhase(Enum):
--- a/server/main.py
+++ b/server/main.py
@@ -21,6 +21,26 @@ from game_log import get_logger
 # Import production components
 from logging_config import setup_logging

+# Initialize Sentry if configured
+if config.SENTRY_DSN:
+    try:
+        import sentry_sdk
+        from sentry_sdk.integrations.fastapi import FastApiIntegration
+        from sentry_sdk.integrations.starlette import StarletteIntegration
+
+        sentry_sdk.init(
+            dsn=config.SENTRY_DSN,
+            environment=config.ENVIRONMENT,
+            traces_sample_rate=0.1 if config.ENVIRONMENT == "production" else 1.0,
+            integrations=[
+                StarletteIntegration(transaction_style="endpoint"),
+                FastApiIntegration(transaction_style="endpoint"),
+            ],
+        )
+        logging.getLogger(__name__).info("Sentry error tracking initialized")
+    except ImportError:
+        logging.getLogger(__name__).warning("sentry-sdk not installed, error tracking disabled")
+
 # Configure logging based on environment
 setup_logging(
    level=config.LOG_LEVEL,
@@ -243,8 +263,75 @@ app.add_middleware(
    environment=config.ENVIRONMENT,
 )

-# Note: Rate limiting middleware is added after app startup when Redis is available
-# See _add_rate_limit_middleware() called from a startup event if needed
+# Rate limiting middleware (uses global _rate_limiter set in lifespan)
+# We create a wrapper that safely handles the case when rate limiter isn't ready
+from starlette.middleware.base import BaseHTTPMiddleware
+from starlette.responses import JSONResponse
+
+
+class LazyRateLimitMiddleware(BaseHTTPMiddleware):
+    """Rate limiting middleware that uses the global _rate_limiter when available."""
+
+    async def dispatch(self, request, call_next):
+        global _rate_limiter
+
+        # Skip if rate limiter not initialized or disabled
+        if not _rate_limiter or not config.RATE_LIMIT_ENABLED:
+            return await call_next(request)
+
+        # Import here to avoid circular imports
+        from services.ratelimit import RATE_LIMITS
+
+        path = request.url.path
+
+        # Skip health checks and static files
+        if path in ("/health", "/ready", "/metrics"):
+            return await call_next(request)
+        if path.endswith((".js", ".css", ".html", ".ico", ".png", ".jpg", ".svg")):
+            return await call_next(request)
+
+        # Determine rate limit tier
+        if path.startswith("/api/auth"):
+            limit, window = RATE_LIMITS["api_auth"]
+        elif path == "/api/rooms" and request.method == "POST":
+            limit, window = RATE_LIMITS["api_create_room"]
+        elif "email" in path or "verify" in path:
+            limit, window = RATE_LIMITS["email_send"]
+        elif path.startswith("/api"):
+            limit, window = RATE_LIMITS["api_general"]
+        else:
+            return await call_next(request)
+
+        # Get client key and check rate limit
+        client_key = _rate_limiter.get_client_key(request)
+        full_key = f"{path}:{client_key}"
+
+        allowed, info = await _rate_limiter.is_allowed(full_key, limit, window)
+
+        if allowed:
+            response = await call_next(request)
+        else:
+            response = JSONResponse(
+                status_code=429,
+                content={
+                    "error": "Rate limit exceeded",
+                    "message": f"Too many requests. Please wait {info['reset']} seconds.",
+                    "retry_after": info["reset"],
+                },
+            )
+
+        # Add rate limit headers
+        response.headers["X-RateLimit-Limit"] = str(info["limit"])
+        response.headers["X-RateLimit-Remaining"] = str(info["remaining"])
+        response.headers["X-RateLimit-Reset"] = str(info["reset"])
+
+        if not allowed:
+            response.headers["Retry-After"] = str(info["reset"])
+
+        return response
+
+
+app.add_middleware(LazyRateLimitMiddleware)

 room_manager = RoomManager()

@@ -309,7 +396,23 @@ async def require_admin(user: User = Depends(require_user)) -> User:
 async def websocket_endpoint(websocket: WebSocket):
    await websocket.accept()

+    # Extract token from query param for optional authentication
+    token = websocket.query_params.get("token")
+    authenticated_user = None
+    if token and _auth_service:
+        try:
+            authenticated_user = await _auth_service.get_user_from_token(token)
+        except Exception as e:
+            logger.debug(f"WebSocket auth failed: {e}")
+
+    # Use authenticated user ID if available, otherwise generate random UUID
+    if authenticated_user:
+        player_id = str(authenticated_user.id)
+        logger.debug(f"WebSocket authenticated as user {player_id}")
+    else:
        player_id = str(uuid.uuid4())
+        logger.debug(f"WebSocket connected anonymously as {player_id}")
+
    current_room: Room | None = None

    try:
@@ -319,6 +422,9 @@ async def websocket_endpoint(websocket: WebSocket):

            if msg_type == "create_room":
                player_name = data.get("player_name", "Player")
+                # Use authenticated user's name if available
+                if authenticated_user and authenticated_user.display_name:
+                    player_name = authenticated_user.display_name
                room = room_manager.create_room()
                room.add_player(player_id, player_name, websocket)
                current_room = room
@@ -327,6 +433,7 @@ async def websocket_endpoint(websocket: WebSocket):
                    "type": "room_created",
                    "room_code": room.code,
                    "player_id": player_id,
+                    "authenticated": authenticated_user is not None,
                })

                await room.broadcast({
@@ -360,6 +467,9 @@ async def websocket_endpoint(websocket: WebSocket):
                    })
                    continue

+                # Use authenticated user's name if available
+                if authenticated_user and authenticated_user.display_name:
+                    player_name = authenticated_user.display_name
                room.add_player(player_id, player_name, websocket)
                current_room = room

@@ -367,6 +477,7 @@ async def websocket_endpoint(websocket: WebSocket):
                    "type": "room_joined",
                    "room_code": room.code,
                    "player_id": player_id,
+                    "authenticated": authenticated_user is not None,
                })

                await room.broadcast({
@@ -487,11 +598,12 @@ async def websocket_endpoint(websocket: WebSocket):
                num_decks = max(1, min(3, num_decks))
                num_rounds = max(1, min(18, num_rounds))

+                async with current_room.game_lock:
                    current_room.game.start_game(num_decks, num_rounds, options)

                    # Log game start for AI analysis
-                logger = get_logger()
-                current_room.game_log_id = logger.log_game_start(
+                    game_logger = get_logger()
+                    current_room.game_log_id = game_logger.log_game_start(
                        room_code=current_room.code,
                        num_players=len(current_room.players),
                        options=options,
@@ -520,6 +632,7 @@ async def websocket_endpoint(websocket: WebSocket):
                    continue

                positions = data.get("positions", [])
+                async with current_room.game_lock:
                    if current_room.game.flip_initial_cards(player_id, positions):
                        await broadcast_game_state(current_room)

@@ -531,6 +644,7 @@ async def websocket_endpoint(websocket: WebSocket):
                    continue

                source = data.get("source", "deck")
+                async with current_room.game_lock:
                    # Capture discard top before draw (for logging decision context)
                    discard_before_draw = current_room.game.discard_top()
                    card = current_room.game.draw_card(player_id, source)
@@ -555,7 +669,7 @@ async def websocket_endpoint(websocket: WebSocket):
                        # Send drawn card only to the player who drew
                        await websocket.send_json({
                            "type": "card_drawn",
-                        "card": card.to_dict(reveal=True),
+                            "card": card.to_dict(),
                            "source": source,
                        })

@@ -566,6 +680,7 @@ async def websocket_endpoint(websocket: WebSocket):
                    continue

                position = data.get("position", 0)
+                async with current_room.game_lock:
                    # Capture drawn card before swap for logging
                    drawn_card = current_room.game.drawn_card
                    player = current_room.game.get_player(player_id)
@@ -596,6 +711,7 @@ async def websocket_endpoint(websocket: WebSocket):
                if not current_room:
                    continue

+                async with current_room.game_lock:
                    # Capture drawn card before discard for logging
                    drawn_card = current_room.game.drawn_card
                    player = current_room.game.get_player(player_id)
@@ -637,6 +753,7 @@ async def websocket_endpoint(websocket: WebSocket):
                    continue

                position = data.get("position", 0)
+                async with current_room.game_lock:
                    player = current_room.game.get_player(player_id)
                    current_room.game.flip_and_end_turn(player_id, position)

@@ -662,6 +779,7 @@ async def websocket_endpoint(websocket: WebSocket):
                if not current_room:
                    continue

+                async with current_room.game_lock:
                    player = current_room.game.get_player(player_id)
                    if current_room.game.skip_flip_and_end_turn(player_id):
                        # Log skip flip decision for human player
@@ -685,6 +803,7 @@ async def websocket_endpoint(websocket: WebSocket):
                    continue

                position = data.get("position", 0)
+                async with current_room.game_lock:
                    player = current_room.game.get_player(player_id)
                    if current_room.game.flip_card_as_action(player_id, position):
                        # Log flip-as-action for human player
@@ -709,6 +828,7 @@ async def websocket_endpoint(websocket: WebSocket):
                if not current_room:
                    continue

+                async with current_room.game_lock:
                    player = current_room.game.get_player(player_id)
                    if current_room.game.knock_early(player_id):
                        # Log knock early for human player
@@ -736,6 +856,7 @@ async def websocket_endpoint(websocket: WebSocket):
                if not room_player or not room_player.is_host:
                    continue

+                async with current_room.game_lock:
                    if current_room.game.start_next_round():
                        # CPU players do their initial flips
                        for cpu in current_room.get_cpu_players():
@@ -796,6 +917,38 @@ async def websocket_endpoint(websocket: WebSocket):
            await handle_player_leave(current_room, player_id)


+async def _process_stats_safe(room: Room):
+    """
+    Process game stats in a fire-and-forget manner.
+
+    This is called via asyncio.create_task to avoid blocking game completion
+    notifications while stats are being processed.
+    """
+    try:
+        # Build mapping - for non-CPU players, the player_id is their user_id
+        # (assigned during authentication or as a session UUID)
+        player_user_ids = {}
+        for player_id, room_player in room.players.items():
+            if not room_player.is_cpu:
+                player_user_ids[player_id] = player_id
+
+        # Find winner
+        winner_id = None
+        if room.game.players:
+            winner = min(room.game.players, key=lambda p: p.total_score)
+            winner_id = winner.id
+
+        await _stats_service.process_game_from_state(
+            players=room.game.players,
+            winner_id=winner_id,
+            num_rounds=room.game.num_rounds,
+            player_user_ids=player_user_ids,
+        )
+        logger.debug(f"Stats processed for room {room.code}")
+    except Exception as e:
+        logger.error(f"Failed to process game stats: {e}")
+
+
 async def broadcast_game_state(room: Room):
    """Broadcast game state to all human players in a room."""
    # Notify spectators if spectator manager is available
@@ -842,30 +995,9 @@ async def broadcast_game_state(room: Room):
                game_logger.log_game_end(room.game_log_id)
                room.game_log_id = None  # Clear to avoid duplicate logging

-            # Process stats for authenticated players
+            # Process stats asynchronously (fire-and-forget) to avoid delaying game over notifications
            if _stats_service and room.game.players:
-                try:
-                    # Build mapping - for non-CPU players, the player_id is their user_id
-                    # (assigned during authentication or as a session UUID)
-                    player_user_ids = {}
-                    for player_id, room_player in room.players.items():
-                        if not room_player.is_cpu:
-                            player_user_ids[player_id] = player_id
-
-                    # Find winner
-                    winner_id = None
-                    if room.game.players:
-                        winner = min(room.game.players, key=lambda p: p.total_score)
-                        winner_id = winner.id
-
-                    await _stats_service.process_game_from_state(
-                        players=room.game.players,
-                        winner_id=winner_id,
-                        num_rounds=room.game.num_rounds,
-                        player_user_ids=player_user_ids,
-                    )
-                except Exception as e:
-                    logger.error(f"Failed to process game stats: {e}")
+                asyncio.create_task(_process_stats_safe(room))

            scores = [
                {"name": p.name, "total": p.total_score, "rounds_won": p.rounds_won}
--- a/server/models/game_state.py
+++ b/server/models/game_state.py
@@ -52,12 +52,38 @@ class CardState:

    @classmethod
    def from_dict(cls, d: dict) -> "CardState":
-        """Create from dictionary."""
-        return cls(
-            rank=d["rank"],
-            suit=d["suit"],
-            face_up=d.get("face_up", False),
-        )
+        """
+        Create from dictionary.
+
+        Handles both full card data and minimal face-down data gracefully.
+
+        Args:
+            d: Dictionary with card data. May contain:
+               - Full data: {rank, suit, face_up}
+               - Minimal face-down: {face_up: False}
+
+        Returns:
+            CardState instance.
+
+        Raises:
+            ValueError: If face_up is True but rank/suit are missing.
+        """
+        face_up = d.get("face_up", False)
+        rank = d.get("rank")
+        suit = d.get("suit")
+
+        # If card is face-up, we must have rank and suit
+        if face_up and (rank is None or suit is None):
+            raise ValueError("Face-up card must have rank and suit")
+
+        # For face-down cards with missing data, use placeholder values
+        # This handles improperly serialized data from older versions
+        if rank is None:
+            rank = "?"  # Placeholder for unknown
+        if suit is None:
+            suit = "?"  # Placeholder for unknown
+
+        return cls(rank=rank, suit=suit, face_up=face_up)


@dataclass
--- a/server/room.py
+++ b/server/room.py
@@ -11,6 +11,7 @@ A Room contains:
    - Settings for number of decks, rounds, etc.
 """

+import asyncio
 import random
 import string
 from dataclasses import dataclass, field
@@ -57,6 +58,7 @@ class Room:
        game: The Game instance containing actual game state.
        settings: Room settings (decks, rounds, etc.).
        game_log_id: SQLite log ID for analytics (if logging enabled).
+        game_lock: asyncio.Lock for serializing game mutations to prevent race conditions.
    """

    code: str
@@ -64,6 +66,7 @@ class Room:
    game: Game = field(default_factory=Game)
    settings: dict = field(default_factory=lambda: {"decks": 1, "rounds": 1})
    game_log_id: Optional[str] = None
+    game_lock: asyncio.Lock = field(default_factory=asyncio.Lock)

    def add_player(
        self,
--- a/server/routers/replay.py
+++ b/server/routers/replay.py
@@ -420,9 +420,19 @@ async def spectate_game(websocket: WebSocket, room_code: str):
    WebSocket endpoint for spectating live games.

    Spectators receive real-time game state updates but cannot interact.
+    Supports optional authentication via token query parameter.
    """
    await websocket.accept()

+    # Optional authentication for spectators
+    token = websocket.query_params.get("token")
+    spectator_user = None
+    if token and _auth_service:
+        try:
+            spectator_user = await _auth_service.get_user_from_token(token)
+        except Exception:
+            pass  # Anonymous spectator
+
    if not _spectator_manager or not _room_manager:
        await websocket.close(code=4003, reason="Spectator service unavailable")
        return
@@ -449,6 +459,7 @@ async def spectate_game(websocket: WebSocket, room_code: str):
            "game_state": game_state,
            "spectator_count": _spectator_manager.get_spectator_count(game_id),
            "players": room.player_list(),
+            "authenticated": spectator_user is not None,
        })

        # Keep connection alive
--- a/server/services/recovery_service.py
+++ b/server/services/recovery_service.py
@@ -309,6 +309,7 @@ class RecoveryService:
            Reconstructed game state.
        """
        from models.game_state import GamePhase, PlayerState
+        from game import GameOptions

        state = RebuiltGameState(game_id=d["game_id"])
        state.room_code = d.get("room_code", "")
@@ -318,7 +319,12 @@ class RecoveryService:
        state.current_player_idx = d.get("current_player_idx", 0)
        state.player_order = d.get("player_order", [])
        state.deck_remaining = d.get("deck_remaining", 0)
-        state.options = d.get("options", {})
+        # Reconstruct GameOptions as proper object for attribute access
+        options_dict = d.get("options", {})
+        if isinstance(options_dict, dict):
+            state.options = GameOptions(**options_dict)
+        else:
+            state.options = options_dict
        state.sequence_num = d.get("sequence_num", 0)
        state.finisher_id = d.get("finisher_id")
        state.host_id = d.get("host_id")
--- a/server/stores/state_cache.py
+++ b/server/stores/state_cache.py
@@ -39,9 +39,9 @@ class StateCache:
    ACTIVE_ROOMS_KEY = "golf:rooms:active"
    PLAYER_ROOM_KEY = "golf:player:{player_id}:room"

-    # TTLs
-    ROOM_TTL = timedelta(hours=4)  # Inactive rooms expire
-    GAME_TTL = timedelta(hours=4)
+    # TTLs - extended to 24 hours to prevent active games from expiring
+    ROOM_TTL = timedelta(hours=24)  # Inactive rooms expire
+    GAME_TTL = timedelta(hours=24)

    def __init__(self, redis_client: redis.Redis):
        """
@@ -360,6 +360,18 @@ class StateCache:

        await pipe.execute()

+    async def touch_game(self, game_id: str) -> None:
+        """
+        Refresh game TTL on any activity.
+
+        Call this on game actions to prevent active games from expiring.
+
+        Args:
+            game_id: Game UUID to refresh.
+        """
+        key = self.GAME_KEY.format(game_id=game_id)
+        await self.redis.expire(key, int(self.GAME_TTL.total_seconds()))
+

 # Global state cache instance (initialized on first use)
 _state_cache: Optional[StateCache] = None
--- a/server/test_game.py
+++ b/server/test_game.py
@@ -251,9 +251,13 @@ class TestDrawDiscardMechanics:
        player = self.game.get_player("p1")
        # Card is face down
        assert player.cards[0].face_up is False
-        # to_dict doesn't reveal it
-        card_dict = player.cards[0].to_dict(reveal=False)
+        # to_client_dict hides face-down card details from clients
+        card_dict = player.cards[0].to_client_dict()
        assert "rank" not in card_dict
+        # But to_dict always includes full data for server-side caching
+        full_dict = player.cards[0].to_dict()
+        assert "rank" in full_dict
+        assert "suit" in full_dict


 # =============================================================================