Huge v2 uplift, now deployable with real user management and tooling!

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-27 11:32:15 -05:00
parent c912a56c2d
commit bea85e6b28
61 changed files with 25153 additions and 362 deletions
--- a/docs/v2/V2_07_PRODUCTION.md
+++ b/docs/v2/V2_07_PRODUCTION.md
@@ -0,0 +1,999 @@
+# V2_07: Production Deployment & Operations
+
+> **Scope**: Docker, deployment, health checks, monitoring, security, rate limiting
+> **Dependencies**: All other V2 documents
+> **Complexity**: High (DevOps/Infrastructure)
+
+---
+
+## Overview
+
+Production readiness requires:
+- **Containerization**: Docker images for consistent deployment
+- **Health Checks**: Liveness and readiness probes
+- **Monitoring**: Metrics, logging, error tracking
+- **Security**: HTTPS, headers, secrets management
+- **Rate Limiting**: API protection from abuse (Phase 1 priority)
+- **Graceful Operations**: Zero-downtime deploys, proper shutdown
+
+---
+
+## 1. Docker Configuration
+
+### Application Dockerfile
+
+```dockerfile
+# Dockerfile
+FROM python:3.11-slim as base
+
+# Set environment
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1
+
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy application code
+COPY server/ ./server/
+COPY client/ ./client/
+
+# Create non-root user
+RUN useradd --create-home --shell /bin/bash appuser \
+    && chown -R appuser:appuser /app
+USER appuser
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+
+EXPOSE 8000
+
+CMD ["python", "-m", "uvicorn", "server.main:app", "--host", "0.0.0.0", "--port", "8000"]
+```
+
+### Production Docker Compose
+
+```yaml
+# docker-compose.prod.yml
+version: '3.8'
+
+services:
+  app:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    environment:
+      - DATABASE_URL=postgresql://golf:${DB_PASSWORD}@postgres:5432/golfgame
+      - REDIS_URL=redis://redis:6379/0
+      - SECRET_KEY=${SECRET_KEY}
+      - RESEND_API_KEY=${RESEND_API_KEY}
+      - SENTRY_DSN=${SENTRY_DSN}
+      - ENVIRONMENT=production
+    depends_on:
+      postgres:
+        condition: service_healthy
+      redis:
+        condition: service_healthy
+    deploy:
+      replicas: 2
+      restart_policy:
+        condition: on-failure
+        max_attempts: 3
+      resources:
+        limits:
+          memory: 512M
+        reservations:
+          memory: 256M
+    networks:
+      - internal
+      - web
+    labels:
+      - "traefik.enable=true"
+      - "traefik.http.routers.golf.rule=Host(`golf.example.com`)"
+      - "traefik.http.routers.golf.tls=true"
+      - "traefik.http.routers.golf.tls.certresolver=letsencrypt"
+
+  worker:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    command: python -m arq server.worker.WorkerSettings
+    environment:
+      - DATABASE_URL=postgresql://golf:${DB_PASSWORD}@postgres:5432/golfgame
+      - REDIS_URL=redis://redis:6379/0
+    depends_on:
+      - postgres
+      - redis
+    deploy:
+      replicas: 1
+      resources:
+        limits:
+          memory: 256M
+
+  postgres:
+    image: postgres:15-alpine
+    environment:
+      POSTGRES_DB: golfgame
+      POSTGRES_USER: golf
+      POSTGRES_PASSWORD: ${DB_PASSWORD}
+    volumes:
+      - postgres_data:/var/lib/postgresql/data
+      - ./init.sql:/docker-entrypoint-initdb.d/init.sql
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U golf -d golfgame"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+    networks:
+      - internal
+
+  redis:
+    image: redis:7-alpine
+    command: redis-server --appendonly yes --maxmemory 128mb --maxmemory-policy allkeys-lru
+    volumes:
+      - redis_data:/data
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+    networks:
+      - internal
+
+  traefik:
+    image: traefik:v2.10
+    command:
+      - "--api.dashboard=true"
+      - "--providers.docker=true"
+      - "--providers.docker.exposedbydefault=false"
+      - "--entrypoints.web.address=:80"
+      - "--entrypoints.websecure.address=:443"
+      - "--certificatesresolvers.letsencrypt.acme.httpchallenge=true"
+      - "--certificatesresolvers.letsencrypt.acme.email=${ACME_EMAIL}"
+      - "--certificatesresolvers.letsencrypt.acme.storage=/letsencrypt/acme.json"
+    ports:
+      - "80:80"
+      - "443:443"
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock:ro
+      - letsencrypt:/letsencrypt
+    networks:
+      - web
+
+volumes:
+  postgres_data:
+  redis_data:
+  letsencrypt:
+
+networks:
+  internal:
+  web:
+    external: true
+```
+
+---
+
+## 2. Health Checks & Readiness
+
+### Health Endpoint Implementation
+
+```python
+# server/health.py
+from fastapi import APIRouter, Response
+from datetime import datetime
+import asyncpg
+import redis.asyncio as redis
+
+router = APIRouter(tags=["health"])
+
+@router.get("/health")
+async def health_check():
+    """Basic liveness check - is the app running?"""
+    return {"status": "ok", "timestamp": datetime.utcnow().isoformat()}
+
+@router.get("/ready")
+async def readiness_check(
+    db: asyncpg.Pool = Depends(get_db_pool),
+    redis_client: redis.Redis = Depends(get_redis)
+):
+    """Readiness check - can the app handle requests?"""
+    checks = {}
+    overall_healthy = True
+
+    # Check database
+    try:
+        async with db.acquire() as conn:
+            await conn.fetchval("SELECT 1")
+        checks["database"] = {"status": "ok"}
+    except Exception as e:
+        checks["database"] = {"status": "error", "message": str(e)}
+        overall_healthy = False
+
+    # Check Redis
+    try:
+        await redis_client.ping()
+        checks["redis"] = {"status": "ok"}
+    except Exception as e:
+        checks["redis"] = {"status": "error", "message": str(e)}
+        overall_healthy = False
+
+    status_code = 200 if overall_healthy else 503
+    return Response(
+        content=json.dumps({
+            "status": "ok" if overall_healthy else "degraded",
+            "checks": checks,
+            "timestamp": datetime.utcnow().isoformat()
+        }),
+        status_code=status_code,
+        media_type="application/json"
+    )
+
+@router.get("/metrics")
+async def metrics(
+    db: asyncpg.Pool = Depends(get_db_pool),
+    redis_client: redis.Redis = Depends(get_redis)
+):
+    """Expose application metrics for monitoring."""
+    async with db.acquire() as conn:
+        active_games = await conn.fetchval(
+            "SELECT COUNT(*) FROM games WHERE completed_at IS NULL"
+        )
+        total_users = await conn.fetchval("SELECT COUNT(*) FROM users")
+        games_today = await conn.fetchval(
+            "SELECT COUNT(*) FROM games WHERE created_at > NOW() - INTERVAL '1 day'"
+        )
+
+    connected_players = await redis_client.scard("connected_players")
+
+    return {
+        "active_games": active_games,
+        "total_users": total_users,
+        "games_today": games_today,
+        "connected_players": connected_players,
+        "timestamp": datetime.utcnow().isoformat()
+    }
+```
+
+---
+
+## 3. Rate Limiting (Phase 1 Priority)
+
+Rate limiting is a Phase 1 priority for security. Implement early to prevent abuse.
+
+### Rate Limiter Implementation
+
+```python
+# server/ratelimit.py
+from fastapi import Request, HTTPException
+from typing import Optional
+import redis.asyncio as redis
+import time
+import hashlib
+
+class RateLimiter:
+    """Token bucket rate limiter using Redis."""
+
+    def __init__(self, redis_client: redis.Redis):
+        self.redis = redis_client
+
+    async def is_allowed(
+        self,
+        key: str,
+        limit: int,
+        window_seconds: int
+    ) -> tuple[bool, dict]:
+        """Check if request is allowed under rate limit.
+
+        Returns (allowed, info) where info contains:
+        - remaining: requests remaining in window
+        - reset: seconds until window resets
+        - limit: the limit that was applied
+        """
+        now = int(time.time())
+        window_key = f"ratelimit:{key}:{now // window_seconds}"
+
+        async with self.redis.pipeline(transaction=True) as pipe:
+            pipe.incr(window_key)
+            pipe.expire(window_key, window_seconds)
+            results = await pipe.execute()
+
+        current_count = results[0]
+        remaining = max(0, limit - current_count)
+        reset = window_seconds - (now % window_seconds)
+
+        info = {
+            "remaining": remaining,
+            "reset": reset,
+            "limit": limit
+        }
+
+        return current_count <= limit, info
+
+    def get_client_key(self, request: Request, user_id: Optional[str] = None) -> str:
+        """Generate rate limit key for client."""
+        if user_id:
+            return f"user:{user_id}"
+
+        # For anonymous users, use IP hash
+        client_ip = request.client.host
+        forwarded = request.headers.get("X-Forwarded-For")
+        if forwarded:
+            client_ip = forwarded.split(",")[0].strip()
+
+        # Hash IP for privacy
+        return f"ip:{hashlib.sha256(client_ip.encode()).hexdigest()[:16]}"
+
+
+# Rate limit configurations per endpoint type
+RATE_LIMITS = {
+    "api_general": (100, 60),      # 100 requests per minute
+    "api_auth": (10, 60),          # 10 auth attempts per minute
+    "api_create_room": (5, 60),    # 5 room creations per minute
+    "websocket_connect": (10, 60), # 10 WS connections per minute
+    "email_send": (3, 300),        # 3 emails per 5 minutes
+}
+```
+
+### Rate Limit Middleware
+
+```python
+# server/middleware.py
+from fastapi import Request
+from starlette.middleware.base import BaseHTTPMiddleware
+from starlette.responses import JSONResponse
+
+class RateLimitMiddleware(BaseHTTPMiddleware):
+    def __init__(self, app, rate_limiter: RateLimiter):
+        super().__init__(app)
+        self.limiter = rate_limiter
+
+    async def dispatch(self, request: Request, call_next):
+        # Determine rate limit tier based on path
+        path = request.url.path
+
+        if path.startswith("/api/auth"):
+            limit, window = RATE_LIMITS["api_auth"]
+        elif path == "/api/rooms":
+            limit, window = RATE_LIMITS["api_create_room"]
+        elif path.startswith("/api"):
+            limit, window = RATE_LIMITS["api_general"]
+        else:
+            # No rate limiting for static files
+            return await call_next(request)
+
+        # Get user ID if authenticated
+        user_id = getattr(request.state, "user_id", None)
+        client_key = self.limiter.get_client_key(request, user_id)
+
+        allowed, info = await self.limiter.is_allowed(
+            f"{path}:{client_key}", limit, window
+        )
+
+        # Add rate limit headers to response
+        response = await call_next(request) if allowed else JSONResponse(
+            status_code=429,
+            content={
+                "error": "Rate limit exceeded",
+                "retry_after": info["reset"]
+            }
+        )
+
+        response.headers["X-RateLimit-Limit"] = str(info["limit"])
+        response.headers["X-RateLimit-Remaining"] = str(info["remaining"])
+        response.headers["X-RateLimit-Reset"] = str(info["reset"])
+
+        if not allowed:
+            response.headers["Retry-After"] = str(info["reset"])
+
+        return response
+```
+
+### WebSocket Rate Limiting
+
+```python
+# In server/main.py
+async def websocket_endpoint(websocket: WebSocket):
+    client_key = rate_limiter.get_client_key(websocket)
+
+    allowed, info = await rate_limiter.is_allowed(
+        f"ws_connect:{client_key}",
+        *RATE_LIMITS["websocket_connect"]
+    )
+
+    if not allowed:
+        await websocket.close(code=1008, reason="Rate limit exceeded")
+        return
+
+    # Also rate limit messages within the connection
+    message_limiter = ConnectionMessageLimiter(
+        max_messages=30,
+        window_seconds=10
+    )
+
+    await websocket.accept()
+
+    try:
+        while True:
+            data = await websocket.receive_text()
+
+            if not message_limiter.check():
+                await websocket.send_json({
+                    "type": "error",
+                    "message": "Slow down! Too many messages."
+                })
+                continue
+
+            await handle_message(websocket, data)
+    except WebSocketDisconnect:
+        pass
+```
+
+---
+
+## 4. Security Headers & HTTPS
+
+### Security Middleware
+
+```python
+# server/security.py
+from starlette.middleware.base import BaseHTTPMiddleware
+
+class SecurityHeadersMiddleware(BaseHTTPMiddleware):
+    async def dispatch(self, request, call_next):
+        response = await call_next(request)
+
+        # Security headers
+        response.headers["X-Content-Type-Options"] = "nosniff"
+        response.headers["X-Frame-Options"] = "DENY"
+        response.headers["X-XSS-Protection"] = "1; mode=block"
+        response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
+        response.headers["Permissions-Policy"] = "geolocation=(), microphone=(), camera=()"
+
+        # Content Security Policy
+        csp = "; ".join([
+            "default-src 'self'",
+            "script-src 'self'",
+            "style-src 'self' 'unsafe-inline'",  # For inline styles
+            "img-src 'self' data:",
+            "font-src 'self'",
+            "connect-src 'self' wss://*.example.com",
+            "frame-ancestors 'none'",
+            "base-uri 'self'",
+            "form-action 'self'"
+        ])
+        response.headers["Content-Security-Policy"] = csp
+
+        # HSTS (only in production)
+        if request.url.scheme == "https":
+            response.headers["Strict-Transport-Security"] = (
+                "max-age=31536000; includeSubDomains; preload"
+            )
+
+        return response
+```
+
+### CORS Configuration
+
+```python
+# server/main.py
+from fastapi.middleware.cors import CORSMiddleware
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=[
+        "https://golf.example.com",
+        "https://www.golf.example.com",
+    ],
+    allow_credentials=True,
+    allow_methods=["GET", "POST", "PUT", "DELETE"],
+    allow_headers=["*"],
+)
+```
+
+---
+
+## 5. Error Tracking with Sentry
+
+### Sentry Integration
+
+```python
+# server/main.py
+import sentry_sdk
+from sentry_sdk.integrations.fastapi import FastApiIntegration
+from sentry_sdk.integrations.redis import RedisIntegration
+from sentry_sdk.integrations.asyncpg import AsyncPGIntegration
+
+if os.getenv("SENTRY_DSN"):
+    sentry_sdk.init(
+        dsn=os.getenv("SENTRY_DSN"),
+        environment=os.getenv("ENVIRONMENT", "development"),
+        traces_sample_rate=0.1,  # 10% of transactions for performance
+        profiles_sample_rate=0.1,
+        integrations=[
+            FastApiIntegration(transaction_style="endpoint"),
+            RedisIntegration(),
+            AsyncPGIntegration(),
+        ],
+        # Filter out sensitive data
+        before_send=filter_sensitive_data,
+    )
+
+def filter_sensitive_data(event, hint):
+    """Remove sensitive data before sending to Sentry."""
+    if "request" in event:
+        headers = event["request"].get("headers", {})
+        # Remove auth headers
+        headers.pop("authorization", None)
+        headers.pop("cookie", None)
+
+    return event
+```
+
+### Custom Error Handler
+
+```python
+# server/errors.py
+from fastapi import Request
+from fastapi.responses import JSONResponse
+import sentry_sdk
+import traceback
+
+async def global_exception_handler(request: Request, exc: Exception):
+    """Handle all unhandled exceptions."""
+
+    # Log to Sentry
+    sentry_sdk.capture_exception(exc)
+
+    # Log locally
+    logger.error(f"Unhandled exception: {exc}", exc_info=True)
+
+    # Return generic error to client
+    return JSONResponse(
+        status_code=500,
+        content={
+            "error": "Internal server error",
+            "request_id": request.state.request_id
+        }
+    )
+
+# Register handler
+app.add_exception_handler(Exception, global_exception_handler)
+```
+
+---
+
+## 6. Structured Logging
+
+### Logging Configuration
+
+```python
+# server/logging_config.py
+import logging
+import json
+from datetime import datetime
+
+class JSONFormatter(logging.Formatter):
+    """Format logs as JSON for aggregation."""
+
+    def format(self, record):
+        log_data = {
+            "timestamp": datetime.utcnow().isoformat(),
+            "level": record.levelname,
+            "logger": record.name,
+            "message": record.getMessage(),
+        }
+
+        # Add extra fields
+        if hasattr(record, "request_id"):
+            log_data["request_id"] = record.request_id
+        if hasattr(record, "user_id"):
+            log_data["user_id"] = record.user_id
+        if hasattr(record, "game_id"):
+            log_data["game_id"] = record.game_id
+
+        # Add exception info
+        if record.exc_info:
+            log_data["exception"] = self.formatException(record.exc_info)
+
+        return json.dumps(log_data)
+
+def setup_logging():
+    """Configure application logging."""
+    handler = logging.StreamHandler()
+
+    if os.getenv("ENVIRONMENT") == "production":
+        handler.setFormatter(JSONFormatter())
+    else:
+        handler.setFormatter(logging.Formatter(
+            "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+        ))
+
+    logging.root.handlers = [handler]
+    logging.root.setLevel(logging.INFO)
+
+    # Reduce noise from libraries
+    logging.getLogger("uvicorn.access").setLevel(logging.WARNING)
+    logging.getLogger("websockets").setLevel(logging.WARNING)
+```
+
+### Request ID Middleware
+
+```python
+# server/middleware.py
+import uuid
+
+class RequestIDMiddleware(BaseHTTPMiddleware):
+    async def dispatch(self, request, call_next):
+        request_id = request.headers.get("X-Request-ID", str(uuid.uuid4()))
+        request.state.request_id = request_id
+
+        response = await call_next(request)
+        response.headers["X-Request-ID"] = request_id
+
+        return response
+```
+
+---
+
+## 7. Graceful Shutdown
+
+### Shutdown Handler
+
+```python
+# server/main.py
+import signal
+import asyncio
+
+shutdown_event = asyncio.Event()
+
+@app.on_event("startup")
+async def startup():
+    # Register signal handlers
+    loop = asyncio.get_running_loop()
+    for sig in (signal.SIGTERM, signal.SIGINT):
+        loop.add_signal_handler(sig, lambda: asyncio.create_task(shutdown()))
+
+@app.on_event("shutdown")
+async def shutdown():
+    logger.info("Shutdown initiated...")
+
+    # Stop accepting new connections
+    shutdown_event.set()
+
+    # Save all active games to Redis
+    await save_all_active_games()
+
+    # Close WebSocket connections gracefully
+    for ws in list(active_connections):
+        try:
+            await ws.close(code=1001, reason="Server shutting down")
+        except:
+            pass
+
+    # Wait for in-flight requests (max 30 seconds)
+    await asyncio.sleep(5)
+
+    # Close database pool
+    await db_pool.close()
+
+    # Close Redis connections
+    await redis_client.close()
+
+    logger.info("Shutdown complete")
+
+async def save_all_active_games():
+    """Persist all active games before shutdown."""
+    for game_id, game in active_games.items():
+        try:
+            await state_cache.save_game(game)
+            logger.info(f"Saved game {game_id}")
+        except Exception as e:
+            logger.error(f"Failed to save game {game_id}: {e}")
+```
+
+---
+
+## 8. Secrets Management
+
+### Environment Configuration
+
+```python
+# server/config.py
+from pydantic import BaseSettings, PostgresDsn, RedisDsn
+
+class Settings(BaseSettings):
+    # Database
+    database_url: PostgresDsn
+
+    # Redis
+    redis_url: RedisDsn
+
+    # Security
+    secret_key: str
+    jwt_algorithm: str = "HS256"
+    jwt_expiry_hours: int = 24
+
+    # Email
+    resend_api_key: str
+    email_from: str = "Golf Game <noreply@golf.example.com>"
+
+    # Monitoring
+    sentry_dsn: str = ""
+    environment: str = "development"
+
+    # Rate limiting
+    rate_limit_enabled: bool = True
+
+    class Config:
+        env_file = ".env"
+        case_sensitive = False
+
+settings = Settings()
+```
+
+### Production Secrets (Example for Docker Swarm)
+
+```yaml
+# docker-compose.prod.yml
+secrets:
+  db_password:
+    external: true
+  secret_key:
+    external: true
+  resend_api_key:
+    external: true
+
+services:
+  app:
+    secrets:
+      - db_password
+      - secret_key
+      - resend_api_key
+    environment:
+      - DATABASE_URL=postgresql://golf@postgres:5432/golfgame?password_file=/run/secrets/db_password
+```
+
+---
+
+## 9. Database Migrations
+
+### Alembic Configuration
+
+```ini
+# alembic.ini
+[alembic]
+script_location = migrations
+sqlalchemy.url = env://DATABASE_URL
+
+[logging]
+level = INFO
+```
+
+### Migration Script Template
+
+```python
+# migrations/versions/001_initial.py
+"""Initial schema
+
+Revision ID: 001
+Create Date: 2024-01-01
+"""
+from alembic import op
+import sqlalchemy as sa
+
+revision = '001'
+down_revision = None
+
+def upgrade():
+    # Users table
+    op.create_table(
+        'users',
+        sa.Column('id', sa.UUID(), primary_key=True),
+        sa.Column('username', sa.String(50), unique=True, nullable=False),
+        sa.Column('email', sa.String(255), unique=True, nullable=False),
+        sa.Column('password_hash', sa.String(255), nullable=False),
+        sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.func.now()),
+        sa.Column('is_admin', sa.Boolean(), default=False),
+    )
+
+    # Games table
+    op.create_table(
+        'games',
+        sa.Column('id', sa.UUID(), primary_key=True),
+        sa.Column('room_code', sa.String(10), nullable=False),
+        sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.func.now()),
+        sa.Column('completed_at', sa.DateTime(timezone=True)),
+    )
+
+    # Events table
+    op.create_table(
+        'events',
+        sa.Column('id', sa.BigInteger(), primary_key=True, autoincrement=True),
+        sa.Column('game_id', sa.UUID(), sa.ForeignKey('games.id'), nullable=False),
+        sa.Column('event_type', sa.String(50), nullable=False),
+        sa.Column('data', sa.JSON(), nullable=False),
+        sa.Column('timestamp', sa.DateTime(timezone=True), server_default=sa.func.now()),
+    )
+
+    # Indexes
+    op.create_index('idx_events_game_id', 'events', ['game_id'])
+    op.create_index('idx_users_email', 'users', ['email'])
+    op.create_index('idx_users_username', 'users', ['username'])
+
+def downgrade():
+    op.drop_table('events')
+    op.drop_table('games')
+    op.drop_table('users')
+```
+
+### Migration Commands
+
+```bash
+# Create new migration
+alembic revision --autogenerate -m "Add user sessions"
+
+# Run migrations
+alembic upgrade head
+
+# Rollback one version
+alembic downgrade -1
+
+# Show current version
+alembic current
+```
+
+---
+
+## 10. Deployment Checklist
+
+### Pre-deployment
+
+- [ ] All environment variables set
+- [ ] Database migrations applied
+- [ ] Secrets configured in secret manager
+- [ ] SSL certificates provisioned
+- [ ] Rate limiting configured and tested
+- [ ] Error tracking (Sentry) configured
+- [ ] Logging aggregation set up
+- [ ] Health check endpoints verified
+- [ ] Backup strategy implemented
+
+### Deployment
+
+- [ ] Run database migrations
+- [ ] Deploy new containers with rolling update
+- [ ] Verify health checks pass
+- [ ] Monitor error rates in Sentry
+- [ ] Check application logs
+- [ ] Verify WebSocket connections work
+- [ ] Test critical user flows
+
+### Post-deployment
+
+- [ ] Monitor performance metrics
+- [ ] Check database connection pool usage
+- [ ] Verify Redis memory usage
+- [ ] Review error logs
+- [ ] Test graceful shutdown/restart
+
+---
+
+## 11. Monitoring Dashboard (Grafana)
+
+### Key Metrics to Track
+
+```yaml
+# Example Prometheus metrics
+metrics:
+  # Application
+  - http_requests_total
+  - http_request_duration_seconds
+  - websocket_connections_active
+  - games_active
+  - games_completed_total
+
+  # Infrastructure
+  - container_cpu_usage_seconds_total
+  - container_memory_usage_bytes
+  - pg_stat_activity_count
+  - redis_connected_clients
+  - redis_used_memory_bytes
+
+  # Business
+  - users_registered_total
+  - games_played_today
+  - average_game_duration_seconds
+```
+
+### Alert Rules
+
+```yaml
+# alertmanager rules
+groups:
+  - name: golf-alerts
+    rules:
+      - alert: HighErrorRate
+        expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "High error rate detected"
+
+      - alert: DatabaseConnectionExhausted
+        expr: pg_stat_activity_count > 90
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Database connections near limit"
+
+      - alert: HighMemoryUsage
+        expr: container_memory_usage_bytes / container_spec_memory_limit_bytes > 0.9
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Container memory usage above 90%"
+```
+
+---
+
+## 12. Backup Strategy
+
+### Database Backups
+
+```bash
+#!/bin/bash
+# backup.sh - Daily database backup
+
+BACKUP_DIR=/backups
+DATE=$(date +%Y%m%d_%H%M%S)
+BACKUP_FILE="${BACKUP_DIR}/golfgame_${DATE}.sql.gz"
+
+# Backup with pg_dump
+pg_dump -h postgres -U golf golfgame | gzip > "$BACKUP_FILE"
+
+# Upload to S3/B2/etc
+aws s3 cp "$BACKUP_FILE" s3://golf-backups/
+
+# Cleanup old local backups (keep 7 days)
+find "$BACKUP_DIR" -name "*.sql.gz" -mtime +7 -delete
+
+# Cleanup old S3 backups (keep 30 days) via lifecycle policy
+```
+
+### Redis Persistence
+
+```conf
+# redis.conf
+appendonly yes
+appendfsync everysec
+auto-aof-rewrite-percentage 100
+auto-aof-rewrite-min-size 64mb
+```
+
+---
+
+## Summary
+
+This document covers all production deployment concerns:
+
+1. **Docker**: Multi-stage builds, health checks, resource limits
+2. **Rate Limiting**: Token bucket algorithm, per-endpoint limits (Phase 1 priority)
+3. **Security**: Headers, CORS, CSP, HSTS
+4. **Monitoring**: Sentry, structured logging, Prometheus metrics
+5. **Operations**: Graceful shutdown, migrations, backups
+6. **Deployment**: Checklist, rolling updates, health verification
+
+Rate limiting is implemented in Phase 1 as a security priority to protect against abuse before public launch.