diff --git a/rpi/kickoff-pi-test.sh b/rpi/kickoff-pi-test.sh
new file mode 100755
index 0000000..390ddee
--- /dev/null
+++ b/rpi/kickoff-pi-test.sh
@@ -0,0 +1,193 @@
+#!/bin/bash
+#
+# Stegasoo Pi Test Kickoff Script
+# Automates: flash -> wait for boot -> setup -> test
+#
+# Usage: ./kickoff-pi-test.sh <image.img.zst> </dev/sdX>
+#
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Pi connection settings
+PI_HOST="stegasoo.local"
+PI_USER="admin"
+PI_PASS="stegasoo"
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+CYAN='\033[0;36m'
+NC='\033[0m'
+
+# -----------------------------------------------------------------------------
+# Helper functions
+# -----------------------------------------------------------------------------
+
+# Wait for Pi to be reachable
+wait_for_pi() {
+    local attempt=1
+    ssh-keygen -R "$PI_HOST" 2>/dev/null
+
+    echo "Waiting for $PI_USER@$PI_HOST..."
+    while ! sshpass -p "$PI_PASS" ssh -o ConnectTimeout=2 -o StrictHostKeyChecking=no -o BatchMode=no -o UserKnownHostsFile=/dev/null "$PI_USER@$PI_HOST" "exit" 2>/dev/null; do
+        printf "\rAttempt %d..." "$attempt"
+        ((attempt++))
+        sleep 2
+    done
+
+    printf "\r${GREEN}✓ Ready after %d attempts${NC}\n" "$attempt"
+    printf '\a'  # Terminal bell
+}
+
+# Run command on Pi (non-interactive)
+run_on_pi() {
+    sshpass -p "$PI_PASS" ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "$PI_USER@$PI_HOST" "$@"
+}
+
+# Run command on Pi (interactive/PTY)
+run_on_pi_interactive() {
+    sshpass -p "$PI_PASS" ssh -t -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "$PI_USER@$PI_HOST" "$@"
+}
+
+# Copy file to Pi
+scp_to_pi() {
+    local src="$1"
+    local dst="$2"
+    sshpass -p "$PI_PASS" scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "$src" "$PI_USER@$PI_HOST:$dst"
+}
+
+# Interactive SSH session
+ssh_pi() {
+    ssh-keygen -R "$PI_HOST" 2>/dev/null
+    sshpass -p "$PI_PASS" ssh -t -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "$PI_USER@$PI_HOST" "$@"
+}
+
+# -----------------------------------------------------------------------------
+# Main
+# -----------------------------------------------------------------------------
+
+if [[ $# -lt 2 ]]; then
+    echo "Usage: $0 <image.img.zst> </dev/sdX>"
+    echo ""
+    echo "Example: $0 stegasoo-v4.1.img.zst /dev/sda"
+    exit 1
+fi
+
+IMAGE="$1"
+DEVICE="$2"
+
+if [[ ! -f "$IMAGE" ]]; then
+    echo -e "${RED}Error: Image file not found: $IMAGE${NC}"
+    exit 1
+fi
+
+if [[ ! -b "$DEVICE" ]]; then
+    echo -e "${RED}Error: Device not found: $DEVICE${NC}"
+    exit 1
+fi
+
+echo -e "${CYAN}╔═══════════════════════════════════════════════════════════════╗${NC}"
+echo -e "${CYAN}║              Stegasoo Pi Test Kickoff                         ║${NC}"
+echo -e "${CYAN}╚═══════════════════════════════════════════════════════════════╝${NC}"
+echo ""
+echo -e "Image:  ${YELLOW}$IMAGE${NC}"
+echo -e "Device: ${YELLOW}$DEVICE${NC}"
+echo ""
+
+# -----------------------------------------------------------------------------
+# Step 1: Flash the image
+# -----------------------------------------------------------------------------
+echo -e "${GREEN}[1/8]${NC} Flashing image..."
+echo ""
+
+# Auto-answer: "yes" for confirm, "y" for wipe, "y" for resize
+printf 'yes\ny\ny\n' | "$SCRIPT_DIR/flash-stock-img.sh" "$IMAGE" "$DEVICE"
+
+echo ""
+echo -e "${GREEN}[2/8]${NC} Flash complete! Waiting for SD card insertion..."
+echo ""
+
+# -----------------------------------------------------------------------------
+# Step 2: Wait for user to insert SD card
+# -----------------------------------------------------------------------------
+echo -e "${YELLOW}════════════════════════════════════════════════════════════════${NC}"
+echo -e "${YELLOW}  Insert SD card into Pi and power on${NC}"
+echo -e "${YELLOW}════════════════════════════════════════════════════════════════${NC}"
+echo ""
+read -p "Press ENTER when Pi is booting..."
+
+echo ""
+
+# -----------------------------------------------------------------------------
+# Step 3: Wait for Pi to be ready
+# -----------------------------------------------------------------------------
+echo -e "${GREEN}[3/8]${NC} Waiting for Pi to boot..."
+echo ""
+
+wait_for_pi
+
+# -----------------------------------------------------------------------------
+# Step 4: Pre-setup (install dependencies)
+# -----------------------------------------------------------------------------
+echo ""
+echo -e "${GREEN}[4/8]${NC} Installing dependencies on Pi..."
+echo ""
+
+run_on_pi "sudo chown admin:admin /opt && sudo apt-get update && sudo apt-get install -y git zstd jq"
+
+# -----------------------------------------------------------------------------
+# Step 5: Clone repo
+# -----------------------------------------------------------------------------
+echo ""
+echo -e "${GREEN}[5/8]${NC} Cloning Stegasoo repo..."
+echo ""
+
+run_on_pi "cd /opt && git clone -b 4.1 https://github.com/adlee-was-taken/stegasoo.git stegasoo"
+
+# -----------------------------------------------------------------------------
+# Step 6: Copy pre-built tarball
+# -----------------------------------------------------------------------------
+echo ""
+echo -e "${GREEN}[6/8]${NC} Copying pre-built tarball to Pi..."
+echo ""
+
+TARBALL="$SCRIPT_DIR/stegasoo-pi-arm64.tar.zst"
+if [[ -f "$TARBALL" ]]; then
+    scp_to_pi "$TARBALL" "/opt/stegasoo/rpi/"
+    echo -e "  ${GREEN}✓${NC} Tarball copied"
+else
+    echo -e "  ${YELLOW}⚠${NC} Tarball not found at $TARBALL"
+    echo -e "  ${YELLOW}⚠${NC} Setup will build from source (takes longer)"
+fi
+
+# -----------------------------------------------------------------------------
+# Step 7: Run setup
+# -----------------------------------------------------------------------------
+echo ""
+echo -e "${GREEN}[7/8]${NC} Running setup.sh on Pi..."
+echo ""
+
+run_on_pi_interactive "cd /opt/stegasoo && ./rpi/setup.sh"
+
+# -----------------------------------------------------------------------------
+# Step 8: Test it works
+# -----------------------------------------------------------------------------
+echo ""
+echo -e "${GREEN}[8/8]${NC} Testing Stegasoo..."
+echo ""
+
+run_on_pi "sudo systemctl start stegasoo && sleep 2 && curl -sk https://localhost:5000 | head -5"
+
+echo ""
+echo -e "${GREEN}════════════════════════════════════════════════════════════════${NC}"
+echo -e "${GREEN}  Build complete! Pi is ready for testing.${NC}"
+echo -e "${GREEN}════════════════════════════════════════════════════════════════${NC}"
+echo ""
+echo -e "Access: ${YELLOW}https://stegasoo.local:5000${NC}"
+echo ""
+read -p "Press ENTER to SSH into Pi for manual testing..."
+
+ssh_pi
diff --git a/src/stegasoo/crypto.py b/src/stegasoo/crypto.py
index 779d054..0e99dca 100644
--- a/src/stegasoo/crypto.py
+++ b/src/stegasoo/crypto.py
@@ -1,18 +1,26 @@
 """
 Stegasoo Cryptographic Functions (v4.0.0 - Channel Key Support)
 
-Key derivation, encryption, and decryption using AES-256-GCM.
-Supports both text messages and binary file payloads.
+This is the crypto layer - where we turn plaintext into indecipherable noise.
 
-BREAKING CHANGES in v4.0.0:
-- Added channel key support for deployment/group isolation
-- Messages encoded with a channel key require the same key to decode
-- Channel key can be configured via environment, config file, or explicit parameter
-- FORMAT_VERSION bumped to 5
+The security model is multi-factor:
+┌────────────────────────────────────────────────────────────────────┐
+│  SOMETHING YOU HAVE          SOMETHING YOU KNOW                    │
+│  ├─ Reference photo          ├─ Passphrase (4+ BIP-39 words)      │
+│  └─ RSA private key (opt)    └─ PIN (6-9 digits)                  │
+│                                                                    │
+│  DEPLOYMENT BINDING                                                │
+│  └─ Channel key (ties messages to a specific server/group)        │
+└────────────────────────────────────────────────────────────────────┘
 
-BREAKING CHANGES in v3.2.0:
-- Removed date dependency from key derivation
-- Renamed day_phrase → passphrase (no daily rotation needed)
+All factors get mixed together through Argon2id (memory-hard KDF) to derive
+the actual encryption key. Miss any factor = wrong key = garbage output.
+
+Encryption: AES-256-GCM (authenticated encryption - tamper = detection)
+KDF: Argon2id (256MB RAM, 4 iterations) or PBKDF2 fallback (600K iterations)
+
+v4.0.0: Added channel key for server/group isolation
+v3.2.0: Removed date dependency (was cute but annoying in practice)
 """
 
 import hashlib
@@ -98,25 +106,38 @@ def _resolve_channel_key(channel_key: str | bool | None) -> bytes | None:
 # =============================================================================
 # CORE CRYPTO FUNCTIONS
 # =============================================================================
+#
+# The "reference photo as a key" concept is one of Stegasoo's unique features.
+# Most steganography tools just use a password. We add the photo as a
+# "something you have" factor - like a hardware token, but it's a cat picture.
 
 
 def hash_photo(image_data: bytes) -> bytes:
     """
     Compute deterministic hash of photo pixel content.
 
-    This normalizes the image to RGB and hashes the raw pixel data,
-    making it resistant to metadata changes.
+    This is the magic sauce that turns your cat photo into a cryptographic key.
+
+    Why pixels and not the file hash?
+    - File metadata changes (EXIF stripped, resaved) = different file hash
+    - But pixel content stays the same
+    - We hash the RGB values directly, so format conversions don't matter
+
+    The double-hash with prefix is belt-and-suspenders mixing. Probably
+    overkill, but hey, it's crypto - paranoia is a feature.
 
     Args:
-        image_data: Raw image file bytes
+        image_data: Raw image file bytes (any format PIL can read)
 
     Returns:
-        32-byte SHA-256 hash
+        32-byte SHA-256 hash of pixel content
     """
+    # Convert to RGB to normalize (RGBA, grayscale, etc. all become RGB)
     img: Image.Image = Image.open(io.BytesIO(image_data)).convert("RGB")
     pixels = img.tobytes()
 
-    # Double-hash with prefix for additional mixing
+    # Double-hash: SHA256(SHA256(pixels) + first 1KB of pixels)
+    # The prefix adds image-specific data to prevent length-extension shenanigans
     h = hashlib.sha256(pixels).digest()
     h = hashlib.sha256(h + pixels[:1024]).digest()
     return h
@@ -133,20 +154,38 @@ def derive_hybrid_key(
     """
     Derive encryption key from multiple factors.
 
-    Combines:
-    - Photo hash (something you have)
-    - Passphrase (something you know)
-    - PIN (something you know, static)
-    - RSA key (something you have)
-    - Channel key (deployment/group binding)
-    - Salt (random per message)
+    This is the heart of Stegasoo's security model. We take all the things
+    you need to prove you're authorized (photo, passphrase, PIN, etc.) and
+    blend them together into one 32-byte key.
 
-    Uses Argon2id if available, falls back to PBKDF2.
+    The flow:
+    ┌─────────────┐   ┌─────────────┐   ┌─────────────┐
+    │ Photo hash  │ + │ passphrase  │ + │ PIN + RSA   │ + salt
+    └─────────────┘   └─────────────┘   └─────────────┘
+           │                 │                 │
+           └────────────────┴────────────────┘
+                             │
+                             ▼
+                    ┌─────────────────┐
+                    │    Argon2id     │  <- Memory-hard KDF
+                    │   256MB / 4 iter │  <- Makes brute force expensive
+                    └─────────────────┘
+                             │
+                             ▼
+                      32-byte AES key
+
+    Why Argon2id?
+    - Memory-hard: attackers can't just throw GPUs at it
+    - 256MB RAM per attempt = expensive at scale
+    - Winner of the Password Hashing Competition (2015)
+    - "id" variant resists both side-channel and GPU attacks
+
+    Fallback: PBKDF2-SHA512 with 600K iterations (for systems without argon2)
 
     Args:
         photo_data: Reference photo bytes
-        passphrase: Shared passphrase (recommend 4+ words)
-        salt: Random salt for this message
+        passphrase: Shared passphrase (recommend 4+ words from BIP-39)
+        salt: Random salt for this message (32 bytes)
         pin: Optional static PIN
         rsa_key_data: Optional RSA key bytes
         channel_key: Channel key parameter:
@@ -155,7 +194,7 @@ def derive_hybrid_key(
             - "" or False: No channel key (public mode)
 
     Returns:
-        32-byte derived key
+        32-byte derived key (ready for AES-256)
 
     Raises:
         KeyDerivationError: If key derivation fails
@@ -163,31 +202,36 @@ def derive_hybrid_key(
     try:
         photo_hash = hash_photo(photo_data)
 
-        # Resolve channel key
+        # Resolve channel key (server-specific binding)
         channel_hash = _resolve_channel_key(channel_key)
 
-        # Build key material
+        # Build key material by concatenating all factors
+        # Passphrase is lowercased to be forgiving of case differences
         key_material = photo_hash + passphrase.lower().encode() + pin.encode() + salt
 
-        # Add RSA key hash if provided
+        # Add RSA key hash if provided (another "something you have")
         if rsa_key_data:
             key_material += hashlib.sha256(rsa_key_data).digest()
 
-        # Add channel key hash if configured (v4.0.0)
+        # Add channel key hash if configured (v4.0.0 - deployment binding)
         if channel_hash:
             key_material += channel_hash
 
+        # Run it all through the KDF
         if HAS_ARGON2:
+            # Argon2id: the good stuff
             key = hash_secret_raw(
                 secret=key_material,
                 salt=salt[:32],
-                time_cost=ARGON2_TIME_COST,
-                memory_cost=ARGON2_MEMORY_COST,
-                parallelism=ARGON2_PARALLELISM,
+                time_cost=ARGON2_TIME_COST,      # 4 iterations
+                memory_cost=ARGON2_MEMORY_COST,  # 256 MB RAM
+                parallelism=ARGON2_PARALLELISM,  # 4 threads
                 hash_len=32,
-                type=Type.ID,
+                type=Type.ID,  # Hybrid mode: resists side-channel AND GPU attacks
             )
         else:
+            # PBKDF2 fallback for systems without argon2-cffi
+            # 600K iterations is slow but not memory-hard
             kdf = PBKDF2HMAC(
                 algorithm=hashes.SHA512(),
                 length=32,
@@ -347,9 +391,12 @@ def _unpack_payload(data: bytes) -> DecodeResult:
 # =============================================================================
 # HEADER FLAGS (v4.0.0)
 # =============================================================================
+#
+# The flags byte tells us about the message without decrypting it.
+# Currently just one flag, but the byte gives us room for 8.
 
-# Header flag bits
-FLAG_CHANNEL_KEY = 0x01  # Set if encoded with a channel key
+FLAG_CHANNEL_KEY = 0x01  # Bit 0: Message was encoded with a channel key
+# Future flags could include: compression, file attachment, etc.
 
 
 def encrypt_message(
@@ -361,33 +408,40 @@ def encrypt_message(
     channel_key: str | bool | None = None,
 ) -> bytes:
     """
-    Encrypt message or file using AES-256-GCM with hybrid key derivation.
+    Encrypt message or file using AES-256-GCM.
 
-    Message format (v4.0.0 - with channel key support):
-    - Magic header (4 bytes)
-    - Version (1 byte) = 5
-    - Flags (1 byte) - indicates if channel key was used
-    - Salt (32 bytes)
-    - IV (12 bytes)
-    - Auth tag (16 bytes)
-    - Ciphertext (variable, padded)
+    This is where plaintext becomes ciphertext. We use AES-256-GCM which is:
+    - AES: The standard, used by everyone from banks to governments
+    - 256-bit key: Enough entropy to survive until the heat death of the universe
+    - GCM mode: Authenticated encryption - if anyone tampers, decryption fails
+
+    The output format (v4.0.0):
+    ┌──────────────────────────────────────────────────────────────────────┐
+    │ \x89ST3 │ 05 │ flags │  salt (32B)  │  iv (12B)  │  tag (16B)  │ ··· │
+    │  magic  │ver │       │              │            │             │cipher│
+    └──────────────────────────────────────────────────────────────────────┘
+
+    Why the random padding at the end?
+    - Message length can reveal information (traffic analysis)
+    - We add 64-319 random bytes and round to 256-byte boundary
+    - All messages look roughly the same size
 
     Args:
         message: Message string, raw bytes, or FilePayload to encrypt
-        photo_data: Reference photo bytes
-        passphrase: Shared passphrase (recommend 4+ words for good entropy)
-        pin: Optional static PIN
-        rsa_key_data: Optional RSA key bytes
+        photo_data: Reference photo bytes (your "key photo")
+        passphrase: Shared passphrase (recommend 4+ words from BIP-39)
+        pin: Optional static PIN for additional security
+        rsa_key_data: Optional RSA key bytes (another "something you have")
         channel_key: Channel key parameter:
-            - None or "auto": Use configured key
+            - None or "auto": Use server's configured key
             - str: Use this specific key
             - "" or False: No channel key (public mode)
 
     Returns:
-        Encrypted message bytes
+        Encrypted message bytes ready for embedding
 
     Raises:
-        EncryptionError: If encryption fails
+        EncryptionError: If encryption fails (shouldn't happen with valid inputs)
     """
     try:
         salt = secrets.token_bytes(SALT_SIZE)
diff --git a/src/stegasoo/dct_steganography.py b/src/stegasoo/dct_steganography.py
index ff355c9..2da5703 100644
--- a/src/stegasoo/dct_steganography.py
+++ b/src/stegasoo/dct_steganography.py
@@ -1,22 +1,30 @@
 """
 DCT Domain Steganography Module (v4.1.0)
 
-Embeds data in DCT coefficients with two approaches:
-1. PNG output: Scipy-based DCT transform (grayscale or color)
-2. JPEG output: jpegio-based coefficient manipulation (if available)
+The fancy pants mode. Instead of hiding bits in pixel values (LSB mode),
+we hide them in the *frequency domain* - specifically in the Discrete Cosine
+Transform coefficients that JPEG compression uses internally.
 
-v4.1.0 Changes:
-- Reed-Solomon error correction protects against bit errors in problematic blocks
-- Majority voting on length headers (3 copies) for additional robustness
-- RS can correct up to 16 byte errors per 223-byte chunk
+Why is this cool?
+- Survives some image processing that would destroy LSB data
+- Works with JPEG without the usual "save destroys everything" problem
+- Uses the same math that JPEG itself uses - we're hiding in plain sight
 
-v3.2.0-patch2 Changes:
-- Chunked processing for large images to avoid heap corruption
-- Process image in vertical strips to limit memory per operation
-- Isolated DCT operations with fresh array allocations
-- Workaround for scipy.fftpack memory issues
+Two approaches depending on what you want:
+1. PNG output: We do our own DCT math via scipy (works on any image)
+2. JPEG output: We use jpegio to directly tweak the coefficients (chef's kiss)
 
-Requires: scipy (for PNG mode), optionally jpegio (for JPEG mode), reedsolo (for error correction)
+v4.1.0 - The "please stop corrupting my data" release:
+- Reed-Solomon error correction (can fix up to 16 byte errors per chunk)
+- Majority voting on headers (store 3 copies, take the winner)
+- Because some image regions are just... problematic
+
+v3.2.0-patch2 - The "scipy why are you like this" release:
+- Chunked processing because scipy's FFT was corrupting memory on big images
+- Process blocks one at a time with fresh arrays
+- Yes, it's slower. No, I don't care. Correctness > speed.
+
+Requires: scipy (PNG mode), optionally jpegio (JPEG mode), reedsolo (error correction)
 """
 
 import gc
@@ -87,11 +95,31 @@ def _write_progress(progress_file: str | None, current: int, total: int, phase:
 # CONSTANTS
 # ============================================================================
 
+# JPEG uses 8x8 blocks for DCT - this is baked into the standard
 BLOCK_SIZE = 8
+
+# The zig-zag order of DCT coefficients. JPEG stores them this way because
+# the human eye is more sensitive to low frequencies (top-left corner)
+# than high frequencies (bottom-right). After quantization, most high-freq
+# coefficients become zero, so zig-zag gives great compression.
+#
+# Visual of an 8x8 DCT block with zig-zag numbering:
+#
+#   DC  1   5   6  14  15  27  28     <- Low frequency (smooth gradients)
+#    2  4   7  13  16  26  29  42
+#    3  8  12  17  25  30  41  43
+#    9 11  18  24  31  40  44  53
+#   10 19  23  32  39  45  52  54
+#   20 22  33  38  46  51  55  60
+#   21 34  37  47  50  56  59  61
+#   35 36  48  49  57  58  62  63     <- High frequency (fine detail/noise)
+#
+# Position (0,0) is the DC coefficient - the average brightness of the block.
+# We NEVER touch DC because changing it causes visible brightness shifts.
 EMBED_POSITIONS = [
-    (0, 1),
-    (1, 0),
-    (2, 0),
+    (0, 1),   # 1st AC coefficient
+    (1, 0),   # 2nd AC coefficient
+    (2, 0),   # ... and so on in zig-zag order
     (1, 1),
     (0, 2),
     (0, 3),
@@ -124,32 +152,59 @@ EMBED_POSITIONS = [
     (6, 1),
     (7, 0),
 ]
+
+# We use positions 4-20 (mid-frequency range). Here's the reasoning:
+# - Positions 0-3: Too low frequency, changes are visible as color shifts
+# - Positions 4-20: Sweet spot - carries enough energy to survive, not visible
+# - Positions 21+: High frequency, often quantized to zero, unreliable
 DEFAULT_EMBED_POSITIONS = EMBED_POSITIONS[4:20]
+
+# Quantization step for QIM (Quantization Index Modulation).
+# This is how we actually embed bits: we round the coefficient to a grid
+# and then nudge it based on whether we want a 0 or 1.
+# Bigger step = more robust to noise, but more visible. 25 is a good balance.
 QUANT_STEP = 25
-DCT_MAGIC = b"DCTS"
-HEADER_SIZE = 10
+
+# Magic bytes so we can identify our own images
+DCT_MAGIC = b"DCTS"      # scipy DCT mode marker
+JPEGIO_MAGIC = b"JPGS"   # jpegio native JPEG mode marker
+HEADER_SIZE = 10         # Magic (4) + version (1) + flags (1) + length (4)
+
 OUTPUT_FORMAT_PNG = "png"
 OUTPUT_FORMAT_JPEG = "jpeg"
-JPEG_OUTPUT_QUALITY = 95
-JPEGIO_MAGIC = b"JPGS"
+JPEG_OUTPUT_QUALITY = 95  # High quality but not 100 (100 causes issues, see below)
+
+# For jpegio mode: we only embed in coefficients with magnitude >= 2
+# Coefficients of 0 or 1 are usually quantized noise - unreliable
 JPEGIO_MIN_COEF_MAGNITUDE = 2
+
+# We embed in the Y (luminance) channel only - it has the most capacity
+# Cb/Cr are often subsampled 4:2:0 anyway
 JPEGIO_EMBED_CHANNEL = 0
-FLAG_COLOR_MODE = 0x01
-FLAG_RS_PROTECTED = 0x02  # Reed-Solomon error correction enabled
 
-# Reed-Solomon settings - 32 symbols can correct up to 16 byte errors per 223-byte chunk
+# Header flags
+FLAG_COLOR_MODE = 0x01      # Set if we preserved color (YCbCr mode)
+FLAG_RS_PROTECTED = 0x02    # Set if Reed-Solomon protected (v4.1.0+)
+
+# Reed-Solomon settings - the "please don't lose my data" system
+# 32 parity symbols per chunk means we can correct up to 16 byte errors
+# Math: RS(255, 223) where 255-223=32 parity bytes, corrects floor(32/2)=16
 RS_NSYM = 32
-RS_LENGTH_HEADER_SIZE = 8  # 8 bytes: 4 for raw_payload_length + 4 for rs_payload_length
-RS_LENGTH_COPIES = 3  # Store length header 3 times for majority voting
-RS_LENGTH_PREFIX_SIZE = RS_LENGTH_HEADER_SIZE * RS_LENGTH_COPIES  # Total: 24 bytes
 
-# Chunking settings for large images
-MAX_CHUNK_HEIGHT = 512  # Process in 512-pixel tall strips
+# We store the payload length 3 times and take majority vote
+# Because if the length is wrong, everything is wrong
+RS_LENGTH_HEADER_SIZE = 8   # 4 bytes raw length + 4 bytes RS-encoded length
+RS_LENGTH_COPIES = 3        # Store 3 copies, need 2 to agree
+RS_LENGTH_PREFIX_SIZE = RS_LENGTH_HEADER_SIZE * RS_LENGTH_COPIES  # 24 bytes total
 
-# JPEG normalization settings
-# JPEGs with quality=100 have all quantization values = 1, which crashes jpegio
-JPEGIO_NORMALIZE_QUALITY = 95  # Re-save quality for problematic JPEGs
-JPEGIO_MAX_QUANT_VALUE_THRESHOLD = 1  # If all quant values <= this, normalize
+# Chunking for large images - scipy's FFT gets memory-corrupty on huge arrays
+MAX_CHUNK_HEIGHT = 512  # Process in strips to keep memory sane
+
+# Fun bug: JPEGs saved with quality=100 have quantization tables full of 1s
+# This makes the DCT coefficients HUGE and jpegio crashes spectacularly
+# Solution: detect and re-save at quality 95 first
+JPEGIO_NORMALIZE_QUALITY = 95
+JPEGIO_MAX_QUANT_VALUE_THRESHOLD = 1  # All 1s in quant table = bad news
 
 
 # ============================================================================
@@ -209,13 +264,26 @@ def has_jpegio_support() -> bool:
 
 # ============================================================================
 # REED-SOLOMON ERROR CORRECTION
-# Protects against bit errors in problematic image blocks
 # ============================================================================
+#
+# Why do we need this? DCT embedding isn't perfect. Some image regions are
+# problematic - flat areas, high compression, edge cases. Bits can flip.
+#
+# Reed-Solomon is the same error correction used in CDs, DVDs, QR codes, and
+# deep space communications. If it's good enough for Voyager, it's good enough
+# for hiding cat pictures in other cat pictures.
+#
+# How it works (simplified):
+# 1. Take your data bytes
+# 2. Add extra "parity" bytes calculated from the data
+# 3. If some bytes get corrupted, the math lets you reconstruct them
+# 4. RS(255, 223) means: 255 byte blocks, 223 data + 32 parity
+# 5. Can correct up to 16 corrupted bytes per block (floor(32/2))
+#
+# The tradeoff: ~14% overhead (32/223). Worth it for reliability.
 
-# Check for reedsolo availability
 try:
     from reedsolo import ReedSolomonError, RSCodec
-
     HAS_REEDSOLO = True
 except ImportError:
     HAS_REEDSOLO = False
@@ -224,48 +292,78 @@ except ImportError:
 
 
 def _rs_encode(data: bytes) -> bytes:
-    """Add Reed-Solomon error correction symbols to data."""
+    """
+    Wrap data in Reed-Solomon error correction.
+
+    Takes your precious payload and adds parity bytes so we can
+    recover from the inevitable bit-rot of DCT embedding.
+    """
     if not HAS_REEDSOLO:
-        return data  # No protection if reedsolo not available
+        return data  # YOLO mode - no protection, good luck
     rs = RSCodec(RS_NSYM)
     return bytes(rs.encode(data))
 
 
 def _rs_decode(data: bytes) -> bytes:
-    """Decode Reed-Solomon protected data, correcting errors if possible."""
+    """
+    Decode Reed-Solomon protected data, fixing errors along the way.
+
+    This is where the magic happens. If bits got flipped during
+    extraction, RS will quietly fix them. If too many flipped...
+    well, we tried.
+    """
     if not HAS_REEDSOLO:
-        return data  # No decoding if reedsolo not available
+        return data
     rs = RSCodec(RS_NSYM)
     try:
         decoded, _, errata_pos = rs.decode(data)
         if errata_pos:
-            pass  # Errors were corrected
+            # Errors were found and corrected - RS earned its keep today
+            pass
         return bytes(decoded)
     except ReedSolomonError as e:
+        # Too many errors - the image got mangled beyond repair
         raise StegasooRSError(f"Image corrupted beyond repair: {e}") from e
 
 
 # ============================================================================
 # SAFE DCT FUNCTIONS
-# These create fresh arrays to avoid scipy memory corruption issues
 # ============================================================================
+#
+# Story time: scipy's fftpack (the old DCT implementation) has memory issues
+# when you process large images. We'd get random garbage in our output, or
+# worse, segfaults. Turns out it was reusing internal buffers in unsafe ways.
+#
+# The fix? Be paranoid. Every single array operation creates a fresh copy.
+# Is it slower? Yes. Does it work? Also yes. I'll take correct over fast.
+#
+# The newer scipy.fft module is better, but we still play it safe because
+# not everyone has the latest scipy and I don't want debugging nightmares.
 
 
 def _safe_dct2(block: np.ndarray) -> np.ndarray:
     """
-    Apply 2D DCT with memory isolation.
-    Creates a completely fresh array to avoid heap corruption.
+    Apply 2D DCT (Discrete Cosine Transform) to an 8x8 block.
+
+    The DCT converts spatial data (pixel values) into frequency data
+    (how much of each frequency component is present). It's the heart
+    of JPEG compression.
+
+    We do it row-by-row and column-by-column with fresh arrays each time
+    because scipy's built-in dct2 can corrupt memory on large batches.
+    Paranoid? Yes. Necessary? Also yes.
     """
-    # Create a brand new array (not a view)
+    # Create a brand new array (not a view) - paranoia level: maximum
     safe_block = np.array(block, dtype=np.float64, copy=True, order="C")
 
-    # First DCT on columns (transpose -> DCT rows -> transpose back)
+    # 2D DCT = 1D DCT on rows, then 1D DCT on columns (separable transform)
+    # First pass: DCT each column
     temp = np.zeros_like(safe_block, dtype=np.float64, order="C")
     for i in range(BLOCK_SIZE):
         col = np.array(safe_block[:, i], dtype=np.float64, copy=True)
-        temp[:, i] = dct(col, norm="ortho")
+        temp[:, i] = dct(col, norm="ortho")  # ortho normalization for symmetry
 
-    # Second DCT on rows
+    # Second pass: DCT each row of the result
     result = np.zeros_like(temp, dtype=np.float64, order="C")
     for i in range(BLOCK_SIZE):
         row = np.array(temp[i, :], dtype=np.float64, copy=True)
@@ -276,19 +374,22 @@ def _safe_dct2(block: np.ndarray) -> np.ndarray:
 
 def _safe_idct2(block: np.ndarray) -> np.ndarray:
     """
-    Apply 2D inverse DCT with memory isolation.
-    Creates a completely fresh array to avoid heap corruption.
+    Apply 2D inverse DCT - convert frequency data back to pixels.
+
+    After we've embedded our secret bits in the DCT coefficients,
+    we need to convert back to pixel values. This is the reverse
+    of _safe_dct2.
+
+    Same paranoid memory handling because same paranoid developer.
     """
-    # Create a brand new array (not a view)
     safe_block = np.array(block, dtype=np.float64, copy=True, order="C")
 
-    # First IDCT on rows
+    # Inverse is the same idea: IDCT rows, then IDCT columns
     temp = np.zeros_like(safe_block, dtype=np.float64, order="C")
     for i in range(BLOCK_SIZE):
         row = np.array(safe_block[i, :], dtype=np.float64, copy=True)
         temp[i, :] = idct(row, norm="ortho")
 
-    # Second IDCT on columns
     result = np.zeros_like(temp, dtype=np.float64, order="C")
     for i in range(BLOCK_SIZE):
         col = np.array(temp[:, i], dtype=np.float64, copy=True)
@@ -348,8 +449,25 @@ def _unpad_image(image: np.ndarray, original_size: tuple[int, int]) -> np.ndarra
 
 
 def _embed_bit_in_coeff(coef: float, bit: int, quant_step: int = QUANT_STEP) -> float:
+    """
+    Embed a single bit into a DCT coefficient using QIM.
+
+    QIM (Quantization Index Modulation) is smarter than simple LSB flipping.
+    Instead of just changing the last bit, we round to a quantization grid
+    and use odd/even to encode 0/1.
+
+    Why is this better?
+    - More robust to noise (small changes don't flip the bit)
+    - Works naturally with JPEG's own quantization
+    - The change is spread across the coefficient's magnitude
+
+    Visual example (quant_step=25):
+    - Coef = 73, want bit=0 -> round to 75 (75/25=3, 3%2=1) -> nudge to 50 (50/25=2, 2%2=0)
+    - Coef = 73, want bit=1 -> round to 75 (75/25=3, 3%2=1) -> already odd, keep at 75
+    """
     quantized = round(coef / quant_step)
     if (quantized % 2) != bit:
+        # Need to flip even<->odd. Nudge in the direction that's closest.
         if quantized % 2 == 0 and bit == 1:
             quantized += 1 if coef >= quantized * quant_step else -1
         elif quantized % 2 == 1 and bit == 0:
@@ -358,13 +476,35 @@ def _embed_bit_in_coeff(coef: float, bit: int, quant_step: int = QUANT_STEP) ->
 
 
 def _extract_bit_from_coeff(coef: float, quant_step: int = QUANT_STEP) -> int:
+    """
+    Extract a bit from a DCT coefficient.
+
+    The inverse of _embed_bit_in_coeff. We round to the quantization grid
+    and check if it's odd (1) or even (0).
+
+    This is why QIM is robust: small noise in the coefficient usually
+    doesn't change which grid point we round to.
+    """
     quantized = round(coef / quant_step)
     return int(quantized % 2)
 
 
 def _generate_block_order(num_blocks: int, seed: bytes) -> list:
+    """
+    Generate a pseudo-random order for processing blocks.
+
+    This is crucial for security - if we just went left-to-right, top-to-bottom,
+    anyone could find the message by checking blocks in order. Instead, we
+    use a keyed shuffle so only someone with the same seed can find the data.
+
+    The seed comes from the crypto layer (derived from passphrase + photo + pin),
+    so the block order is effectively part of the encryption.
+    """
+    # Use SHA-256 to expand the seed into randomness
     hash_bytes = hashlib.sha256(seed).digest()
+    # Seed numpy's RNG (we use RandomState for reproducibility across versions)
     rng = np.random.RandomState(int.from_bytes(hash_bytes[:4], "big"))
+    # Fisher-Yates shuffle
     order = list(range(num_blocks))
     rng.shuffle(order)
     return order
@@ -393,14 +533,28 @@ def _save_color_image(rgb_array: np.ndarray, output_format: str = OUTPUT_FORMAT_
 
 
 def _rgb_to_ycbcr(rgb: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Convert RGB to YCbCr color space.
+
+    YCbCr separates brightness (Y) from color (Cb=blue-ish, Cr=red-ish).
+    This is what JPEG uses internally, and it's great for us because:
+    - Human eyes are WAY more sensitive to brightness than color
+    - We can hide data in Y without it being as visible
+    - Cb/Cr are often subsampled (4:2:0) so Y has more capacity anyway
+
+    The coefficients here are from ITU-R BT.601 - the standard for video.
+    """
     R = rgb[:, :, 0].astype(np.float64)
     G = rgb[:, :, 1].astype(np.float64)
     B = rgb[:, :, 2].astype(np.float64)
 
+    # Y = luminance (brightness). Green contributes most because eyes are most sensitive to it.
     Y = np.array(0.299 * R + 0.587 * G + 0.114 * B, dtype=np.float64, copy=True, order="C")
+    # Cb = blue-difference chroma (centered at 128)
     Cb = np.array(
         128 - 0.168736 * R - 0.331264 * G + 0.5 * B, dtype=np.float64, copy=True, order="C"
     )
+    # Cr = red-difference chroma (centered at 128)
     Cr = np.array(
         128 + 0.5 * R - 0.418688 * G - 0.081312 * B, dtype=np.float64, copy=True, order="C"
     )
@@ -409,6 +563,12 @@ def _rgb_to_ycbcr(rgb: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
 
 
 def _ycbcr_to_rgb(Y: np.ndarray, Cb: np.ndarray, Cr: np.ndarray) -> np.ndarray:
+    """
+    Convert YCbCr back to RGB.
+
+    After embedding in the Y channel, we need to reconstruct RGB for display.
+    The Cb/Cr channels are unchanged - we only touched luminance.
+    """
     R = Y + 1.402 * (Cr - 128)
     G = Y - 0.344136 * (Cb - 128) - 0.714136 * (Cr - 128)
     B = Y + 1.772 * (Cb - 128)
diff --git a/src/stegasoo/steganography.py b/src/stegasoo/steganography.py
index 139e3b2..6eb6899 100644
--- a/src/stegasoo/steganography.py
+++ b/src/stegasoo/steganography.py
@@ -1,21 +1,27 @@
 """
 Stegasoo Steganography Functions (v3.2.0)
 
-LSB and DCT embedding modes with pseudo-random pixel/coefficient selection.
+This is the core embedding/extraction module. Two modes available:
 
-Changes in v3.0:
-- DCT domain embedding mode (requires scipy)
-- embed_mode parameter for encode/decode
-- Auto-detection of embedding mode
-- Comparison utilities
+LSB (Least Significant Bit) Mode:
+- Classic steganography technique - hide bits in the least significant bit of pixel values
+- Works on any image, outputs lossless PNG/BMP
+- Higher capacity than DCT, but destroyed by JPEG compression
+- Great for: high-capacity needs, lossless workflows
 
-Changes in v3.0.1:
-- dct_output_format parameter for DCT mode ('png' or 'jpeg')
-- dct_color_mode parameter for DCT mode ('grayscale' or 'color')
+DCT Mode (see dct_steganography.py):
+- Hides data in frequency-domain coefficients
+- Survives some image processing, works with JPEG
+- Lower capacity but more robust
+- Great for: JPEG images, robustness needs
 
-Changes in v3.2.0:
-- Fixed HEADER_OVERHEAD constant (65 bytes, not 104 - date field removed)
-- Updated ENCRYPTION_OVERHEAD calculation
+Both modes use pseudo-random pixel/coefficient selection based on a key.
+Without the key, you don't know where to look - security through obscurity
+PLUS actual encryption of the payload.
+
+v3.0: Added DCT mode with scipy
+v3.0.1: DCT output format options (PNG/JPEG, grayscale/color)
+v3.2.0: Fixed overhead calculations after removing date field
 """
 
 import io
@@ -83,24 +89,31 @@ EXT_TO_FORMAT = {
 }
 
 # =============================================================================
-# OVERHEAD CONSTANTS (v4.0.0 - Updated for channel key support)
+# OVERHEAD CONSTANTS
 # =============================================================================
-# v4.0.0 Header format (with flags byte for channel key indicator):
-#   Magic:   4 bytes  (\x89ST3)
-#   Version: 1 byte   (5 for v4.0.0)
-#   Flags:   1 byte   (bit 0 = has channel key)
-#   Salt:    32 bytes
-#   IV:      12 bytes
-#   Tag:     16 bytes
-#   -----------------
-#   Total:   66 bytes
 #
-# v3.2.0 had 65 bytes (no flags byte)
-# v3.1.0 had date field (10 bytes + 1 byte length) = 76 bytes header
+# Every stego image has some overhead before the actual payload:
+#
+# The encrypted message format (v4.0.0):
+# ┌─────────────────────────────────────────────────────────────────┐
+# │ \x89ST3 │ v5 │ flags │  salt (32)  │  iv (12)  │  tag (16)  │ ... │
+# │ magic  │ ver│       │             │           │            │ data│
+# └─────────────────────────────────────────────────────────────────┘
+#   4 bytes  1    1         32            12           16         var
+#
+# Plus LSB embedding adds a 4-byte length prefix so we know where to stop.
+#
+# History of overhead sizes (in case you're debugging old images):
+# - v3.1.0: 76 bytes (had date field - 10+1 bytes)
+# - v3.2.0: 65 bytes (removed date, simpler)
+# - v4.0.0: 66 bytes (added flags byte for channel key)
 
-HEADER_OVERHEAD = 66  # v4.0.0: Magic + version + flags + salt + iv + tag
-LENGTH_PREFIX = 4  # 4 bytes for payload length in LSB embedding
-ENCRYPTION_OVERHEAD = HEADER_OVERHEAD + LENGTH_PREFIX  # 70 bytes total
+HEADER_OVERHEAD = 66  # What the crypto layer adds to any message
+LENGTH_PREFIX = 4     # We prepend the payload length for LSB extraction
+ENCRYPTION_OVERHEAD = HEADER_OVERHEAD + LENGTH_PREFIX  # Total: 70 bytes
+
+# That 70 bytes is your minimum image capacity requirement.
+# A tiny 100x100 image gives you ~3750 bytes capacity, minus 70 = ~3680 usable.
 
 # DCT output format options (v3.0.1)
 DCT_OUTPUT_PNG = "png"
@@ -456,6 +469,20 @@ def compare_modes(image_data: bytes) -> dict:
 # =============================================================================
 # PIXEL INDEX GENERATION
 # =============================================================================
+#
+# The key insight: we don't hide data in sequential pixels (that's easy to find).
+# Instead, we scatter the data across pseudo-random pixel locations.
+#
+# The pixel selection key (derived from passphrase + photo + pin) determines
+# WHICH pixels get modified. Without the key, an attacker would have to:
+# 1. Know we're using LSB steganography
+# 2. Try every possible subset of pixels
+# 3. Decrypt the result (which they also can't do without the key)
+#
+# We use ChaCha20 as a CSPRNG (Cryptographically Secure PRNG). It's:
+# - Fast (faster than AES-CTR on most CPUs)
+# - Deterministic (same key = same sequence, needed for extraction)
+# - Secure (can't predict the sequence without the key)
 
 
 @debug.time
@@ -463,8 +490,13 @@ def generate_pixel_indices(key: bytes, num_pixels: int, num_needed: int) -> list
     """
     Generate pseudo-random pixel indices for embedding.
 
-    Uses ChaCha20 as a CSPRNG seeded by the key to deterministically
-    select which pixels will hold hidden data.
+    This is the "where do we hide the bits?" function. We use ChaCha20
+    to generate a deterministic sequence of pixel indices that only
+    someone with the same key can reproduce.
+
+    Two strategies based on how much of the image we're using:
+    - >= 50% capacity: Full Fisher-Yates shuffle (sample without replacement)
+    - < 50% capacity: Direct random sampling (faster, same result)
     """
     debug.validate(len(key) == 32, f"Pixel key must be 32 bytes, got {len(key)}")
     debug.validate(num_pixels > 0, f"Number of pixels must be positive, got {num_pixels}")
@@ -475,6 +507,8 @@ def generate_pixel_indices(key: bytes, num_pixels: int, num_needed: int) -> list
 
     debug.print(f"Generating {num_needed} pixel indices from {num_pixels} total pixels")
 
+    # Strategy 1: Full shuffle when we need a lot of pixels
+    # Fisher-Yates shuffle is O(n) and gives us perfect random sampling
     if num_needed >= num_pixels // 2:
         debug.print(f"Using full shuffle (needed {num_needed}/{num_pixels} pixels)")
         nonce = b"\x00" * 16
@@ -482,8 +516,10 @@ def generate_pixel_indices(key: bytes, num_pixels: int, num_needed: int) -> list
         encryptor = cipher.encryptor()
 
         indices = list(range(num_pixels))
+        # Get enough random bytes to do the shuffle
         random_bytes = encryptor.update(b"\x00" * (num_pixels * 4))
 
+        # Fisher-Yates shuffle - swap each element with a random earlier element
         for i in range(num_pixels - 1, 0, -1):
             j_bytes = random_bytes[(num_pixels - 1 - i) * 4 : (num_pixels - i) * 4]
             j = int.from_bytes(j_bytes, "big") % (i + 1)
@@ -493,14 +529,17 @@ def generate_pixel_indices(key: bytes, num_pixels: int, num_needed: int) -> list
         debug.print(f"Generated {len(selected)} indices via shuffle")
         return selected
 
+    # Strategy 2: Direct sampling when we need fewer pixels
+    # Generate random indices until we have enough unique ones
     debug.print(f"Using optimized selection (needed {num_needed}/{num_pixels} pixels)")
     selected = []
-    used = set()
+    used = set()  # Track which pixels we've already picked
 
     nonce = b"\x00" * 16
     cipher = Cipher(algorithms.ChaCha20(key, nonce), mode=None, backend=default_backend())
     encryptor = cipher.encryptor()
 
+    # Pre-generate 2x the bytes we think we'll need (for collision handling)
     bytes_needed = (num_needed * 2) * 4
     random_bytes = encryptor.update(b"\x00" * bytes_needed)
 
@@ -514,8 +553,9 @@ def generate_pixel_indices(key: bytes, num_pixels: int, num_needed: int) -> list
             used.add(idx)
             selected.append(idx)
         else:
-            collisions += 1
+            collisions += 1  # Birthday paradox in action
 
+    # Edge case: ran out of pre-generated bytes (very high collision rate)
     if len(selected) < num_needed:
         debug.print(f"Need {num_needed - len(selected)} more indices, generating...")
         extra_needed = num_needed - len(selected)
@@ -539,6 +579,23 @@ def generate_pixel_indices(key: bytes, num_pixels: int, num_needed: int) -> list
 # =============================================================================
 # EMBEDDING FUNCTIONS
 # =============================================================================
+#
+# The actual bit-hiding magic happens here. LSB embedding is conceptually simple:
+#
+# Original pixel RGB: (142, 87, 201)
+# In binary:          (10001110, 01010111, 11001001)
+#                                      ^       ^       ^
+#                      These are the LSBs (least significant bits)
+#
+# To hide the bits [1, 0, 1]:
+# Modified pixel RGB: (10001111, 01010110, 11001001) = (143, 86, 201)
+#                                      ^       ^       ^
+#                      Changed!     Changed!  Already 1, no change needed
+#
+# The human eye can't see the difference between 142 and 143.
+# But we've hidden 3 bits of secret data in one pixel.
+#
+# With a 1000x1000 image: 1 million pixels * 3 channels = 3 million bits = 375 KB!
 
 
 @debug.time