fieldwitness/src/soosef/stegasoo/steganalysis.py

"""
Steganalysis Self-Check Module (v4.4.0)

Statistical analysis to estimate detectability risk of stego images.
Runs chi-square and RS (Regular-Singular) analysis on pixel data
to assess how visible the embedding is to an attacker.

Currently LSB-only. DCT steganalysis (calibration attack) deferred.

Usage::

    from stegasoo.steganalysis import check_image

    result = check_image(image_data)
    print(result["risk"])        # "low", "medium", or "high"
    print(result["chi_square"])  # per-channel chi-square p-values
    print(result["rs"])          # per-channel RS embedding estimates
"""

from __future__ import annotations

import io
from dataclasses import dataclass, field

import numpy as np
from PIL import Image

from .constants import (
    STEGANALYSIS_CHI_SUSPICIOUS_THRESHOLD,
    STEGANALYSIS_RS_HIGH_THRESHOLD,
    STEGANALYSIS_RS_MEDIUM_THRESHOLD,
)


@dataclass
class SteganalysisResult:
    """Result of steganalysis on an image."""

    risk: str  # "low", "medium", or "high"
    chi_square: dict = field(default_factory=dict)  # per-channel p-values
    rs: dict = field(default_factory=dict)  # per-channel embedding estimates
    width: int = 0
    height: int = 0
    channels: int = 0
    mode: str = "lsb"


def chi_square_analysis(channel_data: np.ndarray) -> float:
    """Chi-square test on LSB distribution of a single channel.

    Groups pixel values into pairs (2i, 2i+1) — so-called "pairs of values"
    (PoVs). In a clean image, each pair has a natural frequency ratio.
    LSB embedding with random data forces each pair toward equal frequency.

    The test measures H0: "pairs are equalized" (consistent with embedding).

    Args:
        channel_data: Flattened 1-D array of pixel values (uint8).

    Returns:
        p-value from chi-square test.
        HIGH p-value (close to 1.0) → pairs are equalized → suspicious.
        LOW p-value (close to 0.0) → pairs are not equalized → less suspicious.
    """
    from scipy.stats import chi2

    # Count occurrences of each value 0-255
    histogram = np.bincount(channel_data.ravel(), minlength=256)

    # Group into 128 pairs: (0,1), (2,3), ..., (254,255)
    chi_sq = 0.0
    degrees_of_freedom = 0

    for i in range(0, 256, 2):
        observed_even = histogram[i]
        observed_odd = histogram[i + 1]
        total = observed_even + observed_odd

        if total == 0:
            continue

        expected = total / 2.0
        chi_sq += (observed_even - expected) ** 2 / expected
        chi_sq += (observed_odd - expected) ** 2 / expected
        degrees_of_freedom += 1

    if degrees_of_freedom == 0:
        return 1.0  # No data to analyze

    # p-value: probability of observing this chi-square value by chance
    # Low p-value = LSBs are suspiciously uniform = likely embedded
    p_value = 1.0 - chi2.cdf(chi_sq, degrees_of_freedom)
    return float(p_value)


def rs_analysis(channel_data: np.ndarray, block_size: int = 8) -> float:
    """Regular-Singular groups analysis on a single channel.

    Divides the image channel into groups of `block_size` pixels and measures
    the "smoothness" (variation) of each group. Applying a flipping function
    F1 (flip LSB) and F-1 (flip LSB of value-1) produces Regular (smoother)
    and Singular (rougher) groups.

    In a clean image: R_m ≈ R_{-m} and S_m ≈ S_{-m}.
    LSB embedding causes R_m and S_{-m} to converge while S_m and R_{-m}
    diverge, allowing estimation of the embedding rate.

    Args:
        channel_data: Flattened 1-D array of pixel values (uint8).
        block_size: Number of pixels per group (default 8).

    Returns:
        Estimated embedding rate (0.0 = clean, 1.0 = fully embedded).
        Values > 0.5 strongly indicate LSB embedding.
    """
    data = channel_data.ravel().astype(np.int16)
    n = len(data)
    # Trim to multiple of block_size
    n_blocks = n // block_size
    if n_blocks < 10:
        return 0.0  # Not enough data

    data = data[: n_blocks * block_size].reshape(n_blocks, block_size)

    def variation(block: np.ndarray) -> float:
        """Sum of absolute differences between adjacent pixels."""
        return float(np.sum(np.abs(np.diff(block))))

    def flip_positive(block: np.ndarray) -> np.ndarray:
        """F1: flip LSB (0↔1, 2↔3, 4↔5, ...)."""
        return block ^ 1

    def flip_negative(block: np.ndarray) -> np.ndarray:
        """F-1: flip LSB of (value - 1), i.e. -1↔0, 1↔2, 3↔4, ..."""
        result = block.copy()
        even_mask = (block % 2) == 0
        result[even_mask] -= 1
        result[~even_mask] += 1
        return result

    r_m = s_m = r_neg = s_neg = 0

    for i in range(n_blocks):
        block = data[i]
        v_orig = variation(block)

        v_f1 = variation(flip_positive(block))
        if v_f1 > v_orig:
            r_m += 1
        elif v_f1 < v_orig:
            s_m += 1

        v_fn1 = variation(flip_negative(block))
        if v_fn1 > v_orig:
            r_neg += 1
        elif v_fn1 < v_orig:
            s_neg += 1

    # Estimate embedding rate using the RS quadratic formula
    # d0 = R_m - S_m, d1 = R_{-m} - S_{-m}
    # The embedding rate p satisfies: d(p/2) = d0, d(1 - p/2) = d1
    # Simplified estimator: p ≈ (R_m - S_m) / (R_{-m} - S_{-m}) divergence
    d0 = r_m - s_m
    d1 = r_neg - s_neg

    if n_blocks == 0:
        return 0.0

    # Use the simplified dual-statistic estimator
    # In clean images: d0 ≈ d1 (both positive)
    # In embedded images: d0 → 0 while d1 stays positive
    if d1 == 0:
        # Can't estimate — likely very embedded or degenerate
        return 0.5 if d0 == 0 else 0.0

    # Ratio-based estimate: how much has d0 dropped relative to d1
    ratio = d0 / d1
    if ratio >= 1.0:
        return 0.0  # d0 ≥ d1 means no evidence of embedding
    if ratio <= 0.0:
        return 1.0  # d0 collapsed or inverted

    # Linear interpolation: ratio=1 → 0% embedded, ratio=0 → 100% embedded
    estimate = 1.0 - ratio
    return float(np.clip(estimate, 0.0, 1.0))


def assess_risk(chi_p_values: dict[str, float], rs_estimates: dict[str, float]) -> str:
    """Map analysis results to a risk level.

    RS analysis is the primary metric (reliable for both sequential and
    random-order embedding). Chi-square is supplementary — high p-values
    indicate equalized PoV pairs, which is suspicious for random LSB embedding.

    Args:
        chi_p_values: Per-channel chi-square p-values (high = suspicious).
        rs_estimates: Per-channel RS embedding rate estimates (high = suspicious).

    Returns:
        "low", "medium", or "high" detectability risk.
    """
    if not chi_p_values and not rs_estimates:
        return "low"

    # RS is the primary indicator: any channel with high embedding estimate
    max_rs = max(rs_estimates.values()) if rs_estimates else 0.0

    # Chi-square: high p-value means pairs are equalized (suspicious)
    max_chi_p = max(chi_p_values.values()) if chi_p_values else 0.0
    chi_suspicious = max_chi_p > STEGANALYSIS_CHI_SUSPICIOUS_THRESHOLD

    # High risk: RS strongly indicates embedding
    if max_rs > STEGANALYSIS_RS_HIGH_THRESHOLD:
        return "high"

    # Medium risk: moderate RS signal, or RS + chi-square both flagging
    if max_rs > STEGANALYSIS_RS_MEDIUM_THRESHOLD:
        return "medium"
    if chi_suspicious and max_rs > 0.05:
        return "medium"

    return "low"


def check_image(image_data: bytes, mode: str = "lsb") -> dict:
    """Run steganalysis on an image and return detectability assessment.

    Args:
        image_data: Raw image bytes (PNG, BMP, etc.).
        mode: Analysis mode — currently only "lsb" is supported.

    Returns:
        Dict with keys: risk, chi_square, rs, width, height, channels, mode.
    """
    if mode not in ("lsb", "auto"):
        raise ValueError(f"Unsupported steganalysis mode: {mode}. Use 'lsb' or 'auto'.")

    img = Image.open(io.BytesIO(image_data))
    if img.mode not in ("RGB", "RGBA", "L"):
        img = img.convert("RGB")

    width, height = img.size
    pixels = np.array(img)
    img.close()

    channel_names = ["R", "G", "B"] if pixels.ndim == 3 else ["L"]
    if pixels.ndim == 2:
        pixels = pixels[:, :, np.newaxis]

    num_channels = min(pixels.shape[2], 3)  # Skip alpha

    chi_p_values = {}
    rs_estimates = {}

    for i in range(num_channels):
        name = channel_names[i]
        channel = pixels[:, :, i].ravel()
        chi_p_values[name] = chi_square_analysis(channel)
        rs_estimates[name] = rs_analysis(channel)

    risk = assess_risk(chi_p_values, rs_estimates)

    result = SteganalysisResult(
        risk=risk,
        chi_square=chi_p_values,
        rs=rs_estimates,
        width=width,
        height=height,
        channels=num_channels,
        mode=mode,
    )

    return {
        "risk": result.risk,
        "chi_square": result.chi_square,
        "rs": result.rs,
        "width": result.width,
        "height": result.height,
        "channels": result.channels,
        "mode": result.mode,
    }