""" Steganalysis Self-Check Module (v4.4.0) Statistical analysis to estimate detectability risk of stego images. Runs chi-square and RS (Regular-Singular) analysis on pixel data to assess how visible the embedding is to an attacker. Currently LSB-only. DCT steganalysis (calibration attack) deferred. Usage:: from stegasoo.steganalysis import check_image result = check_image(image_data) print(result["risk"]) # "low", "medium", or "high" print(result["chi_square"]) # per-channel chi-square p-values print(result["rs"]) # per-channel RS embedding estimates """ from __future__ import annotations import io from dataclasses import dataclass, field import numpy as np from PIL import Image from .constants import ( STEGANALYSIS_CHI_SUSPICIOUS_THRESHOLD, STEGANALYSIS_RS_HIGH_THRESHOLD, STEGANALYSIS_RS_MEDIUM_THRESHOLD, ) @dataclass class SteganalysisResult: """Result of steganalysis on an image.""" risk: str # "low", "medium", or "high" chi_square: dict = field(default_factory=dict) # per-channel p-values rs: dict = field(default_factory=dict) # per-channel embedding estimates width: int = 0 height: int = 0 channels: int = 0 mode: str = "lsb" def chi_square_analysis(channel_data: np.ndarray) -> float: """Chi-square test on LSB distribution of a single channel. Groups pixel values into pairs (2i, 2i+1) — so-called "pairs of values" (PoVs). In a clean image, each pair has a natural frequency ratio. LSB embedding with random data forces each pair toward equal frequency. The test measures H0: "pairs are equalized" (consistent with embedding). Args: channel_data: Flattened 1-D array of pixel values (uint8). Returns: p-value from chi-square test. HIGH p-value (close to 1.0) → pairs are equalized → suspicious. LOW p-value (close to 0.0) → pairs are not equalized → less suspicious. """ from scipy.stats import chi2 # Count occurrences of each value 0-255 histogram = np.bincount(channel_data.ravel(), minlength=256) # Group into 128 pairs: (0,1), (2,3), ..., (254,255) chi_sq = 0.0 degrees_of_freedom = 0 for i in range(0, 256, 2): observed_even = histogram[i] observed_odd = histogram[i + 1] total = observed_even + observed_odd if total == 0: continue expected = total / 2.0 chi_sq += (observed_even - expected) ** 2 / expected chi_sq += (observed_odd - expected) ** 2 / expected degrees_of_freedom += 1 if degrees_of_freedom == 0: return 1.0 # No data to analyze # p-value: probability of observing this chi-square value by chance # Low p-value = LSBs are suspiciously uniform = likely embedded p_value = 1.0 - chi2.cdf(chi_sq, degrees_of_freedom) return float(p_value) def rs_analysis(channel_data: np.ndarray, block_size: int = 8) -> float: """Regular-Singular groups analysis on a single channel. Divides the image channel into groups of `block_size` pixels and measures the "smoothness" (variation) of each group. Applying a flipping function F1 (flip LSB) and F-1 (flip LSB of value-1) produces Regular (smoother) and Singular (rougher) groups. In a clean image: R_m ≈ R_{-m} and S_m ≈ S_{-m}. LSB embedding causes R_m and S_{-m} to converge while S_m and R_{-m} diverge, allowing estimation of the embedding rate. Args: channel_data: Flattened 1-D array of pixel values (uint8). block_size: Number of pixels per group (default 8). Returns: Estimated embedding rate (0.0 = clean, 1.0 = fully embedded). Values > 0.5 strongly indicate LSB embedding. """ data = channel_data.ravel().astype(np.int16) n = len(data) # Trim to multiple of block_size n_blocks = n // block_size if n_blocks < 10: return 0.0 # Not enough data data = data[: n_blocks * block_size].reshape(n_blocks, block_size) def variation(block: np.ndarray) -> float: """Sum of absolute differences between adjacent pixels.""" return float(np.sum(np.abs(np.diff(block)))) def flip_positive(block: np.ndarray) -> np.ndarray: """F1: flip LSB (0↔1, 2↔3, 4↔5, ...).""" return block ^ 1 def flip_negative(block: np.ndarray) -> np.ndarray: """F-1: flip LSB of (value - 1), i.e. -1↔0, 1↔2, 3↔4, ...""" result = block.copy() even_mask = (block % 2) == 0 result[even_mask] -= 1 result[~even_mask] += 1 return result r_m = s_m = r_neg = s_neg = 0 for i in range(n_blocks): block = data[i] v_orig = variation(block) v_f1 = variation(flip_positive(block)) if v_f1 > v_orig: r_m += 1 elif v_f1 < v_orig: s_m += 1 v_fn1 = variation(flip_negative(block)) if v_fn1 > v_orig: r_neg += 1 elif v_fn1 < v_orig: s_neg += 1 # Estimate embedding rate using the RS quadratic formula # d0 = R_m - S_m, d1 = R_{-m} - S_{-m} # The embedding rate p satisfies: d(p/2) = d0, d(1 - p/2) = d1 # Simplified estimator: p ≈ (R_m - S_m) / (R_{-m} - S_{-m}) divergence d0 = r_m - s_m d1 = r_neg - s_neg if n_blocks == 0: return 0.0 # Use the simplified dual-statistic estimator # In clean images: d0 ≈ d1 (both positive) # In embedded images: d0 → 0 while d1 stays positive if d1 == 0: # Can't estimate — likely very embedded or degenerate return 0.5 if d0 == 0 else 0.0 # Ratio-based estimate: how much has d0 dropped relative to d1 ratio = d0 / d1 if ratio >= 1.0: return 0.0 # d0 ≥ d1 means no evidence of embedding if ratio <= 0.0: return 1.0 # d0 collapsed or inverted # Linear interpolation: ratio=1 → 0% embedded, ratio=0 → 100% embedded estimate = 1.0 - ratio return float(np.clip(estimate, 0.0, 1.0)) def assess_risk(chi_p_values: dict[str, float], rs_estimates: dict[str, float]) -> str: """Map analysis results to a risk level. RS analysis is the primary metric (reliable for both sequential and random-order embedding). Chi-square is supplementary — high p-values indicate equalized PoV pairs, which is suspicious for random LSB embedding. Args: chi_p_values: Per-channel chi-square p-values (high = suspicious). rs_estimates: Per-channel RS embedding rate estimates (high = suspicious). Returns: "low", "medium", or "high" detectability risk. """ if not chi_p_values and not rs_estimates: return "low" # RS is the primary indicator: any channel with high embedding estimate max_rs = max(rs_estimates.values()) if rs_estimates else 0.0 # Chi-square: high p-value means pairs are equalized (suspicious) max_chi_p = max(chi_p_values.values()) if chi_p_values else 0.0 chi_suspicious = max_chi_p > STEGANALYSIS_CHI_SUSPICIOUS_THRESHOLD # High risk: RS strongly indicates embedding if max_rs > STEGANALYSIS_RS_HIGH_THRESHOLD: return "high" # Medium risk: moderate RS signal, or RS + chi-square both flagging if max_rs > STEGANALYSIS_RS_MEDIUM_THRESHOLD: return "medium" if chi_suspicious and max_rs > 0.05: return "medium" return "low" def check_image(image_data: bytes, mode: str = "lsb") -> dict: """Run steganalysis on an image and return detectability assessment. Args: image_data: Raw image bytes (PNG, BMP, etc.). mode: Analysis mode — currently only "lsb" is supported. Returns: Dict with keys: risk, chi_square, rs, width, height, channels, mode. """ if mode not in ("lsb", "auto"): raise ValueError(f"Unsupported steganalysis mode: {mode}. Use 'lsb' or 'auto'.") img = Image.open(io.BytesIO(image_data)) if img.mode not in ("RGB", "RGBA", "L"): img = img.convert("RGB") width, height = img.size pixels = np.array(img) img.close() channel_names = ["R", "G", "B"] if pixels.ndim == 3 else ["L"] if pixels.ndim == 2: pixels = pixels[:, :, np.newaxis] num_channels = min(pixels.shape[2], 3) # Skip alpha chi_p_values = {} rs_estimates = {} for i in range(num_channels): name = channel_names[i] channel = pixels[:, :, i].ravel() chi_p_values[name] = chi_square_analysis(channel) rs_estimates[name] = rs_analysis(channel) risk = assess_risk(chi_p_values, rs_estimates) result = SteganalysisResult( risk=risk, chi_square=chi_p_values, rs=rs_estimates, width=width, height=height, channels=num_channels, mode=mode, ) return { "risk": result.risk, "chi_square": result.chi_square, "rs": result.rs, "width": result.width, "height": result.height, "channels": result.channels, "mode": result.mode, }