stegasoo/agentstuff/sentiment_agent/clients/reddit.py

"""Reddit client using the public JSON API.

No authentication required for read-only search. Reddit requires a descriptive
User-Agent header — requests with generic UAs get 429'd.
"""

import httpx

REDDIT_BASE = "https://www.reddit.com"
USER_AGENT = "sentiment-agent/0.1.0 (research; sentiment analysis tool)"


async def search_posts(
    query: str,
    subreddit: str = "all",
    sort: str = "relevance",
    time_filter: str = "month",
    limit: int = 25,
) -> list[dict]:
    """Search Reddit for posts matching a query.

    Args:
        query: Search terms.
        subreddit: Subreddit to search within, or "all" for site-wide.
        sort: One of "relevance", "hot", "top", "new", "comments".
        time_filter: One of "hour", "day", "week", "month", "year", "all".
        limit: Max results (capped at 100 by Reddit).

    Returns:
        List of post dicts with: title, selftext, author, score,
        num_comments, subreddit, url, permalink, created_utc.
    """
    url = f"{REDDIT_BASE}/r/{subreddit}/search.json"
    async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client:
        resp = await client.get(
            url,
            params={
                "q": query,
                "sort": sort,
                "t": time_filter,
                "limit": min(limit, 100),
                "restrict_sr": "on" if subreddit != "all" else "off",
            },
            headers={"User-Agent": USER_AGENT},
        )
        resp.raise_for_status()
        data = resp.json()

    results = []
    for child in data.get("data", {}).get("children", []):
        post = child.get("data", {})
        results.append(
            {
                "title": post.get("title", ""),
                "selftext": (post.get("selftext") or "")[:2000],
                "author": post.get("author", "[deleted]"),
                "score": post.get("score", 0),
                "upvote_ratio": post.get("upvote_ratio", 0),
                "num_comments": post.get("num_comments", 0),
                "subreddit": post.get("subreddit", ""),
                "url": post.get("url", ""),
                "permalink": f"https://reddit.com{post.get('permalink', '')}",
                "created_utc": post.get("created_utc", 0),
                "is_self": post.get("is_self", False),
            }
        )
    return results


async def get_post_comments(
    permalink: str,
    sort: str = "top",
    limit: int = 25,
) -> list[dict]:
    """Fetch top-level comments for a Reddit post.

    Args:
        permalink: The post's permalink path (e.g., "/r/python/comments/abc123/title/").
        sort: Comment sort order: "top", "best", "new", "controversial".
        limit: Max comments to return.

    Returns:
        List of comment dicts with: body, author, score, created_utc.
    """
    # Strip domain if full URL was passed
    if permalink.startswith("https://"):
        permalink = permalink.replace("https://reddit.com", "")
        permalink = permalink.replace("https://www.reddit.com", "")

    url = f"{REDDIT_BASE}{permalink}.json"
    async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client:
        resp = await client.get(
            url,
            params={"sort": sort, "limit": limit},
            headers={"User-Agent": USER_AGENT},
        )
        resp.raise_for_status()
        data = resp.json()

    # Reddit returns [post_listing, comments_listing]
    if not isinstance(data, list) or len(data) < 2:
        return []

    results = []
    for child in data[1].get("data", {}).get("children", []):
        if child.get("kind") != "t1":
            continue
        comment = child.get("data", {})
        results.append(
            {
                "body": (comment.get("body") or "")[:2000],
                "author": comment.get("author", "[deleted]"),
                "score": comment.get("score", 0),
                "created_utc": comment.get("created_utc", 0),
            }
        )
    return results