Minor fixes

2026-04-04 16:29:20 -04:00
parent 05382c4081
commit 4607ff27dd
23 changed files with 1772 additions and 28 deletions
--- a/agentstuff/sentiment_agent/clients/init.py
+++ b/agentstuff/sentiment_agent/clients/init.py
@@ -0,0 +1 @@
+"""API clients for social media and forum data sources."""
--- a/agentstuff/sentiment_agent/clients/bluesky.py
+++ b/agentstuff/sentiment_agent/clients/bluesky.py
@@ -0,0 +1,166 @@
+"""Bluesky client using the AT Protocol API.
+
+Search requires authentication. Set BLUESKY_HANDLE and BLUESKY_APP_PASSWORD
+env vars. Create an app password at: https://bsky.app/settings/app-passwords
+
+Thread fetching works without auth via the public API.
+"""
+
+import os
+import httpx
+
+BSKY_PUBLIC_API = "https://public.api.bsky.app"
+BSKY_AUTH_API = "https://bsky.social"
+
+
+async def _get_session() -> dict | None:
+    """Authenticate with Bluesky and return session tokens, or None if no creds."""
+    handle = os.environ.get("BLUESKY_HANDLE")
+    app_password = os.environ.get("BLUESKY_APP_PASSWORD")
+    if not handle or not app_password:
+        return None
+
+    async with httpx.AsyncClient(timeout=15) as client:
+        resp = await client.post(
+            f"{BSKY_AUTH_API}/xrpc/com.atproto.server.createSession",
+            json={"identifier": handle, "password": app_password},
+        )
+        resp.raise_for_status()
+        return resp.json()
+
+
+def _format_post(post_view: dict) -> dict:
+    """Extract relevant fields from an AT Protocol post view."""
+    post = post_view.get("post", post_view)
+    record = post.get("record", {})
+    author = post.get("author", {})
+    return {
+        "text": record.get("text", ""),
+        "author_handle": author.get("handle", ""),
+        "author_display_name": author.get("displayName", ""),
+        "created_at": record.get("createdAt", ""),
+        "like_count": post.get("likeCount", 0),
+        "repost_count": post.get("repostCount", 0),
+        "reply_count": post.get("replyCount", 0),
+        "uri": post.get("uri", ""),
+        "cid": post.get("cid", ""),
+        "url": _uri_to_url(post.get("uri", ""), author.get("handle", "")),
+    }
+
+
+def _uri_to_url(uri: str, handle: str) -> str:
+    """Convert an at:// URI to a bsky.app URL."""
+    # at://did:plc:xxx/app.bsky.feed.post/rkey -> https://bsky.app/profile/handle/post/rkey
+    if not uri.startswith("at://"):
+        return ""
+    parts = uri.split("/")
+    if len(parts) >= 5:
+        rkey = parts[-1]
+        return f"https://bsky.app/profile/{handle}/post/{rkey}"
+    return ""
+
+
+async def search_posts(query: str, limit: int = 25, sort: str = "top") -> list[dict]:
+    """Search Bluesky for posts matching a query.
+
+    Requires BLUESKY_HANDLE and BLUESKY_APP_PASSWORD env vars.
+
+    Args:
+        query: Search terms.
+        limit: Max results (capped at 100).
+        sort: "top" (most liked) or "latest" (chronological).
+
+    Returns:
+        List of post dicts with: text, author_handle, author_display_name,
+        created_at, like_count, repost_count, reply_count, uri, url.
+
+    Raises:
+        RuntimeError: If Bluesky credentials are not configured.
+    """
+    session = await _get_session()
+    if not session:
+        raise RuntimeError(
+            "Bluesky search requires authentication. "
+            "Set BLUESKY_HANDLE and BLUESKY_APP_PASSWORD environment variables. "
+            "Create an app password at: https://bsky.app/settings/app-passwords"
+        )
+
+    async with httpx.AsyncClient(timeout=15) as client:
+        resp = await client.get(
+            f"{BSKY_AUTH_API}/xrpc/app.bsky.feed.searchPosts",
+            params={
+                "q": query,
+                "limit": min(limit, 100),
+                "sort": sort,
+            },
+            headers={"Authorization": f"Bearer {session['accessJwt']}"},
+        )
+        resp.raise_for_status()
+        data = resp.json()
+
+    return [_format_post(p) for p in data.get("posts", [])]
+
+
+async def get_thread(uri: str, depth: int = 6) -> dict:
+    """Fetch a Bluesky thread by AT URI or bsky.app URL.
+
+    Args:
+        uri: Either an at:// URI or a https://bsky.app/profile/.../post/... URL.
+        depth: How many levels of replies to fetch (max 1000).
+
+    Returns:
+        Dict with "post" (the root post) and "replies" (list of reply post dicts).
+    """
+    # Convert bsky.app URL to AT URI if needed
+    if uri.startswith("https://bsky.app/"):
+        uri = await _resolve_url_to_uri(uri)
+
+    headers = {}
+    session = await _get_session()
+    if session:
+        headers["Authorization"] = f"Bearer {session['accessJwt']}"
+
+    async with httpx.AsyncClient(timeout=15) as client:
+        resp = await client.get(
+            f"{BSKY_PUBLIC_API}/xrpc/app.bsky.feed.getPostThread",
+            params={"uri": uri, "depth": min(depth, 1000)},
+            headers=headers,
+        )
+        resp.raise_for_status()
+        data = resp.json()
+
+    thread = data.get("thread", {})
+    root_post = _format_post(thread) if "post" in thread else {}
+
+    replies = []
+    for reply in thread.get("replies", []):
+        if "post" in reply:
+            replies.append(_format_post(reply))
+            # Include nested replies one level deep
+            for nested in reply.get("replies", []):
+                if "post" in nested:
+                    replies.append(_format_post(nested))
+
+    return {"post": root_post, "replies": replies}
+
+
+async def _resolve_url_to_uri(url: str) -> str:
+    """Convert a bsky.app URL to an AT URI by resolving the handle."""
+    # https://bsky.app/profile/handle.bsky.social/post/rkey
+    parts = url.rstrip("/").split("/")
+    if len(parts) < 6:
+        raise ValueError(f"Invalid Bluesky URL: {url}")
+
+    handle = parts[4]  # profile/{handle}
+    rkey = parts[6]  # post/{rkey}
+
+    # Resolve handle to DID
+    async with httpx.AsyncClient(timeout=10) as client:
+        resp = await client.get(
+            f"{BSKY_PUBLIC_API}/xrpc/com.atproto.identity.resolveHandle",
+            params={"handle": handle},
+        )
+        resp.raise_for_status()
+        did = resp.json()["did"]
+
+    return f"at://{did}/app.bsky.feed.post/{rkey}"
--- a/agentstuff/sentiment_agent/clients/hackernews.py
+++ b/agentstuff/sentiment_agent/clients/hackernews.py
@@ -0,0 +1,78 @@
+"""Hacker News client using the Algolia HN Search API.
+
+No authentication required. Docs: https://hn.algolia.com/api
+"""
+
+import httpx
+
+HN_API_BASE = "https://hn.algolia.com/api/v1"
+
+
+async def search_stories(query: str, limit: int = 25) -> list[dict]:
+    """Search HN for stories matching a query.
+
+    Returns a list of story dicts with: title, url, author, points,
+    num_comments, created_at, objectID, story_text.
+    """
+    async with httpx.AsyncClient(timeout=15) as client:
+        resp = await client.get(
+            f"{HN_API_BASE}/search",
+            params={
+                "query": query,
+                "tags": "story",
+                "hitsPerPage": min(limit, 50),
+            },
+        )
+        resp.raise_for_status()
+        data = resp.json()
+
+    results = []
+    for hit in data.get("hits", []):
+        results.append(
+            {
+                "title": hit.get("title", ""),
+                "url": hit.get("url", ""),
+                "author": hit.get("author", ""),
+                "points": hit.get("points", 0),
+                "num_comments": hit.get("num_comments", 0),
+                "created_at": hit.get("created_at", ""),
+                "object_id": hit.get("objectID", ""),
+                "story_text": hit.get("story_text") or "",
+                "hn_url": f"https://news.ycombinator.com/item?id={hit.get('objectID', '')}",
+            }
+        )
+    return results
+
+
+async def search_comments(query: str, limit: int = 25) -> list[dict]:
+    """Search HN for comments matching a query.
+
+    Returns a list of comment dicts with: comment_text, author, points,
+    created_at, story_title, story_url.
+    """
+    async with httpx.AsyncClient(timeout=15) as client:
+        resp = await client.get(
+            f"{HN_API_BASE}/search",
+            params={
+                "query": query,
+                "tags": "comment",
+                "hitsPerPage": min(limit, 50),
+            },
+        )
+        resp.raise_for_status()
+        data = resp.json()
+
+    results = []
+    for hit in data.get("hits", []):
+        results.append(
+            {
+                "comment_text": hit.get("comment_text", ""),
+                "author": hit.get("author", ""),
+                "points": hit.get("points", 0),
+                "created_at": hit.get("created_at", ""),
+                "story_title": hit.get("story_title", ""),
+                "story_url": hit.get("story_url", ""),
+                "hn_url": f"https://news.ycombinator.com/item?id={hit.get('objectID', '')}",
+            }
+        )
+    return results
--- a/agentstuff/sentiment_agent/clients/reddit.py
+++ b/agentstuff/sentiment_agent/clients/reddit.py
@@ -0,0 +1,117 @@
+"""Reddit client using the public JSON API.
+
+No authentication required for read-only search. Reddit requires a descriptive
+User-Agent header — requests with generic UAs get 429'd.
+"""
+
+import httpx
+
+REDDIT_BASE = "https://www.reddit.com"
+USER_AGENT = "sentiment-agent/0.1.0 (research; sentiment analysis tool)"
+
+
+async def search_posts(
+    query: str,
+    subreddit: str = "all",
+    sort: str = "relevance",
+    time_filter: str = "month",
+    limit: int = 25,
+) -> list[dict]:
+    """Search Reddit for posts matching a query.
+
+    Args:
+        query: Search terms.
+        subreddit: Subreddit to search within, or "all" for site-wide.
+        sort: One of "relevance", "hot", "top", "new", "comments".
+        time_filter: One of "hour", "day", "week", "month", "year", "all".
+        limit: Max results (capped at 100 by Reddit).
+
+    Returns:
+        List of post dicts with: title, selftext, author, score,
+        num_comments, subreddit, url, permalink, created_utc.
+    """
+    url = f"{REDDIT_BASE}/r/{subreddit}/search.json"
+    async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client:
+        resp = await client.get(
+            url,
+            params={
+                "q": query,
+                "sort": sort,
+                "t": time_filter,
+                "limit": min(limit, 100),
+                "restrict_sr": "on" if subreddit != "all" else "off",
+            },
+            headers={"User-Agent": USER_AGENT},
+        )
+        resp.raise_for_status()
+        data = resp.json()
+
+    results = []
+    for child in data.get("data", {}).get("children", []):
+        post = child.get("data", {})
+        results.append(
+            {
+                "title": post.get("title", ""),
+                "selftext": (post.get("selftext") or "")[:2000],
+                "author": post.get("author", "[deleted]"),
+                "score": post.get("score", 0),
+                "upvote_ratio": post.get("upvote_ratio", 0),
+                "num_comments": post.get("num_comments", 0),
+                "subreddit": post.get("subreddit", ""),
+                "url": post.get("url", ""),
+                "permalink": f"https://reddit.com{post.get('permalink', '')}",
+                "created_utc": post.get("created_utc", 0),
+                "is_self": post.get("is_self", False),
+            }
+        )
+    return results
+
+
+async def get_post_comments(
+    permalink: str,
+    sort: str = "top",
+    limit: int = 25,
+) -> list[dict]:
+    """Fetch top-level comments for a Reddit post.
+
+    Args:
+        permalink: The post's permalink path (e.g., "/r/python/comments/abc123/title/").
+        sort: Comment sort order: "top", "best", "new", "controversial".
+        limit: Max comments to return.
+
+    Returns:
+        List of comment dicts with: body, author, score, created_utc.
+    """
+    # Strip domain if full URL was passed
+    if permalink.startswith("https://"):
+        permalink = permalink.replace("https://reddit.com", "")
+        permalink = permalink.replace("https://www.reddit.com", "")
+
+    url = f"{REDDIT_BASE}{permalink}.json"
+    async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client:
+        resp = await client.get(
+            url,
+            params={"sort": sort, "limit": limit},
+            headers={"User-Agent": USER_AGENT},
+        )
+        resp.raise_for_status()
+        data = resp.json()
+
+    # Reddit returns [post_listing, comments_listing]
+    if not isinstance(data, list) or len(data) < 2:
+        return []
+
+    results = []
+    for child in data[1].get("data", {}).get("children", []):
+        if child.get("kind") != "t1":
+            continue
+        comment = child.get("data", {})
+        results.append(
+            {
+                "body": (comment.get("body") or "")[:2000],
+                "author": comment.get("author", "[deleted]"),
+                "score": comment.get("score", 0),
+                "created_utc": comment.get("created_utc", 0),
+            }
+        )
+    return results
				`@@ -0,0 +1 @@`
				`"""API clients for social media and forum data sources."""`