#!/usr/bin/env python3
"""
source.py — Government / Defense / Grants contract-opportunity sourcing pipeline.

Polls free federal opportunity APIs (SAM.gov contracts, Grants.gov grants,
SBIR.gov defense innovation), normalizes them, scores them for relevance to
your profile, dedupes against a local SQLite store so you only see NEW matches,
and writes a ranked Markdown digest.

Stdlib only — no pip install. Python 3.9+.

Usage:
    python3 source.py                    # run once, show new matches, write digest
    python3 source.py --all              # show all matches (ignore dedup)
    python3 source.py --source sam,grants
    python3 source.py --min-score 3 --since-days 7
    python3 source.py --help / --version

Exit codes: 0 ok · 1 runtime error · 2 usage error.

SLED (state/local) aggregators and private RFP boards have no clean public API
and are intentionally out of scope here — watch those via their portals.
"""
from __future__ import annotations

import argparse
import html
import json
import os
import re
import sqlite3
import sys
import urllib.error
import urllib.parse
import urllib.request
from datetime import datetime, timedelta, timezone

__version__ = "1.0.0"

HERE = os.path.dirname(os.path.abspath(os.path.realpath(__file__)))
DEFAULT_CONFIG = os.path.join(HERE, "config.json")
DEFAULT_DB = os.path.join(HERE, "opportunities.db")
DEFAULT_DIGEST_DIR = os.path.join(HERE, "digests")
UA = "gov-sourcing-pipeline/1.0 (+contracting research)"


def _load_env(path: str | None = None) -> None:
    """Load KEY=VALUE lines from a gitignored .env. A real (non-empty) env var still wins,
    but an EMPTY env var does NOT shadow the .env value — some launchers export KEY='' which
    setdefault() would otherwise keep, hiding the real key."""
    path = path or os.path.join(HERE, ".env")
    try:
        with open(path) as f:
            for line in f:
                line = line.strip()
                if not line or line.startswith("#") or "=" not in line:
                    continue
                k, v = line.split("=", 1)
                key = k.strip()
                if not os.environ.get(key):  # absent OR empty
                    os.environ[key] = v.strip().strip('"').strip("'")
    except FileNotFoundError:
        pass


_load_env()  # intel.py and mcp_server.py import source, so this covers all three


# ----------------------------------------------------------------------------- helpers
def warn(msg: str) -> None:
    print(f"[warn] {msg}", file=sys.stderr)


def http_json(url: str, *, data: bytes | None = None, headers: dict | None = None, timeout: int = 30):
    """GET (or POST if data) returning parsed JSON, or None on any failure."""
    hdrs = {"User-Agent": UA, "Accept": "application/json"}
    if headers:
        hdrs.update(headers)
    req = urllib.request.Request(url, data=data, headers=hdrs, method="POST" if data else "GET")
    try:
        with urllib.request.urlopen(req, timeout=timeout) as resp:
            return json.loads(resp.read().decode("utf-8", "replace"))
    except urllib.error.HTTPError as e:
        warn(f"HTTP {e.code} from {url.split('?')[0]}")
    except (urllib.error.URLError, TimeoutError) as e:
        warn(f"network error for {url.split('?')[0]}: {e}")
    except json.JSONDecodeError:
        warn(f"non-JSON response from {url.split('?')[0]}")
    return None


def _clean(s):
    return html.unescape(str(s or "").strip())


def opp(uid, source, title, agency, otype, naics, posted, due, set_aside, url, summary=""):
    return {
        "uid": uid, "source": source, "title": _clean(title),
        "agency": _clean(agency), "otype": _clean(otype),
        "naics": _clean(naics), "posted": _clean(posted),
        "due": _clean(due), "set_aside": _clean(set_aside),
        "url": (url or "").strip(), "summary": _clean(summary),
    }


# ----------------------------------------------------------------------------- connectors
def fetch_sam(cfg: dict) -> list[dict]:
    """SAM.gov Contract Opportunities (needs a free api.data.gov key)."""
    key = cfg.get("sam_api_key") or os.environ.get("SAM_API_KEY", "")
    if not key:
        warn("SAM: no api key (set sam_api_key in config or SAM_API_KEY env) — skipping. "
             "Get a free key at https://open.gsa.gov/api/get-opportunities-public-api/")
        return []
    days = int(cfg.get("since_days", 14))
    pf = (datetime.now() - timedelta(days=days)).strftime("%m/%d/%Y")
    pt = datetime.now().strftime("%m/%d/%Y")
    out, seen = [], set()
    for ncode in cfg.get("naics", []):
        q = urllib.parse.urlencode({
            "api_key": key, "postedFrom": pf, "postedTo": pt,
            "ncode": ncode, "limit": 100,
        })
        data = http_json(f"https://api.sam.gov/opportunities/v2/search?{q}")
        if not data:
            continue
        for r in data.get("opportunitiesData", []) or []:
            nid = r.get("noticeId", "")
            if not nid or nid in seen:
                continue
            seen.add(nid)
            out.append(opp(
                f"sam:{nid}", "SAM", r.get("title"), r.get("fullParentPathName"),
                r.get("type"), r.get("naicsCode"), r.get("postedDate"),
                r.get("responseDeadLine"), r.get("typeOfSetAsideDescription"),
                r.get("uiLink") or f"https://sam.gov/opp/{nid}/view",
            ))
    return out


def fetch_grants(cfg: dict) -> list[dict]:
    """Grants.gov search2 (keyless). Filtered to for-profit / small-business eligibility,
    with an exclusion list that drops the academic/biomedical research noise a for-profit
    services shop can't win — so 'assistance' results are actually applicable (SBIR/STTR/BAA)."""
    out, seen = [], set()
    statuses = cfg.get("grants_statuses", "forecasted|posted")
    elig = cfg.get("grants_eligibilities", "06|21|25|99")  # for-profit, small biz, others, unrestricted
    exclude = [x.lower() for x in cfg.get("grants_exclude", [])]
    for kw in cfg.get("grants_keywords", ["artificial intelligence"]):
        body = json.dumps({"keyword": kw, "oppStatuses": statuses,
                           "eligibilities": elig, "rows": 50}).encode()
        data = http_json("https://api.grants.gov/v1/api/search2", data=body,
                         headers={"Content-Type": "application/json"})
        if not data or not isinstance(data.get("data"), dict):
            continue
        for r in data["data"].get("oppHits", []) or []:
            gid = str(r.get("id", ""))
            if not gid or gid in seen:
                continue
            if any(x in f"{r.get('title','')} {r.get('agency','')}".lower() for x in exclude):
                continue
            seen.add(gid)
            out.append(opp(
                f"grants:{gid}", "Grants", r.get("title"), r.get("agency"),
                r.get("docType"), "", r.get("openDate"), r.get("closeDate"),
                "", f"https://www.grants.gov/search-results-detail/{gid}",
            ))
    return out


def fetch_sbir(cfg: dict) -> list[dict]:
    """SBIR.gov open solicitations (keyless, needs UA, rate-limited — best effort)."""
    data = http_json("https://api.www.sbir.gov/public/api/solicitations?open=1",
                     headers={"User-Agent": "Mozilla/5.0"})
    if not isinstance(data, list):
        warn("SBIR: API unavailable/rate-limited right now — skipping (retry later).")
        return []
    out = []
    for r in data:
        if not isinstance(r, dict):
            continue
        sid = str(r.get("solicitation_id") or r.get("solicitation_number") or r.get("solicitation_title", ""))
        out.append(opp(
            f"sbir:{sid}", "SBIR", r.get("solicitation_title"),
            r.get("agency") or r.get("branch"), "SBIR/STTR", "",
            r.get("release_date") or r.get("open_date"), r.get("close_date"),
            "", r.get("sbir_topic_link") or r.get("solicitation_agency_url") or "https://www.sbir.gov/solicitations",
        ))
    return out


def fetch_usajobs(cfg: dict) -> list[dict]:
    """USAJOBS federal employment search (free key from developer.usajobs.gov +
    the registered email as the User-Agent header). This is the JOBS track — federal
    W-2 positions, distinct from the contract/grant opportunities above; toggle it in
    config 'sources'. Doubles as a working reference for a search-and-apply aggregator.

    Note: USAJOBS has a clean public SEARCH API but no public APPLY API — ApplyURI
    deep-links into each agency's own system. Aggregate + stage, don't auto-submit.
    """
    key = os.environ.get("USAJOBS_API_KEY", "")
    email = os.environ.get("USAJOBS_EMAIL", "")
    if not key or not email:
        warn("USAJOBS: set USAJOBS_API_KEY + USAJOBS_EMAIL in .env "
             "(free key at https://developer.usajobs.gov) — skipping.")
        return []
    hdrs = {"Host": "data.usajobs.gov", "User-Agent": email, "Authorization-Key": key}
    out, seen = [], set()
    for kw in cfg.get("usajobs_keywords", ["artificial intelligence"]):
        q = urllib.parse.urlencode({"Keyword": kw, "ResultsPerPage": 50})
        data = http_json(f"https://data.usajobs.gov/api/search?{q}", headers=hdrs)
        if not data:
            continue
        for it in (data.get("SearchResult") or {}).get("SearchResultItems", []) or []:
            d = it.get("MatchedObjectDescriptor") or {}
            pid = str(it.get("MatchedObjectId") or d.get("PositionID") or "")
            if not pid or pid in seen:
                continue
            seen.add(pid)
            apply_uri = d.get("ApplyURI") or []
            url = (apply_uri[0] if isinstance(apply_uri, list) and apply_uri
                   else d.get("PositionURI") or "")
            pay = d.get("PositionRemuneration") or []
            sal = ""
            if isinstance(pay, list) and pay:
                p0 = pay[0]
                sal = f"${p0.get('MinimumRange','?')}-{p0.get('MaximumRange','?')}/{p0.get('RateIntervalCode','')}"
            out.append(opp(
                f"usajobs:{pid}", "USAJOBS", d.get("PositionTitle"),
                d.get("OrganizationName"), "Federal Job", "",
                d.get("PublicationStartDate"), d.get("ApplicationCloseDate"),
                "", url, summary=f"{d.get('PositionLocationDisplay','')} · {sal}",
            ))
    return out


def fetch_adzuna(cfg: dict) -> list[dict]:
    """Adzuna job aggregator — commercial + public-sector postings from across the web
    (free app_id + app_key at developer.adzuna.com). The private-sector complement to
    USAJOBS on the JOBS track; toggle in config 'sources'. Trial plan is rate-limited.

    Like USAJOBS: aggregate + deep-link via redirect_url; there's no public apply API.
    """
    app_id = os.environ.get("ADZUNA_APP_ID", "")
    app_key = os.environ.get("ADZUNA_APP_KEY", "")
    if not app_id or not app_key:
        warn("ADZUNA: set ADZUNA_APP_ID + ADZUNA_APP_KEY in .env "
             "(free at https://developer.adzuna.com) — skipping.")
        return []
    country = cfg.get("adzuna_country", "us")
    out, seen = [], set()
    for kw in cfg.get("adzuna_keywords", ["artificial intelligence"]):
        q = urllib.parse.urlencode({
            "app_id": app_id, "app_key": app_key,
            "results_per_page": 50, "what": kw,
            "sort_by": "date", "max_days_old": int(cfg.get("since_days", 14)),
        })
        data = http_json(f"https://api.adzuna.com/v1/api/jobs/{country}/search/1?{q}")
        if not data:
            continue
        for r in data.get("results", []) or []:
            jid = str(r.get("id") or "")
            if not jid or jid in seen:
                continue
            seen.add(jid)
            company = (r.get("company") or {}).get("display_name", "")
            loc = (r.get("location") or {}).get("display_name", "")
            smin, smax = r.get("salary_min"), r.get("salary_max")
            sal = f"${int(smin):,}-${int(smax):,}" if smin and smax else ""
            out.append(opp(
                f"adzuna:{jid}", "ADZUNA", r.get("title"), company,
                (r.get("category") or {}).get("label", "Job"), "",
                r.get("created"), "", "", r.get("redirect_url", ""),
                summary=f"{loc} · {sal}".strip(" ·"),
            ))
    return out


def _job_match(title: str, cfg: dict) -> bool:
    """Relevance gate for firehose job sources — title must hit a job_filter term AND
    avoid every job_exclude term (cuts sales/recruiting/marketing noise)."""
    t = (title or "").lower()
    if any(x in t for x in cfg.get("job_exclude", [])):
        return False
    kws = cfg.get("job_filter", [])
    return (not kws) or any(k in t for k in kws)


def fetch_himalayas(cfg: dict) -> list[dict]:
    """Himalayas remote jobs (keyless, ~88k, structured salary)."""
    out, seen = [], set()
    for off in (0, 20, 40, 60):
        data = http_json(f"https://himalayas.app/jobs/api?limit=20&offset={off}")
        for r in (data or {}).get("jobs", []) or []:
            if not _job_match(r.get("title"), cfg):
                continue
            uid = f"himalayas:{r.get('title','')}-{r.get('companyName','')}"
            if uid in seen:
                continue
            seen.add(uid)
            lo, hi = r.get("minSalary"), r.get("maxSalary")
            sal = f"${lo:,}-${hi:,}" if isinstance(lo, int) and isinstance(hi, int) else ""
            out.append(opp(uid, "Himalayas", r.get("title"), r.get("companyName"),
                           "Remote Job", "", r.get("pubDate", ""), "", "",
                           r.get("applicationLink") or r.get("url", ""), summary=sal))
    return out


def fetch_jobicy(cfg: dict) -> list[dict]:
    """Jobicy remote jobs (keyless, salary)."""
    out, seen = [], set()
    data = http_json("https://jobicy.com/api/v2/remote-jobs?count=100&geo=usa")
    for r in (data or {}).get("jobs", []) or []:
        if not _job_match(r.get("jobTitle"), cfg):
            continue
        uid = f"jobicy:{r.get('id') or r.get('url', '')}"
        if uid in seen:
            continue
        seen.add(uid)
        lo, hi = r.get("salaryMin"), r.get("salaryMax")
        sal = f"${int(lo):,}-${int(hi):,}" if lo and hi else ""
        out.append(opp(uid, "Jobicy", r.get("jobTitle"), r.get("companyName"),
                       "Remote Job", "", r.get("pubDate", ""), "", "",
                       r.get("url", ""), summary=sal))
    return out


def fetch_themuse(cfg: dict) -> list[dict]:
    """The Muse jobs (keyless)."""
    out, seen = [], set()
    for cat in cfg.get("themuse_categories", ["Software Engineering", "Data Science"]):
        data = http_json(
            f"https://www.themuse.com/api/public/jobs?page=1&category={urllib.parse.quote(cat)}")
        for r in (data or {}).get("results", []) or []:
            if not _job_match(r.get("name"), cfg):
                continue
            uid = f"themuse:{r.get('id') or r.get('name', '')}"
            if uid in seen:
                continue
            seen.add(uid)
            locs = ", ".join(l.get("name", "") for l in (r.get("locations") or [])[:2])
            out.append(opp(uid, "TheMuse", r.get("name"),
                           (r.get("company") or {}).get("name"), "Job", "",
                           r.get("publication_date", ""), "", "",
                           (r.get("refs") or {}).get("landing_page", ""), summary=locs))
    return out


def fetch_greenhouse(cfg: dict) -> list[dict]:
    """Greenhouse ATS — jobs straight from target employers' boards (keyless).
    Curate `greenhouse_companies` in config to the employers you want to watch —
    this is the highest-precision job source because YOU choose the companies."""
    out = []
    for tok in cfg.get("greenhouse_companies", []):
        data = http_json(f"https://boards-api.greenhouse.io/v1/boards/{tok}/jobs?content=true")
        for r in (data or {}).get("jobs", []) or []:
            if not _job_match(r.get("title"), cfg):
                continue
            out.append(opp(f"gh:{tok}:{r.get('id')}", "Greenhouse", r.get("title"),
                           tok.replace("-", " ").title(), "Job", "", r.get("updated_at", ""),
                           "", "", r.get("absolute_url", ""),
                           summary=(r.get("location") or {}).get("name", "")))
    return out


def fetch_lever(cfg: dict) -> list[dict]:
    """Lever ATS — jobs straight from target employers (keyless). Curate `lever_companies`."""
    out = []
    for co in cfg.get("lever_companies", []):
        data = http_json(f"https://api.lever.co/v0/postings/{co}?mode=json")
        if not isinstance(data, list):
            continue
        for r in data:
            if not _job_match(r.get("text"), cfg):
                continue
            cats = r.get("categories") or {}
            out.append(opp(f"lever:{co}:{r.get('id')}", "Lever", r.get("text"),
                           co.replace("-", " ").title(), "Job", "", "", "", "",
                           r.get("hostedUrl") or r.get("applyUrl", ""),
                           summary=cats.get("location", "")))
    return out


def fetch_federalregister(cfg: dict) -> list[dict]:
    """Federal Register notices (keyless) — forward-demand signal: agencies post
    sources-sought / RFIs / notices here, often before or alongside the SAM solicitation."""
    out, seen = [], set()
    for term in cfg.get("federalregister_terms", ["sources sought", "artificial intelligence"]):
        url = ("https://www.federalregister.gov/api/v1/documents.json?per_page=50&order=newest"
               f"&conditions[type][]=NOTICE&conditions[term]={urllib.parse.quote(term)}")
        data = http_json(url)
        for r in (data or {}).get("results", []) or []:
            uid = f"fedreg:{r.get('document_number')}"
            if uid in seen:
                continue
            seen.add(uid)
            ag = ", ".join(a.get("name", "") for a in (r.get("agencies") or [])[:2])
            out.append(opp(uid, "FedRegister", r.get("title"), ag,
                           r.get("type", "Notice"), "", r.get("publication_date", ""),
                           "", "", r.get("html_url", ""),
                           summary=(r.get("abstract") or "")[:160]))
    return out


def fetch_hn_hiring(cfg: dict) -> list[dict]:
    """Hacker News 'Who is Hiring' (keyless via Algolia) — the latest monthly thread's
    hiring comments, one self-declared hiring company each. Startup/tech-heavy, fresh."""
    s = http_json("https://hn.algolia.com/api/v1/search_by_date"
                  "?query=Ask%20HN%20Who%20is%20hiring&tags=story,author_whoishiring&hitsPerPage=1")
    hits = (s or {}).get("hits") or []
    if not hits:
        return []
    thread = http_json(f"https://hn.algolia.com/api/v1/items/{hits[0].get('objectID')}")
    out = []
    for ch in (thread or {}).get("children", []) or []:
        raw = ch.get("text") or ""
        if not raw:
            continue
        txt = html.unescape(re.sub(r"\s+", " ", re.sub("<[^>]+>", " ", raw))).strip()
        if not _job_match(txt, cfg):
            continue
        out.append(opp(f"hn:{ch.get('id')}", "HN-Hiring", txt[:110], "", "Hiring post",
                       "", "", "", "", f"https://news.ycombinator.com/item?id={ch.get('id')}",
                       summary=txt[:200]))
    return out


def fetch_yc(cfg: dict) -> list[dict]:
    """Y Combinator companies currently hiring (keyless community mirror). An employer-target
    feeder: relevant YC cos to add to your Greenhouse/Lever watchlists."""
    data = http_json("https://yc-oss.github.io/api/companies/hiring.json")
    if not isinstance(data, list):
        return []
    out = []
    for c in data:
        if not _job_match(f"{c.get('name','')} {c.get('one_liner','')}", cfg):
            continue
        out.append(opp(f"yc:{c.get('id') or c.get('name')}", "YC",
                       f"{c.get('name')} (hiring)", c.get("name"), "Startup", "",
                       "", "", "", c.get("website") or c.get("url", ""),
                       summary=(c.get("one_liner") or "")[:140]))
    return out


def fetch_remoteok(cfg: dict) -> list[dict]:
    """RemoteOK remote jobs (keyless; needs a browser UA). The first array element is a
    legal/metadata notice with no 'position' — skipped by the guard below."""
    data = http_json("https://remoteok.com/api", headers={"User-Agent": "Mozilla/5.0"})
    if not isinstance(data, list):
        warn("RemoteOK: API unavailable right now — skipping.")
        return []
    out, seen = [], set()
    for r in data:
        if not isinstance(r, dict) or not r.get("position"):
            continue
        if not _job_match(r.get("position"), cfg):
            continue
        rid = str(r.get("id") or r.get("slug") or r.get("url", ""))
        if not rid or rid in seen:
            continue
        seen.add(rid)
        lo, hi = r.get("salary_min"), r.get("salary_max")
        sal = f"${int(lo):,}-${int(hi):,}" if lo and hi else ""
        loc = r.get("location") or "Remote"
        out.append(opp(f"remoteok:{rid}", "RemoteOK", r.get("position"), r.get("company"),
                       "Remote Job", "", r.get("date", ""), "", "",
                       r.get("url") or f"https://remoteok.com/remote-jobs/{rid}",
                       summary=f"{loc} · {sal}".strip(" ·")))
    return out


def fetch_remotive(cfg: dict) -> list[dict]:
    """Remotive remote jobs (keyless). Pulls per search term (defaults to job_filter)."""
    out, seen = [], set()
    terms = cfg.get("remotive_search") or cfg.get("job_filter", []) or [""]
    for term in terms[:6]:
        url = "https://remotive.com/api/remote-jobs?limit=50"
        if term:
            url += "&search=" + urllib.parse.quote(term)
        data = http_json(url)
        for r in (data or {}).get("jobs", []) or []:
            if not _job_match(r.get("title"), cfg):
                continue
            rid = str(r.get("id") or r.get("url", ""))
            if not rid or rid in seen:
                continue
            seen.add(rid)
            sm = f"{r.get('candidate_required_location','')} · {r.get('salary','')}".strip(" ·")
            out.append(opp(f"remotive:{rid}", "Remotive", r.get("title"), r.get("company_name"),
                           "Remote Job", "", r.get("publication_date", ""), "", "",
                           r.get("url", ""), summary=sm))
    return out


def fetch_findwork(cfg: dict) -> list[dict]:
    """Findwork.dev jobs (free token: header 'Authorization: Token <key>'; set
    FINDWORK_API_KEY in .env, key at https://findwork.dev/developers/). Skips without a key."""
    key = os.environ.get("FINDWORK_API_KEY", "")
    if not key:
        warn("FINDWORK: set FINDWORK_API_KEY in .env (free at https://findwork.dev/developers/) — skipping.")
        return []
    hdrs = {"Authorization": f"Token {key}"}
    out, seen = [], set()
    for kw in (cfg.get("findwork_search") or cfg.get("job_filter") or ["engineer"])[:6]:
        q = urllib.parse.urlencode({"search": kw, "sort_by": "date"})
        data = http_json(f"https://findwork.dev/api/jobs/?{q}", headers=hdrs)
        for r in (data or {}).get("results", []) or []:
            if not _job_match(r.get("role"), cfg):
                continue
            rid = str(r.get("id", ""))
            if not rid or rid in seen:
                continue
            seen.add(rid)
            loc = r.get("location") or ("Remote" if r.get("remote") else "")
            out.append(opp(f"findwork:{rid}", "Findwork", r.get("role"), r.get("company_name"),
                           "Job", "", r.get("date_posted", ""), "", "",
                           r.get("url", ""), summary=loc))
    return out


def fetch_jooble(cfg: dict) -> list[dict]:
    """Jooble aggregator (free key: POST https://jooble.org/api/<key>; set JOOBLE_API_KEY in
    .env, request at https://jooble.org/api/about). Skips without a key."""
    key = os.environ.get("JOOBLE_API_KEY", "")
    if not key:
        warn("JOOBLE: set JOOBLE_API_KEY in .env (free at https://jooble.org/api/about) — skipping.")
        return []
    loc = cfg.get("jooble_location", "")
    out, seen = [], set()
    for kw in (cfg.get("jooble_keywords") or cfg.get("job_filter") or ["program manager"])[:6]:
        body = json.dumps({"keywords": kw, "location": loc}).encode()
        data = http_json(f"https://jooble.org/api/{key}", data=body,
                         headers={"Content-Type": "application/json"})
        for r in (data or {}).get("jobs", []) or []:
            if not _job_match(r.get("title"), cfg):
                continue
            rid = str(r.get("id") or r.get("link", ""))
            if not rid or rid in seen:
                continue
            seen.add(rid)
            sm = f"{r.get('location','')} · {r.get('salary','')}".strip(" ·")
            out.append(opp(f"jooble:{rid}", "Jooble", r.get("title"), r.get("company"),
                           "Job", "", r.get("updated", ""), "", "",
                           r.get("link", ""), summary=sm))
    return out


def fetch_usaspending(cfg: dict) -> list[dict]:
    """USAspending.gov awards (keyless, NO quota) — who already WON contracts in this lane,
    and for how much. Intel, not open opportunities: surfaces incumbents, award amounts and
    agencies for competitor/teaming targeting. Filtered by the profile's NAICS over a window
    (usaspending_lookback_days, default 730). Each opp carries _recipient/_amount extras for
    intel views like make_incumbents.py."""
    naics = [str(n) for n in cfg.get("naics", []) if n][:25]
    if not naics:
        return []
    end = datetime.now().strftime("%Y-%m-%d")
    start = (datetime.now() - timedelta(days=int(cfg.get("usaspending_lookback_days", 730)))).strftime("%Y-%m-%d")
    body = json.dumps({
        "filters": {
            "award_type_codes": ["A", "B", "C", "D"],
            "naics_codes": naics,
            "time_period": [{"start_date": start, "end_date": end}],
        },
        "fields": ["Award ID", "Recipient Name", "Award Amount", "Awarding Agency",
                   "Awarding Sub Agency", "Start Date", "End Date", "Description"],
        "sort": "Award Amount", "order": "desc", "limit": 100, "page": 1,
    }).encode()
    data = http_json("https://api.usaspending.gov/api/v2/search/spending_by_award/",
                     data=body, headers={"Content-Type": "application/json"})
    out, seen = [], set()
    for r in (data or {}).get("results", []) or []:
        gid = r.get("generated_internal_id") or ""
        aid = str(r.get("Award ID") or gid or "")
        if not aid or aid in seen:
            continue
        seen.add(aid)
        amt = r.get("Award Amount") or 0
        recip = r.get("Recipient Name") or "?"
        try:
            amt_s = f"${float(amt):,.0f}"
        except (TypeError, ValueError):
            amt_s = str(amt)
        o = opp(
            f"usaspending:{aid}", "USAspending",
            r.get("Description") or r.get("Award ID") or "(award)",
            r.get("Awarding Agency"), "Award (won)", "",
            r.get("Start Date", ""), r.get("End Date", ""), "",
            f"https://www.usaspending.gov/award/{gid}" if gid else "https://www.usaspending.gov",
            summary=f"Won by {recip} · {amt_s}",
        )
        o["_recipient"], o["_amount"], o["_subagency"] = recip, (amt or 0), r.get("Awarding Sub Agency", "")
        out.append(o)
    return out


CONNECTORS = {"sam": fetch_sam, "grants": fetch_grants, "sbir": fetch_sbir,
              "usajobs": fetch_usajobs, "adzuna": fetch_adzuna,
              "himalayas": fetch_himalayas, "jobicy": fetch_jobicy, "themuse": fetch_themuse,
              "greenhouse": fetch_greenhouse, "lever": fetch_lever,
              "federalregister": fetch_federalregister,
              "hn": fetch_hn_hiring, "yc": fetch_yc,
              "remoteok": fetch_remoteok, "remotive": fetch_remotive,
              "findwork": fetch_findwork, "jooble": fetch_jooble,
              "usaspending": fetch_usaspending}


# ----------------------------------------------------------------------------- scoring
def score_opportunity(o: dict, cfg: dict) -> float:
    """Relevance score. Higher = better fit.

    THIS IS THE TUNING LEVER. Adjust the `scoring` weights in config.json to
    change what surfaces (precision vs recall). See README.
    """
    w = cfg.get("scoring", {})
    text = f"{o['title']} {o['summary']} {o['agency']}".lower()
    s = 0.0
    # keyword hits
    hits = sum(1 for kw in cfg.get("keywords", []) if kw.lower() in text)
    s += hits * float(w.get("keyword_hit", 1.0))
    # NAICS match
    if o["naics"] and o["naics"] in set(cfg.get("naics", [])):
        s += float(w.get("naics_match", 2.0))
    # set-aside match (small-business friendly)
    if o["set_aside"] and any(sa.lower() in o["set_aside"].lower() for sa in cfg.get("set_asides", [])):
        s += float(w.get("setaside_match", 1.5))
    # recency bonus (posted within window)
    if _is_recent(o["posted"], int(cfg.get("since_days", 14))):
        s += float(w.get("recency", 0.5))
    return round(s, 2)


def _is_recent(posted: str, days: int) -> bool:
    for fmt in ("%Y-%m-%d", "%m/%d/%Y", "%Y-%m-%dT%H:%M:%S"):
        try:
            d = datetime.strptime(posted[:19] if "T" in posted else posted[:10], fmt)
            return (datetime.now() - d).days <= days
        except (ValueError, TypeError):
            continue
    return False


# ----------------------------------------------------------------------------- store
SCHEMA = """
CREATE TABLE IF NOT EXISTS opps (
  uid TEXT PRIMARY KEY, source TEXT, title TEXT, agency TEXT, otype TEXT,
  naics TEXT, posted TEXT, due TEXT, set_aside TEXT, url TEXT, summary TEXT,
  score REAL, first_seen TEXT, last_seen TEXT
);
"""


def store_and_diff(opps: list[dict], db_path: str) -> list[dict]:
    """Upsert all opps; return the ones that are NEW (first seen this run)."""
    con = sqlite3.connect(db_path)
    con.execute(SCHEMA)
    now = datetime.now(timezone.utc).isoformat(timespec="seconds")
    new = []
    for o in opps:
        row = con.execute("SELECT uid FROM opps WHERE uid=?", (o["uid"],)).fetchone()
        if row is None:
            con.execute(
                "INSERT INTO opps VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
                (o["uid"], o["source"], o["title"], o["agency"], o["otype"], o["naics"],
                 o["posted"], o["due"], o["set_aside"], o["url"], o["summary"],
                 o["score"], now, now),
            )
            new.append(o)
        else:
            con.execute("UPDATE opps SET last_seen=?, score=? WHERE uid=?", (now, o["score"], o["uid"]))
    con.commit()
    con.close()
    return new


# ----------------------------------------------------------------------------- output
def render_digest(opps: list[dict], title: str) -> str:
    lines = [f"# {title}", "", f"_{len(opps)} opportunit{'y' if len(opps)==1 else 'ies'} · generated {datetime.now():%Y-%m-%d %H:%M}_", ""]
    by_src: dict[str, list[dict]] = {}
    for o in sorted(opps, key=lambda x: -x["score"]):
        by_src.setdefault(o["source"], []).append(o)
    for src, items in by_src.items():
        lines.append(f"## {src} ({len(items)})")
        lines.append("")
        lines.append("| Score | Title | Agency | NAICS | Set-aside | Due | Link |")
        lines.append("|---|---|---|---|---|---|---|")
        for o in items:
            t = (o["title"][:70] + "…") if len(o["title"]) > 71 else o["title"]
            lines.append(f"| {o['score']} | {t} | {o['agency'][:30]} | {o['naics']} | "
                         f"{o['set_aside'][:20]} | {o['due'][:10]} | {o['url']} |")
        lines.append("")
    return "\n".join(lines)


# ----------------------------------------------------------------------------- main
def main(argv=None) -> int:
    p = argparse.ArgumentParser(description="Source government/defense/grant contract opportunities.")
    p.add_argument("--config", default=DEFAULT_CONFIG)
    p.add_argument("--db", default=DEFAULT_DB)
    p.add_argument("--digest-dir", default=DEFAULT_DIGEST_DIR)
    p.add_argument("--source", help="comma list: sam,grants,sbir (default: config)")
    p.add_argument("--min-score", type=float, default=None)
    p.add_argument("--since-days", type=int, default=None)
    p.add_argument("--all", action="store_true", help="show all matches, not just new")
    p.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
    args = p.parse_args(argv)

    try:
        with open(args.config) as f:
            cfg = json.load(f)
    except (OSError, json.JSONDecodeError) as e:
        print(f"error: cannot read config {args.config}: {e}", file=sys.stderr)
        return 2
    if args.since_days is not None:
        cfg["since_days"] = args.since_days
    min_score = args.min_score if args.min_score is not None else float(cfg.get("min_score", 1.0))
    enabled = (args.source.split(",") if args.source
               else [s for s, on in cfg.get("sources", {}).items() if on])

    all_opps: list[dict] = []
    for name in enabled:
        fn = CONNECTORS.get(name.strip())
        if not fn:
            warn(f"unknown source '{name}'")
            continue
        got = fn(cfg)
        print(f"  {name}: {len(got)} fetched", file=sys.stderr)
        all_opps.extend(got)

    for o in all_opps:
        o["score"] = score_opportunity(o, cfg)
    matches = [o for o in all_opps if o["score"] >= min_score]

    new = store_and_diff(matches, args.db)
    show = matches if args.all else new

    os.makedirs(args.digest_dir, exist_ok=True)
    label = "All matches" if args.all else "New opportunities"
    digest = render_digest(show, f"{label} — {datetime.now():%Y-%m-%d}")
    out_path = os.path.join(args.digest_dir, f"digest-{datetime.now():%Y%m%d-%H%M}.md")
    with open(out_path, "w") as f:
        f.write(digest)

    print(f"\n{len(all_opps)} fetched · {len(matches)} above score {min_score} · "
          f"{len(new)} new · digest → {out_path}")
    for o in sorted(show, key=lambda x: -x["score"])[:10]:
        print(f"  [{o['score']:>4}] {o['source']:<6} {o['title'][:64]}")
    return 0


if __name__ == "__main__":
    try:
        sys.exit(main())
    except KeyboardInterrupt:
        sys.exit(1)
