# /var/www/html/bot/app/crawler/moef_org_pipeline.py
import os, re, time, uuid, hashlib, logging
from typing import List, Dict, Tuple
from datetime import datetime, timezone
import requests
from bs4 import BeautifulSoup
from app.services.supabase_service import get_client

LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper()
logging.basicConfig(level=LOG_LEVEL, format="[%(levelname)s] %(message)s")
logger = logging.getLogger("moef_org_pipeline")

BASE = "https://www.moef.go.kr"
LIST_URL = (BASE + "/mi/orgnzt/org.do?bbsId=MOSFBBS_000000000097&menuNo=9040100")
SEL_LINKS = 'a[href*="orgId="]'
HEADERS = {"User-Agent": "GovBot/1.0 (+https://work.jjickjjicks.com)"}
PHONE_RE = re.compile(r"\d{2,4}-\d{3,4}-\d{4}")

def _clean(s: str | None) -> str: return " ".join((s or "").replace("\u00A0"," ").split())
def _norm(s: str) -> str: return _clean(s)

def _key_hash(name: str, position: str, department: str) -> str:
    base = f"{_norm(name)}|{_norm(position)}|{_norm(department)}"
    return hashlib.sha256(base.encode()).hexdigest()

def _row_hash(name: str, position: str, department: str, task: str, phone: str) -> str:
    base = f"{_norm(name)}|{_norm(position)}|{_norm(department)}|{_norm(task)}|{_norm(phone)}"
    return hashlib.sha256(base.encode()).hexdigest()

def strip_position_from_name(name_raw: str, position_raw: str) -> str:
    n = _clean(name_raw); p = _clean(position_raw)
    if not n or not p: return n
    original = n
    base = re.split(r"[ \t(\[/·\-]", p)[0].strip()
    candidates = [c for c in {p, base} if c]
    for cand in candidates:
        for suf in (cand, " " + cand):
            if n.endswith(suf): n = _clean(n[: -len(suf)]).rstrip(" -·/–—")
        n = re.sub(rf"\s*\(\s*{re.escape(cand)}\s*\)\s*$", "", n).rstrip(" -·/–—")
        n = re.sub(rf"\s*{re.escape(cand)}\s*\([^)]+\)\s*$", "", n).rstrip(" -·/–—")
        if n.endswith(cand): n = _clean(n[: -len(cand)]).rstrip(" -·/–—")
    n = _clean(n)
    return n if n else original

def _retry_get(sess: requests.Session, url: str, timeout=30, tries=3, backoff=0.7) -> requests.Response:
    last = None
    for i in range(tries):
        try:
            r = sess.get(url, headers=HEADERS, timeout=timeout); r.raise_for_status(); return r
        except Exception as e:
            last = e; sleep = backoff*(2**i)
            logger.warning("GET retry %d/%d in %.1fs: %s", i+1, tries, sleep, url); time.sleep(sleep)
    raise last

def fetch_org_links(sess: requests.Session) -> List[Dict]:
    r = _retry_get(sess, LIST_URL)
    soup = BeautifulSoup(r.text, "html.parser")
    seen, out = set(), []
    for a in soup.select(SEL_LINKS):
        href = (a.get("href","") or "").strip()
        if not href or "orgId=" not in href: continue
        link = href if href.startswith("http") else (BASE + href if href.startswith("/") else BASE + "/" + href)
        if link in seen: continue
        seen.add(link)
        out.append({"link": link, "title": a.get("title","")})
    return out

def parse_org_detail(sess: requests.Session, link: str) -> Tuple[List[Dict], str, str]:
    r = _retry_get(sess, link); html = r.text
    soup = BeautifulSoup(html, "html.parser")
    rows: List[Dict] = []
    for tr in soup.select("table tr"):
        if tr.find("th"): continue
        tds = tr.find_all("td")
        if len(tds)<5: continue
        department     = _clean(tds[0].get_text(strip=True))
        name_raw       = _clean(tds[1].get_text(strip=True))
        position_raw   = _clean(tds[2].get_text(strip=True))
        phone          = _clean(tds[3].get_text(strip=True))
        responsibility = _clean(tds[4].get_text(strip=True))
        tel_a = tds[3].select_one('a[href^="tel:"]')
        if tel_a and tel_a.has_attr("href"): phone = tel_a["href"].replace("tel:","").strip()
        name = strip_position_from_name(name_raw, position_raw)
        position = position_raw
        if not (name or position or department or responsibility or phone): continue
        rows.append({"department":department, "name":name, "position":position, "phone":phone, "task":responsibility})
    return rows, link, html

def _count_cur(sb) -> int:
    try:
        res = sb.table("moef_org_cur").select("key_hash", count="exact").execute()
        return res.count or 0
    except Exception:
        return 0

def _validate(stg_count:int, cur_count:int, *, min_abs:int, min_ratio:float) -> Tuple[bool,str]:
    if stg_count < min_abs: return False, f"too_few_rows: {stg_count} < {min_abs}"
    if cur_count>0 and stg_count < int(cur_count*min_ratio):
        return False, f"ratio_drop: {stg_count} < {int(cur_count*min_ratio)} (cur={cur_count})"
    return True, "ok"

def _load_current_map(sb) -> Dict[str, Dict]:
    rows = (sb.table("moef_org_cur").select("*").limit(50000).execute().data) or []
    return { r["key_hash"]: r for r in rows }

def _load_stg_map(sb, run_id:str) -> Dict[str, Dict]:
    rows = (sb.table("moef_org_stg").select("*").eq("run_id", run_id).limit(50000).execute().data) or []
    return { r["key_hash"]: r for r in rows }

def _apply_scd2(sb, run_id:str):
    cur_map = _load_current_map(sb)
    stg_map = _load_stg_map(sb, run_id)
    to_close, to_add = [], []
    now_iso = datetime.now(timezone.utc).isoformat()
    for k in cur_map.keys():
        if k not in stg_map: to_close.append(k)
    for k, v in stg_map.items():
        if (k not in cur_map) or (cur_map[k]["row_hash"] != v["row_hash"]):
            to_add.append({
                "key_hash": k, "department": v["department"], "name": v["name"], "position": v["position"],
                "task": v.get("task"), "phone": v.get("phone"),
                "row_hash": v["row_hash"], "valid_from": now_iso, "valid_to": None, "is_current": True
            })
    if to_close:
        sb.table("moef_org_hist").update({"valid_to": now_iso, "is_current": False}).in_("key_hash", to_close).eq("is_current", True).execute()
    if to_add:
        sb.table("moef_org_hist").insert(to_add).execute()

def run_once(sleep_sec: float = 0.2, min_abs: int = 800, min_ratio: float = 0.7) -> Dict:
    sb = get_client()
    run_id = str(uuid.uuid4())
    sb.table("crawler_run").insert({"id": run_id, "target":"moef_org", "status":"running"}).execute()

    pages = 0; collected = 0
    try:
        with requests.Session() as sess:
            # 목록 HTML 저장
            list_r = _retry_get(sess, LIST_URL)
            sb.table("moef_org_raw").upsert({
                "run_id": run_id, "kind": "list", "page": 0, "url": LIST_URL, "html": list_r.text
            }).execute()

            links = fetch_org_links(sess)
            for i, item in enumerate(links, 1):
                rows, url, html = parse_org_detail(sess, item["link"])
                sb.table("moef_org_raw").upsert({
                    "run_id": run_id, "kind": "detail", "page": i, "url": url, "html": html
                }).execute()
                if rows:
                    payload = []
                    for r in rows:
                        k = _key_hash(r["name"], r["position"], r["department"])
                        h = _row_hash(r["name"], r["position"], r["department"], r.get("task",""), r.get("phone",""))
                        payload.append({"run_id": run_id, **r, "key_hash": k, "row_hash": h})
                    sb.table("moef_org_stg").upsert(payload).execute()
                collected += len(rows); pages += 1; time.sleep(sleep_sec)

        sb.table("moef_org_snapshot").insert({"run_id": run_id, "rows": collected, "status":"collected", "note": f"pages={pages}"}).execute()
        cur_count = _count_cur(sb)
        ok, reason = _validate(collected, cur_count, min_abs=min_abs, min_ratio=min_ratio)
        if not ok:
            sb.table("crawler_run").update({"status":"failed","finished_at":datetime.now(timezone.utc).isoformat(),"pages":pages,"rows":collected,"fail_reason":reason}).eq("id", run_id).execute()
            sb.table("moef_org_snapshot").update({"status":"failed","note":reason}).eq("run_id", run_id).execute()
            logger.error("[moef_org] validation failed: %s", reason)
            return {"run_id": run_id, "status":"failed", "reason": reason}

        _apply_scd2(sb, run_id)
        sb.table("moef_org_snapshot").update({"status":"passed"}).eq("run_id", run_id).execute()
        sb.table("crawler_run").update({"status":"passed","finished_at":datetime.now(timezone.utc).isoformat(),"pages":pages,"rows":collected}).eq("id", run_id).execute()
        logger.info("[moef_org] run passed: run_id=%s, rows=%d, pages=%d", run_id, collected, pages)
        return {"run_id": run_id, "status":"passed", "rows": collected, "pages": pages}

    except Exception as e:
        sb.table("crawler_run").update({"status":"aborted","finished_at":datetime.now(timezone.utc).isoformat(),"pages":pages,"rows":collected,"fail_reason":str(e)}).eq("id", run_id).execute()
        logger.exception("[moef_org] run aborted: %s", e)
        return {"run_id": run_id, "status":"aborted", "error": str(e)}

if __name__ == "__main__":
    print(run_once())