# /var/www/html/bot/scripts/kepco_history_apply.py
from __future__ import annotations
import os, re, io, argparse, logging, requests
from datetime import datetime, timedelta
from typing import Any, Dict, List, Tuple, Optional
from pypdf import PdfReader

from app.services.supabase_service import get_client
from scripts.kepco_history_fill import parse_people  # ← 정적 임포트로 단순화

LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper()
logger = logging.getLogger("kepco_history_apply")
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter("[%(levelname)s] %(message)s"))
logger.addHandler(handler)
logger.setLevel(LOG_LEVEL)

# ---------- 공통 유틸 ----------
VACANCY_PAT = re.compile(r"^\s*\(?공석\)?\s*$")
def _is_vacant_name(name: Optional[str]) -> bool:
    if not name:
        return False
    s = str(name).strip()
    if VACANCY_PAT.match(s):
        return True
    return s in {"결원", "-"}

def _normalize_name(s: Optional[str]) -> str:
    return re.sub(r"\s+", "", (s or "")).strip()

def _key(dept: str, name: Optional[str], start: Optional[str]) -> Tuple[str,str,Optional[str]]:
    return (dept, _normalize_name(name), start or None)

def _to_iso(posted_at: str) -> str:
    s = posted_at.strip().replace(" ", "")
    if re.match(r"^\d{4}-\d{2}-\d{2}$", s):
        return s
    m = re.match(r"^(\d{4})\.(\d{1,2})\.(\d{1,2})$", s)
    if m:
        y, mo, d = int(m.group(1)), int(m.group(2)), int(m.group(3))
        return f"{y:04d}-{mo:02d}-{d:02d}"
    raise ValueError(f"posted_at 형식 불일치: {posted_at}")

def _iso_prev_day(iso_ymd: str) -> str:
    dt = datetime.strptime(iso_ymd, "%Y-%m-%d")
    prev = dt - timedelta(days=1)
    return prev.strftime("%Y-%m-%d")

def _to_kor_date(iso_ymd: str) -> str:
    dt = datetime.strptime(iso_ymd, "%Y-%m-%d")
    return f"{dt.year}년 {dt.month:02d}월 {dt.day:02d}일"

def _download_pdf(url: str) -> str:
    r = requests.get(url, timeout=60)
    r.raise_for_status()
    with io.BytesIO(r.content) as bio:
        reader = PdfReader(bio)
        pages = []
        for p in reader.pages:
            try:
                pages.append(p.extract_text() or "")
            except Exception:
                pages.append("")
    text = "\n".join(pages).replace("\r", "")
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"[ ]*\n[ ]*", "\n", text)
    text = re.sub(r"\n{2,}", "\n", text)
    return text.strip()

# ---------- Supabase ----------
def sb():
    return get_client()

def fetch_sources_in_order(client) -> List[Dict[str, Any]]:
    res = client.table("kepco_id_test") \
        .select("department, posted_at, pdf_url") \
        .order("posted_at", desc=False) \
        .limit(100000) \
        .execute()
    return res.data or []

def fetch_prev_current_by_dept(client, dept: str) -> List[Dict[str, Any]]:
    res = client.table("kepco_history") \
        .select("id, department, name, position, task, start, end, actual_end, posted_at, pdf_url") \
        .eq("department", dept) \
        .eq("current", True) \
        .execute()
    return res.data or []

# ---------- 핵심 처리 ----------
def process_one_department(client, dept: str, posted_at_raw: str, pdf_url: str, merged_text: str) -> None:
    posted_iso = _to_iso(posted_at_raw)
    prev_iso   = _iso_prev_day(posted_iso)
    prev_kor   = _to_kor_date(prev_iso)

    # (A) 직전 current 스냅샷
    prev_rows = fetch_prev_current_by_dept(client, dept)
    prev_by_key: Dict[Tuple[str,str,Optional[str]], Dict[str,Any]] = {
        _key(dept, r.get("name"), r.get("start")): r for r in prev_rows
    }
    prev_keys = set(prev_by_key.keys())

    # (B) 신규 파싱
    people = parse_people(merged_text, department_hint=dept)
    new_by_key: Dict[Tuple[str,str,Optional[str]], Dict[str,Any]] = {}
    for p in people:
        k = _key(dept, p.get("name"), p.get("start"))
        # 같은 키가 2번 이상 나오면 첫 항목만 사용
        if k not in new_by_key:
            new_by_key[k] = p
    new_keys = set(new_by_key.keys())

    # (C) 집합 비교
    to_close_keys = prev_keys - new_keys
    keep_keys     = prev_keys & new_keys
    insert_keys   = new_keys - prev_keys

    # (D) 변경 탐지: end 변경(연장) + task 변경(직책 교체)
    extend_keys: List[Tuple[str,str,Optional[str]]] = []
    task_change_keys: List[Tuple[str,str,Optional[str]]] = []

    def _norm_task(x: Optional[str]) -> str:
        return re.sub(r"\s+", "", (x or "")).strip()

    for k in keep_keys:
        prev_end = (prev_by_key[k] or {}).get("end")
        new_end  = (new_by_key[k] or {}).get("end")
        if (prev_end or "") != (new_end or ""):
            extend_keys.append(k)
            continue
        prev_task = (prev_by_key[k] or {}).get("task") or ""
        new_task  = (new_by_key[k] or {}).get("task") or ""
        if _norm_task(prev_task) != _norm_task(new_task):
            task_change_keys.append(k)

    # (E) 처리 순서
    # 1) 직전 current 중 이번 공시에 사라진 인원 → actual_end = posted_at-1일(국문)
    if to_close_keys:
        ids_to_close = [prev_by_key[k]["id"] for k in to_close_keys if prev_by_key.get(k)]
        for i in range(0, len(ids_to_close), 1000):
            client.table("kepco_history").update({"actual_end": prev_kor}) \
                  .in_("id", ids_to_close[i:i+1000]).is_("actual_end", None).execute()

    # 2) 부서 내 current 전부 false
    client.table("kepco_history").update({"current": False}) \
          .eq("department", dept).eq("current", True).execute()

    # 3) 유지 중 변경 없는 것만 재활성화(current=true, 공시링크 최신화)
    keep_same = [k for k in keep_keys if (k not in extend_keys and k not in task_change_keys)]
    if keep_same:
        ids_to_keep = [prev_by_key[k]["id"] for k in keep_same if prev_by_key.get(k)]
        for i in range(0, len(ids_to_keep), 1000):
            client.table("kepco_history").update({
                "current": True,
                "posted_at": posted_at_raw,
                "pdf_url": pdf_url,
            }).in_("id", ids_to_keep[i:i+1000]).execute()

    # 3-1) task 변경된 '이전행' old_career=true 표시
    if task_change_keys:
        ids_old = [prev_by_key[k]["id"] for k in task_change_keys if prev_by_key.get(k)]
        for i in range(0, len(ids_old), 1000):
            client.table("kepco_history").update({"old_career": True}) \
                  .in_("id", ids_old[i:i+1000]).execute()

    # 4) 신규 삽입 (insert_keys)
    for k in insert_keys:
        p = new_by_key[k]
        payload = {
            "department": dept,
            "name": p.get("name"),
            "task": p.get("task"),
            "position": p.get("position"),
            "gender": p.get("gender"),
            "start": p.get("start"),
            "end": p.get("end"),
            "career": p.get("career") or [],
            "posted_at": posted_at_raw,
            "pdf_url": pdf_url,
            "current": True,
            "extended": False,
            "old_career": False,
        }
        # 공석은 항상 INSERT(중복 허용)
        if _is_vacant_name(p.get("name")):
            client.table("kepco_history").insert(payload).execute()
            continue
        # 중복: (dept, name, start)
        q = client.table("kepco_history").select("id").eq("department", dept).eq("name", p.get("name"))
        q = q.is_("start", None) if p.get("start") is None else q.eq("start", p.get("start"))
        exist = q.limit(1).execute().data or []
        if not exist:
            client.table("kepco_history").insert(payload).execute()

    # 5) 연장 삽입 (extend_keys)
    for k in extend_keys:
        p = new_by_key[k]
        payload = {
            "department": dept,
            "name": p.get("name"),
            "task": p.get("task"),
            "position": p.get("position"),
            "gender": p.get("gender"),
            "start": p.get("start"),
            "end": p.get("end"),
            "career": p.get("career") or [],
            "posted_at": posted_at_raw,
            "pdf_url": pdf_url,
            "current": True,
            "extended": True,
            "old_career": False,
        }
        client.table("kepco_history").insert(payload).execute()

    # 6) task 교체 삽입 (task_change_keys)
    for k in task_change_keys:
        p = new_by_key[k]
        payload = {
            "department": dept,
            "name": p.get("name"),
            "task": p.get("task"),
            "position": p.get("position"),
            "gender": p.get("gender"),
            "start": p.get("start"),
            "end": p.get("end"),
            "career": p.get("career") or [],
            "posted_at": posted_at_raw,
            "pdf_url": pdf_url,
            "current": True,
            "extended": False,
            "old_career": False,
        }
        client.table("kepco_history").insert(payload).execute()

    logger.info(
        "dept=%s processed: close=%d, keep=%d, insert=%d, extend=%d, task_change=%d",
        dept, len(to_close_keys), len(keep_same), len(insert_keys), len(extend_keys), len(task_change_keys)
    )

def process_all(debug: bool = False, sleep_sec: float = 0.0):
    client = sb()
    sources = fetch_sources_in_order(client)
    logger.info("sources: %d", len(sources))

    for i, src in enumerate(sources, 1):
        dept = (src.get("department") or "").strip()
        posted_at = (src.get("posted_at") or "").strip()
        pdf_url = (src.get("pdf_url") or "").strip()
        if not dept or not posted_at or not pdf_url:
            logger.info("[%d/%d] skip (dept/posted_at/pdf_url 부족): %s", i, len(sources), dept)
            continue
        try:
            merged = _download_pdf(pdf_url)
            process_one_department(client, dept, posted_at, pdf_url, merged)
        except Exception as e:
            logger.exception("[%d/%d] process failed (dept=%s, posted_at=%s): %s",
                             i, len(sources), dept, posted_at, e)

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--debug", action="store_true")
    args = parser.parse_args()
    if args.debug:
        logger.setLevel("DEBUG")
    process_all(debug=args.debug)

if __name__ == "__main__":
    main()
