import re
import time
from typing import Dict, Iterator, Optional, List, Any
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

from app.services.supabase_service import get_supabase
from app.services.notify import broadcast_html  # 기존 알림 유틸 사용

BASE = "https://www.motie.go.kr"
LIST_URL = f"{BASE}/kor/article/ATCL6e90bb9de"

# onclick="article.view('4475236');" 에서 숫자만 추출
RE_ARTICLE_ID = re.compile(r"article\.view\('(\d+)'\)")

GOVBOT_UA = "GovBot/1.0 (+https://work.jjickjjicks.com)"

def _req(url: str, ua: str, params: Optional[dict] = None) -> requests.Response:
    headers = {
        "User-Agent": ua or GOVBOT_UA,
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    }
    r = requests.get(url, headers=headers, params=params, timeout=15)
    r.raise_for_status()
    return r

def _clean(s: str) -> str:
    # n8n의 Trim/Clean Up Text 유사: 공백 정리
    return " ".join((s or "").split())

def _to_iso_date(s: str | None) -> Optional[str]:
    if not s:
        return None
    t = _clean(s).replace(".", "-")
    m = re.search(r"(\d{4})-(\d{2})-(\d{2})", t)
    return f"{m.group(1)}-{m.group(2)}-{m.group(3)}" if m else None

def crawl_motie(ua: str, page: int = 1) -> Iterator[Dict]:
    """
    목록 파싱:
      title: td:nth-of-type(3) Text
      date : td:nth-of-type(5) Text  (게시일)
      fileUrl: a[href^="/attach/down"] -> href
      link: a[onclick^="article.view"] -> onclick (정규식으로 articleId)
    """
    params = {
        "mno": "",
        "pageIndex": page,
        "rowPageC": 0,
        "displayAuthor": "",
        "searchCategory": 3,  # 인사 카테고리
        "schClear": "on",
        "startDtD": "",
        "endDtD": "",
        "searchCondition": 1,
        "searchKeyword": "",
    }
    res = _req(LIST_URL, ua, params=params)
    soup = BeautifulSoup(res.text, "html.parser")

    for tr in soup.select("table tbody tr"):
        # title/date
        title_el = tr.select_one("td:nth-of-type(3)")
        date_el  = tr.select_one("td:nth-of-type(5)")
        title = _clean(title_el.get_text(" ", strip=True)) if title_el else ""
        date_text  = _clean(date_el.get_text(" ", strip=True)) if date_el else ""
        posted_at = _to_iso_date(date_text)

        # fileUrl (있을 수도/없을 수도)
        file_a = tr.select_one('a[href^="/attach/down"]')
        file_url = urljoin(BASE, file_a["href"]) if file_a and file_a.has_attr("href") else None

        # onclick에서 articleId 추출
        view_a = tr.select_one('a[onclick^="article.view"]')
        onclick = view_a.get("onclick") if view_a else None
        if not onclick:
            continue
        m = RE_ARTICLE_ID.search(onclick)
        if not m:
            continue
        article_id = m.group(1)  # str

        # 상세 링크
        detail_url = f"{LIST_URL}/{article_id}/view?"

        yield {
            "articleId": article_id,
            "title": title or "산업부 인사발령",
            "posted_at": posted_at,  # ← 게시일(YYYY-MM-DD)
            "url": detail_url,
        }

    time.sleep(0.6)

def run() -> int:
    """
    신규 건을 motie_id(id,title,posted_at)로 저장하고 텔레그램 알림 발송.
    return: 신규 건 수
    """
    ua = GOVBOT_UA
    items = list(crawl_motie(ua, page=1))
    if not items:
        return 0

    sb = get_supabase()
    ids = [it["articleId"] for it in items]
    existing = sb.table("motie_id").select("id").in_("id", ids).execute().data or []
    exist = {r["id"] for r in existing}

    new_items = [it for it in items if it["articleId"] not in exist]
    if not new_items:
        return 0

    # DB insert
    payload = [{
        "id": it["articleId"],
        "title": it.get("title") or "산업부 인사발령",
        "posted_at": it.get("posted_at")  # date or None
    } for it in new_items]
    for i in range(0, len(payload), 500):
        sb.table("motie_id").insert(payload[i:i+500]).execute()

    # 알림 (게시일 포함)
    for it in new_items:
        url = it["url"]
        title = it.get("title") or "산업부 인사발령"
        posted_at = it.get("posted_at")
        lines = [ "산업부 인사발령입니다.", "" ]
        if posted_at:
            lines.append(f"게시일: {posted_at}")
        # 제목을 링크로
        lines.append(f'<a href="{url}">[{title}]</a>')
        broadcast_html("\n".join(lines))

    return len(new_items)
