# /var/www/html/govbot/app/crawlers/moef.py
import re
from typing import Dict, Iterator, Optional, List, Any
from urllib.parse import urlencode

import requests
from bs4 import BeautifulSoup

from app.services.supabase_service import get_supabase
from app.services.notify import broadcast_html  # 기존 알림 유틸 사용

BASE = "https://www.moef.go.kr"
LIST_URL = f"{BASE}/nw/notice/hr.do"

# javascript:fn_egov_select('menuNo','bbsId','postId')
RE_HREF = re.compile(r"fn_egov_select\('(\d+)','([A-Za-z0-9_]+)','([A-Za-z0-9_]+)'\)")
RE_BRACKET_TAG = re.compile(r"^\s*\[([^\]]+)\]\s*(.*)$")
RE_DATE = re.compile(r"(\d{4}[.-]\d{2}[.-]\d{2})")

GOVBOT_UA = "GovBot/1.0 (+https://work.jjickjjicks.com)"

def _req(url: str, ua: str, params: Optional[dict] = None) -> requests.Response:
    headers = {
        "User-Agent": ua or GOVBOT_UA,
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    }
    r = requests.get(url, headers=headers, params=params, timeout=20)
    r.raise_for_status()
    return r

def _clean(s: str) -> str:
    return " ".join((s or "").split())

def _to_iso_date(s: str | None) -> Optional[str]:
    if not s:
        return None
    t = _clean(s).replace(".", "-")
    m = re.search(r"(\d{4})-(\d{2})-(\d{2})", t)
    return f"{m.group(1)}-{m.group(2)}-{m.group(3)}" if m else None

def _extract_tag_from_state(container) -> Optional[str]:
    """
    컨테이너 내부에서 class 가 state 로 시작하는 span 탐색 (예: <span class="state1">파견연장</span>)
    """
    if not container:
        return None
    for el in container.find_all("span"):
        classes = el.get("class") or []
        if any(str(c).startswith("state") for c in classes):
            text = _clean(el.get_text(" ", strip=True))
            if text:
                return text
    return None

def _extract_tag_and_title_by_bracket(title_text: str) -> tuple[Optional[str], str]:
    m = RE_BRACKET_TAG.match(title_text or "")
    if m:
        return m.group(1), m.group(2).strip()
    return None, title_text

def crawl_moef(ua: str, page_index: Optional[int] = None) -> Iterator[Dict]:
    """
    목록: /nw/notice/hr.do?menuNo=4050300
    - a[href*="javascript:fn_egov_select"] ...
    - 태그: <span class="stateN">텍스트</span> 우선, 없으면 [태그]제목 패턴
    - 게시일: 컨테이너 텍스트에서 YYYY-MM-DD / YYYY.MM.DD 추출
    - page_index 를 넘기면 해당 페이지를 조회. None이면 1페이지(기본 동작 호환)
    """
    params = {"menuNo": "4050300"}
    if page_index is not None:
        params["pageIndex"] = page_index

    res = _req(LIST_URL, ua, params=params)
    soup = BeautifulSoup(res.text, "html.parser")

    for a in soup.select('a[href*="javascript:fn_egov_select"]'):
        href = a.get("href") or ""
        m = RE_HREF.search(href)
        if not m:
            continue
        menu_no, bbs_id, post_id = m.group(1), m.group(2), m.group(3)
        item_id = f"{bbs_id}-{post_id}"

        # 제목
        title_text = ""
        h3 = a.find_previous("h3")
        if h3:
            link = h3.find("a")
            if link:
                title_text = _clean(link.get_text(" ", strip=True))
        if not title_text:
            title_text = _clean(a.get_text(" ", strip=True)) or "기재부 인사발령"

        # 컨테이너 결정
        container = a.find_parent(["li", "div", "article", "tr"]) or a.parent

        # 태그: stateN span 우선
        tag = _extract_tag_from_state(container)
        # fallback: [태그] 제목
        if not tag:
            tag, title_text = _extract_tag_and_title_by_bracket(title_text)

        # 게시일 추출
        posted_at = None
        if container:
            txt = container.get_text(" ", strip=True)
            d = RE_DATE.search(txt or "")
            if d:
                posted_at = _to_iso_date(d.group(1))

        qs = {
            "menuNo": menu_no,
            "searchBbsId": bbs_id,
            "searchNttId": post_id,
            "searchBbsId1": bbs_id,
            "searchNttId1": post_id,
        }
        detail_url = f"{BASE}/nw/notice/hrDetail.do?{urlencode(qs)}"

        yield {
            "item_id": item_id,
            "menuNo": menu_no,
            "bbsId": bbs_id,
            "postId": post_id,
            "title": title_text,
            "tag": tag,
            "posted_at": posted_at,
            "url": detail_url,
        }

def run() -> int:
    """
    신규 건을 moef_id(id,bbsId,postId,title,tag,posted_at)로 저장하고 텔레그램 알림 발송.
    return: 신규 건 수
    """
    ua = GOVBOT_UA
    items = list(crawl_moef(ua))
    if not items:
        return 0

    sb = get_supabase()
    ids = [it["item_id"] for it in items]
    existing = sb.table("moef_id").select("id").in_("id", ids).execute().data or []
    exist = {r["id"] for r in existing}

    new_items = [it for it in items if it["item_id"] not in exist]
    if not new_items:
        return 0

    # DB insert
    payload = [{
        "id": it["item_id"],
        "bbsId": it["bbsId"],
        "postId": it["postId"],
        "title": it.get("title") or "기재부 인사발령",
        "tag": it.get("tag"),
        "posted_at": it.get("posted_at"),
    } for it in new_items]
    for i in range(0, len(payload), 500):
        sb.table("moef_id").insert(payload[i:i+500]).execute()

    # 알림 ([tag] + 게시일)
    for it in new_items:
        url = it["url"]
        title = it.get("title") or "기재부 인사발령"
        tag = it.get("tag")
        posted_at = it.get("posted_at")
        title_block = f"[{tag}] {title}" if tag else title

        lines = ["기재부 인사발령입니다.", ""]
        if posted_at:
            lines.append(f"게시일: {posted_at}")
        lines.append(f'<a href="{url}">[{title_block}]</a>')
        broadcast_html("\n".join(lines))

    return len(new_items)
