# /var/www/html/bot/app/crawler/moef_n8n.py
from html import escape
import re
import time
from datetime import datetime, timezone
from typing import Dict, List, Optional, Tuple

import requests
from bs4 import BeautifulSoup

from app.services.supabase_service import get_client, logger
from app.services.telegram_sender import broadcast_html

supabase = get_client()

LIST_URL = "https://www.moef.go.kr/nw/notice/hr.do?menuNo=4050300"
DETAIL_URL = "https://www.moef.go.kr/nw/notice/hrDetail.do"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; govbot/1.0; +https://work.jjickjjicks.com)"
}

def _get_with_retry(url: str, *, params=None, headers=None, timeout=20):
    delay = 0.5
    for i in range(4):
        try:
            r = requests.get(url, params=params, headers=headers, timeout=timeout)
            r.raise_for_status()
            return r
        except Exception as e:
            if i == 3:
                raise
            logger.warning("GET retry (%s): %s", i+1, e)
            time.sleep(delay)
            delay *= 2
    raise RuntimeError("unreachable")

# ------------------------------
# crawler_run helpers
# ------------------------------
def _run_start(target: str) -> str:
    import uuid
    run_id = str(uuid.uuid4())
    try:
        supabase.table("crawler_run").insert({
            "id": run_id,
            "target": target,
            "status": "running",
            "pages": 0,
            "rows": 0,
            "fail_reason": None,
            "started_at": datetime.now(timezone.utc).isoformat(),
        }).execute()
    except Exception as e:
        logger.warning(f"[crawler_run insert] {e}")
    return run_id

def _run_finish(run_id: str, *, status: str, pages: int, rows: int, fail_reason: str | None = None):
    payload = {
        "status": status,
        "finished_at": datetime.now(timezone.utc).isoformat(),
        "pages": pages,
        "rows": rows,
    }
    if fail_reason is not None:
        payload["fail_reason"] = fail_reason
    try:
        supabase.table("crawler_run").update(payload).eq("id", run_id).execute()
    except Exception as e:
        logger.warning(f"[crawler_run finish] {e}")

# ------------------------------
# Helpers
# ------------------------------
def _extract_js_params(href_js: str) -> Optional[Tuple[str, str, str]]:
    """
    javascript:fn_egov_select('4050300','MOSFBBS_...','POST_ID') -> (menuNo, bbsId, postId)
    """
    m = re.search(r"fn_egov_select\('(\d+)','([A-Za-z0-9_]+)','([A-Za-z0-9_]+)'\)", href_js or "")
    if not m:
        return None
    return m.group(1), m.group(2), m.group(3)

def _split_tag_from_title(title_text: str) -> Tuple[Optional[str], str]:
    """
    '[태그] 제목' -> ('태그','제목'), 아니면 (None, 원제)
    """
    if not title_text:
        return None, ""
    m = re.match(r"^\[(.+?)\]\s*(.+)$", title_text.strip())
    if m:
        return m.group(1).strip(), m.group(2).strip()
    return None, title_text.strip()

def _extract_tag_from_state(container: Optional[BeautifulSoup]) -> Optional[str]:
    """
    목록 DOM에서 'state*' 클래스를 갖는 태그(예: <span class="state1">인사발령</span>)를 우선 추출
    """
    if not container:
        return None
    el = container.select_one('span[class*="state"], em[class*="state"], strong[class*="state"]')
    if el:
        t = (el.get_text(" ", strip=True) or "").strip()
        t = t.strip("[]()")
        return t or None
    return None

def _parse_date_from_detail_html(html: str) -> Optional[str]:
    """
    상세 페이지에서 '등록일' 또는 날짜 패턴(YYYY.MM.DD / YYYY-MM-DD) -> 'YYYY-MM-DD'
    """
    m = re.search(r"등록일[^0-9]*(\d{4}[.\-](\d{1,2})[.\-](\d{1,2}))", html)
    if not m:
        m = re.search(r"(\d{4}[.\-](\d{1,2})[.\-](\d{1,2}))", html)
    if not m:
        return None
    raw = m.group(1).replace(".", "-")
    try:
        dt = datetime.strptime(raw, "%Y-%m-%d")
        return dt.strftime("%Y-%m-%d")
    except Exception:
        parts = raw.split("-")
        parts = [p.zfill(2) if i > 0 else p for i, p in enumerate(parts)]
        try:
            dt = datetime.strptime("-".join(parts), "%Y-%m-%d")
            return dt.strftime("%Y-%m-%d")
        except Exception:
            return None

def _detail_posted_at(menuNo: str, bbsId: str, postId: str) -> Optional[str]:
    params = {
        "searchBbsId1": bbsId,
        "searchNttId1": postId,
        "menuNo": menuNo,
    }
    r = _get_with_retry(DETAIL_URL, params=params, headers=HEADERS, timeout=20)
    return _parse_date_from_detail_html(r.text)

def _exists_in_supabase(row_id: str) -> bool:
    res = supabase.table("moef_id").select("id").eq("id", row_id).limit(1).execute()
    return bool(res.data)

def _safe_insert(row: Dict) -> bool:
    """
    tag/posted_at 유무와 무관하게 유연 삽입
    """
    fields_priority = [
        ["id", "bbsId", "postId", "title", "tag", "posted_at"],
        ["id", "bbsId", "postId", "title", "posted_at"],
        ["id", "bbsId", "postId", "title", "tag"],
        ["id", "bbsId", "postId", "title"],
    ]
    for fields in fields_priority:
        data = {k: row[k] for k in fields if k in row}
        try:
            supabase.table("moef_id").insert(data).execute()
            return True
        except Exception as e:
            logger.warning(f"insert retry with fields {fields} failed: {e}")
    return False

# ------------------------------
# Main
# ------------------------------
def run_once() -> Dict[str, int | str]:
    target = "moef_id"
    run_id = _run_start(target)
    pages = 0
    inserted = 0
    sent = 0

    try:
        r = _get_with_retry(LIST_URL, headers=HEADERS, timeout=20)
        pages += 1
        soup = BeautifulSoup(r.text, "html.parser")

        anchors = soup.select('a[href*="javascript:fn_egov_select"]')

        for a in anchors:
            href_js = a.get("href", "")
            params = _extract_js_params(href_js)
            if not params:
                continue
            menuNo, bbsId, postId = params
            row_id = f"{bbsId}-{postId}"

            if _exists_in_supabase(row_id):
                continue

            # 제목(가까운 h3 > a 우선, 없으면 a 자신 텍스트)
            title_text = ""
            h3 = a.find_previous("h3")
            if h3:
                link = h3.find("a")
                if link:
                    title_text = link.get_text(" ", strip=True)
            if not title_text:
                title_text = a.get_text(" ", strip=True) or "기재부 인사발령"

            # 컨테이너: li/tr/article/div 등 상위 박스
            container = (
                a.find_parent("li")
                or a.find_parent("tr")
                or a.find_parent("article")
                or a.find_parent("div")
                or a.parent
            )

            # 1) state* 태그에서 태그 추출
            tag = _extract_tag_from_state(container)

            # 2) 없으면 [태그]제목 패턴 파싱
            tag2, clean_title = _split_tag_from_title(title_text)
            if not tag:
                tag = tag2
            else:
                # 목록에서 state로 태그를 찾았으면 제목의 대괄호 프리픽스는 제거
                title_text = clean_title

            # 상세에서 게시일
            try:
                posted_at = _detail_posted_at(menuNo, bbsId, postId)
            except Exception as e:
                logger.warning(f"fetch posted_at failed for {row_id}: {e}")
                posted_at = None

            row = {
                "id": row_id,
                "bbsId": bbsId,
                "postId": postId,
                "title": clean_title or title_text,
            }
            if tag:
                row["tag"] = tag
            if posted_at:
                row["posted_at"] = posted_at

            ok = _safe_insert(row)
            if not ok:
                logger.warning(f"insert failed for {row_id}")
                continue

            inserted += 1

            # 텔레그램 알림(옵션)
            link = f"{DETAIL_URL}?searchBbsId1={bbsId}&searchNttId1={postId}&menuNo={menuNo}"
            head = f"[({escape(tag)}) {escape(row.get('title',''))}]" if tag else f"[{escape(row.get('title',''))}]"
            message = (
                "기재부 인사발령입니다.\n"
                f"{head}\n"
                f'<a href="{link}">[자세히 보기]</a>'
            )
            try:
                sent_res = broadcast_html(message)
                sent += int(sent_res) if isinstance(sent_res, int) else (1 if sent_res else 0)
            except Exception as e:
                logger.warning(f"telegram send failed for {row_id}: {e}")

            time.sleep(0.5)

        _run_finish(run_id, status="passed", pages=pages, rows=inserted)
        logger.info(f"MOEF run: inserted={inserted}, sent={sent}")
        return {"inserted": inserted, "sent": sent, "run_id": run_id, "status": "passed"}
    except Exception as e:
        _run_finish(run_id, status="aborted", pages=pages, rows=inserted, fail_reason=str(e))
        logger.exception(f"MOEF run aborted: {e}")
        return {"error": str(e), "run_id": run_id, "status": "aborted"}

if __name__ == "__main__":
    print(run_once())
