import os
import re
import logging
from html import escape
from typing import Dict, List, Optional, Tuple

import requests
from bs4 import BeautifulSoup

from app.services.supabase_service import get_client
from app.services.telegram_sender import broadcast_html


LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper()
logging.basicConfig(level=LOG_LEVEL, format="[%(levelname)s] %(message)s")
logger = logging.getLogger("me_n8n")

BASE = "https://me.go.kr"

# 목록: 환경부 > 알림/홍보 > 보도·설명 > 인사발령 (boardMasterId=22, menuId=10527)
# 제공된 예시는 read.do 한 건에 대한 URL이지만, 일반적으로 list.do를 통해 최신 글 목록을 가져옵니다.
LIST_URL = (
    f"{BASE}/home/web/board/list.do?menuId=10527&boardMasterId=22"
)

DETAIL_PATH = "/home/web/board/read.do"

HEADERS = {"User-Agent": "GovBot/1.0 (+https://work.jjickjjicks.com)"}


# ------------------------------
# crawler_run helpers
# ------------------------------
def SB():
    return get_client()


def _run_start(target: str) -> str:
    import uuid
    from datetime import datetime, timezone

    run_id = str(uuid.uuid4())
    try:
        SB().table("crawler_run").insert({
            "id": run_id,
            "target": target,
            "status": "running",
            "pages": 0,
            "rows": 0,
            "fail_reason": None,
            "started_at": datetime.now(timezone.utc).isoformat(),
        }).execute()
    except Exception as e:
        logger.warning(f"[crawler_run insert] {e}")
    return run_id


def _run_finish(
    run_id: str,
    *,
    status: str,
    pages: int,
    rows: int,
    fail_reason: Optional[str] = None,
):
    from datetime import datetime, timezone

    payload = {
        "status": status,
        "finished_at": datetime.now(timezone.utc).isoformat(),
        "pages": pages,
        "rows": rows,
    }
    if fail_reason is not None:
        payload["fail_reason"] = fail_reason
    try:
        SB().table("crawler_run").update(payload).eq("id", run_id).execute()
    except Exception as e:
        logger.warning(f"[crawler_run finish] {e}")


# ------------------------------
# parsing helpers
# ------------------------------
RE_BOARD_ID = re.compile(r"[?&]boardId=(\d+)")
RE_DATE = re.compile(r"(\d{4})[.\-/](\d{1,2})[.\-/](\d{1,2})")


def _clean(s: Optional[str]) -> str:
    if not s:
        return ""
    return " ".join(s.split())


def _abs_url(href: str) -> str:
    href = (href or "").strip()
    if not href:
        return href
    if href.startswith("http://") or href.startswith("https://"):
        return href
    if href.startswith("/"):
        return BASE + href
    # 상대경로일 경우 목록 기준 절대화
    return f"{BASE}/{href.lstrip('./')}"


def _extract_board_id(href: str) -> Optional[str]:
    m = RE_BOARD_ID.search(href or "")
    return m.group(1) if m else None


def _find_posted_text(container: BeautifulSoup) -> Optional[str]:
    # 테이블 행(tr) 또는 리스트(li) 등에서 날짜 텍스트 추정
    if not container:
        return None

    # 1) 흔한 클래스명/태그들에서 추출
    cand = (
        container.select_one(".date")
        or container.select_one(".reg_date")
        or container.select_one("td:nth-last-of-type(1)")
        or container.select_one("span")
    )
    if cand:
        t = cand.get_text(" ", strip=True)
        if t:
            return t

    # 2) 행 전체 텍스트에서 날짜 패턴 직접 검색
    try:
        txt = container.get_text(" ", strip=True)
        m = RE_DATE.search(txt)
        if m:
            return m.group(0)
    except Exception:
        pass

    return None


def _normalize_date(s: str) -> Optional[str]:
    if not s:
        return None
    m = RE_DATE.search(s)
    if not m:
        return None
    y, mo, d = m.group(1), m.group(2).zfill(2), m.group(3).zfill(2)
    return f"{y}-{mo}-{d}"


def parse_list_html(html: str) -> List[Dict]:
    soup = BeautifulSoup(html, "html.parser")

    # 목록에서 read.do 링크를 모두 수집
    anchors = soup.select('a[href*="/home/web/board/read.do"], a[href*="boardId="]')
    items: List[Dict] = []
    seen: set[str] = set()

    for a in anchors:
        href = a.get("href", "")
        bid = _extract_board_id(href)
        if not bid:
            continue
        if bid in seen:
            continue
        seen.add(bid)

        title = _clean(a.get_text(" ", strip=True))
        if not title:
            # 상위 박스(h3/a 등)에서 보조 추출
            h3 = a.find_previous("h3")
            if h3:
                title = _clean(h3.get_text(" ", strip=True))

        container = a.find_parent("tr") or a.find_parent("li") or a.find_parent("article") or a.find_parent("div") or a.parent
        posted_raw = _find_posted_text(container)
        posted = _normalize_date(posted_raw or "")

        items.append({
            "id": bid,
            "title": title or "환경부 인사발령",
            "url": _abs_url(href),
            "posted_at": posted,
        })

    return items


def _list_url_for(page: int, param: str = "pageIndex") -> str:
    if page <= 1:
        return LIST_URL
    sep = "&" if ("?" in LIST_URL) else "?"
    return f"{LIST_URL}{sep}{param}={page}"

def _get_with_retry(url: str, *, headers=None, timeout=25) -> requests.Response:
    delay = 0.5
    for i in range(4):
        try:
            r = requests.get(url, headers=headers, timeout=timeout)
            r.raise_for_status()
            return r
        except Exception as e:
            if i == 3:
                raise
            logger.warning("GET retry (%s): %s", i+1, e)
            time.sleep(delay)
            delay *= 2
    raise RuntimeError("unreachable")


def fetch_and_extract(max_pages: int = 1) -> List[Dict]:
    max_pages = max(1, int(max_pages or 1))
    seen: set[str] = set()
    out: List[Dict] = []

    for p in range(1, max_pages + 1):
        url = _list_url_for(p, "pageIndex")
        r = _get_with_retry(url, headers=HEADERS, timeout=25)
        page_items = parse_list_html(r.text)

        # 보정: 목록에서 날짜를 못 찾았으면 상세페이지에서 추출 시도
        def _posted_from_detail(url: str) -> Optional[str]:
            try:
                dr = _get_with_retry(url, headers=HEADERS, timeout=20)
                soup = BeautifulSoup(dr.text, "html.parser")
                # 1) 흔한 셀렉터
                for sel in (".date", ".reg_date", "time"):
                    el = soup.select_one(sel)
                    if el:
                        s = _normalize_date(el.get_text(" ", strip=True))
                        if s:
                            return s
                # 2) 레이블 기반(등록일/작성일)
                for dt in soup.find_all(["dt", "th"]):
                    t = (dt.get_text(" ", strip=True) or "").strip()
                    if any(k in t for k in ("등록일", "작성일", "등록일자")):
                        sib = dt.find_next_sibling(["dd", "td"]) or dt.parent.find_next_sibling("tr")
                        if sib:
                            s = _normalize_date(sib.get_text(" ", strip=True))
                            if s:
                                return s
                # 3) 문서 전체 텍스트에서 날짜 패턴
                s = _normalize_date(soup.get_text(" ", strip=True))
                return s
            except Exception:
                return None

        new_added = 0
        for it in page_items:
            if not it.get("posted_at") and it.get("url"):
                pdate = _posted_from_detail(it["url"])
                if pdate:
                    it["posted_at"] = pdate
            iid = str(it.get("id") or "")
            if not iid or iid in seen:
                continue
            seen.add(iid)
            out.append(it)
            new_added += 1

        if p > 1 and new_added == 0:
            break
    return out


def upsert_and_notify(items: List[Dict]) -> Tuple[int, int]:
    sb = SB()
    inserted, sent = 0, 0

    for it in items:
        row_id = it.get("id")
        if not row_id:
            continue
        # 존재 확인
        try:
            exist = sb.table("me_id").select("id").eq("id", row_id).limit(1).execute()
            if exist.data:
                continue
        except Exception as e:
            logger.warning(f"exist check failed for {row_id}: {e}")
            continue

        # insert (필드 유연성 확보)
        row = {
            "id": row_id,
            "title": (it.get("title") or "").strip(),
        }
        if it.get("posted_at"):
            row["posted_at"] = it.get("posted_at")
        if it.get("url"):
            row["url"] = it.get("url")

        ok = False
        for fields in (
            ["id", "title", "posted_at", "url"],
            ["id", "title", "url"],
            ["id", "title", "posted_at"],
            ["id", "title"],
        ):
            data = {k: row[k] for k in fields if k in row}
            try:
                sb.table("me_id").insert(data).execute()
                ok = True
                break
            except Exception as e:
                logger.warning(f"insert retry {fields} failed for {row_id}: {e}")
        if not ok:
            continue

        inserted += 1

        # 텔레그램 알림
        title = escape(row.get("title", "환경부 인사발령"))
        url = escape(row.get("url", f"{BASE}{DETAIL_PATH}?boardId={row_id}&menuId=10527&boardMasterId=22"))
        msg = (
            "환경부 인사발령입니다.\n"
            f"[{title}]\n"
            f'<a href="{url}">[자세히 보기]</a>'
        )
        try:
            res = broadcast_html(msg)
            sent += int(res) if isinstance(res, int) else (1 if res else 0)
        except Exception as e:
            logger.warning(f"notify failed for {row_id}: {e}")

    return inserted, sent


# ------------------------------
# main
# ------------------------------
def run_once():
    target = "me_id"
    run_id = _run_start(target)
    pages = 0
    rows = 0

    try:
        # 백필 페이지 수(옵션): 환경변수로 제어
        backfill_pages = 0
        try:
            backfill_pages = int(os.getenv("ME_ID_BACKFILL_PAGES", "0") or 0)
        except Exception:
            backfill_pages = 0
        max_pages = backfill_pages if backfill_pages and backfill_pages > 0 else 1

        items = fetch_and_extract(max_pages=max_pages)
        pages += max_pages
        if not items:
            _run_finish(run_id, status="passed", pages=pages, rows=0)
            logger.info("ME n8n-style run: no items")
            return {"source": "ME_ID", "inserted": 0, "sent": 0, "run_id": run_id, "status": "passed"}

        ins, sent = upsert_and_notify(items)
        rows = ins
        _run_finish(run_id, status="passed", pages=pages, rows=rows)
        logger.info("ME n8n-style run: inserted=%s, sent=%s", ins, sent)
        return {"source": "ME_ID", "inserted": ins, "sent": sent, "run_id": run_id, "status": "passed"}
    except Exception as e:
        _run_finish(run_id, status="aborted", pages=pages, rows=rows, fail_reason=str(e))
        logger.exception("ME n8n-style run aborted: %s", e)
        return {"source": "ME_ID", "error": str(e), "run_id": run_id, "status": "aborted"}


if __name__ == "__main__":
    print(run_once())