# /var/www/html/bot/app/crawler/na_industry_committee.py
from __future__ import annotations

import argparse
import hashlib
import json
import re
import time
import uuid
from datetime import datetime, timezone
from html import unescape
from typing import Dict, Optional, Tuple, List

import requests
from bs4 import BeautifulSoup

from app.services.supabase_service import get_client, logger

# ------------------------------
# Constants
# ------------------------------
LIST_PAGE = "https://industry.na.go.kr/cmmit/mem/cmmitMemList/list.do"
MEMLIST_API = "https://industry.na.go.kr/cmmit/mem/cmmitMemList/memList.json"

BASE_HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; govbot/1.0; +https://work.jjickjjicks.com)",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "ko-KR,ko;q=0.9",
}
AJAX_HEADERS = {
    "User-Agent": BASE_HEADERS["User-Agent"],
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "X-Requested-With": "XMLHttpRequest",
    "Origin": "https://industry.na.go.kr",
    "Referer": f"{LIST_PAGE}?menuNo=2000013",
    "Accept-Language": "ko-KR,ko;q=0.9",
}
DEFAULT_PARAMS = {
    "menuNo": "2000013",
    "cntsDivCd": "CMMIT",
    "committeeCd": "",
    "hrCmtId": "",
    "jobResCd": "",
    "polyCd": "",
    "hgNm": "",
}

# election 모드
ELECTION_NONE = "none"   # 완전 생략
ELECTION_FULL = "full"   # 개별 페이지 들어가서 시도(느림)

# ------------------------------
# Supabase helpers
# ------------------------------
def SB(): 
    return get_client()

def _run_start(target: str) -> str:
    run_id = str(uuid.uuid4())
    try:
        SB().table("crawler_run").insert({
            "id": run_id, "target": target, "status": "running",
            "pages": 0, "rows": 0, "fail_reason": None,
            "started_at": datetime.now(timezone.utc).isoformat()
        }).execute()
    except Exception as e:
        logger.warning(f"[crawler_run insert] {e}")
    return run_id

def _run_heartbeat(run_id: str, *, pages: int | None = None, rows: int | None = None, status: str | None = None):
    payload = {}
    if pages is not None: payload["pages"] = pages
    if rows is not None:  payload["rows"]  = rows
    if status is not None: payload["status"] = status
    if payload:
        try:
            SB().table("crawler_run").update(payload).eq("id", run_id).execute()
        except Exception as e:
            logger.warning(f"[crawler_run update] {e}")

def _run_finish(run_id: str, *, status: str, pages: int, rows: int, fail_reason: str | None = None):
    payload = {
        "status": status,
        "finished_at": datetime.now(timezone.utc).isoformat(),
        "pages": pages, "rows": rows
    }
    if fail_reason is not None:
        payload["fail_reason"] = fail_reason
    try:
        SB().table("crawler_run").update(payload).eq("id", run_id).execute()
    except Exception as e:
        logger.warning(f"[crawler_run finish] {e}")

def _raw_insert(run_id: str, kind: str, *, committee_cd: str | None = None,
                url: str | None = None, html: str | None = None, payload: dict | None = None, json_obj: dict | None = None):
    rec = {"run_id": run_id, "kind": kind, "committee_cd": committee_cd}
    if url is not None:     rec["url"] = url
    if html is not None:    rec["html"] = html
    if payload is not None: rec["payload"] = payload
    if json_obj is not None:rec["json"] = json_obj
    try:
        SB().table("congress_raw").insert(rec).execute()
    except Exception as e:
        logger.warning(f"[congress_raw insert] {e}")

# ------------------------------
# Utils: normalize & hashing
# ------------------------------
def _clean(s: str | None) -> str:
    if not s: return ""
    return " ".join(unescape(s).replace("\u00A0", " ").split())

def _norm_text(v: Optional[str]) -> Optional[str]:
    t = _clean(v)
    return t if t else None

RE_PHONE = re.compile(r"(\d{2,4}-\d{3,4}-\d{4})")
def _norm_phone(v: Optional[str]) -> Optional[str]:
    if not v: return None
    v = re.sub(r"\s+", " ", str(v))
    m = RE_PHONE.search(v)
    return m.group(1) if m else v.strip()

def _key_hash(committee: str, full_name: str) -> str:
    # 위원회 멤버십 단위 키(위원회+이름)
    base = f"{_clean(committee)}|{_clean(full_name)}"
    return hashlib.sha256(base.encode()).hexdigest()

def _row_hash(rec: dict) -> str:
    # 변동 추적용 주요 필드
    parts = [
        rec.get("committee_name") or "",
        rec.get("role_name") or "",
        rec.get("full_name") or "",
        rec.get("name_hanja") or "",
        rec.get("party") or "",
        rec.get("district") or "",
        rec.get("phone") or "",
        rec.get("room_no") or "",
        rec.get("photo_url") or "",
        rec.get("profile_url") or "",
        rec.get("member_page_url") or "",
        rec.get("aide_staff") or "",
        rec.get("aide_secretary") or "",
        rec.get("aide_secretary2") or "",
        str(rec.get("election_count") if rec.get("election_count") is not None else ""),
    ]
    return hashlib.sha256("|".join(parts).encode()).hexdigest()

# ------------------------------
# election_count (옵션)
# ------------------------------
def _extract_election_count_from_html(html: str) -> Optional[int]:
    text = re.sub(r"\s+", " ", BeautifulSoup(html, "html.parser").get_text(" ", strip=True))
    for pat in [r"당선횟수\s*[:：]?\s*(\d+)", r"선수\s*[:：]?\s*(\d+)"]:
        m = re.search(pat, text)
        if m:
            try: return int(m.group(1))
            except: pass
    m2 = re.search(r"(초선|재선|삼선|3선|사선|4선|오선|5선|육선|6선|칠선|7선)", text)
    if m2:
        mapping = {"초선":1,"재선":2,"삼선":3,"3선":3,"사선":4,"4선":4,"오선":5,"5선":5,"육선":6,"6선":6,"칠선":7,"7선":7}
        return mapping.get(m2.group(1))
    return None

def _fetch_election_count(session: requests.Session, url: Optional[str], timeout: float = 5.0) -> Optional[int]:
    if not url: return None
    try:
        r = session.get(url, headers={"User-Agent": BASE_HEADERS["User-Agent"]}, timeout=timeout)
        r.raise_for_status()
        return _extract_election_count_from_html(r.text)
    except Exception:
        return None

# ------------------------------
# CSRF / memList
# ------------------------------
def _fetch_csrf(session: requests.Session) -> Tuple[Optional[str], Optional[str]]:
    try:
        r = session.get(f"{LIST_PAGE}?menuNo=2000013", headers=BASE_HEADERS, timeout=15)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")
        token_el = soup.select_one('meta[name="_csrf"]')
        header_el = soup.select_one('meta[name="_csrf_header"]')
        token = token_el.get("content").strip() if token_el else None
        header = header_el.get("content").strip() if header_el else "X-CSRF-TOKEN"
        return token, header
    except Exception as e:
        logger.warning(f"CSRF fetch failed: {e}")
        return None, None

def _post_memlist(session: requests.Session, payload: dict) -> dict:
    csrf_token, csrf_header = _fetch_csrf(session)
    headers = dict(AJAX_HEADERS)
    if csrf_token:
        headers[csrf_header] = csrf_token
        payload["_csrf"] = csrf_token

    for attempt in range(2):
        try:
            r = session.post(MEMLIST_API, data=payload, headers=headers, timeout=20)
            if r.status_code == 403 and attempt == 0:
                csrf_token, csrf_header = _fetch_csrf(session)
                if csrf_token:
                    headers[csrf_header] = csrf_token
                    payload["_csrf"] = csrf_token
                time.sleep(0.8)
                continue
            r.raise_for_status()
            return r.json()
        except Exception:
            if attempt == 0:
                time.sleep(0.8)
                continue
            raise

# ------------------------------
# STG / HIST helpers
# ------------------------------
def _stg_upsert_rows(run_id: str, rows: List[dict], chunk: int = 500):
    sb = SB()
    total = 0
    for i in range(0, len(rows), chunk):
        part = rows[i:i+chunk]
        if not part: continue
        sb.table("congress_member_stg").upsert(part).execute()
        total += len(part)
    return total

def _count_cur() -> int:
    try:
        res = SB().table("congress_member_cur").select("key_hash", count="exact").execute()
        return res.count or 0
    except Exception:
        return 0

def _validate(stg_count: int, cur_count: int, *, min_abs: int, min_ratio: float) -> tuple[bool,str]:
    if stg_count < min_abs:
        return False, f"too_few_rows: {stg_count} < {min_abs}"
    if cur_count > 0 and stg_count < int(cur_count * min_ratio):
        return False, f"ratio_drop: {stg_count} < {int(cur_count*min_ratio)} (cur={cur_count})"
    return True, "ok"

def _load_cur_map() -> dict[str, dict]:
    rows = (SB().table("congress_member_hist")
            .select("*").eq("is_current", True).is_("valid_to", "null").limit(50000).execute().data) or []
    return { r["key_hash"]: r for r in rows }

def _load_stg_map(run_id: str) -> dict[str, dict]:
    rows = (SB().table("congress_member_stg").select("*").eq("run_id", run_id).limit(50000).execute().data) or []
    return { r["key_hash"]: r for r in rows }

def _apply_scd2(run_id: str):
    sb = SB()
    cur_map = _load_cur_map()
    stg_map = _load_stg_map(run_id)

    now_iso = datetime.now(timezone.utc).isoformat()

    # 1) STG에 없는 키는 닫기 (퇴출)
    to_close_missing: list[str] = [k for k in cur_map.keys() if k not in stg_map]
    if to_close_missing:
        sb.table("congress_member_hist").update({
            "valid_to": now_iso, "is_current": False
        }).in_("key_hash", to_close_missing).eq("is_current", True).execute()

    # 2) row 변경된 키는 기존 current를 먼저 닫기 (교체)
    changed_keys: list[str] = []
    to_add: list[dict] = []

    for k, v in stg_map.items():
        cur = cur_map.get(k)
        if not cur:
            # 신규
            changed_keys.append(None)  # placeholder, 의미 없음
            base = v
            to_add.append({
                "key_hash": k,
                "committee_name": base.get("committee_name"),
                "role_name": base.get("role_name"),
                "full_name": base.get("full_name"),
                "name_hanja": base.get("name_hanja"),
                "party": base.get("party"),
                "district": base.get("district"),
                "phone": base.get("phone"),
                "room_no": base.get("room_no"),
                "photo_url": base.get("photo_url"),
                "profile_url": base.get("profile_url"),
                "member_page_url": base.get("member_page_url"),
                "aide_staff": base.get("aide_staff"),
                "aide_secretary": base.get("aide_secretary"),
                "aide_secretary2": base.get("aide_secretary2"),
                "election_count": base.get("election_count"),
                "row_hash": base.get("row_hash"),
                "valid_from": now_iso,
                "valid_to": None,
                "is_current": True,
            })
        else:
            if cur.get("row_hash") != v.get("row_hash"):
                changed_keys.append(k)
                base = v
                to_add.append({
                    "key_hash": k,
                    "committee_name": base.get("committee_name"),
                    "role_name": base.get("role_name"),
                    "full_name": base.get("full_name"),
                    "name_hanja": base.get("name_hanja"),
                    "party": base.get("party"),
                    "district": base.get("district"),
                    "phone": base.get("phone"),
                    "room_no": base.get("room_no"),
                    "photo_url": base.get("photo_url"),
                    "profile_url": base.get("profile_url"),
                    "member_page_url": base.get("member_page_url"),
                    "aide_staff": base.get("aide_staff"),
                    "aide_secretary": base.get("aide_secretary"),
                    "aide_secretary2": base.get("aide_secretary2"),
                    "election_count": base.get("election_count"),
                    "row_hash": base.get("row_hash"),
                    "valid_from": now_iso,
                    "valid_to": None,
                    "is_current": True,
                })

    # 기존 current 닫기 (신규 None placeholder 제거)
    changed_keys = [k for k in changed_keys if k]
    if changed_keys:
        sb.table("congress_member_hist").update({
            "valid_to": now_iso, "is_current": False
        }).in_("key_hash", changed_keys).eq("is_current", True).execute()

    if to_add:
        sb.table("congress_member_hist").insert(to_add).execute()

# (선택) 기존 호환성을 위해 베이스 테이블도 갱신
def _upsert_base_member(row: dict):
    # NOTE: 이 업서트는 id(PK) 기준으로만 업데이트한다.
    # on_conflict 컬럼 조합이 스키마와 안 맞으면 42P10이 나므로 PK 기반으로만 사용.
    try:
        SB().table("congress_member").upsert(row, on_conflict="id").execute()
    except Exception as e:
        logger.warning(f"[base upsert] {e}")

# ------------------------------
# Core
# ------------------------------
def run_once(
    committee_cd: Optional[str] = None,
    sleep: float = 0.15,
    election: str = ELECTION_FULL,   # 기본값 full
    min_abs: int = 50,
    min_ratio: float = 0.6,
) -> Dict[str, int | str]:
    """
    파이프라인:
      raw(list/html, memlist/json) → STG(run별) → 검증 → HIST(SCD2) → snapshot
    """
    run_id = _run_start("congress_member")
    pages = 0
    stg_rows = 0

    try:
        s = requests.Session()

        # (1) 목록 HTML raw
        try:
            r_list = s.get(f"{LIST_PAGE}?menuNo=2000013", headers=BASE_HEADERS, timeout=20)
            r_list.raise_for_status()
            _raw_insert(run_id, "list", committee_cd=committee_cd, url=r_list.url, html=r_list.text)
            pages += 1
        except Exception as e:
            _raw_insert(run_id, "list_error", committee_cd=committee_cd, url=f"{LIST_PAGE}?menuNo=2000013", html=str(e))
            logger.warning(f"[raw] list fetch failed: {e}")

        # (2) memList.json raw
        payload = dict(DEFAULT_PARAMS)
        if committee_cd:
            payload["committeeCd"] = committee_cd
            payload["hrCmtId"] = committee_cd

        data = _post_memlist(s, payload)
        _raw_insert(run_id, "memlist", committee_cd=committee_cd, url=MEMLIST_API, payload=payload, json_obj=data)
        pages += 1

        result_list: List[dict] = (data or {}).get("resultList", []) or []
        total = len(result_list)
        logger.info(f"[congress] memList size={total}")
        if total == 0:
            _run_finish(run_id, status="failed", pages=pages, rows=0, fail_reason="empty resultList")
            SB().table("congress_member_snapshot").insert({
                "run_id": run_id, "rows": 0, "status": "failed", "note": "empty resultList"
            }).execute()
            return {"run_id": run_id, "status": "failed", "rows": 0}

        # (3) STG 적재
        staging: List[dict] = []
        for idx, it in enumerate(result_list, 1):
            committee = _norm_text(it.get("committeeName"))
            role      = _norm_text(it.get("jobResNm"))
            name      = _norm_text(it.get("hgNm"))
            name_hj   = _norm_text(it.get("hjNm"))
            party     = _norm_text(it.get("polyNm"))
            district  = _norm_text(it.get("origNm"))
            phone     = _norm_phone(it.get("assemTel"))
            room_no   = _norm_text(it.get("roomNo"))
            photo     = _norm_text(it.get("deptImgUrl"))
            pf_url    = _norm_text(it.get("pfLinkUrl"))
            member_pg = _norm_text(it.get("memberPage"))
            staff     = _norm_text(it.get("staff"))
            sec1      = _norm_text(it.get("secretary"))
            sec2      = _norm_text(it.get("secretary2"))

            if not committee or not name:
                if idx % 10 == 0 or idx == total:
                    _run_heartbeat(run_id, pages=pages, rows=staging.__len__())
                continue

            election_count = None
            if election == ELECTION_FULL:
                # 둘 중 먼저 성공하는 값
                election_count = _fetch_election_count(s, member_pg) or _fetch_election_count(s, pf_url)

            base = {
                "committee_name": committee,
                "role_name": role or "",
                "full_name": name,
                "name_hanja": name_hj,
                "party": party,
                "district": district,
                "phone": phone,
                "room_no": room_no,
                "photo_url": photo,
                "profile_url": pf_url,
                "member_page_url": member_pg,
                "aide_staff": staff,
                "aide_secretary": sec1,
                "aide_secretary2": sec2,
                "election_count": election_count,
            }
            k = _key_hash(committee, name)
            h = _row_hash(base)

            staging.append({
                "run_id": run_id,
                **base,
                "key_hash": k,
                "row_hash": h,
            })

            # 호환: 기존 테이블에도 업서트(필요 없으면 이 줄 지워도 됨)
            _upsert_base_member({
                "id": hashlib.sha1(f"{committee}|{name}|{party or ''}|{district or ''}".encode()).hexdigest(),
                **base, "source_json": it,
            })

            if idx % 10 == 0 or idx == total:
                _run_heartbeat(run_id, pages=pages, rows=len(staging))

            time.sleep(sleep)

        if staging:
            stg_rows = _stg_upsert_rows(run_id, staging)
        else:
            stg_rows = 0

        # snapshot: collected
        SB().table("congress_member_snapshot").insert({
            "run_id": run_id, "rows": stg_rows, "status": "collected",
            "note": f"pages={pages}, memList={total}"
        }).execute()
        _run_heartbeat(run_id, status="collected", pages=pages, rows=stg_rows)

        # (4) 검증
        cur_count = _count_cur()
        ok, reason = _validate(stg_rows, cur_count, min_abs=min_abs, min_ratio=min_ratio)
        if not ok:
            SB().table("congress_member_snapshot").update({
                "status": "failed", "note": reason
            }).eq("run_id", run_id).execute()
            _run_finish(run_id, status="failed", pages=pages, rows=stg_rows, fail_reason=reason)
            logger.error(f"[congress] validation failed: {reason}")
            return {"run_id": run_id, "status": "failed", "rows": stg_rows, "reason": reason}

        # (5) SCD2 적용 → HIST
        _apply_scd2(run_id)
        SB().table("congress_member_snapshot").update({
            "status": "passed"
        }).eq("run_id", run_id).execute()
        _run_finish(run_id, status="passed", pages=pages, rows=stg_rows)
        logger.info(f"[congress] run passed: run_id={run_id}, rows={stg_rows}, pages={pages}")
        return {"run_id": run_id, "status": "passed", "rows": stg_rows, "pages": pages}

    except Exception as e:
        logger.exception(f"[congress] aborted: {e}")
        # 실패 기록
        try:
            SB().table("congress_member_snapshot").insert({
                "run_id": run_id, "rows": stg_rows, "status": "aborted", "note": str(e)
            }).execute()
        except Exception:
            pass
        _run_finish(run_id, status="aborted", pages=pages, rows=stg_rows, fail_reason=str(e))
        return {"run_id": run_id, "status": "aborted", "error": str(e)}

# ------------------------------
# CLI
# ------------------------------
def _parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(description="NA Industry Committee Members pipeline")
    p.add_argument("--committee", help="특정 위원회만 수집 (예: 9700187)", default=None)
    p.add_argument("--sleep", type=float, default=0.15)
    p.add_argument("--election", choices=[ELECTION_NONE, ELECTION_FULL], default=ELECTION_FULL)  # 기본값 full
    p.add_argument("--min_abs", type=int, default=50)
    p.add_argument("--min_ratio", type=float, default=0.6)
    return p.parse_args()

if __name__ == "__main__":
    args = _parse_args()
    print(run_once(
        committee_cd=args.committee,
        sleep=args.sleep,
        election=args.election,
        min_abs=args.min_abs,
        min_ratio=args.min_ratio,
    ))
