o
    vh@                  	   @   s   d dl Z d dlZd dlmZmZmZmZ d dlmZ d dl	Z	d dl
mZ dZddededee d	e	jfd
dZded	efddZded	efddZded	ee fddZdeded	ee fddZded	ee fddZdS )    N)DictIteratorListOptional)urljoin)BeautifulSoupzhttps://www.motie.go.krurluaparamsreturnc                 C   s,   |pddd}t j| ||dd}|  |S )Nz
govbot/1.0z?text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8)z
User-AgentAccept   )headersr
   timeout)requestsgetraise_for_status)r   r	   r
   r   r r   ./var/www/html/govbot/app/crawlers/motie_org.py_req   s   r   sc                 C   s   d | pd S )N  )joinsplit)r   r   r   r   _clean   s   r   c                 C   s`   t  d}t|| }t|jd}|d}|sdS |dd}td|}|r.t|	dS dS )u   
    /kor/25/empSearch 첫 페이지에서 a.direction.last 의 onclick 속성
    'empSearch.list(숫자)'에서 숫자를 추출해 마지막 페이지로 사용.
    /kor/25/empSearchhtml.parserza.direction.last   onclickr   zempSearch\.list\((\d+)\))
BASEr   r   text
select_oner   researchintgroup)r	   r   ressoupa_lastr    mr   r   r   get_last_page   s   


r,   htmlc              	   C   s  t | d}g }|d}|D ]}|d}d } } } }	}
d}t|dkrt|d jdd	d
}t|d jdd	d
}t|d jdd	d
}|d}|r^|dr^|d d	 }
nd
dd |dd D }td|}|ry|dnd}
t|dkrt|d jdd	d
nd}|st|d jdd	d
}||
d	 }	d	}|st|jdd	d
}|sq| }t|dk rq|d }|d }|d }|d }td|}|r|dn|}
d
|dd |
d	 }	|||||	|
d q|S )u   
    목록 테이블의 각 tr을 파싱하여 name/position/department/task/phone을 추출.
    1) 우선 td 셀 기반 파싱
    2) 실패 시 n8n에서 하던 '행 텍스트를 공백 분리' 백업 파싱
    r   ztable tbody trtdr   F   r   r   Tstripr      za[href^="tel:"]hrefztel:c                 s   s"    | ]}t |jd ddV  qdS )r   Tr0   Nr   get_text).0r.   r   r   r   	<genexpr>B   s     z/iter_emp_rows_from_page_html.<locals>.<genexpr>Nz\b\d{2,4}-\d{3,4}-\d{4}\b      )nameposition
departmenttaskphone)r   selectfind_alllenr   r5   r#   has_attrremoveprefixr1   r   r$   r%   r'   replacer   append)r-   r)   outrowstrtdsr<   r=   r>   r?   r@   parsedtel_atail_txtr+   body_txtrow_textcolumns
last_tokenr   r   r   iter_emp_rows_from_page_html'   sX   



&
rS   pagec                 C   s&   t  d}t|| d|id}t|jS )uR   
    /kor/25/empSearch?pageIndex={page} 한 페이지를 가져와 행 파싱
    r   	pageIndex)r
   )r!   r   rS   r"   )r	   rT   r   r(   r   r   r   fetch_emp_pagei   s   

rV   c                 C   s~   t  d}t|| }t|jd}dd |dD }dd |dd D }t }g }|D ]}||vr<|| || q,|S )	u   
    /kor/26/headquarters 페이지에서
    a[onclick*="headquarters.jsSearchOrgan"] 의 텍스트(부서명)를 모두 수집.
    n8n 로직처럼 첫 번째 항목은 스킵(헤더 가능성)하고 나머지 사용.
    z/kor/26/headquartersr   c                 S   s   g | ]}t |jd ddqS )r   Tr0   r4   )r6   ar   r   r   
<listcomp>z   s    z%fetch_departments.<locals>.<listcomp>z(a[onclick*="headquarters.jsSearchOrgan"]c                 S   s   g | ]}|r|qS r   r   )r6   xr   r   r   rX   }   s    r   N)r!   r   r   r"   rA   setrG   add)r	   r   r(   r)   arrseenuniqrY   r   r   r   fetch_departmentsq   s   

r_   )N)r$   timetypingr   r   r   r   urllib.parser   r   bs4r   r!   strdictResponser   r   r&   r,   rS   rV   r_   r   r   r   r   <module>   s    "	B