o
    ^Ph$                     @  s  d dl mZ d dlZd dlZd dlZd dlZd dlmZ d dlZd dl	m
Z
 d dlmZ dZeddd	Zd<d=ddZd>ddZddd dD Zd?ddZd@d d!Zh d"Zd#d$hZh d%Zh d&Zd'hZh d(Zd)ZdAd-d.ZdBd0d1Zd<dCd3d4ZdDd6d7ZdEd:d;Z dS )F    )annotationsN)List)extract_text_to_fp)LAParamsz%GovBot/1.0 (+https://example.invalid)z*/*zhttps://alio.go.kr/)z
User-AgentAcceptReferer      4@urlstrtimeoutfloatreturnrequests.Responsec                 C  s   t j| t|d}|  |S )N)headersr   )requestsgetHEADERSraise_for_status)r	   r   r r   5/var/www/html/govbot/app/crawlers/kepco_org_parser.py	_download   s   r   	pdf_bytesbytesc                 C  s8   t  }tddddddd}tt | ||d | S )	Ng{Gz?g?g       @g      ?FT)line_marginword_marginchar_margin
boxes_flowdetect_vertical	all_texts)laparams)ioStringIOr   r   BytesIOgetvalue)r   buf_outr    r   r   r   _pdf_to_text   s   r&    c                 c  s    | ]}t |V  qd S N)chr.0cr   r   r   	<genexpr>       r-   )i   i   i   i  sc                 C  sv   | sdS |  dd dd} | dd tD } |  dd	 d
d	 dd dd} td| } tdd| } |  S )Nr'   
     c                 S  s   i | ]}t |d qS r(   )ordr*   r   r   r   
<dictcomp>$   s    z_normalize.<locals>.<dictcomp>u   –-u   —u   （(u   ）)NFKC[ \t]+)replace	translate_ZWunicodedata	normalizeresubstrip)r/   r   r   r   
_normalize    s   $rC   text	List[str]c                 C  s*   t | } dd | dD }dd |D S )Nc                 S  s   g | ]}t |qS r   )rC   r+   xr   r   r   
<listcomp>-       z_prep_lines.<locals>.<listcomp>r1   c                 S  s   g | ]}|r|qS r   r   )r+   lnr   r   r   rH   .   rI   )rC   split)rD   linesr   r   r   _prep_lines*   s   rM   >   	   시작일	   종료일   성명   성별   임기   직위   직책   (시작일)   (종료일)   선임절차   임명권자   주요경력   당연직여부   선임절차규정u   상임기관장u	   기관장>   	   변경전	   변경후
   변경 전
   변경 후   변경사유>   rW   rX   rZ   r[   rY   >   rP   rQ   rR   rS   rT   rU   rV   )u   사장u   원장u	   이사장u   대표이사u	   부사장u	   본부장tok
str | Noneboolc                   s&    sdS     t fddtD S )NFc                 3  s    | ]}  |V  qd S r(   )endswith)r+   r/   ra   r   r   r-   A   s    z$_looks_like_title.<locals>.<genexpr>)rB   anyTITLE_HINT_SUFFIXre   r   re   r   _looks_like_title=   s   rh   tc                 C  s(   | t v p| tv p| tv p| tv p| tv S r(   )
ALL_LABELS
LABEL_FLOWLABEL_BLOCKROW_START_SYNONYMSCHANGE_TOKENS)ri   r   r   r   	_is_labelC   s   ro   endpoint_urlc                 C  s   t | |d}z| }W n ty   i }Y nw dD ]}||}t|tr1| dr1|  S qt	d|j
}|r@|dS dS )u   
    https://alio.go.kr/download/pdf.json?... 응답에서 PDF URL을 추출.
    키가 들쭉날쭉하므로 여러 후보 + 백업 정규식으로 탐색.
    )r   )fileUrlpdfUrlr	   	file_pathfileurlpdfz.pdfzhttps?://[^\s\"']+?\.pdfr   r'   )r   json	Exceptionr   
isinstancer
   lowerrd   r@   searchrD   group)rp   r   r   jkvmr   r   r   _fetch_pdf_url_from_alio_jsonN   s   
r   raw_textc                   s0  t |  t }d fdd}g }d}d}||k r | }|tv r||p(|}|}g }	d}
||k r | }|d	v ru|d
7 }||k rM | dkrM|d
7 }||k rtt | stt | sc | tv rh|d
7 }qM|	 |  |d
7 }	 q/|dv r|d
7 }||k r | dkr|d
7 }||k rt | st | s | tv r|d
7 }q|	 |  |d
7 }	 q/|dkr|d
7 }g }||k rt | s| |  |d
7 }||k rt | rd| }
nt|r|dvrn|d
7 }||k s3|r|	rd|	dd }|ddd |||
fD  t	||d
 }q|t
v rN|| |d
7 }||k rMt | sM| |  |d
7 }||k rMt | r6q|tv r|d
 }|g}||k r}t | s}| |  |d
7 }||k r}t | rf|d| |}q|tv s|tv r|}g }||k r | }|t
v s|tv s|tv rnB|tv s|tv r|| |d
7 }||k rt | s|dkr׈ | }| |  |d
7 }||k rt | rΐq	 |r|d| |}q|| |d
7 }||k sdd |D }d|S )uD  
    규칙:
      - 라벨 체인(직위/성명/직책/성별/임기/(시작일)/(종료일))은 가능한 한 줄로 이어 붙임
        예) '직위 상임기관장 성명 김동철' / '임기 (시작일) 2023... (종료일) 2026...'
      - '주요경력'은 헤더 1줄 + 항목들 각 1줄 유지
      - '선임절차/선임절차규정/당연직여부/임명권자'는 '라벨 + 값'을 한 줄로
      - '변경 전/변경 후/변경사유' 블록은 직전 '직위' 값을 추정해 요약 한 줄:
        예) '상임이사 서근배 공석 의원면직(5.19)'
    idxintr   r
   c                   sT   t | d td| d dD ]} | }|rt|rq|ds#|tv r'|  S qdS )N      u   이사r'   )rangemaxro   rd   rm   )r   r}   ri   Lr   r   guess_position_beforeq   s   z9_linearize_text_for_export.<locals>.guess_position_beforer   r'   >   r\   r^   r   rP   >   r]   r_   r`   r3   >   rP   N   c                 s  s    | ]}|r|V  qd S r(   r   rF   r   r   r   r-      r.   z-_linearize_text_for_export.<locals>.<genexpr>rS   c                 S  s*   g | ]}|r|  rtd d|  qS )r:   r3   )rB   r@   rA   rF   r   r   r   rH      s   * z._linearize_text_for_export.<locals>.<listcomp>r1   )r   r   r   r
   )rM   lenrn   ro   rh   rm   appendjoinrB   r   rl   rk   LABEL_INLINE)r   nr   outilast_pos_valueri   posr|   namesreasontt
reason_bufbodybufr   r   r   _linearize_text_for_exportd   s   



* 





g
r   alio_pdf_json_urldictc                 C  s6   t | }|s
tdt|j}t|}t|}d|iS )u   
    1) alio pdf.json → PDF URL 추출
    2) PDF 다운로드 → 텍스트 추출
    3) 기대 레이아उ트로 선형화
    4) {"text": "..."} 반환
    u2   PDF URL을 alio json에서 찾지 못했습니다.rD   )r   RuntimeErrorr   contentr&   r   )r   pdf_urlru   rD   linearr   r   r   endpoint_to_text_json   s   
r   )r   )r	   r
   r   r   r   r   )r   r   r   r
   )r/   r
   r   r
   )rD   r
   r   rE   )ra   rb   r   rc   )ri   r
   r   rc   )rp   r
   r   r   r   r
   )r   r
   r   r
   )r   r
   r   r   )!
__future__r   r!   r@   rv   r>   typingr   r   pdfminer.high_levelr   pdfminer.layoutr   UAr   r   r&   r   r=   rC   rM   rj   rm   rn   rk   rl   r   rg   rh   ro   r   r   r   r   r   r   r   <module>   s4    






 