
    hN                    8   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	m
Z
mZmZmZ d dlZd dlmZ d dlmZ  ej        dd                                          Z ej        ed	            ej        d
          ZdZdZg dZdZdZdZddiZi edddddZ dZ! ej"        e!d           dOdZ#dPd!Z$dQd"Z%dRd%Z&dSdTd(Z'dUd*Z(dVd-Z)dWdXd5Z*dWdYd8Z+e!d.fdZd=Z,d> Z-d[d@Z.dWd\dBZ/dWd]dDZ0d^d_dGZ1d`dHZ2dadbdIZ3dcdddLZ4dM Z5e6dNk    r e5             dS dS )e    )annotationsN)ListDictAnyOptionalTuple)	PdfReader
get_client	LOG_LEVELINFOz[%(levelname)s] %(message)s)levelformatalio_kepco_sync_testkepco_id_testkepco_org_test)C0247C0042C0043C0066C0082C0220C0259C0248C0306C0305C0236z/https://alio.go.kr/item/itemReportListSusi.jsonz(https://alio.go.kr/item/itemOrganList.doz$https://alio.go.kr/download/pdf.jsonz
User-Agentz*GovBot/1.0 (+https://work.jjickjjicks.com)zhttps://alio.go.krzhttps://alio.go.kr/XMLHttpRequestzapplication/json;charset=UTF-8)OriginRefererzX-Requested-WithContent-Typez/var/www/html/bot/tmp/alioTexist_oksOptional[str]returnstrc                x   | sdS |                      dd                               dd                               dd          }d                    |                                          }|                     dd                               dd                               d	d          }|                                S )
N      	z 
 
z 
z
 )replacejoinsplitstrip)r$   outs     1/var/www/html/bot/scripts/alio_kepco_sync_test.pynormalize_blockr5   .   s     r
))D#


&
&x
5
5
=
=dC
H
HC
((399;;

C
++fd
#
#
+
+E4
8
8
@
@
M
MC99;;    boolc                p    | sdS d                     |                                           }|dv pd|v pd|v S )NTr)   )-u   –u   —u   미정u	   무기한)r0   r1   )r$   vs     r4   is_blank_dater;   6   sG     t
		A##LAL;!;KLr6   c                    dd l }|                    | |          }|r'|                    d                                          nd S )Nr      )researchgroupr2   )re_patr$   r>   ms       r4   pickrC   <   sD    III
		&!A!",1771::,r6   pagesList[Dict[str, Any]]c                ^    d | D             }t          d                    |                    S )Nc                j    g | ]0}||                     d           |                     d           pd1S )textr)   )get).0ps     r4   
<listcomp>z$merge_page_texts.<locals>.<listcomp>B   s;    GGGQ1GvGQUU6]] bGGGr6   r.   )r5   r0   )rD   textss     r4   merge_page_textsrN   A   s/    GG%GGGE499U++,,,r6   merged_textdepartment_hintc                h   dd l }|}|s?|                    d|           }|r'|                    d                                          }|                     d          }|dk    r
| |d          n| }d |                    d|          D             }g }|D ]}	t          |	          }
t          d|
          }t          d|
          }t          d	|
          }t          d
|
          }|                    d|
          }|r'|                    d                                          nd }|r'|                    d                                          nd }t          |          rd n(d	                    |pd                                          }t          |          rd n(d	                    |pd                                          }t          d|
          }d |pd                    d          D             }|od|v }|od|v pd|v }|s|ry|s}|
                    |pd|||||||d           |S )Nr   u-   임원\s*현황\s*\n([^\n]+)\n임원\s*현황r=   u   직위 c                <    g | ]}|                     d           |S )u   직위)
startswith)rJ   secs     r4   rL   z,extract_people_from_text.<locals>.<listcomp>O   s*    ]]]CNNS[D\D\]]]]r6   u   \n(?=직위\s)u   직위\s*([^\n]+?)\s*성명u4   성명\s*([^\n]+?)(?=\s*(?:직책|성별|임기|\n))u-   직책\s*([^\n]+?)(?=\s*(?:성별|임기|\n))u   성별\s*([남여])uG   임기\s*\(시작일\)\s*([^\n(]+?)\s*\(종료일\)\s*([^\n]+?)(?:\n|$)   r+   r)   u   주요경력\s*([\s\S]*?)(?=\n\s*(?:선임절차|선임절차규정|당연직여부|직위\s|기준일|제출일|기관 공시 담당자|$))c                ^    g | ]*}|                                 |                                 +S  )r2   )rJ   lines     r4   rL   z,extract_people_from_text.<locals>.<listcomp>c   s-    \\\4tzz||\$**,,\\\r6   r.   u
   변경 전u
   변경 후)
departmentnamepositiongenderstartendtaskcareer)r>   r?   r@   r2   findr1   r5   rC   r;   r0   append)rO   rP   r>   deptrB   	start_idxbodysectionsr3   rawrT   r[   rZ   r_   r\   dm	start_rawend_rawr]   r^   career_blockr`   junk_position	junk_names                           r4   extract_people_from_textrn   E   s   IIID &IIFTT 	&771::##%%D  ++I&/1nn;yzz""+D]]rxx(94@@]]]H "C # #c""6<<KSQQDcJJ,c22YYacfgg+-7BHHQKK%%'''4	)+5"((1++##%%%%i00Ychh	R?V?V?X?X6Y6Y#G,,Sdd#((GMr;P;P;R;R2S2S \
 
 ]\L,>B+E+Ed+K+K\\\ ?lh&>O|t3N9M	 	I 	 	

*" 	
 	
 		 		 		 		 Jr6   Tuple[int, int, int]c                    	 | pd                     d          \  }}}t          |          t          |          t          |          fS # t          $ r Y dS w xY w)Nr)   .)r   r   r   )r1   int	Exception)r$   yrB   ds       r4   idate_to_tuplerv   y   sc    7//#&&1aAAA''   yys   AA	 	
AAstr_astr_bc                B    t          |           t          |          k    S )za > b ?)rv   )rw   rx   s     r4   is_newerrz      s    %  >%#8#888r6   Fsessionrequests.Sessionapba_idpage_norr   .Tuple[Optional[Dict[str, Any]], Optional[str]]c                D   t          |          |dd}	 |                     t          |d|ddt                     n# t          $ r Y nw xY w|                     t          t          |d          }|r"t          	                    d|||j
                   |                                 |                                }|                    d	          d
k    s|                    d          sdS |d         }|                    d          pi                     d          }||fS )N20305)pageNoapbaIdreportFormRootNoiQO  )r   r   r      )paramstimeoutheaders)r   jsonr   z[%s] LIST page=%s status=%sstatussuccessdata)NN	organInfoapbaNa)r'   rI   PREWARM_URLBASE_HEADERSrs   postLIST_URLXHR_HEADERS_JSONloggerdebugstatus_coderaise_for_statusr   )	r{   r}   r~   r   payloadrjr   organs	            r4   	alio_listr      s2   WgVVG%5GTT 	 	
 	
 	
 	

     	X'7grRRA U2GWamTTT	AuuX)##155==#zV9DXXk""(b--h77E;s   '= 
A
	A
disclosure_noOptional[bytes]c                    |                      t          d|it          d          }|r:t                              d||j        |j                             d                     |j        dk    rd S |j        S )NdisclosureNo<   )r   r   r   z%[PDF] disclosureNo=%s status=%s ct=%sr!      )rI   PDF_JSON_URLr   r   r   r   r   content)r{   r   r   r   s       r4   download_pdfr      ss    L.-)HR^hjkkA {<mQ]\]\e\i\ijx\y\yzzz}t9r6   	pdf_bytesbytesout_dirDict[str, Any]c                   t          j        |d           t           j                            || d          }t           j                            || d          }t	          |d          5 }|                    |            d d d            n# 1 swxY w Y   g }	 t          |          }t          |j        d          D ]G\  }	}
	 |
	                                pd}n# t          $ r d}Y nw xY w|                    |	|d	           Hn3# t          $ r&}t                              d
||           Y d }~nd }~ww xY wt          |          }|t          |          ||d}t	          |dd          5 }t!          j        ||dd           d d d            n# 1 swxY w Y   |rt                              d||           |S )NTr"   z.pdf.jsonwbr=   )r]   r)   )pagerH   z![WARN] PDF read failed for %s: %s)r   
page_countrD   mergedwutf-8encodingFrU   )ensure_asciiindentz[OUT] saved pdf=%s json=%s)osmakedirspathr0   openwriter	   	enumeraterD   extract_textrs   rb   r   warningrN   lenr   dumpr   )r   r   r   r   pdf_path	json_pathfrD   readerir   rH   er   r   s                  r4   save_pdf_and_extract_jsonr      s   K$''''w||G%;%;%;<<HW&=&=&=>>I	h		 		               E	N8$$ Q777 	4 	4GAt((**0b   LL!T223333	4  N N N:M1MMMMMMMMN e$$F,CJJQVbhiiG	iw	/	/	/ <1	'15;;;;< < < < < < < < < < < < < < <  H18YGGGNs`   -BBB*D CD C.+D -C..D 
D;D66D;3FFFc                     t                      S Nr
   rW   r6   r4   sb_getr      s    <<r6   id_valuec                `   	 |                      t                                        d                              d|                              d                                          }t          |j                  S # t          $ r'}t          
                    d||           Y d }~dS d }~ww xY w)Nidr=   z[WARN] exists_id(%s) error: %sF)tableTBL_IDselecteqlimitexecuter7   r   rs   r   r   )sbr   resr   s       r4   	exists_idr      s    hhv%%d++..tX>>DDQGGOOQQCH~~   71EEEuuuuus   A9A< <
B-B((B-rowc                   |r6t                               dt          |                    d                     dS 	 |                     t                                        |                                           dS # t          $ r:}t                               d|                    d          |           Y d }~dS d }~ww xY w)Nz[DRY] insert %s -> %sr   Tz![WARN] insert_id failed id=%s: %sF)	r   infor   rI   r   insertr   rs   r   )r   r   dryr   s       r4   	insert_idr      s    
 +VSWWT]]CCCt
$$,,...t   :CGGDMM1MMMuuuuus   ?A; ;
B?/B::B?rY   c                   |r#t                               dt          |           dS 	 |                     t                                                                        d|                                          }t          |d          r.t          |j	        t                    rt          |j	                  S dS # t          $ r'}t                               d||           Y d }~dS d }~ww xY w)Nz#[DRY] delete %s where department=%sr   rY   r   z-[WARN] delete_org_by_department(%s) error: %s)r   r   TBL_ORGr   deleter   r   hasattr
isinstancer   listr   rs   r   )r   rY   r   r   r   s        r4   delete_org_by_departmentr      s    
 97JOOOqhhw&&((++L*EEMMOO3 	!Jsx$>$> 	!sx== q   F
TUVVVqqqqqs   BB9 9
C*C%%C*  rowsc           
        |sdS d}|r=t                               dt          t          |                     t          |          S t	          dt          |          |          D ]}||||z            }	 |                     t                                        |                                           |t          |          z  }b# t          $ r:}t           	                    d||t          |          z   dz
  |           Y d }~d }~ww xY w|S )Nr   z[DRY] insert %s rows=%dz'[WARN] bulk_insert_org failed %d~%d: %sr=   )
r   r   r   r   ranger   r   r   rs   r   )r   r   chunkr   totalr   partr   s           r4   bulk_insert_orgr      s    qE
 -wD		BBB4yy1c$ii'' [ [AagI	[HHW$$T**22444SYYEE 	[ 	[ 	[NNDa3t99UVXYZZZZZZZZ	[Ls   4AC
D
0DD
c                   	 |                      t                                        d                              d|                              dd                              d                                          }|j        pg }|r6|d                             d          rt          |d         d                   S n3# t          $ r&}t                              d||           Y d}~nd}~ww xY wdS )	uV   
    kepco_id에서 해당 기관의 가장 최신 posted_at('YYYY.MM.DD') 반환
    	posted_atrY   T)descr=   r   z)[WARN] get_latest_posted_at(%s) error: %sN)r   r   r   r   orderr   r   r   rI   r'   rs   r   r   )r   rY   r   r   r   s        r4   get_latest_posted_atr      s    SHHVvk""r,
++u[tu,,uQxxwyy 	 x~2 	-DGKK,, 	-tAw{+,,, S S SBJPQRRRRRRRRS4s   B;B? ?
C/	C**C/c           	     
     t                      dddd t                      }d}t           d          \  }}|r,|sG|                    d          pi                     d          st                              d	            S |p+|                    d          pi                     d          pd
}| d<   |                    d          pg }	r)t                              dt          |	                     t          |          }
r"t                              dt          |
           d}  fd}|	D ]}|                    d          }|r||v r|
                    |            |||           t          |                    d          |
          rx|r6t          |                    d          |                    d                    r@|                    d          |                    d          |                    d          |d}t          dt          d|dz                       D ]}t           |          \  }}|s rt                              d|            n|                    d          pg }r*t                              d|t          |                     |s nd}|D ]X}|                    d          }|r||v r|
                    |            d         } |||            d         |k    r|dz  }Yt                              d||           |dk    r nt          j        d           |r|d         }|d         }t"          j                            t(          | d          }t"          j                            |          sb	 t-           |          }|rt/          ||t(                     n4# t0          $ r'}t                              d||           Y d}~nd}~ww xY wd}t"          j                            |          rwt3          |dd          5 }t5          j        |          }ddd           n# 1 swxY w Y   |                    d           p#t9          |                    d!          pg           }|rt;          ||"          }t=          |#          }r#t                              d$t>          ||           tA          |#          }|dk     d%<   t                              d&|||d         |
           n;t                              d'|           nrt                              d(|
            S ))u   
    페이지 1..pages까지 순회:
      - kepco_id_test: 누락분 INSERT + PDF/JSON 덤프(자료 축적)
      - kepco_org_test: '페이지1'에서 current_latest보다 최신이 있을 때만 재작성
    r   FN)r   new_idsorg_updatedrY   r=   r   r   r   z%[%s] list fail or organ name missing.r)   rY   resultz[%s] page1 items=%dz'[%s] current_latest posted_at in %s: %sc                .   |                      d          }|sd S |                      d          }|                      d          }| d| }t           d| }t          |          s||||||d}t          |          rdxx         d	z  cc<   	 t	          |
          }|rt          ||t                     d S t                              d
|           d S # t          $ r(}	t                              d
||	           Y d }	~	d S d }	~	ww xY wd S )Nr   titleidater9   ?disclosureNo=r   rY   r   r   r   pdf_urlr   r   r=   r   r   r   '[%s] pdf.json download failed (disc=%s)'[%s] pdf save/json failed (disc=%s): %s)
rI   r   r   r   r   r   TMP_DIRr   r   rs   )itemr   discr   r   rec_idr   r   r   r   r}   r   r   r   r{   summarys             r4   upsert_to_id_and_dumpz3process_update_pages.<locals>.upsert_to_id_and_dump)  s   xx'' 	F!!!!""5""!7777V$$ 	\#!%"" C Sc*** (	"""a'"""\($eDDD	 ]-iwV[\\\\\\NN#LgW[\\\\\ \ \ \H'SWYZ[[[[[[[[[\#	\ 	\s   ,C  C   
D*DDr   r   r   )r   r   r   rY   rU   z[%s] page=%d -> no data; stopz[%s] page=%d items=%dr   [%s] page %d -> new ids: %d      ?r   r   z)[%s] late json build failed (disc=%s): %sr   r   r   r   rD   )rP   r   z![%s] delete %s dept=%s -> %d rowsr   z9[%s] ORG updated: dept=%s inserted=%d (posted_at=%s > %s)z$[%s] merged text missing for disc=%sz3[%s] No newer-than-%s item on page1. ORG untouched.)!r   setr   rI   r   r   r   r   r   r   addrz   r   maxr   timesleepr   r   r0   r   existsr   r   rs   r   r   loadrN   rn   r   r   r   )!r{   r}   rD   r   r   	seen_disc
organ_namedata1organ1result1current_latestnewer_for_orgr  itr  r   dataNorganNresultNpage_newbefore_new_idsrc   r   r   r   r   r   pjpeopledeletedinsertedr   r  s!   `` ``                          @@r4   process_update_pagesr    sw    
B QuTXYYG%%I $J gw???ME6  EIIk$:$:$@b#E#Eh#O#O >HHHMEIIk228b==hGGM2J&GLii!!'RG C*GS\\BBB)"j99N a>Q_```.2M\ \ \ \ \ \ \ \ \ \<   vvn%% 	ty((db*--- BFF7OO^44 	! hrvvg@Q@QRY@Z@Z&[&[ $&FF>$:$:VVG__VVG__",	! ! aQ	**++  !'7DFFFv 	 M<gtLLLE))H%%+ 	OLL0'4WNNN 	E 	 	B66.))D 49,,MM$$Y/N!!"j111y!N22A17D(KKKq==E
4  i^,\*GLLT...99	w~~i(( 	^^($eDDD	 ]-iwV[\\\\ ^ ^ ^JGUY[\]]]]]]]]^ 7>>)$$ 	Qiw777 "1Yq\\" " " " " " " " " " " " " " "VVH%%P)9"&&//:OR)P)PF 		R-fdKKKF.r4SAAAG c@'7TXZabbb&r6s;;;H%-\GM"KKSU\^bdln{  }D  oE  GU  V  V  V  VNNA7DQQQQ 	iLLNPWYghhhNs*   5,O" "
P,PP	Q**Q.1Q.2   	max_pagesc           	        t                      }d}d}t                      }t          d|dz             D ]}	t          | ||	|          \  }
}|
s n|r|s|}|
                    d          pg }|s nnd}|D ]-}|                    d          }|                    d          }|                    d          }|sE||v rJ|                    |           |p|pd	}| d
| }t           d| }t          ||          s||||||d}t          |||          r
|dz  }|dz  }	 t          | ||          }|rt          ||t          |           nt                              d||           # t          $ r(}t                              d|||           Y d}~%d}~ww xY w/t                              d||	|           |dk    r nt!          j        d           |||dS )u   
    전 페이지 순회: kepco_id_test 미보유 항목 INSERT + PDF/JSON 덤프만 수행
    (ORG는 건드리지 않음)
    r   Nr=   r   r   r   r   r   r)   r9   r   r   r   r   r   r   r  r  )r   r   rY   )r   r  r   r   rI   r  r   r   r   r   r   r   r   r   rs   r   r
  r  )r{   r}   r!  r   r   r   new_cntr  r  r   r   r   r   r  r   r  r   r   rc   r  r   r   r   r   s                           r4   process_full_bootstrapr$    s   
 
BG $JIaQ'' 3 3$eDDDe 	E 	 	J(##)r 	E "	` "	`D88N++DHHW%%EHHW%%E y  MM$,J,"D%%t%%F%;;T;;GR(( ` "&%)"!&&  R#... "qLGMH` ,Wd% H H HI  a1)T7Z_`````'PRY[_```  ` ` `NN#LgW[]^________`'`, 	17D(KKKq==E
4'LLLs   &A	E00
F":FF"c            	        t          j        d          } |                     ddd           |                     ddd           |                     d	dd
           |                     dt          dd           |                     dt          dd           |                                 }|j        r$t                              t          j	                   t          j                    }|j                            t                     dddg d}|j        rt                              dt"                     t$          D ]}t                              d|           	 t'          |||j        |j        |j                  }n8# t,          $ r+}t                              d||           |dd}Y d }~nd }~ww xY w|dxx         dz  cc<   |dxx         |                    dd          z  cc<   |d                             |           ʐn%t                              d|j                   t$          D ]}t                              d|           	 t7          ||t9          d|j                  |j        |j                  }n9# t,          $ r,}t                              d ||           |dd!d"}Y d }~nd }~ww xY w|dxx         dz  cc<   |dxx         |                    dd          z  cc<   |                    d#          r|d#xx         dz  cc<   |d                             |           t                              d$|           t;          |           d S )%NzALIO KEPCO sync (TEST tables).)descriptionz--debug
store_truez
debug logs)actionhelpz	--dry-runzno DB writesz--fullz2(once) crawl ALL pages per company (no ORG writes)z--max-pagesr   zmax pages for --full)typedefaultr)  z--pagesr=   z3(update mode) fetch N pages per company (default=1)r   )	processedinserted_idsr   	companieszK== FULL bootstrap mode: accumulate ALL pages into %s and dump PDFs/JSONs ==z[%s] ----- start (full) -----)r!  r   r   z[%s] full bootstrap error: %s)r   r   r,  r-  r   r.  zd== UPDATE mode: insert missing IDs (pages= %d) + update ORG only if page1 has strictly newer item ==z[%s] ----- start -----)rD   r   r   z[%s] update error: %sF)r   r   r   r   zdone: %s)argparseArgumentParseradd_argumentrr   
parse_argsr   r   setLevelloggingDEBUGrequestsSessionr   updater   fullr   r   APBA_IDSr$  r!  dry_runrs   r   rI   rb   rD   r  r	  print)parserargsr$   totalsapbar   r   s          r4   mainrA    s   $1QRRRF
	,\JJJ
L~NNN
<pqqq
CBXYYY
	Q=rsssDz '&&&AI\"""aPRSSFy -acijjj 		- 		-DKK7>>>6-aW[Wagkgsttt 6 6 6>aHHH"&1556 ;1$>"""dhhy!&<&<<""";&&t,,,,		- 	z  }A  }G  	H  	H  	H 	- 	-DKK0$777L+At3q$*;M;MUYU_eieqrrr L L L6a@@@"&1UKKL ;1$>"""dhhy!&<&<<"""xx&& +}%%%*%%%;&&t,,,,
KK
F###	&MMMMMs0   #F
F7!F22F71J
J9"J44J9__main__)r$   r%   r&   r'   )r$   r%   r&   r7   )r$   r'   r&   r%   )rD   rE   r&   r'   r   )rO   r'   rP   r%   r&   rE   )r$   r%   r&   ro   )rw   r%   rx   r%   r&   r7   )F)r{   r|   r}   r'   r~   rr   r&   r   )r{   r|   r   r'   r&   r   )r   r   r   r'   r   r'   r&   r   )r   r'   r&   r7   )r   r   r&   r7   )rY   r'   r&   rr   )r   F)r   rE   r&   rr   )rY   r'   r&   r%   )FF)r{   r|   r}   r'   rD   rr   r&   r   )r   FF)r{   r|   r}   r'   r!  rr   r&   r   )7
__future__r   r   sysr   r
  r/  r4  typingr   r   r   r   r   r6  pypdfr	   app.services.supabase_servicer   getenvupperr   basicConfig	getLoggerr   r   r   r:  r   r   r   r   r   r   r   r5   r;   rC   rN   rn   rv   rz   r   r   r   r   r   r   r   r   r   r  r$  rA  __name__rW   r6   r4   <module>rM     sT   " " " " " " 				 



     3 3 3 3 3 3 3 3 3 3 3 3 3 3        4 4 4 4 4 4BIk6**0022	  ),I J J J J		1	2	2 

  
 =85 >"$(4    ' Gd # # # #   M M M M- - - -
- - - -1 1 1 1 1h   9 9 9 9
    .     T[bg     >     	 	 	 	 	            *J J J J JZ?M ?M ?M ?M ?MD- - -^ zDFFFFF r6   