
    r6hp:                       d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
mZmZmZmZ d dlZd dlmZ ej                            ej                            ej                            e          d                    Zeej        vrej                            e           d dlmZ  ej        dd                                          Z ej        ed	
            ej        d          Z dZ!dZ"dZ#dZ$ ej%        e$d           dIdZ&dJdZ'dKdZ(dLdZ)dMdZ*dNd!Z+dOd#Z,dPd%Z-dQd(Z.dRdSd,Z/dTd/Z0dUdVd6Z1dWd<Z2d= Z3dXdAZ4dB Z5dYdEZ6dZdFZ7dG Z8e9dHk    r e8             dS dS )[    )annotationsN)AnyDictListOptionalTuple)	PdfReaderz..
get_client	LOG_LEVELINFOz[%(levelname)s] %(message)s)levelformatkepco_org_collect_testkepco_id_testkepco_org_all_testz$https://alio.go.kr/download/pdf.jsonz/var/www/html/bot/tmp/alioT)exist_oksOptional[str]returnstrc                    | sdS |                      dd          } t          j        dd|           } t          j        dd|           } |                                 S )N z[ \t]+ z
[ ]*\n[ ]*
)replaceresubstrip)r   s    3/var/www/html/bot/scripts/kepco_org_collect_test.py	normalizer"      sS    RR			$A
y#q!!A
}dA&&A7799    boolc                    | sdS t          j        dd|                                           }|dv pt          j        d|          d uS )NT\s+r   >      –   —-u   (미정|무기한))r   r   r    search)r   vs     r!   is_blank_dater,   "   sO    TT
vr1##%%A##Vry1F'J'JRV'VVr#   patc                    t          j        | |t           j                  }|r'|                    d                                          nd S )N)flags   )r   r*   	MULTILINEgroupr    )r-   r   ms      r!   pickr4   '   s@    
	#q---A!",1771::,r#   merged_textc                    t          j        d|           }|r'|                    d                                          nd S )Nu-   임원\s*현황\s*\n([^\n]+)\n임원\s*현황r0   )r   r*   r2   r    )r5   r3   s     r!   extract_department_from_headerr7   +   s:    
	BKPPA!",1771::,r#   texts	List[str]c                    d                     d | D                       }t          j        dd|          }|                                S )Nr   c                0    g | ]}|t          |          S  )r"   ).0ts     r!   
<listcomp>z$merge_page_texts.<locals>.<listcomp>0   s#    999q9	!999r#   z\n{2,})joinr   r   r    )r8   joineds     r!   merge_page_textsrB   /   sC    YY99e999::FVItV,,F<<>>r#   krc                "   | sdS t          j        d|           }|sdS t          |                    d                    t          |                    d                    t          |                    d                    }}}|dd|dd|dS )	u#   'YYYY년 M월 D일' -> 'YYYY-MM-DD'Nu1   (\d{4})\s*년\s*(\d{1,2})\s*월\s*(\d{1,2})\s*일r0         04dr)   02dr   r*   intr2   )rC   r3   ymods        r!   to_iso_from_krrN   4   s    dd
	FKKATT1771::AGGAJJQWWQZZ1rA&&&b&&&q&&&&r#   dotc                "   | sdS t          j        d|           }|sdS t          |                    d                    t          |                    d                    t          |                    d                    }}}|dd|dd|dS )	z'YYYY.MM.DD' -> 'YYYY-MM-DD'Nz(\d{4})\.(\d{1,2})\.(\d{1,2})r0   rE   rF   rG   r)   rH   rI   )rO   r3   rK   rL   rM   s        r!   to_iso_from_dotsrQ   <   s    tt
	2C88ATT1771::AGGAJJQWWQZZ1rA&&&b&&&q&&&&r#   bodyc                D    g t          j        d           D ])}                    |                                           *sg S                     t	                                 fdt          t	                    dz
            D             }d |D             S )Nu   (?m)^\s*직위\bc                f    g | ]-}|         |d z                                                      .S )r0   r    )r=   irR   idxss     r!   r?   z/_split_sections_by_position.<locals>.<listcomp>L   s;    LLLAT!WT!A#Y&'--//LLLr#   r0   c                <    g | ]}|                     d           |S )   직위)
startswith)r=   r   s     r!   r?   z/_split_sections_by_position.<locals>.<listcomp>M   s)    :::!1<<#9#9:A:::r#   )r   finditerappendstartlenrange)rR   r3   sectionsrW   s   `  @r!   _split_sections_by_positionra   E   s    D[,d33  AGGII 	KKD		LLLLLs4yy{9K9KLLLH::x::::r#   sec#Tuple[Optional[str], Optional[str]]c                "   t          j        d|           }|r|                    d                                          |                    d                                          }}t	          |          rd nt          j        dd|          }t	          |          rd nt          j        dd|          }||fS t          j        d|           }|r|                    d                                          |                    d                                          }}t	          |          rd nt          j        dd|          }t	          |          rd nt          j        dd|          }||fS t          j        d|           }|r|                    d                                          |                    d                                          }}t	          |          rd nt          j        dd|          }t	          |          rd nt          j        dd|          }||fS t          j        d|           }|rR|                    d                                          }t	          |          rd nt          j        dd|          }|d fS d	S )
NuG   임기\s*\(시작일\)\s*([^\n(]+?)\s*\(종료일\)\s*([^\n]+?)(?:\n|$)r0   rE   r&   r   uW   임기\s*시작(?:일)?\s*[:：]?\s*([^\n]+?)[\s,/]*종료(?:일)?\s*[:：]?\s*([^\n]+)u>   임기\s*[:：]?\s*([^\n~\-–—]+?)\s*[~\-–—]\s*([^\n]+)u1   임기\s*(?:시작(?:일)?)?\s*[:：]?\s*([^\n]+)NN)r   r*   r2   r    r,   r   )rb   r3   s_rawe_raws_vale_vals         r!   _parse_term_blockrj   O   sP   
	\^abbA wwqzz''))1771::+;+;+=+=u%e,,L"&e2L2L%e,,L"&e2L2Le|
	lnqrrA wwqzz''))1771::+;+;+=+=u%e,,L"&e2L2L%e,,L"&e2L2Le|
	SUXYYA wwqzz''))1771::+;+;+=+=u%e,,L"&e2L2L%e,,L"&e2L2Le|
	FLLA 

  ""%e,,L"&e2L2Ld{:r#   department_hint
debug_discList[Dict[str, Any]]c                   |pt          |           }|                     d          }|dk    r
| |d          n| }t          |          }|s|rt                              d|           g S g }|D ]}t          |          }	|	                    d          sd|	v rt          j        d|	          sAt          d|	          }
t          d|	          }t          d	|	          }t          d
|	          }dd} ||
          dv s ||          dv rt          |	          \  }}t          d|	          }g }|rd t          j        d|          D             }|s|                    |||
|||||rd                    |          nd d           |r7t                              d|t          |          t          |                     |S )NrY   r   u'   [parse][%s] no '직위' sections found.u   직위 변경 전u   변경사유u   (임기|직책|성별)\suA   직위\s*[:：]?\s*([^\n]+?)\s*(?:성명|직책|성별|임기|\n)u>   성명\s*[:：]?\s*([^\n]+?)(?=\s*(?:직책|성별|임기|\n))u7   직책\s*[:：]?\s*([^\n]+?)(?=\s*(?:성별|임기|\n))u   성별\s*[:：]?\s*([남여])xr   r   r   c                2    | pd                     dd          S )Nr   r   )r   )ro   s    r!   tokzparse_people.<locals>.tok   s    !'r1B1B31K1K*Kr#   >   	   변경전rY   >      성명	   변경후u   주요경력\s*([\s\S]*?)(?=\n\s*(?:선임\s*절차|선임절차|선임\s*절차\s*규정|선임절차규정|당연직여부|직위\s|기준일|제출일|기관\s*공시\s*담당자|$))c                ^    g | ]*}|                                 |                                 +S r<   rU   )r=   lns     r!   r?   z parse_people.<locals>.<listcomp>   s-    XXXRRXXZZXbhhjjXXXr#   z\n+r   )
departmentnamepositiongenderr]   endtaskcareerz$[parse][%s] sections=%d -> people=%d)ro   r   r   r   )r7   findra   loggerdebugr"   rZ   r   r*   r4   rj   splitr\   r@   r^   )r5   rk   rl   rw   	start_idxrR   r`   outrawrb   ry   rx   r|   rz   rq   r]   r{   career_blockr}   s                      r!   parse_peopler   m   s;    O$B;$O$OJ  **I&/1nn;yzz""+D*400H  	PLLBJOOO	 "C & &nn >>-.. 	>S3H3HQSQZ[vx{Q|Q|3H\^abbY[^__RTWXX8#>>KKKK3x==222cc$iiCY6Y6Y&s++
s I
 
  	YXX28FL+I+IXXXF 	

$ +1;dii'''t	
 	
 		 		 		 		  b;ZXX[\_X`X`aaaJr#   posted_at_dotDict[Tuple[str, str], str]c                   d}t          j        d|           }|r"t          |                    d                    }|st	          |          }t          j        d|           }|si S t          |                    d                    }i }|                    d          D ]}|                                }|s|                                }	t          |	          dk     rA|	d         }
|	d	         }|	d
         }d	                    |	dd
                                                   }t          j
        d|          r||f}|r|||<   |S )uF  
    '직위 변경 전 성명 변경 후 성명 변경사유' 블록에서
    (position, prev_name) -> actual_end(YYYY-MM-DD) 매핑을 만든다.
    '변경 후'가 공석/결원/-/빈값이면 prev_name의 실제 종료일로 간주.
    날짜는 '기준일 YYYY년 M월 D일'을 우선, 없으면 posted_at 사용.
    NuB   기준일\s*([0-9]{4}\s*년\s*[0-9]{1,2}\s*월\s*[0-9]{1,2}\s*일)r0   u   (직위\s*변경\s*전\s*성명\s*변경\s*후\s*성명\s*변경사유)([\s\S]*?)(?=\n\s*직위\s+(?:상임|비상임|상임기관장|상임감사|상임이사|비상임이사)|\Z)rE   r   rF   r   u   (공석|결원|-))r   r*   rN   r2   rQ   r"   r   r    r^   r@   	fullmatch)r5   r   baser3   block_mblockactual_end_maplinerv   toksreason	next_name	prev_namery   keys                  r!   parse_changesr      sc    D
	WYdeeA *aggajj)) /..i 	@ G  	gmmA&&''E13ND!! + +ZZ\\ 	xxzzt99q==bH	H	HHT#2#Y''--// <,i88 	+Y'C +&*s#r#   Fsessionrequests.Sessiondiscr   Dict[str, Any]c                   t           j                            t          | d          }t           j                            |          r>t          |dd          5 }t          j        |          cd d d            S # 1 swxY w Y   |                     t          d|id          }|
                                 |j        }t           j                            t          | d          }t          |d	          5 }|                    |           d d d            n# 1 swxY w Y   g }	 t          |          }	t          |	j        d
          D ]D\  }
}	 |                                pd}n# t"          $ r d}Y nw xY w|                    |           En3# t"          $ r&}t&                              d||           Y d }~nd }~ww xY wt+          |          }|t-          |          d t          |          D             |d}t          |dd          5 }t          j        ||dd           d d d            n# 1 swxY w Y   |r7t&                              d|t-          |          t-          |                     |S )Nz.jsonrutf-8)encodingdisclosureNo<   )paramstimeoutz.pdfwbr0   )r]   r   z#[WARN] PdfReader failed disc=%s: %sc                $    g | ]\  }}|d z   |dS )r0   )pagetextr<   )r=   rV   r>   s      r!   r?   z(ensure_json_for_disc.<locals>.<listcomp>   s(    GGGsq1Q3**GGGr#   )r   
page_countpagesmergedwFrE   )ensure_asciiindentz%[disc=%s] page_count=%d merged_len=%d)ospathr@   TMP_DIRexistsopenjsonloadgetPDF_JSON_URLraise_for_statuscontentwriter	   	enumerater   extract_text	Exceptionr\   r   warningrB   r^   dumpr   )r   r   r   	json_pathfr   	pdf_bytespdf_pathr8   readerrV   r   r>   er   payloads                   r!   ensure_json_for_discr      sL   Wnnn55I	w~~i    )S7333 	 q9Q<<	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	L.$)?LLA	Iw||G]]]33H	h		 		               E	G8$$ Q777 	 	GAt%%''-2   LLOOOO	  G G G<dAFFFFFFFFG e$$F%jjGGi6F6FGGG	 G 
iw	/	/	/ <1	'15;;;;< < < < < < < < < < < < < < <  ]<dCJJPSTZP[P[\\\Nsr   A;;A?A?8DD!D(*F E*)F *E96F 8E99F 
GF>>GH77H;>H;deptrx   roler]   r{   c                    |  d| d| d|pd d|pd d|pd }t          j        |                    d                                                    S )Nz::r   r   )hashlibmd5encode	hexdigest)r   r   rx   r   r]   r{   r   s          r!   make_idr      si    NNdNNdNNdjbNNEKRNN39"NND;t{{7++,,66888r#   c                     t                      S )Nr
   r<   r#   r!   sbr      s    <<r#   	only_dept	only_discOptional[List[str]]c                4   t                                          t                                        d                              d          }| r|                    d|           }|r|                    d|          }|                                }|j        pg S )Nz(department,disclosure_no,posted_at,titlei rw   disclosure_no)	r   tableTBL_SRCselectlimiteqin_executedata)r   r   qress       r!   fetch_sourcesr     s    


7""#MNNTTU[\\A *DDy)) .EE/9--
))++C8>rr#   c                     t                                          t                                                                        dd                                           d S )Nidr   )r   r   TBL_OUTdeleteneqr   r<   r#   r!   truncate_outr     sF    DDJJw  $$T"--5577777r#   rowsNonec                   t                      }	 |                    t                                        |                                            d S # t
          $ r}t          |          }d|v rjd|v rft                              d           d | D             }|                    t                                        |                                           n Y d }~d S d }~ww xY w)Ncolumn
actual_endz6[WARN] table has no 'actual_end'. retrying without it.c                J    g | ] }d  |                                 D             !S )c                &    i | ]\  }}|d k    ||S )r   r<   )r=   kr+   s      r!   
<dictcomp>z0_try_insert_chunk.<locals>.<listcomp>.<dictcomp>  s(    DDDca!|2C2Ca2C2C2Cr#   )items)r=   r   s     r!   r?   z%_try_insert_chunk.<locals>.<listcomp>  s/    SSSDDQWWYYDDDSSSr#   )	r   r   r   insertr   r   r   r   r   )r   clientr   msgrows2s        r!   _try_insert_chunkr     s    TTF
W$$T**2244444   !ffs??|s22NNSTTTSSdSSSELL!!((//779999 :99999s   ?A 
C$A>CC$c           
        | sd S d}t          dt          |           |          D ]f}| |||z            }	 t          |            # t          $ r:}t                              d||t          |          z   dz
  |           Y d }~_d }~ww xY wd S )Ni  r   z$[WARN] insert chunk %d~%d failed: %sr0   )r_   r^   r   r   r   r   )r   CHUNKrV   partr   s        r!   insert_rowsr     s    E1c$ii'' X XAagI	Xd#### 	X 	X 	XNNA1aD		kRSmUVWWWWWWWW	X	X Xs   A
B	0BB	c                 	   t          j        d          } |                     dd           |                     ddd           |                     d	dd
           |                     dt          d d           |                     dt          d d           |                                 }|j        r$t                              t          j	                   t          j                    }|j                            ddi           |j        r$d |j                            d          D             nd }t!          |j        |          }|                    d            |j        r5|j        s.t                              dt,                     t/                       g }|D ]}|                    d          pd}|                    d          pd}|                    d          pd}	|                    d          pd}
|r|sd	 t3          |||j                  }|                    d          pd}t5          |||          }t7          ||	          }|j        r<t                              d |||	|
pd!t9          |          t9          |                     |D ]{}|                    d"          p|                    d#          pd                                pd }t=          |||                    d$          pd||                    d%          |                    d&                    }|                    |                    d#          pd|                    d$          pdf          }|                    ||||	|
|                    d$          |                    d#          |                    d"          |                    d'          |                    d%          |                    d&          ||                    d(          d)           }z# t@          $ r'}t          !                    d*||           Y d }~d }~ww xY wt                              d+t9          |                     |j        stE          |           t                              d,           tG          d-|j        rd.nt9          |          i           d S )/NzjCollect all executives from kepco_id_test PDFs into kepco_org_all_test (with actual_end from change table))descriptionz--debug
store_true)actionz	--dry-runzDB write skip)r   helpz--rebuild-allz&truncate target table before insertingz--only-deptu*   특정 기관만 (예: 한국전력공사))typedefaultr   z--only-discu   쉼표로 disclosureNo 제한z
User-Agentz*GovBot/1.0 (+https://work.jjickjjicks.com)c                6    g | ]}|                                 S r<   rU   )r=   ro   s     r!   r?   zmain.<locals>.<listcomp>6  s     CCCAaggiiCCCr#   ,c                    |                      d          pd|                      d          pd|                      d          pdfS )Nrw   r   	posted_atr   )r   )r   s    r!   <lambda>zmain.<locals>.<lambda>8  sF    |!4!4!:AEE+<N<N<TRTVWV[V[\kVlVlVrpr s r#   )r   z[INIT] truncate %srw   r   r   r  title)r   r   )rk   rl   )r   z<[disc=%s][%s] posted=%s title=%s -> people=%d, end_events=%dr)   r|   ry   rx   r]   r{   rz   r}   )r   rw   r   r  r  rx   ry   r|   rz   r]   r{   r   r}   z[WARN] disc=%s parse failed: %szto insert rows: %dzdone.insertedr   )$argparseArgumentParseradd_argumentr   
parse_argsr   r   setLevelloggingDEBUGrequestsSessionheadersupdater   r   r   r   sortrebuild_alldry_runinfor   r   r   r   r   r   r^   r    r   r\   r   r   r   print)pargsr   only_disc_listsrc_rowsout_rowsr   r   r   r  r  pjr   peopleend_mappinfor   row_idr   r   s                       r!   mainr#  (  s     -Y  	Z  	Z  	ZANN9\N222NN;|/NJJJNN?<>fNgggNN=sD?kNlllNN=sD?^N___<<>>Dz '&&&  GOL*VWXXXGK~_CC)=)=c)B)BCCCC[_NT^^<<HMMssMttt  ('222%'H )G )Guu\""(buu_%%+EE+&&,"	g$" 	4 	!	G%gt4:FFFBVVH%%+F!&$4PPPF#F)DDDGz _[!4U\cCKKQTU\Q]Q]_ _ _    		&))HUYYz-B-BHbOOQQYUY tUYYv->->-D"dEIIV]L^L^`e`i`ijo`p`pqq$[[%))J*?*?*E2uyyQWGXGXG^\^)_``
 "&%)!*"!IIf-- %		* 5 5!IIf--#ii11"YYw// 99U++",#ii11! !    (  	G 	G 	GNN<dAFFFFFFFF	G KK$c(mm444< H
KK	:DL;qqc(mm
<=====s   HP
Q$QQ__main__)r   r   r   r   )r   r   r   r$   )r-   r   r   r   r   r   )r5   r   r   r   )r8   r9   r   r   )rC   r   r   r   )rO   r   r   r   )rR   r   r   r9   )rb   r   r   rc   re   )r5   r   rk   r   rl   r   r   rm   )r5   r   r   r   r   r   )F)r   r   r   r   r   r$   r   r   )r   r   r   r   rx   r   r   r   r]   r   r{   r   r   r   )r   r   r   r   r   rm   )r   rm   r   r   )r   rm   ):
__future__r   r   sysr   r   r	  r  r   typingr   r   r   r   r   r  pypdfr	   r   abspathr@   dirname__file__ROOT_DIRr\   app.services.supabase_servicer   getenvupperr   basicConfig	getLoggerr   r   r   r   r   makedirsr"   r,   r4   r7   rB   rN   rQ   ra   rj   r   r   r   r   r   r   r   r   r   r#  __name__r<   r#   r!   <module>r4     s6   " " " " " " 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3        7??27<<(A(A4HHII38HOOH 4 4 4 4 4 4BIk6**0022	  ),I J J J J		3	4	4

5
& Gd # # # #   W W W W
- - - -- - - -   
' ' ' '' ' ' '; ; ; ;   <6 6 6 6 6r+ + + +\% % % % %N9 9 9 9
     8 8 8   X X X XH> H> H>T zDFFFFF r#   