
    G)hl(                       d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZm	Z	m
Z
mZmZ d dlmZ d dlZd dlmZ d dlmZ  ej        dd                                          Z ej        ed	
            ej        d          ZdZdZdZdZ ej        ed           d6dZd7dZ d8dZ!d9dZ"d:dZ#d;d<d%Z$d=d>d)Z%d?d-Z&d. Z'd@d/Z(d;d0Z)dAdBd3Z*d4 Z+e,d5k    r e+             dS dS )C    )annotationsN)DictAnyListTupleOptional)date)	PdfReader
get_client	LOG_LEVELINFOz[%(levelname)s] %(message)s)levelformatkepco_history_build_testkepco_id_testkepco_history_testz/var/www/html/bot/tmp/alioz$https://alio.go.kr/download/pdf.jsonT)exist_oksOptional[str]returnstrc                x   | sdS |                      dd                               dd                               dd          }d                    |                                          }|                     dd                               dd                               d	d          }|                                S )
N      	z 
 
z 
z
 )replacejoinsplitstrip)r   outs     5/var/www/html/bot/scripts/kepco_history_build_test.py	normalizer&      s     r
))D#


&
&x
5
5
=
=dC
H
HC
((399;;

C
++fd
#
#
+
+E4
8
8
@
@
M
MC99;;    krc                ,   | sd S dd l }|                    d|           }|sd S t          |                    d                    t          |                    d                    t          |                    d                    }}}|dd|dd|dS )	Nr   u1   (\d{4})\s*년\s*(\d{1,2})\s*월\s*(\d{1,2})\s*일         04d-02d)researchintgroup)r(   r0   mymods         r%   	to_iso_krr8       s     tIII
		FKKA t1771::AGGAJJQWWQZZ1rA&&&b&&&q&&&&r'   boolc                p    | sdS d                     |                                           }|dv pd|v pd|v S )NTr   )r.   u   –u   —u   미정u	   무기한)r!   r"   )r   vs     r%   is_blank_dater<   +   sG     t
		A##LAL;!;KLr'   re_patc                    dd l }|                    | |          }|r'|                    d                                          nd S )Nr   r*   )r0   r1   r3   r#   )r=   r   r0   r4   s       r%   pickr?   1   sD    III
		&!A!",1771::,r'   pagesList[Dict[str, Any]]c                ^    d | D             }t          d                    |                    S )Nc                j    g | ]0}||                     d           |                     d           pd1S )textr   get).0ps     r%   
<listcomp>z$merge_page_texts.<locals>.<listcomp>7   s;    GGGQ1GvGQUU6]] bGGGr'   r   )r&   r!   )r@   textss     r%   merge_page_textsrK   6   s/    GG%GGGETYYu%%&&&r'   Fsessionrequests.SessiondiscDict[str, Any]c                   t           j                            t          | d          }t           j                            |          r>t          |dd          5 }t          j        |          cddd           S # 1 swxY w Y   |                     t          d|id          }|
                                 |j        }t           j                            t          | d	          }t          |d
          5 }|                    |           ddd           n# 1 swxY w Y   g }	 t          |          }	t          |	j        d          D ]G\  }
}	 |                                pd}n# t"          $ r d}Y nw xY w|                    |
|d           Hn3# t"          $ r&}t&                              d||           Y d}~nd}~ww xY wt+          |          }|t-          |          ||d}t          |dd          5 }t          j        ||dd           ddd           n# 1 swxY w Y   |rt&                              d||           |S )u^    /tmp/alio/{disc}.json 없으면 pdf.json → PDF 저장 → 텍스트 추출 → JSON 생성 z.jsonrutf-8)encodingNdisclosureNo<   )paramstimeoutz.pdfwbr*   )startr   )pagerD   z![WARN] PdfReader fail disc=%s: %s)rT   
page_countr@   mergedwFr+   )ensure_asciiindentz[DBG] saved %s / %s)ospathr!   TMP_DIRexistsopenjsonloadrF   PDF_JSON_URLraise_for_statuscontentwriter
   	enumerater@   extract_text	ExceptionappendloggerwarningrK   lendumpdebug)rL   rN   rs   	json_pathfrQ   	pdf_bytespdf_pathr@   readerirZ   rD   er\   payloads                   r%   ensure_json_for_discr|   :   s&   Wnnn55I	w~~i    )S7333 	 q9Q<<	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	L.$)?LLA	Iw||G]]]33H	h		 		               E	E8$$ Q777 	4 	4GAt((**0b   LL!T223333	4  E E E:D!DDDDDDDDE e$$F#3u::Y_``G	iw	/	/	/ <1	'15;;;;< < < < < < < < < < < < < < < A*Hi@@@Nsr   A;;A?A?8DD!D(*F E*)F *E96F 8E99F 
G GG>H##H'*H'r\   department_hint*Tuple[List[Dict[str, Any]], Optional[str]]c                   ddl }|}|s?|                    d|           }|r'|                    d                                          }d}|                    d|           }|r"t	          |                    d                    }|                     d          }|dk    r
| |d         n| }d |                    d|          D             }	g }
|	D ]}t          |          }t          d	|          }t          d
|          }t          d|          }t          d|          }|                    d|          }|r'|                    d                                          nd}|r'|                    d                                          nd}t          |          rdnt	          |          pt          |          }t          |          rdnt	          |          pt          |          }t          d|          }d |pd                    d          D             }|rd
                    |          nd}|od|v }|od|v pd|v }|s|r||s|
                    |pd|||||||d           |
|fS )uF   
    사람 목록과 문서의 '기준일'(ISO)을 함께 반환
    r   Nu-   임원\s*현황\s*\n([^\n]+)\n임원\s*현황r*   u<   기준일\s*([0-9]{4}\s*년\s*\d{1,2}\s*월\s*\d{1,2}\s*일)u   직위 c                <    g | ]}|                     d           |S )u   직위)
startswith)rG   secs     r%   rI   z"extract_people.<locals>.<listcomp>q   s*    ]]]CNNS[D\D\]]]]r'   u   \n(?=직위\s)u   직위\s*([^\n]+?)\s*성명u4   성명\s*([^\n]+?)(?=\s*(?:직책|성별|임기|\n))u-   직책\s*([^\n]+?)(?=\s*(?:성별|임기|\n))u   성별\s*([남여])uG   임기\s*\(시작일\)\s*([^\n(]+?)\s*\(종료일\)\s*([^\n]+?)(?:\n|$)r+   u   주요경력\s*([\s\S]*?)(?=\n\s*(?:선임절차|선임절차규정|당연직여부|직위\s|기준일|제출일|기관 공시 담당자|$))c                ^    g | ]*}|                                 |                                 +S  )r#   )rG   lines     r%   rI   z"extract_people.<locals>.<listcomp>   s2    aaaTXT^T^T`T`atzz||aaar'   r   r   u
   변경 전u
   변경 후)
departmentnamepositiontaskgenderrY   endcareer)r0   r1   r3   r#   r8   findr"   r&   r?   r<   r!   rn   )r\   r}   r0   deptr4   	basis_isom_basis	start_idxbodysectionsr$   rawr   r   r   r   r   dm	start_rawend_raw	start_isoend_isocareer_blockcareer_listcareer_textjunk_position	junk_names                              r%   extract_peopler   ^   s    IIID &IIFOO 	&771::##%%D IiiWY_``G 0gmmA..//	I&&I!*a6)**VD]]rxx(94@@]]]H "C % %nn6<<KSQQDcJJ,c22YYacfgg+-7BHHQKK%%'''4	+-7BHHQKK%%'''4))44fDD)I:N:N:fR[\eRfRf	)'22dDD)G:L:L:dR[\cRdRd \
 
 ba1C0J0J40P0Paaa0;Edii,,, !?lh&>O|t3N9M	 	I 	 	

*" !	
 	
 		 		 		 		 	>r'   r   r   rolec                    |  d| d|pd }t          j        |                    d                                                    S )Nz::r   rR   )hashlibmd5encode	hexdigest)r   r   r   bases       r%   make_keyr      sI    **d**djb**D;t{{7++,,66888r'   c                     t                      S Nr   r   r'   r%   sbr      s    <<r'   c                 :   	 t                                          t                                        d                              d                                          } | j        pg S # t          $ r'}t          	                    d|           g cY d }~S d }~ww xY w)Nz"department,disclosure_no,posted_ati z[WARN] fetch_all_ids error: %s)
r   tableTBL_IDselectlimitexecutedatarm   ro   rp   )resrz   s     r%   fetch_all_idsr      s    ddjj  ''(LMMSSTZ[[cceex~2   7;;;						s   A&A) )
B3BBBc                ~   | r"t                               dt                     d S 	 t                                          t                                                                        dd                                           d S # t          $ r,}t           	                    dt          |           Y d }~d S d }~ww xY w)Nz[DRY] truncate %sidr   z[WARN] truncate %s failed: %s)
ro   infoTBL_OUTr   r   deleteneqr   rm   rp   )dryrz   s     r%   truncate_outr      s    
 '111D


7""$$((b1199;;;;; D D D6CCCCCCCCCDs   AB 
B<!B77B<  rowsc                   |r0t                               dt          t          |                      d S t	          dt          |           |          D ]}| |||z            }	 t                                          t                                        |                                           \# t          $ r@}t           
                    dt          ||t          |          z   dz
  |           Y d }~d }~ww xY wd S )Nz[DRY] insert %s rows=%dr   z&[WARN] insert %s rows %d~%d failed: %sr*   )ro   r   r   rq   ranger   r   insertr   rm   rp   )r   chunkr   ry   partrz   s         r%   bulk_insert_outr      s	   
 -wD		BBB1c$ii'' c cAagI	cDDJJw&&t,,446666 	c 	c 	cNNCWaQRSVW[S\S\Q\]^Q^`abbbbbbbb	c	c cs   !AB--
C776C22C7c                    t          j        d          } |                     dd           |                     dd           |                                 }|j        r$t
                              t          j                   t                      }|
                    d            t          j                    }i }|D ](}|                    d	          pd
}|                    d          pd
}|                    d          pd
}|r|sMt          |||j                  }	|	                    d          pd
}
t          |
|          \  }}|D ]}|                    d          pd
}|                    d          p|                    d          pd
                                pd }t#          |||          }|                    |          }|s||||                    d          |                    d          |                    d          |                    d          |                    d          ||                    d          ||||d||<   
|                    d          s |                    d          r|d         |d<   |                    d          r|d         |d<   dD ]"}|                    |          r||         ||<   #|r||d<   ||d<   ||d<   *t%          |                                          }t)          |j                   t-          ||j                   t
                              dt1          |                     t3          dt1          |          i           d S )NzBuild KEPCO history (TEST))descriptionz--debug
store_true)actionz	--dry-runc                ^    |                      d          pd|                      d          pdfS )Nr   r   	posted_atrE   )rQ   s    r%   <lambda>zmain.<locals>.<lambda>   s-    QUU<006Bk8J8J8PbQ r'   )keyr   r   disclosure_nor   )rs   r\   )r}   r   r   r   r   rY   r   r   )r   r   r   r   r   r   rY   r   
actual_endr   
first_disc	last_discfirst_posted_atlast_posted_at)r   r   r   r   r   r   r   )r   zbuilt history rows: %dr   )argparseArgumentParseradd_argument
parse_argsrs   ro   setLevelloggingDEBUGr   sortrequestsSessionrF   r|   r   r#   r   listvaluesr   dry_runr   r   rq   print)rH   argsr   r   aggrQ   r   rN   idatepjr\   peopler   pinfor   r   r   curkout_rowss                       r%   mainr      s   ,HIIIANN9\N222NN;|N444<<>>Dz '&&&??DIIQQIRRRA%'C 5. 5.uu\""(buu_%%+k""(b 	4 	 "!T<<<!!'R*64HHH	 (	. (	.E99V$$*DIIf%%D:)>)>D"KKMMUQUDD$--C''#,,C !."&  %		* 5 5!IIf--#ii11"YYw// 99U++"+#ii11"&!%',&+ C$ www'' 2EIIg,>,> 2#(>CL99U## .!&uCJ> * *Ayy|| *!&qA 2(1C%#'K (-$%%Q(	.V CJJLL!!HT\""""H$,////
KK(#h--888	63x==
!"""""r'   __main__)r   r   r   r   )r(   r   r   r   )r   r   r   r9   )r=   r   r   r   r   r   )r@   rA   r   r   )F)rL   rM   rN   r   r   rO   r   )r\   r   r}   r   r   r~   )r   r   r   r   r   r   r   r   )r   rA   )r   F)r   rA   )-
__future__r   r`   re   r   r   r   typingr   r   r   r   r   datetimer	   r   pypdfr
   app.services.supabase_servicer   getenvupperr   basicConfig	getLoggerro   r   r   rb   rg   makedirsr&   r8   r<   r?   rK   r|   r   r   r   r   r   r   r   __name__r   r'   r%   <module>r      st   " " " " " " + + + + + + + + + + + + + + + + + + + + 3 3 3 3 3 3 3 3 3 3 3 3 3 3              4 4 4 4 4 4BIk6**0022	  ),I J J J J		5	6	6 '5 Gd # # # #   	' 	' 	' 	'M M M M- - - -
' ' ' '" " " " "H< < < < <|9 9 9 9
     D D D D	c 	c 	c 	c 	cL# L# L#\ zDFFFFF r'   