
    i˫h9                    8   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
mZmZmZmZmZ ej                            ej                            ej                            e          dd                    Zeej        vrej                            e           d dlmZ d dlmZmZmZmZ  ej        dd                                          Z  ej!        d	          Z" ej#                    Z$e$%                     ej&        d
                     e"'                    e$           e"(                    e            dZ)dZ*d Z+dLdMdZ,h dZ-dNdZ.dOdZ/dPdQd"Z0dRd$Z1 ej2        d%ej3                  Z4 ej2        d&ej3                  Z5 ej2        d'          Z6 ej2        d(          Z7 ej2        d)ej3                  Z8 ej2        d*ej3                  Z9dSd,Z:dTd/Z;dUd1Z<dVd8Z=dWd<Z>	 	 	 	 	 dXdYdDZ?	 	 	 	 dZd[dIZ@dJ ZAeBdKk    r eA             dS dS )\    )annotationsN)AnyDictListOptionalTuplez..
get_client)make_sessionpdf_to_page_textsmerge_page_textsparse_people	LOG_LEVELINFOkepco_history_backfillz[%(levelname)s] %(message)skepco_id_testkepco_historyc                     t                      S )Nr	        7/var/www/html/bot/app/crawler/kepco_history_backfill.pysbr      s    <<r     limitintoffsetreturnList[Dict[str, Any]]c                   |                      t                                        d                              dd                              dd                              |||z   dz
            }|                                }|j        pg S )N*pdf_urlzhttp%	posted_atF)desc   )tableTABLE_ID_TESTselectlikeorderrangeexecutedata)clientr   r   qress        r   fetch_id_rowsr0      st    	m	$	$
&++
$y'
"
"
%%%
(
(
%*
+
+	 
 ))++C8>rr   >	      –   —   없음N/An/a Nonenull-xOptional[str]c                    | d S t          |                                           }t          j        dd|          }|t          v rd n|S )Nz\s+ )strstripresub_BLANK_TOKENS)r:   ss     r   _norm_blank_to_nonerD   '   sF    ytAA
vsAA%%441,r   
departmentnamer>   taskboolc                   |pd                                 }|pd                                 }t          |          }	 |                     t                                        d                              d|                              d|                              d          }||                    dd                                          }|j	        sg rdS |                     t                                        d                              d|                              d|                              dd                              d                                          }	t          |	j	        pg           d	k    S |                    d|                                          }t          |j	        pg           d	k    S # t          $ r)}
t                              d
||||
           Y d }
~
dS d }
~
ww xY w)Nr6   idrE   rF   r$   rG   r8   Tr   z$[exists] check failed (%s/%s/%s): %sF)r?   rD   r%   TABLE_HISTORYr'   eqr   is_r+   r,   len	Exceptionloggerwarning)r-   rE   rF   rG   depnmtkbaser/   res2es              r   history_existsrX   .   s   
"
"
$
$C:2



B
d
#
#B]++L#&&FBq	 	
 :((66**2244C B tLL//VD\\Rc**R^^R^^U1XXWYY  tyB''!++''&"%%--//Csx~2&&**   =sBANNNuuuuus&   BG B'G AG 
G7G22G7Frowsdryc           
       	 |st                               d           dS h d		fd|D             }|r7t                               dt          |                     t          |          S d}d}t	          dt          |          |          D ]}||||z            }	 |                     t                                        |                                           |t          |          z  }b# t          $ ro}t           
                    d||t          |          z   dz
  |           t           
                    d	t          j        |d         d
                     Y d }~d }~ww xY wt                               d|           |S )Nz[insert] nothing to insertr   >   endrF   rG   startcareergenderr!   positionr"   
actual_endrE   c                .    g | ]fd D             S )c                D    i | ]}|v |                     |          S r   )get).0krs     r   
<dictcomp>z2insert_history_rows.<locals>.<listcomp>.<dictcomp>P   s*    555a1ff15588fffr   r   )re   rg   allows    @r   
<listcomp>z'insert_history_rows.<locals>.<listcomp>P   s/    DDD!5555U555DDDr   z[DRY][insert] %d rowsi  z&[insert][ERROR] failed chunk %d-%d: %sr$   z[insert][ERROR] sample row: %sFensure_asciiz[insert] inserted rows=%d)rP   debuginforN   r*   r%   rK   insertr+   rO   errorjsondumps)
r-   rY   rZ   cleanedtotalBichunkrW   ri   s
            @r   insert_history_rowsrx   K   s    1222qvvvEDDDDtDDDG
 +S\\:::7||EA1c'llA&& e e!A#	eLL''..u55==???SZZEE 	e 	e 	eLLA1aE

lSTnVWXXXLL94:eAh]b;c;c;cdddddddd	e KK+U333Ls   AC//
E(9A%E##E(urlc                   	 |                      |d          }|j        dk    s|j        s#t                              d||j                   d S t          j        dd          \  }}t          j        |d          5 }|	                    |j                   d d d            n# 1 swxY w Y   |S # t          $ r'}t                              d	||           Y d }~d S d }~ww xY w)
N<   )timeout   z[PDF] fetch failed %s (%s)
alio_hist_z.pdf)prefixsuffixwbz[PDF] error %s: %s)rd   status_codecontentrP   rQ   tempfilemkstemposfdopenwriterO   )sessionry   rg   fdpathfrW   s          r   fetch_pdf_to_tmp_directr   b   s'   KKRK((=CqyNN7amLLL4#<GGGDYr4   	AGGAI	 	 	 	 	 	 	 	 	 	 	 	 	 	 	   +S!444tttttsB   A
B0 .B0 <B#B0 #B''B0 *B'+B0 0
C!:CC!u    변경\s*전\s*(?:이름|성명)u    변경\s*후\s*(?:이름|성명)u6   변경\s*전\s*(?:이름|성명)\s*[:：]?\s*([^\s/]+)u6   변경\s*후\s*(?:이름|성명)\s*[:：]?\s*([^\s/]+)uN   변경\s*전\s*(?:이름|성명)[\s\S]{0,120}?변경\s*후\s*(?:이름|성명)u   [^\n\r]{0,80}?([가-힣]{2,6})\s+([가-힣]{2,6})\s+(?:선임|해임|임명|연임|재선임|중임|퇴임|사임|유지|변경|개편)rC   c                b    | s| S |                      dd          } t          j        dd|           S )N    r=   z[ \t]+)replacer@   rA   )rC   s    r   _compact_textr   z   s6     			(C  A6)S!$$$r   	full_text.Tuple[Optional[str], Optional[str], bool, str]c                6   | sdS t          | d d                   }t                              |          }t                              |          }t	          |r|                    d          nd           }t	          |r|                    d          nd           }|r|r
||||k    dfS t                              |          rt                              |          rt                              |          }|r||	                                |	                                dz            }t                              |          }|rRt	          |                    d                    }	t	          |                    d                    }
|	r|
r
|	|
|	|
k    dfS dS )	N)NNFno_texti  r$   inline_labelsiX     table_near_header)NNF	not_found)r   RE_INLINE_OLDsearchRE_INLINE_NEWrD   groupRE_OLD_LABELRE_NEW_LABELRE_HEADER_BLOCKr\   RE_NAME_PAIR_NEAR)r   headm1m2old_anew_ahbtailpairold_bnew_bs              r   parse_name_change_verboser      s    ,++5D5)**D			d	#	#B			d	#	#Br ;t<<Er ;t<<E ? ?eeun>>4   	O\%8%8%>%> 	O##D)) 	O"&&((S.01D$++D11D O+DJJqMM::+DJJqMM:: OU O %%5.;NNN))r   r   c                    t           j                            t           j                            |                     }|r7t           j                            |          st          j        |d           d S d S d S )NT)exist_ok)r   r   dirnameabspathexistsmakedirs)r   ds     r   
ensure_dirr      so    
--..A &"" &
A%%%%%%& & & &r   log_pathrec_metaDict[str, Any]old_namenew_namereasonc                   t          |            t          j        d          |                    d          |                    d          |                    d          |                    d          |                    d          |                    d          |||dd	}t	          | d
d          5 }|                    t          j        |d          dz              d d d            d S # 1 swxY w Y   d S )Nz%Y-%m-%d %H:%M:%SrJ   rE   titledisclosure_nor"   r!   u2   old_name == new_name → actual_end update skipped)timerJ   rE   r   r   r"   r!   r   r   r   noteautf-8encodingFrk   
)r   r   strftimerd   openr   rq   rr   )r   r   r   r   r   payloadr   s          r   log_same_name_caser      sA   x122ll4  ll<00g&&!o66\\+..<<	**D G 
hg	.	.	. @!	
7777$>???@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @s   4-C..C25C2starts_from_current_pdf	List[str]r"   c                H   |r|r|sdS d}t          t          |                    D ]}t          |          }|s	 |                     t                                        d|i                              d|                                                              d|                              dd          	                                 |dz  }t                              d|||           # t          $ r'}t                              d|||           Y d }~d }~ww xY w|S )	Nr   ra   rF   r]   r8   r$   u%   [actual_end] %s ← name=%s, start=%sz)[actual_end][ERROR] name=%s, start=%s: %s)sortedsetrD   r%   rK   updaterL   r?   rM   r+   rP   rn   rO   rp   )r-   r   r   r"   updated_totalstst_normrW   s           r   !update_actual_end_for_name_changer      s@    2 ) qMS01122 \ \%b)) 	
	\\\-((flI.//b))**b'""c,''giiiQMKK?HV]^^^^ 	\ 	\ 	\LLDhPWYZ[[[[[[[[	\s   B1C..
D8DDrecrm   same_log_pathno_dup_check	dump_headdump_peopleTuple[int, int]c	                
   |                     d          pd                                }	|	s0t                              d|                     d                     dS |                     d          pd                                }
|                     d          }t	          ||	          }|sdS t          |          pg }|rt          |          nd}|r{	 t          |dd	
          5 }|                    |d d                    d d d            n# 1 swxY w Y   n2# t          $ r%}t          
                    d|           Y d }~nd }~ww xY wt          |          \  }}}}t                              d||||           t          ||
          pg }|rv	 t          |dd	
          5 }t          j        ||dd           d d d            n# 1 swxY w Y   n2# t          $ r%}t          
                    d|           Y d }~nd }~ww xY wt                              dt!          |          |                     d          |
||	           |rD|rA|r|rft#          ||                     d          |                     d          |                     d          |                     d          ||	d|||           t                              d|           ng }|D ]h}|                     d          pd                                |k    r9t%          |                     d                    }|r|                    |           i|rL|st)          | |||           n7t                              d|t+          t-          |                    |           g }t-                      }|D ]}|                     d          pd                                }|                     d          pd                                }t%          |                     d                    }|sz|sY|dk    }|sQ|||pdf}||v rt/          | |||          rt                              d|||           |                    |           ||t%          |                     d                    t%          |                     d                    t%          |                     d                    t%          |                     d                     ||                     d!          pg ||	d"
} |                    |            t                              d#t!          |                     t3          | ||$          }!t!          |          |!fS )%Nr!   r6   z[skip] empty pdf_url id=%srJ   )r   r   rE   r"   wr   r   i@  z[dump_head][WARN] %sz*[name-change] old=%s new=%s same=%s via=%s)department_hintFr   )rl   indentz[dump_people][WARN] %sz9[parse] people_count=%d id=%s dept=%s posted_at=%s pdf=%sr   r   )rJ   rE   r   r   r"   r!   u+   [name-change] same '%s' → skip actual_endrF   r]   z1[DRY] would set actual_end for %s starts=%s to %srG   u   공석z[dup-skip] (%s, %s, %s)r`   r_   r\   r^   )
rE   rF   r`   r_   r]   r\   rG   r^   r"   r!   z[prepare] to_insert=%d)rZ   )rd   r?   rP   rm   r   r   r   r   r   rO   rQ   r   r   rq   dumprn   rN   r   rD   appendr   r   r   rX   addrx   )"r-   r   r   rm   rZ   r   r   r   r   r!   r   r"   pdf_pathtextsmerged_textr   rW   r   r   is_samewhypeoplestarts_for_oldpst_val	to_insert	seen_keysdeptrF   	task_norm	is_vacantkeyrowinserteds"                                     r   process_one_recordr      sR    wwy!!'R..00G 13774==AAAtww|,,299;;O$$I&w88H th''-2E-2:"5)))K 6	6iw777 ,1ETE*+++, , , , , , , , , , , , , , , 	6 	6 	6NN1155555555	6 (A'M'M$Hh
LL=xSZ\_```+GGGM2F 8	8k3999 CQ	&!%BBBBC C C C C C C C C C C C C C C 	8 	8 	8NN3Q77777777	8 KKKFSWWT]]OYQ Q Q  RH R 	R ,"=''$--"%'',"7"7 WWW--%(WW_%=%=!*&3 3 Xs, , , KKExPPPPN 6 6EE&MM'R..00H<<0w@@F 6&--f555 R R5fhXabbbbKK S (&^1D1D*E*EyR R R ')II  l##)r0022f#**,,'f66	 	 		#)I #T9?3)##!&$i@@ LL!:D$	RRRc""" +AEE*,=,=>>)!%%//::(w88&quuU||44eeHoo+"
 
 	
KK(#i..999"69#>>>Hv;;  sx   D 'DD DD DD 
E'EEG )GG GG GG 
H	$HH	logs/name_change_same.txt
limit_pagemax_rowsOptional[int]c                   t                      }t                      }		 |                    t                                        d                              d                                           n4# t          $ r'}
t          	                    dt          |
            d }
~
ww xY wdx}x}}d}	 t          || |          }|sn|D ]}|dz  }t          ||	|||||||	  	        \  }}|dz  }||z  }t                              d|                    d          |||                    d	          |                    d
                     |r||k    r|||dc S t          j        d           |t!          |          z  }|||dS )NrJ   r$   z [INIT] %s table check failed: %sr   T)r   r   )rm   rZ   r   r   r   r   z5[id=%s] people=%d, inserted=%d, dept=%s, posted_at=%srE   r"   )scanned	processedr   g?)r   r   r%   rK   r'   r   r+   rO   rP   rp   r0   r   rn   rd   r   sleeprN   )r   rZ   r   rm   r   r   r   r   r-   r   rW   r   r   r   r   batchr   
cnt_peopleinss                      r   run_allr   ,  s    TTFnnG]##**40066q99AACCCC   7JJJ &'&I&7FfJvFFF 	 	 	CqLG0M)YK  OJ
 NIOHKKOz38M8MswwWbOcOce e e ZI11#*PXYYYYYJsOOOO#e**%& YHMMMs   AA1 1
B";"BB"c            
     .   t          j                    } |                     dd           |                     dt          d           |                     dt          d            |                     dd           |                     d	t          d
           |                     dd           |                     ddd           |                     dt          d d           |                     dt          d d           |                                 }|j        rt                              d           |j	        rXt          |j                   t          |j        dd          5 }|                    d           d d d            n# 1 swxY w Y   t          |j        |j        |j        |j        |j        |j        |j        |j                  }t                              dt-          j        |d                     t1          |           d S )Nz	--dry-run
store_true)actionz--page-sizer   )typedefaultz--maxz--debugz
--same-logr   z--reset-same-logz--no-dup-checku,   중복체크 끄기(문제 원인 진단용))r   helpz--dump-headu)   PDF 병합 텍스트 헤더 저장 경로)r  r  r  z--dump-peopleu#   파싱된 people JSON 저장 경로DEBUGr   r   r   r6   )r   rZ   r   rm   r   r   r   r   zdone: %sFrk   )argparseArgumentParseradd_argumentr   r>   
parse_argsrm   rP   setLevelreset_same_logr   same_logr   r   r   	page_sizedry_runmaxr   r   r   rn   rq   rr   print)apargsr   summarys       r   mainr  O  s   		 	"	"BOOKO555OOMTO:::OOG#tO444OOIlO333OOLs4OOPPPOO&|O<<<OO$\@nOoooOOMT@kOlllOOO#tBgOhhh==??Dz !    4=!!!$-w777 	1GGBKKK	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 >t|dhdjm$2C.d.>  G
 KK
DJwUCCCDDD	'NNNNNs   +FFF__main__)r   r   )r   r   r   r   r   r   )r:   r;   r   r;   )rE   r;   rF   r>   rG   r;   r   rH   )F)rY   r   rZ   rH   r   r   )ry   r>   r   r;   )rC   r>   r   r>   )r   r>   r   r   )r   r>   )
r   r>   r   r   r   r>   r   r>   r   r>   )r   r>   r   r   r"   r;   r   r   )FFNFNN)r   r   rm   rH   rZ   rH   r   r;   r   rH   r   r;   r   r;   r   r   )r   FNFr   FNN)r   r   rZ   rH   r   r   rm   rH   r   r>   r   rH   r   r;   r   r;   r   r   )C
__future__r   r   sysr@   rq   r   r  loggingr   typingr   r   r   r   r   r   r   joinr   __file__ROOT_DIRr   app.services.supabase_servicer
   app.crawler.group_n8nr   r   r   r   getenvupperr   	getLoggerrP   StreamHandler_handlersetFormatter	Formatter
addHandlerr	  r&   rK   r   r0   rB   rD   rX   rx   r   compileMr   r   r   r   r   r   r   r   r   r   r   r   r   r  __name__r   r   r   <module>r)     s   " " " " " " ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; 3 3 3 3 3 3 3 3 3 3 3 3 3 37??27<<(A(A4NNOO38HOOH 4 4 4 4 4 4            BIk6**0022			3	4	4 7 ""   'g'(EFF G G G   (    	          POO- - - -   :    .    rz=rtDDrz=rtDD
TUU
TUU"*nprptuuBJ OD  
% % % %* * * *,& & & &
@ @ @ @$   , 9>6:,12648h! h! h! h! h!T RV6QCG)-!N !N !N !N !NF  : zDFFFFF r   