o
    ê$Âhä:  ã                   @   sJ  d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZmZm	Z	 d dl
m
Z
mZ d dlZd dlmZ d dlmZ e  dd¡ ¡ Zejedd	 e d
¡ZdZed ZddiZe d¡ZdedB defdd„Zdedefdd„Zdedededefdd„Zdedededededefdd„Z ded edefd!d"„Z!dHd&ej"d'edej#fd(d)„Z$d&ej"dee fd*d+„Z%d,edefd-d.„Z&d&ej"d/ede	ee eef fd0d1„Z'de(fd2d3„Z)d4e(d5e(d6e(d7e*de	e+ef f
d8d9„Z,deeef fd:d;„Z-d<edeeef fd=d>„Z.d<efd?d@„Z/dIdDe*d6e(d7e*defdEdF„Z0e1dGkr#e2e0ƒ ƒ dS dS )Jé    N)ÚListÚDictÚTuple)ÚdatetimeÚtimezone)ÚBeautifulSoup)Ú
get_clientÚ	LOG_LEVELÚINFOz[%(levelname)s] %(message)s)ÚlevelÚformatÚme_org_pipelinezhttps://me.go.krz/home/web/index.do?menuId=10428z
User-Agentz*GovBot/1.0 (+https://work.jjickjjicks.com)z\d{2,4}-\d{3,4}-\d{4}ÚsÚreturnc                 C   s"   d  | pd dd¡ dd¡ ¡ ¡S )Nú Ú õ   Â u   â€‹)ÚjoinÚreplaceÚsplit©r   © r   ú0/var/www/html/bot/app/crawler/me_org_pipeline.pyÚ_clean   s   "r   c                 C   s   t | ƒS ©N)r   r   r   r   r   Ú_norm   s   r   ÚnameÚpositionÚ
departmentc                 C   s2   t | ƒ› dt |ƒ› dt |ƒ› }t | ¡ ¡ ¡ S ©Nú|©r   ÚhashlibÚsha256ÚencodeÚ	hexdigest)r   r   r   Úbaser   r   r   Ú	_key_hash   s    r'   ÚtaskÚphonec              
   C   sF   t | ƒ› dt |ƒ› dt |ƒ› dt |ƒ› dt |ƒ› 	}t | ¡ ¡ ¡ S r   r!   )r   r   r   r(   r)   r&   r   r   r   Ú	_row_hash#   s   4r*   Úname_rawÚposition_rawc           	      C   s  t | ƒ}t |ƒ}|r|s|S |}t d|¡d  ¡ }dd„ ||hD ƒ}|D ]S}|d| fD ]}| |¡rBt |d t|ƒ … ƒ d¡}q-t dt |¡› dd	|¡ d¡}t d
t |¡› dd	|¡ d¡}| |¡rxt |d t|ƒ … ƒ d¡}q%t |ƒ}|r|S |S )Nu   [ \t(\[/Â·\-]r   c                 S   ó   g | ]}|r|‘qS r   r   ©Ú.0Úcr   r   r   Ú
<listcomp>/   ó    z,strip_position_from_name.<locals>.<listcomp>r   u    -Â·/â€“â€”z\s*\(\s*z	\s*\)\s*$r   z\s*z\s*\([^)]+\)\s*$)	r   Úrer   ÚstripÚendswithÚlenÚrstripÚsubÚescape)	r+   r,   ÚnÚpÚoriginalr&   Ú
candidatesÚcandÚsufr   r   r   Ústrip_position_from_name(   s&   
€""
€r@   é   é   çffffffæ?ÚsessÚurlc           
      C   sŒ   d }t |ƒD ]=}z| j|t|d}| ¡  |W   S  tyC } z|}|d|  }	t d|d ||	|¡ t |	¡ W Y d }~qd }~ww |‚)N)ÚheadersÚtimeouté   zGET retry %d/%d in %.1fs: %sé   )	ÚrangeÚgetÚHEADERSÚraise_for_statusÚ	ExceptionÚloggerÚwarningÚtimeÚsleep)
rD   rE   rG   ÚtriesÚbackoffÚlastÚiÚrÚerR   r   r   r   Ú
_retry_get<   s   
€ürY   c           	      C   sê   t | tƒ}t|jdƒ}tƒ }g }| d¡D ]]}| d¡pd ¡ }|r'| d¡r(q| d¡r/|n| d¡r8t	| nt	d | }| 
¡ }d|v sWd	|v sWd
|v sWd|v sWd|v sWq||v r\q| |¡ | |t| d¡pm| ¡ ƒdœ¡ q|S )an  Parse the org chart page and extract unit links.

    The ME org page tends to link each unit to a detail page. We conservatively
    collect anchors that look like org detail links. Heuristics:
    - href starts with '/' or 'http'
    - and contains one of: 'org', 'dept', 'team' or ends with 'index.do' while including menuId
    - avoid javascript/hash links
    úhtml.parserÚaÚhrefr   )ú#zjavascript:Úhttpú/zmenuid=10428z/orgÚorgÚdeptÚteamÚtitle)Úlinkrc   )rY   ÚLIST_URLr   ÚtextÚsetÚfind_allrK   r4   Ú
startswithÚBASEÚlowerÚaddÚappendr   Úget_text)	rD   rW   ÚsoupÚseenÚoutr[   r\   rd   Úhr   r   r   Úfetch_org_linksK   s"   
	,(
$rs   ro   c                 C   sp   dD ]}|   |¡}|rt| ¡ ƒ}|rt|ƒdkr|  S qdD ]}|   |¡}|r5t| ¡ ƒ}|r5|  S q dS )N)Úh1Úh2Úh3z
strong.titzdiv.title h2zdiv.title h3rA   )z.breadcrumb li:last-childz.location li:last-childz.path li:last-childr   )Ú
select_oner   rn   r6   )ro   Úselrr   ÚtxÚlir   r   r   Ú_guess_department_from_pageo   s   
€
€r{   rd   c                    sè  t | |ƒ}|j}t|dƒ}t|ƒ}g }| d¡}|D ]S}dd„ | d¡D ƒ}	|	r3tdd„ |	D ƒƒs3q| d¡D ]4}
|
 d¡rAq8|
 d	¡}t|ƒd
k rMq8dd„ |D ƒ}d\}}}}}|	rët|	ƒt|ƒkrët	|	|ƒD ]{\‰ }t
|jdddƒ}t‡ fdd„dD ƒƒr…|}qjt‡ fdd„dD ƒƒr“|}qjt‡ fdd„dD ƒƒr¡|}qjt‡ fdd„dD ƒƒrÛ| d¡}|rÃ| d¡rÃ| d¡ dd¡ ¡ nt
|jdddƒ}t |¡}|rØ| d¡n|}qjdˆ v sãdˆ v rå|}qj|sê|}nbdd„ |D ƒ}t|ƒd krLt|ƒd!kr|d |d" |d# |d
 d |d d … ¡f\}}}}}n"|d |d" |d# }}}t|ƒd
kr:d |d
d … ¡nd}|}t |¡}|rK| d¡}nq8t||ƒ}|sb|sb|sb|sb|sbq8| |||||d$œ¡ q8q|||fS )%NrZ   Útablec                 S   s"   g | ]}t |jd ddƒ ¡ ‘qS ©r   T©r4   )r   rn   rk   )r/   Úthr   r   r   r1   Ž   s    ÿÿz$parse_org_detail.<locals>.<listcomp>r   c                 3   s*    | ]‰ t ‡ fd d„dD ƒƒrˆ V  qdS )c                 3   ó    | ]}|ˆ v V  qd S r   r   ©r/   Úk©rr   r   r   Ú	<genexpr>’   ó   € z-parse_org_detail.<locals>.<genexpr>.<genexpr>)õ   ì„±ëª…õ   ì´ë¦„õ   ì§ìœ„õ   ì§ê¸‰õ   ì—°ë½õ   ì „í™”õ   ë‹´ë‹¹N)Úany)r/   r   rƒ   r   r„   ’   s   €( z#parse_org_detail.<locals>.<genexpr>ÚtrÚtdrB   c                 S   s   g | ]}t |jd ddƒ‘qS r}   )r   rn   )r/   r   r   r   r   r1   œ   s    ÿ)r   r   r   r   r   r   Tr~   c                 3   r€   r   r   r   rƒ   r   r   r„   §   r…   )u   ë¶€ì„œu	   ë¶€ì„œëª…u   ì†Œì†c                 3   r€   r   r   r   rƒ   r   r   r„   ©   r…   )r†   r‡   c                 3   r€   r   r   r   rƒ   r   r   r„   «   r…   )rˆ   r‰   u   ì§ì±…c                 3   r€   r   r   r   rƒ   r   r   r„   ­   r…   )r‹   rŠ   u	   ì—°ë½ì²˜u   ë²ˆí˜¸za[href^="tel:"]r\   ztel:r   r   rŒ   u   ì—…ë¬´c                 S   r-   r   r   r.   r   r   r   r1   ½   r2   é   é   rI   rH   )r   r   r   r)   r(   )rY   rf   r   r{   Úselectrh   r   Úfindr6   Úzipr   rn   rw   rK   r   r4   ÚPHONE_REÚsearchÚgroupr   r@   rm   )rD   rd   rW   Úhtmlro   Úpage_departmentÚrowsÚcandidate_tablesr|   rF   rŽ   ÚtdsÚtextsr   r   r   r)   r(   r   ÚtÚtel_aÚmÚcolsr   rƒ   r   Úparse_org_detail   s   



þ

ÿ
ÿÿý
€€8$

€
ûÿÅ
Er¢   c                 C   s:   z|   d¡jddd ¡ }|jpdW S  ty   Y dS w )NÚ
me_org_curÚkey_hashÚexact)Úcountr   )r|   r’   Úexecuter¦   rN   )ÚsbÚresr   r   r   Ú
_count_curÝ   s   ÿrª   Ú	stg_countÚ	cur_countÚmin_absÚ	min_ratioc                C   s\   | |k rdd| › d|› fS |dkr,| t || ƒk r,dd| › dt || ƒ› d|› dfS dS )	NFztoo_few_rows: z < r   zratio_drop: z (cur=ú))TÚok)Úint)r«   r¬   r­   r®   r   r   r   Ú	_validateå   s
   $r²   c                 C   s.   |   d¡ d¡ d¡ ¡ jpg }dd„ |D ƒS )Nr£   Ú*éPÃ  c                 S   ó   i | ]}|d  |“qS ©r¤   r   ©r/   rW   r   r   r   Ú
<dictcomp>ï   ó    z%_load_current_map.<locals>.<dictcomp>)r|   r’   Úlimitr§   Údata)r¨   rš   r   r   r   Ú_load_current_mapí   s    r¼   Úrun_idc                 C   s6   |   d¡ d¡ d|¡ d¡ ¡ jpg }dd„ |D ƒS )NÚ
me_org_stgr³   r½   r´   c                 S   rµ   r¶   r   r·   r   r   r   r¸   ô   r¹   z!_load_stg_map.<locals>.<dictcomp>)r|   r’   Úeqrº   r§   r»   )r¨   r½   rš   r   r   r   Ú_load_stg_mapò   s   (rÀ   c           	      C   s  t | ƒ}t| |ƒ}g g }}t tj¡ ¡ }| ¡ D ]}||vr%| |¡ q| 	¡ D ]0\}}||vs<|| d |d krZ| ||d |d |d | 
d¡| 
d¡|d |d ddœ
¡ q*|rr|  d	¡ |d
dœ¡ d|¡ dd¡ ¡  |r€|  d	¡ |¡ ¡  d S d S )NÚrow_hashr   r   r   r(   r)   T)
r¤   r   r   r   r(   r)   rÁ   Ú
valid_fromÚvalid_toÚ
is_currentÚme_org_histF)rÃ   rÄ   r¤   rÄ   )r¼   rÀ   r   Únowr   ÚutcÚ	isoformatÚkeysrm   ÚitemsrK   r|   ÚupdateÚin_r¿   r§   Úinsert)	r¨   r½   Úcur_mapÚstg_mapÚto_closeÚto_addÚnow_isor‚   Úvr   r   r   Ú_apply_scd2÷   s@   


€öÿ€ÿÿrÔ   çš™™™™™É?éô  ç333333ã?Ú	sleep_secc                 C   s  t ƒ }z%dD ] }z| |¡ d¡ d¡ ¡  W q ty&   td|› dƒ‚w W nC tyk } z7t d|¡ t	t
 ¡ ƒ}z| d¡ |dd	t	|ƒd
œ¡ ¡  W n	 tyX   Y nw |d	t	|ƒdœW  Y d }~S d }~ww t	t
 ¡ ƒ}| d¡ |dddœ¡ ¡  d}d}z?t ¡ ™}	t|	tƒ}
| d¡ |ddt|
jdœ¡ ¡  t|	ƒ}t|dƒD ]q\}}t|	|d ƒ\}}}| d¡ |d|||dœ¡ ¡  |rg }|D ]2}t|d |d |d ƒ}t|d |d |d | dd¡| dd¡ƒ}| d|i|¥||dœ¥¡ qÏ| d¡ |¡ ¡  |t|ƒ7 }|d7 }t | ¡ qªW d   ƒ n	1 s'w   Y  | d¡ ||dd|› d œ¡ ¡  t|ƒ}t||||d!\}}|s‡| d¡ d"t  !t"j#¡ $¡ |||d#œ¡ %d$|¡ ¡  | d¡ d"|d%œ¡ %d|¡ ¡  t d&|¡ |d"|d'œW S t&||ƒ | d¡ d(d)i¡ %d|¡ ¡  | d¡ d)t  !t"j#¡ $¡ ||d*œ¡ %d$|¡ ¡  t 'd+|||¡ |d)||d,œW S  ty } z0| d¡ d	t  !t"j#¡ $¡ ||t	|ƒd#œ¡ %d$|¡ ¡  t (d-|¡ |d	t	|ƒdœW  Y d }~S d }~ww ).N)Ú
me_org_rawr¾   rÅ   Úme_org_snapshotr£   r³   rI   zSupabase object 'z_' is missing. Run sql/me_org_pipeline.sql in your Supabase project before running this crawler.z [me_org] schema check failed: %sÚcrawler_runÚme_orgÚaborted)ÚidÚtargetÚstatusÚfail_reason)r½   rà   ÚerrorÚrunning)rÞ   rß   rà   r   rÙ   Úlist)r½   ÚkindÚpagerE   r˜   rd   Údetailr   r   r   r(   r   r)   r½   )r¤   rÁ   r¾   rÚ   Ú	collectedzpages=)r½   rš   rà   Únote)r­   r®   Úfailed)rà   Úfinished_atÚpagesrš   rá   rÞ   )rà   ré   z[me_org] validation failed: %s)r½   rà   Úreasonrà   Úpassed)rà   rë   rì   rš   z1[me_org] run passed: run_id=%s, rows=%d, pages=%d)r½   rà   rš   rì   z[me_org] run aborted: %s))r   r|   r’   rº   r§   rN   ÚRuntimeErrorrO   râ   ÚstrÚuuidÚuuid4rÍ   ÚrequestsÚSessionrY   re   Úupsertrf   rs   Ú	enumerater¢   r'   r*   rK   rm   r6   rQ   rR   rª   r²   rË   r   rÆ   r   rÇ   rÈ   r¿   rÔ   ÚinfoÚ	exception)rØ   r­   r®   r¨   rž   Úpreflight_errr½   rì   rè   rD   Úlist_rÚlinksrV   Úitemrš   rE   r˜   ÚpayloadrW   r‚   rr   r¬   r°   rí   rX   r   r   r   Úrun_once  sâ   
ÿÿý
ü
ÿ€ó


û
û&ÿêô
%ÿ
ûÿ"
 
üÿ
ûÿ€õrþ   Ú__main__)rA   rB   rC   )rÕ   rÖ   r×   )3Úosr3   rQ   rñ   r"   ÚloggingÚtypingr   r   r   r   r   ró   Úbs4r   Úapp.services.supabase_servicer   ÚgetenvÚupperr	   ÚbasicConfigÚ	getLoggerrO   rj   re   rL   Úcompiler•   rð   r   r   r'   r*   r@   rô   ÚResponserY   rs   r{   r¢   r±   rª   ÚfloatÚboolr²   r¼   rÀ   rÔ   rþ   Ú__name__Úprintr   r   r   r   Ú<module>   s>   0

"$&\&#
sÿ