o
    ü ±hh#  ã                   @   s<  d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZmZm	Z	 d dl
m
Z
mZ d dlZd dlmZ d dlmZ e  dd¡ ¡ Zejedd	 e d
¡ZdZed ZdZddiZe d¡ZdedB defdd„Zdedefdd„Zdedededefdd„Z dedededededefdd„Z!d ed!edefd"d#„Z"dEd'ej#d(edej$fd)d*„Z%d'ej#dee fd+d,„Z&d'ej#d-ede	ee eef fd.d/„Z'de(fd0d1„Z)d2e(d3e(d4e(d5e*de	e+ef f
d6d7„Z,deeef fd8d9„Z-d:edeeef fd;d<„Z.d:efd=d>„Z/dFdAe*d4e(d5e*defdBdC„Z0e1dDkre2e0ƒ ƒ dS dS )Gé    N)ÚListÚDictÚTuple)ÚdatetimeÚtimezone)ÚBeautifulSoup)Ú
get_clientÚ	LOG_LEVELÚINFOz[%(levelname)s] %(message)s)ÚlevelÚformatÚmoef_org_pipelinezhttps://www.moef.go.krz;/mi/orgnzt/org.do?bbsId=MOSFBBS_000000000097&menuNo=9040100za[href*="orgId="]z
User-Agentz*GovBot/1.0 (+https://work.jjickjjicks.com)z\d{2,4}-\d{3,4}-\d{4}ÚsÚreturnc                 C   s   d  | pd dd¡ ¡ ¡S )Nú Ú õ   Â )ÚjoinÚreplaceÚsplit©r   © r   ú2/var/www/html/bot/app/crawler/moef_org_pipeline.pyÚ_clean   s    r   c                 C   s   t | ƒS )N)r   r   r   r   r   Ú_norm   s    r   ÚnameÚpositionÚ
departmentc                 C   s2   t | ƒ› dt |ƒ› dt |ƒ› }t | ¡ ¡ ¡ S ©Nú|©r   ÚhashlibÚsha256ÚencodeÚ	hexdigest)r   r   r   Úbaser   r   r   Ú	_key_hash   s    r&   ÚtaskÚphonec              
   C   sF   t | ƒ› dt |ƒ› dt |ƒ› dt |ƒ› dt |ƒ› 	}t | ¡ ¡ ¡ S r   r    )r   r   r   r'   r(   r%   r   r   r   Ú	_row_hash   s   4r)   Úname_rawÚposition_rawc           	      C   s  t | ƒ}t |ƒ}|r|s|S |}t d|¡d  ¡ }dd„ ||hD ƒ}|D ]S}|d| fD ]}| |¡rBt |d t|ƒ … ƒ d¡}q-t dt |¡› dd	|¡ d¡}t d
t |¡› dd	|¡ d¡}| |¡rxt |d t|ƒ … ƒ d¡}q%t |ƒ}|r|S |S )Nu   [ \t(\[/Â·\-]r   c                 S   s   g | ]}|r|‘qS r   r   )Ú.0Úcr   r   r   Ú
<listcomp>#   s    z,strip_position_from_name.<locals>.<listcomp>r   u    -Â·/â€“â€”z\s*\(\s*z	\s*\)\s*$r   z\s*z\s*\([^)]+\)\s*$)	r   Úrer   ÚstripÚendswithÚlenÚrstripÚsubÚescape)	r*   r+   ÚnÚpÚoriginalr%   Ú
candidatesÚcandÚsufr   r   r   Ústrip_position_from_name   s   &€""&€r<   é   é   çffffffæ?ÚsessÚurlc           
      C   sŒ   d }t |ƒD ]=}z| j|t|d}| ¡  |W   S  tyC } z|}|d|  }	t d|d ||	|¡ t |	¡ W Y d }~qd }~ww |‚)N)ÚheadersÚtimeouté   zGET retry %d/%d in %.1fs: %sé   )	ÚrangeÚgetÚHEADERSÚraise_for_statusÚ	ExceptionÚloggerÚwarningÚtimeÚsleep)
r@   rA   rC   ÚtriesÚbackoffÚlastÚiÚrÚerN   r   r   r   Ú
_retry_get-   s   ",€þrU   c                 C   s°   t | tƒ}t|jdƒ}tƒ g }}| t¡D ]?}| dd¡pd ¡ }|r(d|vr)q| 	d¡r0|n| 	d¡r9t
| nt
d | }||v rDq| |¡ | || dd¡dœ¡ q|S )	Núhtml.parserÚhrefr   zorgId=Úhttpú/Útitle)ÚlinkrZ   )rU   ÚLIST_URLr   ÚtextÚsetÚselectÚ	SEL_LINKSrG   r0   Ú
startswithÚBASEÚaddÚappend)r@   rS   ÚsoupÚseenÚoutÚarW   r[   r   r   r   Úfetch_org_links7   s   
,

ri   r[   c              	   C   s*  t | |ƒ}|j}t|dƒ}g }| d¡D ]{}| d¡rq| d¡}t|ƒdk r(qt|d jddƒ}t|d	 jddƒ}	t|d
 jddƒ}
t|d jddƒ}t|d jddƒ}|d  	d¡}|rr| 
d¡rr|d  dd¡ ¡ }t|	|
ƒ}|
}|s„|s„|s„|s„|s„q| |||||dœ¡ q|||fS )NrV   ztable trÚthÚtdé   r   T)r0   rE   rD   r>   é   za[href^="tel:"]rW   ztel:r   )r   r   r   r(   r'   )rU   r]   r   r_   ÚfindÚfind_allr2   r   Úget_textÚ
select_oneÚhas_attrr   r0   r<   rd   )r@   r[   rS   Úhtmlre   ÚrowsÚtrÚtdsr   r*   r+   r(   ÚresponsibilityÚtel_ar   r   r   r   r   Úparse_org_detailD   s&   

"

ry   c                 C   s:   z|   d¡jddd ¡ }|jpdW S  ty   Y dS w )NÚmoef_org_curÚkey_hashÚexact)Úcountr   )Útabler_   Úexecuter}   rJ   )ÚsbÚresr   r   r   Ú
_count_curY   s   ÿr‚   Ú	stg_countÚ	cur_countÚmin_absÚ	min_ratioc                C   s\   | |k rdd| › d|› fS |dkr,| t || ƒk r,dd| › dt || ƒ› d|› dfS dS )	NFztoo_few_rows: z < r   zratio_drop: z (cur=ú))TÚok)Úint)rƒ   r„   r…   r†   r   r   r   Ú	_validate`   s   $rŠ   c                 C   s.   |   d¡ d¡ d¡ ¡ jpg }dd„ |D ƒS )Nrz   Ú*éPÃ  c                 S   ó   i | ]}|d  |“qS ©r{   r   ©r,   rS   r   r   r   Ú
<dictcomp>h   ó    z%_load_current_map.<locals>.<dictcomp>)r~   r_   Úlimitr   Údata)r€   rt   r   r   r   Ú_load_current_mapf   s    r”   Úrun_idc                 C   s6   |   d¡ d¡ d|¡ d¡ ¡ jpg }dd„ |D ƒS )NÚmoef_org_stgr‹   r•   rŒ   c                 S   r   rŽ   r   r   r   r   r   r   l   r‘   z!_load_stg_map.<locals>.<dictcomp>)r~   r_   Úeqr’   r   r“   )r€   r•   rt   r   r   r   Ú_load_stg_mapj   s   (r˜   c           	      C   s  t | ƒ}t| |ƒ}g g }}t tj¡ ¡ }| ¡ D ]}||vr%| |¡ q| 	¡ D ]0\}}||vs<|| d |d krZ| ||d |d |d | 
d¡| 
d¡|d |d ddœ
¡ q*|rr|  d	¡ |d
dœ¡ d|¡ dd¡ ¡  |r€|  d	¡ |¡ ¡  d S d S )NÚrow_hashr   r   r   r'   r(   T)
r{   r   r   r   r'   r(   r™   Ú
valid_fromÚvalid_toÚ
is_currentÚmoef_org_histF)r›   rœ   r{   rœ   )r”   r˜   r   Únowr   ÚutcÚ	isoformatÚkeysrd   ÚitemsrG   r~   ÚupdateÚin_r—   r   Úinsert)	r€   r•   Úcur_mapÚstg_mapÚto_closeÚto_addÚnow_isoÚkÚvr   r   r   Ú_apply_scd2n   s(   

€ý€*ÿr­   çš™™™™™É?é   Ú	sleep_secc                 C   s.  t ƒ }tt ¡ ƒ}| d¡ |dddœ¡ ¡  d}d}z=t ¡ ˜}t	|t
ƒ}| d¡ |ddt
|jdœ¡ ¡  t|ƒ}	t|	d	ƒD ]p\}
}t||d
 ƒ\}}}| d¡ |d|
||dœ¡ ¡  |r¢g }|D ]2}t|d |d |d ƒ}t|d |d |d | dd¡| dd¡ƒ}| d|i|¥||dœ¥¡ qe| d¡ |¡ ¡  |t|ƒ7 }|d	7 }t | ¡ qAW d   ƒ n1 s¼w   Y  | d¡ ||dd|› dœ¡ ¡  t|ƒ}t||||d\}}|s| d¡ dt tj¡ ¡ |||dœ¡ d|¡ ¡  | d¡ d|dœ¡ d|¡ ¡  t  !d|¡ |d|dœW S t"||ƒ | d¡ d d!i¡ d|¡ ¡  | d¡ d!t tj¡ ¡ ||d"œ¡ d|¡ ¡  t  #d#|||¡ |d!||d$œW S  t$y– } z0| d¡ d%t tj¡ ¡ ||t|ƒdœ¡ d|¡ ¡  t  %d&|¡ |d%t|ƒd'œW  Y d }~S d }~ww )(NÚcrawler_runÚmoef_orgÚrunning)ÚidÚtargetÚstatusr   Úmoef_org_rawÚlist)r•   ÚkindÚpagerA   rs   rE   r[   Údetailr   r   r   r'   r   r(   r•   )r{   r™   r–   Úmoef_org_snapshotÚ	collectedzpages=)r•   rt   r¶   Únote)r…   r†   Úfailed)r¶   Úfinished_atÚpagesrt   Úfail_reasonr´   )r¶   r¾   z [moef_org] validation failed: %s)r•   r¶   Úreasonr¶   Úpassed)r¶   rÀ   rÁ   rt   z3[moef_org] run passed: run_id=%s, rows=%d, pages=%d)r•   r¶   rt   rÁ   Úabortedz[moef_org] run aborted: %s)r•   r¶   Úerror)&r   ÚstrÚuuidÚuuid4r~   r¥   r   ÚrequestsÚSessionrU   r\   Úupsertr]   ri   Ú	enumeratery   r&   r)   rG   rd   r2   rM   rN   r‚   rŠ   r£   r   rž   r   rŸ   r    r—   rK   rÆ   r­   ÚinforJ   Ú	exception)r°   r…   r†   r€   r•   rÁ   r½   r@   Úlist_rÚlinksrR   Úitemrt   rA   rs   ÚpayloadrS   r«   Úhr„   rˆ   rÃ   rT   r   r   r   Úrun_once   s^   


ÿ

ÿ, ôø$4"
 28€ýrÕ   Ú__main__)r=   r>   r?   )r®   r¯   r?   )3Úosr/   rM   rÈ   r!   ÚloggingÚtypingr   r   r   r   r   rÊ   Úbs4r   Úapp.services.supabase_servicer   ÚgetenvÚupperr	   ÚbasicConfigÚ	getLoggerrK   rb   r\   r`   rH   ÚcompileÚPHONE_RErÇ   r   r   r&   r)   r<   rË   ÚResponserU   ri   ry   r‰   r‚   ÚfloatÚboolrŠ   r”   r˜   r­   rÕ   Ú__name__Úprintr   r   r   r   Ú<module>   s>   0

"
&&
1ÿ