
    shiK                        d Z ddlZddlZddlZddlZddlZddlmZmZm	Z	m
Z
 ddlmZmZ ddlZddlmZ ddlmZ  ej        ej        d            ej        d	          Zd
Ze dZe dZe dZddiZddddZ ej        d          Z ej        d          Zde
e         defdZ dedefdZ!dedededefdZ"dedededededefdZ#ddd d!d"d#ej$        d$ed%efd&Z%d' Z&dXd(Z'd)ede(fd*Z)d+edee         fd,Z*d#ej$        d%ed-edee         fd.Z+d/edee         fd0Z,d#ej$        d-ed1ed2ed3ed4ede
e         fd5Z-d#ej$        d6ede	ee         e
e         ef         fd7Z.d#ej$        de/fd8Z0d#ej$        d9e/de	ee         ef         fd:Z1d;d<d#ej$        d=ed>e2dee         fd?Z3d+e
e         d@e
e         defdAZ4d+e
e         d@e
e         defdBZ5dCee         dee         fdDZ6dEdFdGedCee         dHe/fdIZ7de/fdJZ8dKe/dLe/dMe/dNe2de	e(ef         f
dOZ9d=edPedCee         dee         fdQZ:d=edCee         dee         fdRZ;dYd>e2dMe/dNe2defdVZ<e=dWk    r e> e<                       dS dS )ZuU  
MOTIE 조직도 크롤러(_test) — /view + empSearch 병행 수집
- 메인(조직도)에서 jsSearchOrgan(...) 4개 인자 추출 → POST /view 로 직원표 파싱
- empSearch 전체 페이지 크롤링 병행 → 일부(파견 등) /view 미노출 인원 보강
- 스테이징에서 dedupe → finalize_motie_run_test 로 SCD2 반영
    N)ListDictTupleOptional)datetimetimezone)BeautifulSoup)
get_clientz[%(levelname)s] %(message)s)levelformatmotie_org_pipeline_testzhttps://www.motie.go.krz/kor/26/headquartersz/kor/28/institutionz/kor/25/empSearch
User-Agentz*GovBot/2.2 (+https://work.jjickjjicks.com)z?text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8z#ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7)r   AcceptzAccept-Languagez\d{2,4}-\d{3,4}-\d{4}z8jsSearchOrgan\('([^']*)','([^']*)','([^']*)','([^']*)'\)sreturnc                     d                     | pd                    dd                              dd                                                    S )N      u   ​)joinreplacesplitr   s    8/var/www/html/bot/app/crawler/motie_org_pipeline_test.py_cleanr   1   sD    88QW"%%h44<<XrJJPPRRSSS    c                      t          |           S N)r   r   s    r   _normr   4   s    *r   nameposition
departmentc           	          t          j        t          |            dt          |           dt          |                                                                                     S N|hashlibsha256r   encode	hexdigest)r    r!   r"   s      r   	_key_hashr+   6   sT    >U4[[PP5??PPU:=N=NPPWWYYZZddfffr   taskphonec                    t          j        t          |            dt          |           dt          |           dt          |           dt          |           	                                                                          S r$   r&   r    r!   r"   r,   r-   s        r   	_row_hashr0   9   su    >;;[[x[[5+<+<[[uT{{[[USX\\[[bbdd ikkr      g333333?)headersdatamax_trysleepsessmethodurlc          	         d }t          t                    }	|	                    |pi            t          d|dz             D ]}
	 |                                dk    r | j        |f|	|dd|}n | j        |f|	dd|}|                                 |c S # t          $ rS}|}t          
                    d|                                |
||           t          j        ||
z             Y d }~d }~ww xY w|)N   POST   )r2   r3   timeout)r2   r=   z%s failed(%d/%d): %s)dictBASE_HEADERSupdaterangeupperpostgetraise_for_status	Exceptionloggerwarningtimer5   )r6   r7   r8   r2   r3   r4   r5   kwlasthires                r   _requestrP   >   s.   D\AAHHW]3331gk"" " "
	"||~~''DIcJ14JJrJJDHS>!R>>2>>   HHH 	" 	" 	"DNN16<<>>1gsSSSJuqy!!!!!!!!	" Js   AB
C5"A	C00C5c                      t          | d|fi |S )NGETrP   )r6   r8   rJ   s      r   _getrT   O   s    8D%#C#C#C#CCr   c                 $    t          | d|fd|i|S )Nr;   r3   rS   )r6   r8   r3   rJ   s       r   _postrV   P   s!    htVS.Z.Zt.ZWY.Z.Z'Zr   txtc                 h    t          |           }t          |          o|                    d          S )N)u   과u   관u   국u   실u   팀)r   boolendswith)rW   ts     r   _is_org_namer\   U   s+    sA77Fqzz"EFFFr   ac                    g | j         d}}}||dk     r}dD ]f}|                    |d          D ]L}t          |                                          }t	          |          r||vr|                    d|           Mg|j         }|dz  }||dk     }g t                      }}|D ]0}	|	|vr*|                    |	           |                    |	           1|S )Nr      )strongspanpr]   F)	recursiver:   )	parentfind_allr   get_textr\   insertsetaddappend)
r]   pathelhopsselxtxoutseenrb   s
             r   _infer_pathrs   Y   s    1d"D
.TAXX/ 	' 	'C[[[66 ' 'AJJLL))## '$KK2&&&' Y	 .TAXX CEEC ' 'D==HHQKKKAJr   scopec           	      x   t          | |d|i          j        }t          |d          }i }|                    d          D ]}t	          |                                          }t          |          s3|                    d          pd}t          	                    |          sgfdt          dd	          D             \  }	}
}}||	|
||f}||vr||	|
||t          |          |d
||<   t          |                                          }t                              d|t!          |                     |S )NRefererr2   html.parserza[onclick*="jsSearchOrgan"]onclickr   c              3   f   K   | ]+}                     |                                          V  ,d S r   )groupstrip).0rM   ms     r   	<genexpr>z"_parse_org_tree.<locals>.<genexpr>u   s9      0Y0Y1A1A1C1C0Y0Y0Y0Y0Y0Yr   r:      )r    deptCdVdepth1Iddepth2Iddepth3Idrk   rt   z[%s] targets: %d)rT   textr	   selectr   rf   r\   rD   JS_ORGAN_REsearchrA   rs   listvaluesrG   infolen)r6   r8   rt   htmlsouptargetsr]   r    ocr   r   r   r   keyrq   r~   s                  @r   _parse_org_treer   h   sY   cIs#34449D}--D:<G[[677  ajjll##D!! 	UU9#r"" 	0Y0Y0Y0YUSTVW[[0Y0Y0Y-8XWh(;g"$$$#A GCL w~~
 
 C
KK"E3s88444Jr   r   c                 @   t          | d          }|                    d          }|r|                    d          n)|                    d          p|                    d          }|sg S g }|                    d          D ]}|                    d          }t          |          dk     r,t          |d                                                   }t          |d	                                                   }t          |d
                             dd                    }	|d         }
|
                    d          }|rQ|                    d          r<|                    d          	                    dd          
                                }nWt          |
                    dd                    }t                              |          }|r|                    d          n|}t          |||	|g          s|                    |||	|d           |S )Nrx   zh4.tit-type02tablez.data-tbl tableztbody trtd   r   r:      r   Tr|   r1   a[href^="tel:"]hreftel:r   )r    r!   r,   r-   )r	   
select_one	find_nextr   re   r   r   rf   rD   r   r|   PHONE_REr   r{   anyrj   )r   r   anchorr   rowstrtdsr    r!   r,   
phone_celltel_ar-   	last_textr~   s                  r   _parse_detail_table_from_htmlr      s   }--D___--F)/sFW%%%T__EV5W5W5s[_[j[jkr[s[sE 	Dll:&& X Xkk$s88a<<c!foo''((#a&//++,,c!fooco6677V
%%&788 	3UYYv&& 	3IIf%%--fb99??AAEEz223d2CCDDI	**A"#2AGGAJJJED(D%011 	TxPUVVWWWWKr   r   r   r   r   c                   |dk    rt            d}t          }nt            d}t          }||||d}|dd}	t                              d||           	 t          | |||	          j        }
t          |
          r|
S n# t          $ r Y nw xY wd S )	Nheadquartersz/kor/26/headquarters/viewz/kor/28/institution/viewr   r   r   r   z0application/x-www-form-urlencoded; charset=UTF-8)rv   zContent-Typez[try:POST view] %s %s)r3   r2   )	BASEHEADQUARTERS_URLINSTITUTION_URLrG   r   rV   r   r   rF   )r6   rt   r   r   r   r   view_urlrefererformr2   r   s              r   
_post_viewr      s    555#444" 	 D J G KK'4888T8$@@@E(.. 	K	   4s   (A= =
B
	B
deptc           
         |d          d|d          d|d          d|d          d|d	          	}t          | |d         |d         |d         |d         |d	         
          }|sg d|fS t          |          }|sg d|fS |||fS )zAdept: {"name","scope","deptCdV","depth1Id","depth2Id","depth3Id"}rt   z/view POST deptCdV=r   z, depth1Id=r   z, depth2Id=r   z, depth3Id=r   r   N)r   r   )r6   r   triedr   r   s        r   _fetch_dept_detailr      s    G}  Z  Zi  Z  ZTR\M]  Z  Zjnoyjz  Z  Z  HL  MW  HX  Z  ZEWYj!j!j!  D  4(..D 4ur   c                 4   t          | t          t                    }t          |j        d          }|                    d          }|sdS t          j        d|                    dd                    }|r"t          |
                    d                    ndS )Nrw   rx   za.direction.lastr:   zempSearch\.list\((\d+)\)ry   r   )rT   EMP_BASEEMP_HEADERSr	   r   r   rer   rD   intr{   )r6   rN   r   rK   r~   s        r   _emp_fetch_last_pager      s    T8[111A//D??-..D q
	-txx	2/F/FGGA&3qwwqzz???Q&r   pagec           	      2   t            d| }t          | |t                    }|j        }t	          |d          }g }|                    d          D ]A}|                    d          }t          |          dk     r,t          |d         	                    d	                    }	t          |d
         	                    d	                    }
t          |d         	                    d	                    }|d         }|
                    d          }|rQ|                    d          r<|                    d                              dd                                          }nWt          |	                    dd	                    }t                              |          }|r|                    d          n|}t          |          dk    r9d                    d |dd         D                                                       nd}|r*|r(|                    |d                                          }|dk    r"d|v rt$                              d|	|
           t)          |	|
|||g          s&|                    |	|
|||d           C||fS )Nz?pageIndex=rw   rx   ztable tbody trr   r   r   Tr   r:   r   r   r   r   r   r   c              3   ^   K   | ](}t          |                    d d                    V  )dS )r   Tr   N)r   rf   )r}   r   s     r   r   z"_emp_parse_page.<locals>.<genexpr>   s:      NNbVBKK4K8899NNNNNNr   r1   u   파견u   기획재정부u   skip 파견·기재부: %s/%sr/   )r   rT   r   r   r	   r   re   r   r   rf   r   rD   r   r|   r   r   r{   r   rG   debugr   rj   )r6   r   r8   rN   r   r   r   r   r   r    r!   r"   phone_tdr   r-   r   r~   r,   s                     r   _emp_parse_pager      s   
(
($
(
(CT3,,,A6D}--DDkk*++ $
 $
kk$s88a<<c!fooDo1122#a&///5566CFOO$O7788
r7##$566 	3UYYv&& 	3IIf%%--fb99??AAEEx00D0AABBI	**A"#2AGGAJJJE 3xx!|| HHNNC"INNNNNTTVVV 	
  	3T 	3<<r**0022D !!&74&?&?LL8$IIID(Je<== 	xzSWbghh	
 	
 	
 	
 :r   g333333?	sleep_secrun_idr   c                   t          |           }g }t          d|dz             D ]}t          | |          \  }}|                    d                              |d| |d                                           |                    |           t          j        |           t          
                    d|t          |                     |S )Nr:   motie_org_raw_testzemp:r   r   r   z[empSearch] pages=%d, rows=%d)r   rA   r   r   upsertexecuteextendrI   r5   rG   r   r   )	r6   sbr   r   rK   	collectedrb   r   r   s	            r   _emp_collect_allr     s    %%DI1dQh 	 	$T1--
d 	%&&--zazz4@@	
 	

')))
9
KK/s9~~FFFr   bc                 8   d | pd                     d          D             d |pd                     d          D             z   }t                      g }}|D ]0}||vr*|                    |           |                    |           1d                    |          S )Nc                 ^    g | ]*}|                                 |                                 +S  r   r}   rb   s     r   
<listcomp>z _merge_tasks.<locals>.<listcomp>#  s-    BBB1		BQWWYYBBBr   r   /c                 ^    g | ]*}|                                 |                                 +S r   r   r   s     r   r   z _merge_tasks.<locals>.<listcomp>#  s3    E{E{E{TUqrqxqxqzqzE{aggiiE{E{E{r   z / )r   rh   ri   rj   r   )r]   r   partsrr   rq   rb   s         r   _merge_tasksr   "  s    BBb 4 4BBBE{E{Z[Za_aYhYhilYmYmE{E{E{{Er#D ' 'D==HHQKKKA::c??r   c                 n   | pd                                 } |pd                                 }t          t                              |                     }t          t                              |                    }|r|s| S |r|s|S | s|r|S |s| r| S t	          |           t	          |          k    r| n|S )Nr   )r|   rY   r   	fullmatchr   )r]   r   a_okb_oks       r   _pick_better_phoner   *  s    	
bAR001""1%%&&DtH4F4Fq4I4I/J/J"D"("D"(1H1HA#a&&  11a'r   r   c                 R   i }| D ]}|d         }||vrt          |          ||<   !||         }t          |                    d          |                    d                    |d<   t          |                    d          |                    d                    |d<   |                                D ]R}t          |d         |d         |d         |                    dd          |                    dd                    |d<   St          |                                          S )	Nkey_hashr,   r-   r    r!   r"   r   row_hash)r>   r   rD   r   r   r0   r   )r   by_keyrN   kbasevs         r   _dedupe_stage_rowsr   3  s    F N NjMF??QF1Iay$TXXf%5%5quuV}}EEV*488G+<+<aeeGnnMMW]]__ t t!!F)Qz]AlOQUUSY[]M^M^`a`e`efmoq`r`rss*   r     chunkr   r   c                    t          dt          |          |          D ]L}||||z            }|s|                     |                              |                                           Md S )Nr   )rA   r   r   r   r   )r   r   r   r   rM   parts         r   _chunked_upsertr   @  ss    1c$ii'' / /AagIX
t$$,,..../ /r   c                     	 |                      d                              dd                                          }|j        pdS # t          $ r Y dS w xY w)Nmotie_org_cur_testr   exact)countr   )r   r   r   r   rF   )r   ress     r   
_count_curr   I  sf    hh+,,33Jg3NNVVXXy~A   qqs   AA 
AA	stg_count	cur_countmin_abs	min_ratioc                    | |k     r
dd|  d| fS |dk    r4| t          ||z            k     rdd|  dt          ||z             d| dfS dS )	NFztoo_few_rows: z < r   zratio_drop: z (cur=))Tok)r   )r   r   r   r   s       r   	_validater   P  s    75*R9*R*R*R*R#RR1}}SY)>%?%???  P[^g  P[  P[lopy  }F  qF  mG  mG  P[  P[  OX  P[  P[  P[  I[  B[:r   	dept_namec                 
   g }|D ]p}t          |d         |d         |          }|                    | |d         |d         ||                    dd          |                    dd          |d           qt          |          S )Nr    r!   r,   r   r-   r   r    r!   r"   r,   r-   r   )r+   rj   rD   r   )r   r   r   stagedrN   r   s         r   _prepare_stage_batchr   U  s    F 
 
ai:	::fI*#EE&"%%UU7B''
 
 	 	 	 	 f%%%r   c                 6   g }|D ]}|                     dd          }t          |d         |d         |          }|                    | |d         |d         ||                     dd          |                     dd          |d           t          |          S )u;   empSearch rows → stage rows (department 그대로 사용)r"   r   r    r!   r,   r-   r   )rD   r+   rj   r   )r   r   r   rN   r"   r   s         r   _prepare_stage_batch_empr   d  s    F  UU<,,
ai:
;;fI*$EE&"%%UU7B''
 
 	 	 	 	 f%%%r   皙?   皙?c           
         t                      }t          t          j                              |                    d                              ddd                                           d}d}	 t          j                    5 }t          |t          d          }t          |t          d          }||z   }		 fd|	D             }
|
rt          |d	|
d
           n2# t          $ r%}t                              d|           Y d }~nd }~ww xY w|	D ]}|d         }t!          ||          \  }}}|r[	 |                    d                              |d          d| |d                                           n# t          $ r Y nw xY w|sn|rl	 |                    d                              d|d          d|d         p| |d d         d                                           n# t          $ r Y nw xY w|r8t%          ||          }|r%t          |d|d
           |t'          |          z  }|Q|                    d                              |d          d|d         p| |d                                           |dz  }t)          j        |            t-          ||d          }|r7t/          |          }|r%t          |d|d
           |t'          |          z  }|                    d                              |dd| d                                           t1          |          }t3          ||||          \  }}|st5          j        t8          j                                                  }|                    d                              d ||||d!                               d"                                           |                    d                              d |d#                               d$                                           t          !                    d%|           d |d&cd d d            S |"                    d'd(i                                          }t          #                    d)|j$                   t5          j        t8          j                                                  }|                    d                              d*d+i                               d$                                           |                    d                              d+|||d,                               d"                                           t          #                    d-||           d+||d.cd d d            S # 1 swxY w Y   d S # t          $ r}t5          j        t8          j                                                  }|                    d                              d/|||t          |          d!                               d"                                           t          %                    d0|           d/t          |          d1cY d }~S d }~ww xY w)2Ncrawler_run_testmotie_org_testrunning)idtargetstatusr   r   institutionc                     g | ]C}|d          |d         d                     |                    dg                     |d         dDS )r    r   z > rk   rt   )r   r    coderk   rt   )r   rD   )r}   r[   r   s     r   r   zrun_once.<locals>.<listcomp>  sg     $ $ $  %fIiL!JJquuVR'8'899wZ  $ $ $r   motie_org_head_targets_testr   r   z.motie_org_head_targets_test upsert skipped: %sr    motie_org_trylog_testrt   :)r   r   
tried_urlsr   zMISS:r   i@  r   motie_org_stg_testr:   r  r   motie_org_snapshot_testr   zpages=)r   r   r
  note)r   r   failed)r
  finished_atpagesr   fail_reasonr  )r
  r  r   z&[motie_org_test] validation failed: %s)r   r
  reasonfinalize_motie_run_testp_run_idz,[motie_org_test] finalize_motie_run_test: %sr
  passed)r
  r  r  r   z9[motie_org_test] run passed: run_id=%s, rows=%d, pages=%d)r   r
  r   r  abortedz [motie_org_test] run aborted: %s)r   r
  error)&r
   struuiduuid4r   rg   r   requestsSessionr   r   r   r   rF   rG   rH   r   r   r   r   rI   r5   r   r   r   r   r   nowr   utc	isoformatr@   eqr  rpcr   r3   	exception)r   r   r   r   pages_savedtotal_collectedr6   
hq_targetsinst_targetsr   	snap_rowsrO   r[   r   r   r   r   stageemp_rows	stage_empr   r   r  r$  r   r   s                            @r   run_oncer2  x  s   	BFHH  ''vAQ]f(g(ghhpprrrKOfH ]	i4*41A>RRJ*4-PPL </GT$ $ $ $ #$ $ $	  ^#B(EyX\]]]] T T TOQRSSSSSSSST  $& $&fI	$6tQ$?$?!dE  !899@@'-76Q6Qi6Q6Qafgg !'))))$      !566==&,$TAgJ$T$T99R$T$T$)%4%L? ?   #79999$     60DIIE 6',@%tTTTT'3u::5#HH12299#)aj3^3^1Y<C\S\3^3^hlmm giiiq 
9%%%% (b&CHHHH 64VXFF	 6#B(<itTTTT#s9~~5O HH.//66!?k[qdo[q[qrr giii"2I"?IwZcdddJB Pl8<00::<<+,,33'k[j  |B  C  C "T6""77999233::hX^;_;_``ccdlntuu}}EvNNN"(HOOa]	i ]	i ]	i ]	i ]	i ]	i ]	i ]	if &&2Z4HIIQQSSCKKFQQQ,x|,,6688CHH.//66(7KLLOOPXZ`aaiikkkHH'((//#C+Wfgg bvwwyyyKKS> > >$/\ghh{]	i ]	i ]	i ]	i ]	i ]	i ]	i ]	i ]	i ]	i ]	i ]	i ]	i ]	i ]	i ]	i ]	i ]	i~  H H Hl8<((2244
#$$++ {Tctwxytztz{{	
 	

"T6

77999;Q??? IAGGGGGGGGHs   3V7 2V*9#CV*
D'DV*D&V*3A	E=<V*=
F
V*	F

V*AG-,V*-
G:7V*9G::IV*V7 E V*V7 *V..V7 1V.2V7 7
Z
B>Z?Z
Z
__main__r   )r  r  r  )?__doc__r   rI   r   r'   loggingtypingr   r   r   r   r   r   r"  bs4r	   app.services.supabase_servicer
   basicConfigINFO	getLoggerrG   r   r   r   r   r   r?   compiler   r   r  r   r   r+   r0   r#  rP   rT   rV   rY   r\   rs   r   r   r   r   r   r   r   floatr   r   r   r   r   r   r   r   r   r2  __name__printr   r   r   <module>r@     sh    
			     . . . . . . . . . . . . ' ' ' ' ' ' ' '        4 4 4 4 4 4  ',/L M M M M		4	5	5
 !000 ///%%%IJ ?O<  2:.//bjTUU
Thsm T T T T T +S *S * * * *gC g3 gC gC g g g gC 3 C s 3 SV    
 HLRV`ail   8# S s    " D C C Z Z Z Z
Gc Gd G G G G= T#Y    (*  S T$Z    @ T
    <X% c s c ]` lo t|  ~A  uB    8X- T eDJPXY\P]_bDb>c    *'x/ 'C ' ' ' '-(* -# -%T
C:P - - - -^ UY   8+  E ]abf]g    &HSM hsm     ((3- (HSM (c ( ( ( (!T$Z !DJ ! ! ! ! GK / / /s /$t* / / / / /c      # % TYZ^`cZcTd    
& & &DJ &4PT: & & & &&S &T
 &tDz & & & &(mH mH mHc mHE mHTX mH mH mH mH^ z	E((** r   