
    ;hF                        d Z ddlZddlZddlZddlZddlZddlmZmZm	Z	m
Z
 ddlmZmZ ddlZddlmZ ddlmZ  ej        ej        d            ej        d	          Zd
Ze dZe dZe dZddddZded         iZ ej        d          Z ej        d          Zde
e         defdZ dedefdZ!dedededefdZ"dedededededefdZ#ddd d!d"d#ej$        d$ed%efd&Z%d' Z&dUd(Z'd)ede(fd*Z)dee         fd+Z*d#ej$        d%ed,edee         fd-Z+d.edee         fd/Z,d#ej$        d,ed0ed1ed2ed3ede
e         fd4Z-d#ej$        d5ede	ee         e
e         ef         fd6Z.d#ej$        de/fd7Z0d#ej$        d8e/de	ee         ef         fd9Z1de/fd:Z2d;e/d<e/d=e/d>e3de	e(ef         f
d?Z4d@e
e         dAe
e         defdBZ5d@e
e         dAe
e         defdCZ6dDee         dee         fdEZ7dFdGdHedDee         dIe/fdJZ8dKedLedDee         dee         fdMZ9dKedDee         dee         fdNZ:dVdRe3d=e/d>e3defdSZ;e<dTk    r e= e;                       dS dS )WuJ  
MOTIE 조직도 크롤러(운영) — /view + empSearch 병행 수집
- 메인(조직도)에서 jsSearchOrgan(...) 4개 인자 추출 → POST /view 로 직원표 파싱
- empSearch 전체 페이지 크롤링 병행 → /view 미노출 인원 보강(파견 등)
- 스테이징에서 dedupe → finalize_motie_run 로 SCD2 반영
    N)ListDictTupleOptional)datetimetimezone)BeautifulSoup)
get_clientz[%(levelname)s] %(message)s)levelformatmotie_org_pipelinezhttps://www.motie.go.krz/kor/26/headquartersz/kor/28/institutionz/kor/25/empSearchz*GovBot/3.0 (+https://work.jjickjjicks.com)z?text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8z#ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7)
User-AgentAcceptzAccept-Languager   z\d{2,4}-\d{3,4}-\d{4}z8jsSearchOrgan\('([^']*)','([^']*)','([^']*)','([^']*)'\)sreturnc                     d                     | pd                    dd                              dd                                                    S )N      u   ​)joinreplacesplitr   s    3/var/www/html/bot/app/crawler/motie_org_pipeline.py_cleanr   1   sD    88QW"%%h44<<XrJJPPRRSSS    c                      t          |           S N)r   r   s    r   _normr   4   s    !99r   nameposition
departmentc                     t          |            dt          |           dt          |           }t          j        |                                                                          S N|r   hashlibsha256encode	hexdigest)r    r!   r"   bases       r   	_key_hashr,   7   sU    DkkAAE(OOAAeJ.?.?AAD>$++--((22444r   taskphonec                    t          |            dt          |           dt          |           dt          |           dt          |           	}t          j        |                                                                          S r$   r&   )r    r!   r"   r-   r.   r+   s         r   	_row_hashr0   ;   su    Dkk^^E(OO^^eJ.?.?^^%++^^PUV[P\P\^^D>$++--((22444r      g333333?)headersdatamax_trysleepsessmethodurlc          	         d }t          t                    }	|	                    |pi            t          d|dz             D ]}
	 |                                dk    r | j        |f|	|dd|}n | j        |f|	dd|}|                                 |c S # t          $ rS}|}t          
                    d|                                |
||           t          j        ||
z             Y d }~d }~ww xY w|)N   POST   )r2   r3   timeout)r2   r=   z%s failed(%d/%d): %s)dictBASE_HEADERSupdaterangeupperpostgetraise_for_status	Exceptionloggerwarningtimer5   )r6   r7   r8   r2   r3   r4   r5   kwlasthires                r   _requestrP   B   s.   D\AAHHW]3331gk"" " "
	"||~~''DIcJ14JJrJJDHS>!R>>2>>   HHH 	" 	" 	"DNN16<<>>1gsSSSJuqy!!!!!!!!	" Js   AB
C5"A	C00C5c                      t          | d|fi |S )NGETrP   )r6   r8   rJ   s      r   _getrT   S   s    8D%#C#C#C#CCr   c                 $    t          | d|fd|i|S )Nr;   r3   rS   )r6   r8   r3   rJ   s       r   _postrV   T   s!    htVS.Z.Zt.ZWY.Z.Z'Zr   txtc                 h    t          |           }t          |          o|                    d          S )N)u   과u   관u   국u   실u   팀)r   boolendswith)rW   ts     r   _is_org_namer\   Y   s+    sA77Fqzz"EFFFr   c                    g | j         d}}}||dk     r}dD ]f}|                    |d          D ]L}t          |                                          }t	          |          r||vr|                    d|           Mg|j         }|dz  }||dk     }g t                      }}|D ]0}	|	|vr*|                    |	           |                    |	           1|S )Nr      )strongspanpaF)	recursiver:   )	parentfind_allr   get_textr\   insertsetaddappend)
rb   pathelhopsselxtxoutseenra   s
             r   _infer_pathrs   ]   s    1d"D
.TAXX/ 	' 	'C[[[66 ' 'AJJLL))## '$KK2&&&' Y	 .TAXX CEEC ' 'D==HHQKKKAJr   scopec           	      x   t          | |d|i          j        }t          |d          }i }|                    d          D ]}t	          |                                          }t          |          s3|                    d          pd}t          	                    |          sgfdt          dd	          D             \  }	}
}}||	|
||f}||vr||	|
||t          |          |d
||<   t          |                                          }t                              d|t!          |                     |S )NRefererr2   html.parserza[onclick*="jsSearchOrgan"]onclickr   c              3   f   K   | ]+}                     |                                          V  ,d S r   )groupstrip).0rM   ms     r   	<genexpr>z"_parse_org_tree.<locals>.<genexpr>y   s9      0Y0Y1A1A1C1C0Y0Y0Y0Y0Y0Yr   r:      )r    deptCdVdepth1Iddepth2Iddepth3Idrk   rt   z[%s] targets: %d)rT   textr	   selectr   rf   r\   rD   JS_ORGAN_REsearchrA   rs   listvaluesrG   infolen)r6   r8   rt   htmlsouptargetsrb   r    ocr   r   r   r   keyrq   r~   s                  @r   _parse_org_treer   l   sY   cIs#34449D}--D:<G[[677  ajjll##D!! 	UU9#r"" 	0Y0Y0Y0YUSTVW[[0Y0Y0Y-8XWh(;g"$$$#A GCL w~~
 
 C
KK"E3s88444Jr   r   c                 @   t          | d          }|                    d          }|r|                    d          n)|                    d          p|                    d          }|sg S g }|                    d          D ]}|                    d          }t          |          dk     r,t          |d                                                   }t          |d	                                                   }t          |d
                             dd                    }	|d         }
|
                    d          }|rQ|                    d          r<|                    d          	                    dd          
                                }nWt          |
                    dd                    }t                              |          }|r|                    d          n|}t          |||	|g          s|                    |||	|d           |S )Nrx   zh4.tit-type02tablez.data-tbl tableztbody trtd   r   r:      r   Tr|   r1   a[href^="tel:"]hreftel:r   )r    r!   r-   r.   )r	   
select_one	find_nextr   re   r   r   rf   rD   r   r|   PHONE_REr   r{   anyrj   )r   r   anchorr   rowstrtdsr    r!   r-   
phone_celltel_ar.   	last_textr~   s                  r   _parse_detail_table_from_htmlr      s   }--D___--F)/sFW%%%T__EV5W5W5s[_[j[jkr[s[sE 	Dll:&& X Xkk$s88a<<c!foo''((#a&//++,,c!fooco6677V
%%&788 	3UYYv&& 	3IIf%%--fb99??AAEEz223d2CCDDI	**A"#2AGGAJJJED(D%011 	TxPUVVWWWWKr   r   r   r   r   c                   |dk    rt            d}t          }nt            d}t          }||||d}|dd}	t                              d||           	 t          | |||	          j        }
t          |
          r|
S n# t          $ r Y nw xY wd S )	Nheadquartersz/kor/26/headquarters/viewz/kor/28/institution/viewr   r   r   r   z0application/x-www-form-urlencoded; charset=UTF-8)rv   zContent-Typez[try:POST view] %s %s)r3   r2   )	BASEHEADQUARTERS_URLINSTITUTION_URLrG   r   rV   r   r   rF   )r6   rt   r   r   r   r   view_urlrefererformr2   r   s              r   
_post_viewr      s    555#444" 	 D J G KK'4888T8$@@@E(.. 	K	   4s   (A= =
B
	B
deptc           
         |d          d|d          d|d          d|d          d|d	          	}t          | |d         |d         |d         |d         |d	         
          }|sg d|fS t          |          }|sg d|fS |||fS )zAdept: {"name","scope","deptCdV","depth1Id","depth2Id","depth3Id"}rt   z/view POST deptCdV=r   z, depth1Id=r   z, depth2Id=r   z, depth3Id=r   r   N)r   r   )r6   r   triedr   r   s        r   _fetch_dept_detailr      s    G}  Z  Zi  Z  ZTR\M]  Z  Zjnoyjz  Z  Z  HL  MW  HX  Z  ZEWYj!j!j!  D  4(..D 4ur   c                 4   t          | t          t                    }t          |j        d          }|                    d          }|sdS t          j        d|                    dd                    }|r"t          |
                    d                    ndS )Nrw   rx   za.direction.lastr:   zempSearch\.list\((\d+)\)ry   r   )rT   EMP_BASEEMP_HEADERSr	   r   r   rer   rD   intr{   )r6   rN   r   rK   r~   s        r   _emp_fetch_last_pager      s    T8[111A//D??-..D q
	-txx	2/F/FGGA&3qwwqzz???Q&r   pagec           	      2   t            d| }t          | |t                    }|j        }t	          |d          }g }|                    d          D ]A}|                    d          }t          |          dk     r,t          |d         	                    d	                    }	t          |d
         	                    d	                    }
t          |d         	                    d	                    }|d         }|
                    d          }|rQ|                    d          r<|                    d                              dd                                          }nWt          |	                    dd	                    }t                              |          }|r|                    d          n|}t          |          dk    r9d                    d |dd         D                                                       nd}|r*|r(|                    |d                                          }|dk    r"d|v rt$                              d|	|
           t)          |	|
|||g          s&|                    |	|
|||d           C||fS )Nz?pageIndex=rw   rx   ztable tbody trr   r   r   Tr   r:   r   r   r   r   r   r   c              3   ^   K   | ](}t          |                    d d                    V  )dS )r   Tr   N)r   rf   )r}   r   s     r   r   z"_emp_parse_page.<locals>.<genexpr>   s:      NNbVBKK4K8899NNNNNNr   r1   u   파견u   기획재정부u   skip 파견·기재부: %s/%s)r    r!   r"   r-   r.   )r   rT   r   r   r	   r   re   r   r   rf   r   rD   r   r|   r   r   r{   r   rG   debugr   rj   )r6   r   r8   rN   r   r   r   r   r   r    r!   r"   phone_tdr   r.   r   r~   r-   s                     r   _emp_parse_pager      s   
(
($
(
(CT3,,,A6D}--DDkk*++ #
 #
kk$s88a<<c!fooDo1122#a&///5566CFOO$O7788
r7##$566 	3UYYv&& 	3IIf%%--fb99??AAEEx00D0AABBI	**A"#2AGGAJJJE 3xx!|| HHNNC"INNNNNTTVVV 	
  	3T 	3<<r**0022D !!&74&?&?LL8$IIID(Je<== 	xzSWbghh	
 	
 	
 	
 :r   c                     	 |                      d                              dd                                          }|j        pdS # t          $ r Y dS w xY w)u>   현재(open) 스냅샷 수 — 뷰(motie_org_cur)에서 읽기motie_org_curkey_hashexact)countr   )r   r   executer   rF   )sbress     r   
_count_curr     se    hh''..z.IIQQSSy~A   qqs   AA 
AA	stg_count	cur_countmin_abs	min_ratioc                    | |k     r
dd|  d| fS |dk    r4| t          ||z            k     rdd|  dt          ||z             d| dfS dS )	NFztoo_few_rows: z < r   zratio_drop: z (cur=))Tok)r   )r   r   r   r   s       r   	_validater     s    7>y>>W>>>>1}}SY)>%?%???aYaa3y97L3M3MaaU^aaaaa:r   rb   bc                 8   d | pd                     d          D             d |pd                     d          D             z   }t                      g }}|D ]0}||vr*|                    |           |                    |           1d                    |          S )Nc                 ^    g | ]*}|                                 |                                 +S  r   r}   ra   s     r   
<listcomp>z _merge_tasks.<locals>.<listcomp>'  s-    BBB1		BQWWYYBBBr   r   /c                 ^    g | ]*}|                                 |                                 +S r   r   r   s     r   r   z _merge_tasks.<locals>.<listcomp>'  s3    E{E{E{TUqrqxqxqzqzE{aggiiE{E{E{r   z / )r   rh   ri   rj   r   )rb   r   partsrr   rq   ra   s         r   _merge_tasksr   &  s    BBb 4 4BBBE{E{Z[Za_aYhYhilYmYmE{E{E{{Er#D ' 'D==HHQKKKA::c??r   c                 n   | pd                                 } |pd                                 }t          t                              |                     }t          t                              |                    }|r|s| S |r|s|S | s|r|S |s| r| S t	          |           t	          |          k    r| n|S )Nr   )r|   rY   r   	fullmatchr   )rb   r   a_okb_oks       r   _pick_better_phoner   .  s    	
bA	
bA""1%%&&D""1%%&&D D  D     A#a&&  11a'r   r   c                 R   i }| D ]}|d         }||vrt          |          ||<   !||         }t          |                    d          |                    d                    |d<   t          |                    d          |                    d                    |d<   |                                D ]R}t          |d         |d         |d         |                    dd          |                    dd                    |d<   St          |                                          S )	Nr   r-   r.   r    r!   r"   r   row_hash)r>   r   rD   r   r   r0   r   )r   by_keyrN   kr+   vs         r   _dedupe_stage_rowsr   =  s    F N NjMF??QF1Iay#DHHV$4$4aeeFmmDDV*488G+<+<aeeGnnMMW]]__ 
 
!fIq}aoquuVR7H7H!%%PWY[J\J\
 
*    r     chunkr   r   c                    t          dt          |          |          D ]L}||||z            }|s|                     |                              |                                           Md S )Nr   )rA   r   r   upsertr   )r   r   r   r   rM   parts         r   _chunked_upsertr   M  sw    1c$ii'' / /AE	M" 	
t$$,,....	/ /r   run_id	dept_namec                 
   g }|D ]p}t          |d         |d         |          }|                    | |d         |d         ||                    dd          |                    dd          |d           qt          |          S )Nr    r!   r-   r   r.   r   r    r!   r"   r-   r.   r   )r,   rj   rD   r   )r   r   r   stagedrN   r   s         r   _prepare_stage_batch_from_viewr   T  s    F 
 
ai:	:: &	jM'fb))w++ 
	
 
	
 
	
 
	
 f%%%r   c                 6   g }|D ]}|                     dd          }t          |d         |d         |          }|                    | |d         |d         ||                     dd          |                     dd          |d           t          |          S )Nr"   r   r    r!   r-   r.   r   )rD   r,   rj   r   )r   r   r   rN   depr   s         r   _prepare_stage_batch_from_empr   e  s    F 
 
eeL"%%ai:44 &	jM!fb))w++ 
	
 
	
 
	
 
	
 f%%%r   皙?   皙?	sleep_secc           
         t                      }t          t          j                              }|                    d                              |ddd                                           d}d}	 t          j                    5 }t          |t          d          }t          |t          d          }	||	z   }
|
D ]5}|d         }t          ||          \  }}}|sn|rl	 |                    d	                              |d
|d          d|d         p| |d d         d                                           n# t          $ r Y nw xY w|r8t          |||          }|r%t!          |d|d           |t#          |          z  }|Q|                    d	                              ||d          d|d         p| |d                                           |dz  }t%          j        |            7t)          |          }t+          d|dz             D ]}t-          ||          \  }}|                    d	                              |d| |d                                           |r7t/          ||          }|r%t!          |d|d           |t#          |          z  }|dz  }t%          j        d           	 d d d            n# 1 swxY w Y   |                    d                              ||dd| d                                           t1          |          }t3          ||||          \  }}|st5          j        t8          j                                                  }|                    d                              d||||d                               d|                                           |                    d                              d|d                               d|                                           tB          "                    d |           |d|d!S |#                    d"d#|i                                          }tB          $                    d$|j%                   t5          j        t8          j                                                  }|                    d                              d%d&i                               d|                                           |                    d                              d&|||d'                               d|                                           tB          $                    d(|||           |d&||d)S # t          $ r}t5          j        t8          j                                                  }	 |                    d                              d*|||t          |          d                               d|                                           n# t          $ r Y nw xY wtB          &                    d+|           |d*t          |          d,cY d }~S d }~ww xY w)-Ncrawler_run	motie_orgrunning)idtargetstatusr   r   institutionr    motie_org_rawzMISS:rt   :r   i@  )r   r   r   motie_org_stgr   r   r:   zemp:r   motie_org_snapshot	collectedzpages=)r   r   r  note)r   r   failed)r  finished_atpagesr   fail_reasonr  )r  r
  r   z![motie_org] validation failed: %s)r   r  reasonfinalize_motie_runp_run_idz"[motie_org] finalize_motie_run: %sr  passed)r  r  r  r   z4[motie_org] run passed: run_id=%s, rows=%d, pages=%d)r   r  r   r  abortedz[motie_org] run aborted: %s)r   r  error)'r
   struuiduuid4r   rg   r   requestsSessionr   r   r   r   r   rF   r   r   r   rI   r5   r   rA   r   r   r   r   r   nowr   utc	isoformatr@   eqrG   r  rpcr   r3   	exception)r   r   r   r   r   r  r	  r6   
hq_targetsinst_targetsr   r[   r   	rows_viewr   r   stagerK   ra   rows_emphtml_emp	stage_empr   r   r  r  r   rO   s                               r   run_oncer'  z  s   	BFHH]""&KS\#]#]^^ffhhhEI\H 5	 4*41A>RRJ*4-PPL </G  & &fI	);D!)D)D&	4 ! U 1188&,$TAgJ$T$T99R$T$T$)%4%L: :   #79999$     0:69iXXE 0'OU$OOOO!SZZ/	#HH_--44#)aj3^3^1Y<C\S\3^3^hlmm giii

9%%%% (--D1dQh''    %4T1%=%="( ))00%zazz8LL '))) 4 =fh O OI  4'OYdSSSS!S^^3	

3 M5	  5	  5	  5	  5	  5	  5	  5	  5	  5	  5	  5	  5	  5	  5	 p 	%&&--yKQaZ_QaQabb	
 	

')))rNN	y)WPYZZZ
F 	L,x|,,6688CHH]##**#C%QZkqrr bvwwyyyHH)**11Xv2V2VWWZZ[cekllttvvvLL<fEEE$FKKK ff)J+?@@HHJJ8#(CCCl8<((2244
%&&--x.BCCFFxQWXX``bbb
&&eYWW	
 	

"T6

77999JFT]_deee HiRWXXX 	H 	H 	Hl8<((2244	HH]##**$S5R[lopqlrlrss bvwwyyyy 	 	 	D	6::: IAGGGGGGGG	Hs   2U3 AKAD87K8
EKEE6K<U3 KU3 KE!U3 2E U3 3
Y=0Y.A!XY
XYX0YYY__main__r   )r   r   r   )>__doc__r   rI   r  r'   loggingtypingr   r   r   r   r   r   r  bs4r	   app.services.supabase_servicer
   basicConfigINFO	getLoggerrG   r   r   r   r   r?   r   compiler   r   r  r   r   r,   r0   r  rP   rT   rV   rY   r\   rs   r   r   r   r   r   r   r   r   floatr   r   r   r   r   r   r   r'  __name__printr   r   r   <module>r5     s    
			     . . . . . . . . . . . . ' ' ' ' ' ' ' '        4 4 4 4 4 4  ',/L M M M M		/	0	0
 !000 ///%%% ?O< 
 \,782:.//bjTUU
Thsm T T T T TS S    5C 53 5C 5C 5 5 5 55C 53 5C 5s 53 5SV 5 5 5 5 HLRV`ail   8# S s    " D C C Z Z Z Z
Gc Gd G G G Gd3i    (*  S T$Z    @ T
    <X% c s c ]` lo t|  ~A  uB    8X- T eDJPXY\P]_bDb>c    *'x/ 'C ' ' ' '+(* +# +%T
C:P + + + +`c      # % TYZ^`cZcTd    HSM hsm     ((3- (HSM (c ( ( ( (!T$Z !DJ ! ! ! !  GK / / /s /$t* / / / / /&3 &3 &d4j &UYZ^U_ & & & &"&# &T$Z &DJ & & & &*cH cH cHc cHE cHTX cH cH cH cHJ z	E((** r   