o
    oh,                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	 d dl
Z
d dlmZ d dlmZ d dlmZ e dd Zejed	d
 edZdZe dZdZddiZdd ZdedefddZdddededededee f
ddZe dZ!e d Z"d!ee defd"d#Z#d$edefd%d&Z$d$edee fd'd(Z%d)edee fd*d+Z&d!edee fd,d-Z'd.edee fd/d0Z(dEd2ed3edefd4d5Z)dd6d7d8ede
j*fd9d:Z+dFd<edee fd=d>Z,d?ee de	eef fd@dAZ-dBdC Z.e/dDkre0e.  dS dS )G    N)escape)DictListOptionalTuple)BeautifulSoup
get_client)broadcast_html	LOG_LEVELINFOz[%(levelname)s] %(message)s)levelformatme_n8nzhttps://me.go.krz5/home/web/board/list.do?menuId=10527&boardMasterId=22z/home/web/board/read.doz
User-Agentz*GovBot/1.0 (+https://work.jjickjjicks.com)c                   C   s   t  S Nr    r   r   '/var/www/html/bot/app/crawler/me_n8n.pySB"   s   r   targetreturnc                 C   s   dd l }ddlm}m} t| }zt d|| dddd ||j	
 d  W |S  tyI } ztd|  W Y d }~|S d }~ww )Nr   datetimetimezonecrawler_runrunning)idr   statuspagesrowsfail_reason
started_atz[crawler_run insert] )uuidr   r   struuid4r   tableinsertnowutc	isoformatexecute	Exceptionloggerwarning)r   r!   r   r   run_ider   r   r   
_run_start&   s(   r/   )r   r-   r   r   r   r   c          	   
   C   s   ddl m }m} |||j ||d}|d ur||d< zt d|d| 	  W d S  t
yJ } ztd|  W Y d }~d S d }~ww )Nr   r   )r   finished_atr   r   r   r   r   z[crawler_run finish] )r   r   r&   r'   r(   r   r$   updateeqr)   r*   r+   r,   )	r-   r   r   r   r   r   r   payloadr.   r   r   r   _run_finish:   s   $r4   z[?&]boardId=(\d+)z%(\d{4})[.\-/](\d{1,2})[.\-/](\d{1,2})sc                 C   s   | sdS d |  S )N  )joinsplit)r5   r   r   r   _cleanY   s   r:   hrefc                 C   sR   | pd  } | s
| S | ds| dr| S | drt|  S t d| d S )Nr6   zhttp://zhttps:///z./)strip
startswithBASElstrip)r;   r   r   r   _abs_url_   s   
rA   c                 C   s    t | pd}|r|dS d S )Nr6      )RE_BOARD_IDsearchgroup)r;   mr   r   r   _extract_board_idk   s   rG   	containerc                 C   s   | sd S |  dp|  dp|  dp|  d}|r%|jddd}|r%|S z| jddd}t|}|r:|dW S W d S  tyF   Y d S w )	N.date	.reg_dateztd:nth-last-of-type(1)spanr7   Tr=   r   )
select_oneget_textRE_DATErD   rE   r*   )rH   candttxtrF   r   r   r   _find_posted_textp   s0   

rS   c                 C   s\   | sd S t | }|sd S |d|dd|dd}}}| d| d| S )NrB         -)rO   rD   rE   zfill)r5   rF   ymodr   r   r   _normalize_date   s   
.r[   htmlc                 C   s   t | d}|d}g }t }|D ]c}|dd}t|}|s q||v r%q|| t|jddd}|sE|d}	|	rEt|	jddd}|	d	p[|	d
p[|	dp[|	dp[|j
}
t|
}t|pdd}|||pldt||d q|S )Nhtml.parserz7a[href*="/home/web/board/read.do"], a[href*="boardId="]r;   r6   r7   TrL   h3trliarticlediv   환경부 인사발령)r   titleurl	posted_at)r   selectsetgetrG   addr:   rN   find_previousfind_parentparentrS   r[   appendrA   )r\   soupanchorsitemsseenar;   bidrd   r^   rH   
posted_rawpostedr   r   r   parse_list_html   s6   



.
rw   	pageIndexpageparamc                 C   s2   | dkrt S dt v rdnd}t  | | d|  S )NrB   ?&=)LIST_URL)ry   rz   sepr   r   r   _list_url_for   s   r      headerstimeoutre   c                C   s   d}t dD ]<}ztj| ||d}|  |W   S  tyB } z|dkr& td|d | t| |d9 }W Y d }~qd }~ww t	d)	Ng      ?   r   rU   zGET retry (%s): %srB   rT   unreachable)
rangerequestsri   raise_for_statusr*   r+   r,   timesleepRuntimeError)re   r   r   delayirr.   r   r   r   _get_with_retry   s   

r   rB   	max_pagesc                 C   s   t dt| pd} t }g }td| d D ]d}t|d}t|tdd}t|j}dt	dt
t	 fdd}d	}|D ]6}	|	d
sO|	drO||	d }
|
rO|
|	d
< t	|	dpVd}|r^||v r_q7|| ||	 |d7 }q7|dkry|d	kry |S q|S )NrB   rx   r   r   re   r   c                    s   ztt | tdd}t|jd}dD ]}||}|r)t|jddd}|r)|  W S q|dd	gD ]7}|jdddp;d
  t	 fdddD rh|
ddgpU|j
d}|rht|jddd}|rh|  W S q1t|jddd}|W S  ty~   Y d S w )N   r   r]   )rI   rJ   r   r7   TrL   dtthr6   c                 3   s    | ]}| v V  qd S r   r   .0krQ   r   r   	<genexpr>   s    zAfetch_and_extract.<locals>._posted_from_detail.<locals>.<genexpr>)u	   등록일u	   작성일u   등록일자ddtdr_   )r   HEADERSr   textrM   r[   rN   find_allr=   anyfind_next_siblingrm   r*   )re   drro   selelr5   r   sibr   r   r   _posted_from_detail   s0   


z.fetch_and_extract.<locals>._posted_from_detailr   rf   r   r6   )maxintrh   r   r   r   r   rw   r   r"   r   ri   rj   rn   )r   rr   outpre   r   
page_itemsr   	new_addeditpdateiidr   r   r   fetch_and_extract   s2   




 r   rq   c                    s6  t  }d\}}| D ]}|d}|sq	z|ddd|d }|jr+W q	W n tyI } zt	
d| d|  W Y d }~q	d }~ww ||dpQd d	 |d
rb|d
 d
< |drn|d d< d}g dg dg dddgfD ]:}	 fdd|	D }
z|d|
  d}W  n  ty } zt	
d|	 d| d|  W Y d }~q~d }~ww |sq	|d7 }t dd}t dt t d| d}d| d| d}zt|}|t|trt|n|rdnd7 }W q	 ty } zt	
d| d|  W Y d }~q	d }~ww ||fS )N)r   r   r   me_idrB   zexist check failed for z: rd   r6   )r   rd   rf   re   F)r   rd   rf   re   )r   rd   re   )r   rd   rf   c                    s   i | ]}| v r| | qS r   r   r   rowr   r   
<dictcomp>,  s    z%upsert_and_notify.<locals>.<dictcomp>Tzinsert retry z failed for rc   z	?boardId=z&menuId=10527&boardMasterId=22u"   환경부 인사발령입니다.
[z]
<a href="u   ">[자세히 보기]</a>r   znotify failed for )r   ri   r$   rg   r2   limitr)   datar*   r+   r,   r=   r%   r   r?   DETAIL_PATHr
   
isinstancer   )rq   sbinsertedsentr   row_idexistr.   okfieldsr   rd   re   msgresr   r   r   upsert_and_notify
  sr   

"

( &"r   c            
   
   C   s8  d} t | }d}d}zed}zttddpd}W n ty$   d}Y nw |r-|dkr-|nd}t|d}||7 }|sPt|d|dd td	 d
dd|ddW S t	|\}}|}t|d||d td|| d
|||ddW S  ty }	 z t|d||t
|	d td|	 d
t
|	|ddW  Y d }	~	S d }	~	ww )Nr   r   ME_ID_BACKFILL_PAGES0rB   )r   passed)r   r   r   zME n8n-style run: no itemsME_ID)sourcer   r   r-   r   z&ME n8n-style run: inserted=%s, sent=%saborted)r   r   r   r   zME n8n-style run aborted: %s)r   errorr-   r   )r/   r   osgetenvr*   r   r4   r+   infor   r"   	exception)
r   r-   r   r   backfill_pagesr   rq   insr   r.   r   r   r   run_onceL  s:   

r   __main__)rx   )rB   )1r   reloggingr\   r   typingr   r   r   r   r   bs4r   app.services.supabase_servicer	   app.services.telegram_senderr
   r   upperr   basicConfig	getLoggerr+   r?   r~   r   r   r   r"   r/   r   r4   compilerC   rO   r:   rA   rG   rS   r[   rw   r   Responser   r   r   r   __name__printr   r   r   r   <module>   s^    




&7B
!