o
    ohk$                     @   s  d dl mZ d dlZd dlZd dlmZmZ d dlmZmZm	Z	m
Z
 d dlZd dlmZ d dlmZmZ d dlmZ e ZdZd	Zd
diZdddddefddZdedefddZdddedededededB f
ddZdede	e
eeef  fddZd ede
e	e ef fd!d"Zd#e	e de	e fd$d%Zd&ede	e fd'd(Zd)ed*ed+ede	e fd,d-Z d.ede!fd/d0Z"d1ede!fd2d3Z#deeeeB f fd4d5Z$e%d6kre&e$  dS dS )7    )escapeN)datetimetimezone)DictListOptionalTuple)BeautifulSoup)
get_clientlogger)broadcast_htmlz5https://www.moef.go.kr/nw/notice/hr.do?menuNo=4050300z,https://www.moef.go.kr/nw/notice/hrDetail.doz
User-AgentzCMozilla/5.0 (compatible; govbot/1.0; +https://work.jjickjjicks.com)   paramsheaderstimeouturlc                C   s   d}t dD ]=}ztj| |||d}|  |W   S  tyC } z|dkr' td|d | t| |d9 }W Y d }~qd }~ww t	d)	N      ?   r      zGET retry (%s): %s      unreachable)
rangerequestsgetraise_for_status	Exceptionr   warningtimesleepRuntimeError)r   r   r   r   delayire r&   )/var/www/html/bot/app/crawler/moef_n8n.py_get_with_retry   s   

r(   targetreturnc                 C   s   dd l }t| }ztd|| dddd ttj	
 d  W |S  ty@ } ztd|  W Y d }~|S d }~ww )Nr   crawler_runrunning)idr)   statuspagesrowsfail_reason
started_atz[crawler_run insert] )uuidstruuid4supabasetableinsertr   nowr   utc	isoformatexecuter   r   r   )r)   r3   run_idr%   r&   r&   r'   
_run_start)   s&   
r>   )r1   r=   r.   r/   r0   r1   c             
   C   s   |t tj ||d}|d ur||d< ztd|d| 	  W d S  t
yA } ztd|  W Y d }~d S d }~ww )N)r.   finished_atr/   r0   r1   r+   r-   z[crawler_run finish] )r   r9   r   r:   r;   r6   r7   updateeqr<   r   r   r   )r=   r.   r/   r0   r1   payloadr%   r&   r&   r'   _run_finish:   s   "rC   href_jsc                 C   s4   t d| pd}|sdS |d|d|dfS )za
    javascript:fn_egov_select('4050300','MOSFBBS_...','POST_ID') -> (menuNo, bbsId, postId)
    z=fn_egov_select\('(\d+)','([A-Za-z0-9_]+)','([A-Za-z0-9_]+)'\) Nr   r   r   )researchgroup)rD   mr&   r&   r'   _extract_js_paramsK   s   rJ   
title_textc                 C   sD   | sdS t d|  }|r|d |d fS d|  fS )uL   
    '[태그] 제목' -> ('태그','제목'), 아니면 (None, 원제)
    )NrE   z^\[(.+?)\]\s*(.+)$r   r   N)rF   matchstriprH   )rK   rI   r&   r&   r'   _split_tag_from_titleT   s   rN   	containerc                 C   sB   | sdS |  d}|r|jdddpd }|d}|pdS dS )u~   
    목록 DOM에서 'state*' 클래스를 갖는 태그(예: <span class="state1">인사발령</span>)를 우선 추출
    Nz@span[class*="state"], em[class*="state"], strong[class*="state"] TrM   rE   z[]())
select_oneget_textrM   )rO   eltr&   r&   r'   _extract_tag_from_state_   s   

rV   htmlc                 C   s   t d| }|st d| }|sdS |ddd}zt|d}|dW S  tyY   |d}dd	 t	|D }ztd
|d}|dW  Y S  tyX   Y Y dS w w )
uj   
    상세 페이지에서 '등록일' 또는 날짜 패턴(YYYY.MM.DD / YYYY-MM-DD) -> 'YYYY-MM-DD'
    u3   등록일[^0-9]*(\d{4}[.\-](\d{1,2})[.\-](\d{1,2}))z#(\d{4}[.\-](\d{1,2})[.\-](\d{1,2}))Nr   .-z%Y-%m-%dc                 S   s&   g | ]\}}|d kr| dn|qS )r   r   )zfill).0r#   pr&   r&   r'   
<listcomp>{   s   & z0_parse_date_from_detail_html.<locals>.<listcomp>)rF   rG   rH   replacer   strptimestrftimer   split	enumeratejoin)rW   rI   rawdtpartsr&   r&   r'   _parse_date_from_detail_htmll   s&   
rg   menuNobbsIdpostIdc                 C   s&   ||| d}t t|tdd}t|jS )N)searchBbsId1searchNttId1rh   r   r   )r(   
DETAIL_URLHEADERSrg   text)rh   ri   rj   r   r$   r&   r&   r'   _detail_posted_at   s   
rp   row_idc                 C   s,   t ddd| d }t|jS )Nmoef_idr-   r   )r6   r7   selectrA   limitr<   booldata)rq   resr&   r&   r'   _exists_in_supabase   s   "
rx   rowc                    s   g dg dg dg dg}|D ]6} fdd|D }zt d|  W  dS  tyF } ztd	| d
|  W Y d}~qd}~ww dS )u<   
    tag/posted_at 유무와 무관하게 유연 삽입
    )r-   ri   rj   titletag	posted_at)r-   ri   rj   rz   r|   )r-   ri   rj   rz   r{   r-   ri   rj   rz   c                    s   i | ]}| v r| | qS r&   r&   )r[   kry   r&   r'   
<dictcomp>   s    z _safe_insert.<locals>.<dictcomp>rr   Tzinsert retry with fields z	 failed: NF)r6   r7   r8   r<   r   r   r   )ry   fields_priorityfieldsrv   r%   r&   r   r'   _safe_insert   s   "r   c                  C   s<  d} t | }d}d}d}zbtttdd}|d7 }t|jd}|d}|D ]-}|dd	}	t|	}
|
s6q&|
\}}}| d
| }t	|rGq&d	}|
d}|r^|d}|r^|jddd}|si|jdddphd}|dp|dp|dp|dp|j}t|}t|\}}|s|}n|}zt|||}W n ty } ztd| d|  d }W Y d }~nd }~ww ||||p|d}|r||d< |r||d< t|}|std|  q&|d7 }t d| d| d| }|rdt| dt|d d	 d!nd"t|d d	 d!}d#| d$| d%}zt|}|t|tr't|n|r,dnd7 }W n tyN } ztd&| d|  W Y d }~nd }~ww td' q&t|d(||d) td*| d+|  |||d(d,W S  ty } z!t|d-||t|d. t d/|  t||d-d0W  Y d }~S d }~ww )1Nrr   r   r   )r   r   r   zhtml.parserz$a[href*="javascript:fn_egov_select"]hrefrE   rY   h3arP   TrQ   u   기재부 인사발령litrarticledivzfetch posted_at failed for z: r}   r{   r|   zinsert failed for z?searchBbsId1=z&searchNttId1=z&menuNo=z[(z) rz   ][u!   기재부 인사발령입니다.
z

<a href="u   ">[자세히 보기]</a>ztelegram send failed for r   passed)r.   r/   r0   zMOEF run: inserted=z, sent=)insertedsentr=   r.   aborted)r.   r/   r0   r1   zMOEF run aborted: )errorr=   r.   )!r>   r(   LIST_URLrn   r	   ro   rs   r   rJ   rx   find_previousfindrS   find_parentparentrV   rN   rp   r   r   r   r   rm   r   r   
isinstanceintr   r    rC   infor4   	exception)r)   r=   r/   r   r   r$   soupanchorsr   rD   r   rh   ri   rj   rq   rK   r   linkrO   r{   tag2clean_titler|   r%   ry   okheadmessagesent_resr&   r&   r'   run_once   s   





	@*"r   __main__)'rW   r   rF   r   r   r   typingr   r   r   r   r   bs4r	   app.services.supabase_servicer
   r   app.services.telegram_senderr   r6   r   rm   rn   r4   r(   r>   r   rC   rJ   rN   rV   rg   rp   ru   rx   r   r   __name__printr&   r&   r&   r'   <module>   s8   ( 		h