
    hw)                    D   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZm	Z	m
Z
mZ d dlZd dlmZ dZdZ ej        d          Zdd	d	d
dddddddddZd:dZd;dZd<dZd=dZd>d Zd?d"Zd@d$ZdAd'ZdBd)ZdCdDd/ZdCdEd2ZdFd3ZdCdGd7Z d8 Z!e"d9k    r e!             dS dS )H    )annotationsN)ListDictOptionalTuple)	PdfReaderz/https://alio.go.kr/item/itemReportListSusi.jsonz$https://alio.go.kr/download/pdf.jsonu   (직위\s*변경\s*전\s*성명\s*변경\s*후\s*성명\s*변경사유)([\s\S]*?)(?=\n\s*직위\s+(?:상임|비상임|상임기관장|상임감사|상임이사|비상임이사)|\Z)C0247C0248C0305C0306C0220C0042C0043C0066C0082C0259C0236)u   한국전력공사u   한국전력기술주식회사u   한국전력기술(주)u   한전KPS(주)u	   한전KDNu   한국수력원자력(주)u   한국남동발전(주)u   한국남부발전(주)u   한국동서발전(주)u   한국서부발전(주)u   한국중부발전(주)u   한전원자력연료(주)sstrreturnc                    |                      dd          } t          j        dd|           } t          j        dd|           } t          j        dd|           } |                                 S )N z[ \t]+ z
[ ]*\n[ ]*
z\n{2,})replaceresubstrip)r   s    0/var/www/html/bot/scripts/pdf_extract_changes.py_normalize_textr!   "   s\    			$A
y#q!!A
}dA&&A
y$""A7799    patternOptional[str]c                    t          j        | |          }|r'|                    d                                          nd S )N   r   searchgroupr   )r#   r   ms      r    _pickr+   )   s9    
	'1A!",1771::,r"   textc                    t          j        d|           }|r'|                    d                                          nd S )Nu-   임원\s*현황\s*\n([^\n]+)\n임원\s*현황r&   r'   )r,   r*   s     r    _extract_organ_from_textr.   -   s:    
	BDIIA!",1771::,r"   namec                    | sd S | t           v rt           |          S t                                           D ]\  }}|| v r|c S d S )N)APBA_NAME_TO_IDitems)r/   kvs      r    _apba_from_organ_namer5   2   s`     tt$$%%''  199HHH 4r"   urlc                   t          j        | d          }|                                 t          j        d          \  }}t          j        |d          5 }|                    |j                   d d d            n# 1 swxY w Y   |S )N<   )timeoutz.pdf)suffixwb)	requestsgetraise_for_statustempfilemkstemposfdopenwritecontent)r6   rfdpathfs        r    _download_to_tmprI   ?   s    S"%%%Av...HB	2t		 		              Ks   A??BBpath_or_urlc                `   | }|                      d          rt          |           }	 t          |          }g }|j        D ]Q}	 |                    |                                pd           -# t          $ r |                    d           Y Nw xY wd                    |          }|| k    r3t          j	        
                    |          rt          j        |           n@# || k    r4t          j	        
                    |          rt          j        |           w w w xY wt          |          S )N)zhttp://zhttps://r   r   )
startswithrI   r   pagesappendextract_text	ExceptionjoinrA   rG   existsremover!   )rJ   
local_pathreaderrM   pr,   s         r    extract_text_from_pdfrW   G   s<   J566 3%k22
":&& 	! 	!A!Q^^--34444 ! ! !R     !yy$$
)C)C$Ij!!! $$
)C)C$Ij!!!! %$4   s/   C! )A-,C! -BC! BC! !=Dsection_textc                   t          j        d|           }|r;t          j        dd|                    d                                                    S t          j        d|           }|r;t          j        dd|                    d                                                    S t          j        d|           }|r;t          j        dd|                    d                                                    S d S )NuG   임기\s*\(시작일\)\s*([^\n(]+?)\s*\(종료일\)\s*([^\n]+?)(?:\n|$)z\s+r   r&   u<   임기\s*[:：]?\s*([^\n~\-–—]+?)\s*[~\-–—]\s*[^\n]+u1   임기\s*(?:시작(?:일)?)?\s*[:：]?\s*([^\n]+))r   r(   r   r)   r   )rX   r*   s     r    _parse_start_in_sectionrZ   Z   s    
	\^jkkA 7vfc1771::#3#3#5#5666
	QS_``A 7vfc1771::#3#3#5#5666
	FUUA 7vfc1771::#3#3#5#56664r"   merged_text	List[str]c                @    d t          j        d|           D             S )Nc                <    g | ]}|                     d           |S )u   직위)rL   ).0secs     r    
<listcomp>z,_split_position_sections.<locals>.<listcomp>k   s*    ```Cs~~V^G_G_`C```r"   u   \n(?=직위\s))r   split)r[   s    r    _split_position_sectionsrc   i   s#    ``28$5{CC````r"   pos_hintc                d   t          |           }t          j        dt          j        |           d          }t	          |pd          }|D ]b}|                    |          st          d|          pd}|r!t	          |          r|t	          |          vrMt          |          }|r|c S cd S )Nu   성명\s*[:：]?\s*z(\s|$)r   u   직위\s*([^\n]+?)\s*성명)rc   r   compileescaper!   r(   r+   rZ   )	r[   r/   rd   sectionsname_patpos_hint_normr`   sec_posstarts	            r    #_find_start_in_text_by_name_and_posrm   m   s    '44HzH4HHHIIH#HN33M  s## 	6<<B 	_W55 	-_fOgOg:g:g',, 	LLL	4r"      apba_id	max_pagesintList[Dict[str, str]]c           	     ^   g }ddddd}t          d|dz             D ]}t          |          | dd}t          j        t          |t          j        |          d	
          }	 |                                }n# t          $ r Y  nw xY w|                    d          dk    r n|                    d          pi                     d          pg }|s nY|D ]T}	|	                    |	                    d          |	                    d          |	                    d          d           U|S )Nzhttps://alio.go.krzhttps://alio.go.kr/XMLHttpRequestzapplication/json;charset=UTF-8)OriginRefererzX-Requested-WithzContent-Typer&   20305)pageNoapbaIdreportFormRootNo   )headersdatar9   statussuccessr}   resultdisclosureNoidatetitle)r   r   r   )
ranger   r<   postLIST_URLjsondumpsrP   r=   rN   )
ro   rp   r2   headers_jsonpagebodyrE   r}   r   rows
             r    _list_itemsr   }   sk   "$E&(,8	 L aQ''  d))wGTTM(Ltz$?O?OY[\\\	6688DD 	 	 	EE	88H**E((6""(b--h77=2 	E 	 	CLL # 7 7))))     	 Ls   #A88
BBcurrent_disclookbackc                    t          | d          }|sg S t          fdt          |          D             d           }||dd|z            S ||dz   |dz   |z            S )N   )rp   c              3  T   K   | ]"\  }}|                     d           k    |V  #dS )r   Nr=   )r_   iitr   s      r    	<genexpr>z#_find_prev_items.<locals>.<genexpr>   s=      ZZea266.3I3I\3Y3Y3Y3Y3Y3YZZr"   r&   )r   next	enumerate)ro   r   r   r2   idxs    `   r    _find_prev_itemsr      s    1---E 	
ZZZZy//ZZZ\`
a
aC
{Qqz\""QAh&''r"   c           	        t                               |           }|sg S |                    d                                          }d |                    d          D             }g }t          j        d          }|D ]}|                    |          }|s|                    |                    d                                          |                    d                                          |                    d                                          |                    d                                          d	           |S )
N   c                ^    g | ]*}|                                 |                                 +S  )r   )r_   lns     r    ra   z!parse_changes.<locals>.<listcomp>   s-    BBBBrxxzzBRXXZZBBBr"   r   zN^\s*(?P<position>.+?)\s+(?P<before>\S+)\s+(?P<after>\S+)\s+(?P<reason>.+?)\s*$positionbeforeafterreason)r   r   r   r   )	
HEADER_PATr(   r)   r   rb   r   rf   matchrN   )r[   r*   blocklinesoutline_rer   mms           r    parse_changesr      s4   +&&A 	GGAJJEBB%++d"3"3BBBE "CjjkkG 	 	]]2 	

,,2244hhx((..00XXg&&,,..hhx((..00	
 
 	 	 	 	 Jr"   text_currentchangesNonec           	        |sd S |D ]H}|                     d          s1t          ||d         |                     d                    }|r||d<   Id |D             }|sd S | s"t          |          }t          |          } | sd S t	          | ||          }	|	D ]}
|
                     d          }|st
           d| }	 t          |          }n# t          $ r Y Cw xY w|D ]I}|                     d          rt          ||d         |                     d                    }|r||d<   Jt          d |D                       r d S d S )	N
prev_startr   r   c                <    g | ]}|                     d           |S )r   r   r_   cs     r    ra   z0fill_prev_start_with_history.<locals>.<listcomp>   s)    BBB!aeeL.A.ABABBBr"   r   r   z?disclosureNo=c              3  @   K   | ]}|                     d           V  dS )r   Nr   r   s     r    r   z/fill_prev_start_with_history.<locals>.<genexpr>   s.      99qquu\""999999r"   )	r=   rm   r.   r5   r   PDF_URLrW   rP   all)ro   r   r   r   r   r   psneed_historyorgan
prev_itemsr   discr6   t_prevs                 r    fill_prev_start_with_historyr      s     % %uu\"" 	%4\1X;PQPUPUV`PaPabbB %"$,BBwBBBL  (66'.. 	F!'<(KKKJ  vvn%% 	....	*3//FF 	 	 	H	 	% 	%Auu\"" 4VQx[!%%PZJ[J[\\B %"$,99L99999 	EE	! s   ?C
CCc                 x   d } d }d}t           j        dd          }|s/t          dt           j                   t          j        d           |d         } d}|t          |          k     r||         dk    r9|dz   t          |          k     r#||dz                                            }|dz  }nY||         dk    rH|dz   t          |          k     r2	 t          ||dz                      }n# t          $ r d}Y nw xY w|dz  }n|dz  }|t          |          k     t          |           }t          j        d	|           }|r|                    d          nd
}t          |          }|D ]}	d
|	d<   t          |||||           |s;t          t          j        g dddd                     t          j        d           t          t          j        d|idd                     |D ]P}	|	                    d          pd}
t          d|	d          d|	d          d|
 d|	d          d|	d          d           Qd S )Nrn   r&   z_Usage: python -m scripts.pdf_extract_changes <PDF_URL_or_path> [--apba-id C0305] [--lookback 5])filer   z	--apba-idr   z
--lookbackzdisclosureNo=([0-9]+)r   r   r   u-   변경 전/후 표를 찾지 못했습니다.)r   messageF)ensure_asciiindentr   -u	   [변경] r   z | r   u    (임기 시작일: u   ) → r   u
    (사유: r   ))sysargvprintstderrexitlenr   rq   rP   rW   r   r(   r)   r   r   r   r   r=   )srcro   r   argsr   r,   m_discr   r   r   r   s              r    mainr      s   
CGH8ABB<D ovy  wA  	B  	B  	B  	B
q'C	A
c$ii--7k!!acCIIoo1Q3ioo''GFAA!W$$1s4yytAaCy>>   FAAFA c$ii-- !%%DY/55F&,46<<???"LD!!G  , !,gPXYYYY djR4cddsx  BC  D  D  D  	E  	E  	E 
$*i)a
H
H
HIII } }UU<  'C{!J-{{AhK{{R{{WXY`Wa{{mnowmx{{{||||} }s   C* *C98C9__main__)r   r   r   r   )r#   r   r   r   r   r$   )r,   r   r   r$   )r/   r$   r   r$   )r6   r   r   r   )rJ   r   r   r   )rX   r   r   r$   )r[   r   r   r\   )r[   r   r/   r   rd   r$   r   r$   )rn   )ro   r   rp   rq   r   rr   )ro   r   r   r   r   rq   r   rr   )r[   r   r   rr   )ro   r$   r   r   r   r   r   rr   r   rq   r   r   )#
__future__r   rA   r   r   r   r?   typingr   r   r   r   r<   pypdfr   r   r   rf   r   r1   r!   r+   r.   r5   rI   rW   rZ   rc   rm   r   r   r   r   r   __name__r   r"   r    <module>r      s:   " " " " " " " " " " " " " " " " " " " " " " " " " " . . . . . . . . . . . .        =1 RZ | 
 "&-&")&&&&&")     - - - -- - - -

 
 
 
   ! ! ! !&   a a a a        8	( 	( 	( 	( 	(   *( ( ( ( (V/} /} /}b zDFFFFF r"   