o
    -h/c                     @   sR  d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	m
Z
mZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZmZmZmZ d dlmZm Z  d d	l!m"Z" d d
l#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5 ddl&m6Z6m7Z7m8Z8 ddl9m:Z:m;Z; ddl<m=Z= e>dZ?e@g dZAerddlBmCZC ddlDmEZE ddddddZFdeGdeHfd d!ZId"eeHef deeHef fd#d$ZJG d%d& d&eZKd:d'e(d(e)de(fd)d*ZLd'e(d+e)de(fd,d-ZMG d.d/ d/e-ZNG d0d1 d1eNZOd2e(d3e(ddfd4d5ZPG d6d7 d7eOZQG d8d9 d9eOZRdS );    N)	lru_cache)
TYPE_CHECKINGAnyCallableDict	GeneratorListOptionalPatternTupleUnion)	normalize)warn)PDFPageAggregator)LTCharLTComponentLTContainerLTCurveLTItemLTPageLTTextContainer)PDFPageInterpreter	PDFStackT)PDFPage)	PSLiteral   )utils)T_bboxT_numT_obj
T_obj_list)	Container)PDFStructTreeStructTreeMissing)T_table_settingsTableTableFinderTableSettings)decode_textresolve_allresolve_and_decode)MalformedPDFExceptionPdfminerException)TextMapz^LT)advheight	linewidthptssizesrcsizewidthx0x1y0y1bitsmatrixuprightfontnametext	imagemask
colorspaceevenoddfillnon_stroking_colorstrokestroking_colorstreamnamemcidtag)	PageImage)PDFzSimSun,RegularzSimHei,RegularzSimKai,RegularzSimFang,RegularzSimLi,Regular)s   s   s   _GB2312s   _GB2312s   r<   returnc                 C   sh   d| v r|  dd }| d | | |d  }}nd| }}t|t|dd }t|dd | S )N   +r          )indexCP936_FONTNAMESgetstr)r<   split_atprefixsuffix
suffix_new rX   H/var/www/html/govbot/env/lib/python3.10/site-packages/pdfplumber/page.pyfix_fontname_bytes\   s   
rZ   kwargsc                 C   s   dd |   D S )Nc                 S   s(   i | ]\}}|t |trt|n|qS rX   )
isinstancelisttuple).0keyvaluerX   rX   rY   
<dictcomp>h   s    z'tuplify_list_kwargs.<locals>.<dictcomp>)items)r[   rX   rX   rY   tuplify_list_kwargsg   s   rd   c                       s   e Zd ZU dZdZee ed< dZee	 ed< dde
dee ddfdd	Zdd
dZdddZdef fddZd fddZd fddZ  ZS )"PDFPageAggregatorWithMarkedContentzZExtract layout from a specific page, adding marked-content IDs to
    objects where found.Ncur_mcidcur_tagrH   propsrK   c                 C   s6   t |j| _t|trd|v r|d | _dS d| _dS )z5Handle beginning of tag, setting current MCID if any.MCIDN)r(   rF   rg   r\   dictrf   )selfrH   rh   rX   rX   rY   	begin_tagu   s   
z,PDFPageAggregatorWithMarkedContent.begin_tagc                 C   s   d| _ d| _dS )z/Handle beginning of tag, clearing current MCID.N)rg   rf   rk   rX   rX   rY   end_tag}   s   
z*PDFPageAggregatorWithMarkedContent.end_tagc                 C   s,   | j jr| j jd }| j|_| j|_dS dS )z^Add current MCID to what we hope to be the most recent object created
        by pdfminer.six.rO   N)cur_item_objsrf   rG   rg   rH   )rk   cur_objrX   rX   rY   tag_cur_item   s
   	z/PDFPageAggregatorWithMarkedContent.tag_cur_itemc                    s   t  j|i |}|   |S )z;Hook for rendering characters, adding the `mcid` attribute.)superrender_charrr   )rk   argsr[   r.   	__class__rX   rY   rt      s   z.PDFPageAggregatorWithMarkedContent.render_charc                       t  j|i | |   dS )z7Hook for rendering images, adding the `mcid` attribute.N)rs   render_imagerr   rk   ru   r[   rv   rX   rY   ry         z/PDFPageAggregatorWithMarkedContent.render_imagec                    rx   )zAHook for rendering lines and curves, adding the `mcid` attribute.N)rs   
paint_pathrr   rz   rv   rX   rY   r|      r{   z-PDFPageAggregatorWithMarkedContent.paint_pathNrK   N)__name__
__module____qualname____doc__rf   r	   int__annotations__rg   rS   r   r   rl   rn   rr   floatrt   ry   r|   __classcell__rX   rX   rv   rY   re   n   s   
 

re   box_rawrotationc                 C   sp   t dd | D std|  t| d | d f\}}t| d | d f\}}|dv r2||||fS ||||fS )	Nc                 s   s    | ]	}t |tjV  qd S r}   )r\   numbersNumberr_   xrX   rX   rY   	<genexpr>   s    z!_normalize_box.<locals>.<genexpr>z0Bounding box contains non-number coordinate(s): r   rN   r      )Z   i  )allr+   sorted)r   r   r5   r6   r7   r8   rX   rX   rY   _normalize_box   s   r   	mb_heightc                 C   s    | \}}}}||| ||| fS r}   rX   )r   r   r5   r7   r6   r8   rX   rX   rY   _invert_box   s   r   c                   @   s  e Zd ZU ejdg Zee ed< dZe	ed< dZ
	dgddd	ed
edefddZdhddZedefddZedefddZedeeeef  fddZedefddZedefddZedefddZedeeef fddZdeeef deeef fd d!Zd"edefd#d$Z d%ee! de"eddf fd&d'Z#deeef fd(d)Z$	did*e%e& de'fd+d,Z(	did*e%e& dee) fd-d.Z*	did*e%e& de%e) fd/d0Z+	did*e%e& deeee%e    fd1d2Z,	did*e%e& de%eee%e    fd3d4Z-d5ede.fd6d7Z/					djd8e0ee1e f d9e	d:e	d;ed<e	d=e	d5edeeeef  fd>d?Z2d5edefd@dAZ3d5edefdBdCZ4d5edefdDdEZ5	dkdFe	d<e	d5edefdGdHZ6	dldJe7dKe	dLe	ddMfdNdOZ8	dldJe7dKe	dLe	ddMfdPdQZ9	dldJe7dKe	dLe	ddMfdRdSZ:dTe;ege	f ddUfdVdWZ<d5eddUfdXdYZ=				I	IdmdZe%e0ee>f  d[e%e0ee>f  d\e%e0ee>f  d]e	d^e	dd_fd`daZ?didbe%ee  deeef fdcddZ@defdedfZAdS )nPage_layoutcached_propertiesTis_originalNr   pdfrJ   page_objpage_numberinitial_doctopc                    s   || _ | | _ | _|| _|| _ddtdtdtf fdd}|dd}|d | _t|d	| j}|d
 |d  }t	||| _
dD ]}	|	 jv rYt	t||	| j|}
t| |	 |
 q?d jvrc| j
| _| j
| _t | j| _d S )Nr`   defaultrK   c                    s    t  j| }|d u r|S |S r}   )r)   attrsrR   )r`   r   ra   r   rX   rY   get_attr   s   zPage.__init__.<locals>.get_attrRotater   ih  MediaBoxr   r   )CropBoxTrimBoxBleedBoxArtBoxr   r}   )r   	root_pager   r   r   rS   r   r   r   r   mediaboxr   setattrlowercropboxbboxr   _get_textmapget_textmap)rk   r   r   r   r   r   	_rotationmb_rawr   box_namebox_normalizedrX   r   rY   __init__   s,   



zPage.__init__rK   c                 C   s   |    | j  d S r}   )flush_cacher   cache_clearrm   rX   rX   rY   close   s   z
Page.closec                 C      | j d | j d  S )NrN   r   r   rm   rX   rX   rY   r4         z
Page.widthc                 C   r   )Nr   r   r   rm   rX   rX   rY   r/      r   zPage.heightc                 C   s0   zdd t | j| D W S  ty   g  Y S w )z-Return the structure tree for a page, if any.c                 S   s   g | ]}|  qS rX   )to_dict)r_   elemrX   rX   rY   
<listcomp>   s    z'Page.structure_tree.<locals>.<listcomp>)r"   r   r#   rm   rX   rX   rY   structure_tree   s
   zPage.structure_treec              
   C   sx   t | dr| jS t| jj| j| jjd}t| jj|}z|| j	 W n t
y3 } zt|d }~ww | | _| jS )Nr   )pagenolaparams)hasattrr   re   r   rsrcmgrr   r   r   process_pager   	Exceptionr,   
get_result)rk   deviceinterpretererX   rX   rY   layout   s    

zPage.layoutc                    sx   dt ttf dtdt ttf ffdd dtdtf fdd}tjjp(g }tt||}t	t
r:|S |S )	NptrrK   c                    sF   |d }t |D ]}| \}}||d kr jn j}||| f} q| S )Nr   rN   )ranger4   r/   )r   r   turnsir   ycomprm   rX   rY   rotate_point  s   z!Page.annots.<locals>.rotate_pointannotc                    sb  | d \}}}} ||fj } ||fj }jj}ttg ||R |\}}	}
}| di }|d| d| dd}| D ]>\}}|d urz	|d||< W qE ty   z	|d||< W n ty   j	j
rr td	| d
| d Y nw Y qEw qEjd||| |
||	 j|	 |	||
| ||	 d}|| d| v r| d< | |d< |S )NRectAURITContents)urititlecontentszutf-8zutf-16zCould not decode z of annotation. z will be missing.r   )r   object_typer5   r7   r6   r8   doctoptopbottomr4   r/   Pdata)r   r   r/   r   r   rR   rc   decodeUnicodeDecodeErrorr   raise_unicode_errorsr   r   r   update)r   _a_b_c_dpt0pt1rhr5   r   r6   r   aextraskvparsedr   rk   rX   rY   parse  s\    
zPage.annots.<locals>.parse)r   r   r   r   r)   r   annotsr]   mapr\   CroppedPage_crop_fn)rk   r   rawr   rX   r   rY   r     s   *1

zPage.annotsc                 C   s   dd | j D S )Nc                 S   s   g | ]
}|d  dur|qS )r   NrX   )r_   r   rX   rX   rY   r   T  s    z#Page.hyperlinks.<locals>.<listcomp>)r   rm   rX   rX   rY   
hyperlinksR  s   zPage.hyperlinksc                 C   s    t | dr| jS |  | _| jS )N_objects)r   r   parse_objectsrm   rX   rX   rY   objectsV  s   

zPage.objectsr   c                 C   s*   | j d |d  | j d | j |d  fS )Nr   r   )r   r/   )rk   r   rX   rX   rY   point2coord]  s   *zPage.point2coordobjc           
         s
  t td|jj }dtttf dt	tttf  fdd}t
td t||j }||d<  j|d< dD ]}t||rGtt||j||< q6t|ttfrd| } jjd ur`t jj|n||d	< t|tr|j}t|jtru|jn|jf|d
< t|jtr|jn|jf|d< t|d trt |d |d< n#t|t!frt"t j#|d |d<  fdd|j$D |d< |j%|d<  j&d d \}}	d|v r j'|d  |	 |d<  j'|d  |	 |d<  j(|d  |d< d|v r|dkr|d | |d< |d | |d< |S )N itemrK   c                 S   s$   | \}}|t v rt|}||fS d S r}   )	ALL_ATTRSr)   )r   r   r   resrX   rX   rY   process_attrd  s
   z)Page.process_object.<locals>.process_attrr   r   )ncsscsr=   rD   rB   r<   r1   c                    s$   g | ]^}}|gt  j|R qS rX   )r   r   )r_   cmdr1   rm   rX   rY   r     s   $ z'Page.process_object.<locals>.<listcomp>pathdashrN   r7   r8   r   r   r   r5   r   r6   ))resublt_patrw   r   r   r   rS   r   r	   rj   filterr   __dict__rc   r   r   r*   getattrrF   r\   r   r   get_textr   unicode_normnormalize_unicodegraphicstatescolorr^   ncolorbytesrZ   r   r]   r   original_pathdashing_styler   r/   r   )
rk   r   kindr  attrcsr=   gsmb_x0mb_toprX   rm   rY   process_objecta  sJ   &



zPage.process_objectlayout_objectsc                 c   sR    |D ]#}t |tr | jjd ur| |V  | |jE d H  q| |V  qd S r}   )r\   r   r   r   r  iter_layout_objectsrp   )rk   r  r   rX   rX   rY   r    s   
zPage.iter_layout_objectsc                 C   sR   i }|  | jjD ]}|d }|dv rq	||d u rg ||< || | q	|S )Nr   )anno)r  r   rp   rR   append)rk   r   r   r  rX   rX   rY   r     s   zPage.parse_objectstable_settingsc                 C   s   t |}t| |S r}   )r'   resolver&   rk   r!  tsetrX   rX   rY   debug_tablefinder  s   

zPage.debug_tablefinderc                 C   s   t |}t| |jS r}   )r'   r"  r&   tablesr#  rX   rX   rY   find_tables  s   
zPage.find_tablesc                 C   sX   t |}| |}t|dkrd S dtdttttf fdd}tt	||dd }|S )Nr   r   rK   c                 S   s   t | j | jd | jd fS )Nr   r   )lencellsr   r   rX   rX   rY   sorter  s   zPage.find_table.<locals>.sorter)r`   )
r'   r"  r'  r(  r%   r   r   r   r]   r   )rk   r!  r$  r&  r+  largestrX   rX   rY   
find_table  s   

zPage.find_tablec                    s&   t | |  } fdd|D S )Nc                    s"   g | ]}|j d i  jpi qS )rX   )extracttext_settings)r_   tabler$  rX   rY   r     s   " z'Page.extract_tables.<locals>.<listcomp>)r'   r"  r'  )rk   r!  r&  rX   r1  rY   extract_tables  s   

zPage.extract_tablesc                 C   s6   t |}| |}|d u rd S |jdi |jpi S NrX   )r'   r"  r-  r.  r/  )rk   r!  r$  r0  rX   rX   rY   extract_table  s
   

zPage.extract_tabler[   c                 K   s\   t | jd}d|vr|d| ji d|vr|d| ji i ||}tj| jfi |S )N)layout_bboxlayout_width_charslayout_widthlayout_height_charslayout_height)rj   r   r   r4   r/   r   chars_to_textmapchars)rk   r[   defaultsfull_kwargsrX   rX   rY   r     s   zPage._get_textmappatternregexcase
main_groupreturn_charsreturn_groupsc           	      K   s*   | j di t|}|j||||||dS )N)r?  r@  rA  rB  rC  rX   )r   rd   search)	rk   r>  r?  r@  rA  rB  rC  r[   textmaprX   rX   rY   rD    s   
zPage.searchc                 K   s   | j di t|jS r3  )r   rd   	as_stringrk   r[   rX   rX   rY   extract_text  s   zPage.extract_textc                 K      t j| jfi |S r}   )r   extract_text_simpler;  rG  rX   rX   rY   rJ       zPage.extract_text_simplec                 K   rI  r}   )r   extract_wordsr;  rG  rX   rX   rY   rL    rK  zPage.extract_wordsstripc                 K   s   | j di t|j||dS )N)rM  rB  rX   )r   rd   extract_text_lines)rk   rM  rB  r[   rX   rX   rY   rN    s   zPage.extract_text_linesFr   relativestrictr   c                 C   s   t | |||dS )N)rO  rP  )r   rk   r   rO  rP  rX   rX   rY   crop!  s   z	Page.cropc                 C      t | |||tjdS zS
        Same as .crop, except only includes objects fully within the bbox
        )rO  rP  crop_fn)r   r   within_bboxrQ  rX   rX   rY   rV  &     zPage.within_bboxc                 C   rS  rT  )r   r   outside_bboxrQ  rX   rX   rY   rX  0  rW  zPage.outside_bboxtest_functionFilteredPagec                 C   s
   t | |S r}   )rZ  )rk   rY  rX   rX   rY   r
  :     
zPage.filterc                 K   sB   t | dd }dd | j D |_tj| jfi ||jd< |S )u   
        Removes duplicate chars — those sharing the same text and positioning
        (within `tolerance`) as other characters in the set. Adjust extra_args
        to be more/less restrictive with the properties checked.
        c                 S   s   dS )NTrX   r*  rX   rX   rY   <lambda>C  s    z#Page.dedupe_chars.<locals>.<lambda>c                 S   s   i | ]\}}||qS rX   rX   )r_   r  objsrX   rX   rY   rb   D  s    z%Page.dedupe_chars.<locals>.<dictcomp>char)rZ  r   rc   r   r   dedupe_charsr;  )rk   r[   prX   rX   rY   r_  =  s   zPage.dedupe_chars
resolutionr4   r/   	antialiasforce_mediaboxrI   c           	      C   s   ddl m}m} tdd |||fD }|dkrtd| |dur+d| | j }n|dur6d| | j }|| |p;|||dS )	z
        You can pass a maximum of 1 of the following:
        - resolution: The desired number pixels per inch. Defaults to 72.
        - width: The desired image width in pixels.
        - height: The desired image width in pixels.
        r   )DEFAULT_RESOLUTIONrI   c                 s   s    | ]}|d uV  qd S r}   rX   r   rX   rX   rY   r   X  s    z Page.to_image.<locals>.<genexpr>zUOnly one of these arguments can be provided: resolution, width, height. You provided NH   )ra  rb  rc  )displayrd  rI   sum
ValueErrorr4   r/   )	rk   ra  r4   r/   rb  rc  rd  rI   	num_specsrX   rX   rY   to_imageH  s    zPage.to_imageobject_typesc              	   C   sl   |d u rt | j dg }n|}| j| j| j| j| j| j| j	| j
d}|D ]}t| |d ||d < q&|S )Nr   )r   r   r   r   r   r   r4   r/   s)r]   r   keysr   r   r   r   r   r   r4   r/   r  )rk   rk  _object_typesdtrX   rX   rY   r   i  s   
zPage.to_dictc                 C   s   d| j  dS )Nz<Page:>)r   rm   rX   rX   rY   __repr__|  s   zPage.__repr__r   r~   r}   )TTr   TT)TT)FT)NNNFF)Br   r   r   r!   r   r   rS   r   r   boolpagesr   r   r   r   r   propertyr4   r/   r   r   r   r   r   r    r   r   r   r   r   r   r   r  r   r   r  r   r	   r$   r&   r%  r%   r'  r-  r2  r4  r-   r   r   r
   rD  rH  rJ  rL  rN  r   rR  rV  rX  r   r
  r_  r   rj  r   rr  rX   rX   rX   rY   r      s6  
 

,A"J






	






$!r   c                   @   s(   e Zd ZU dZeed< defddZdS )DerivedPageFr   parent_pagec                 C   sd   || _ |j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _| 	t
j t | j| _d S r}   )rx  r   r   r   r   r   r   r   r   r   r!   r   r   r   r   )rk   rx  rX   rX   rY   r     s   zDerivedPage.__init__N)r   r   r   r   rt  r   r   r   rX   rX   rX   rY   rw    s   
 rw  r   parent_bboxc                 C   st   t | }|dkrtd|  dt | |}|d u r%td|  d| t |}||k r8td|  d| d S )Nr   zBounding box z has an area of zero.z. is entirely outside parent page bounding box z. is not fully within parent page bounding box )r   calculate_arearh  get_bbox_overlap)r   ry  	bbox_areaoverlapoverlap_arearX   rX   rY   test_proposed_bbox  s$   

r  c                       sb   e Zd Zejddfdededeeegef de	de	f
 fdd	Z
ed
eeef fddZ  ZS )r   FTrx  	crop_bboxrU  rO  rP  c                    s   |r|j \}}}} \}	}
}}|	| |
| || || f |r%t |j  dtdtf fdd}t | || _tju rE|j | _ d S  | _ d S )Nr]  rK   c                    s
   |  S r}   rX   )r]  r  rU  rX   rY   r     r[  z&CroppedPage.__init__.<locals>._crop_fn)r   r  r    rs   r   r   r   rX  )rk   rx  r  rU  rO  rP  o_x0o_top_r5   r   r6   r   r   rv   r  rY   r     s   

zCroppedPage.__init__rK   c                    2   t  dr jS  fdd jj D  _ jS )Nr   c                    s   i | ]
\}}|  |qS rX   )r   r_   r   r   rm   rX   rY   rb     s    z'CroppedPage.objects.<locals>.<dictcomp>r   r   rx  r   rc   rm   rX   rm   rY   r     s   


zCroppedPage.objects)r   r   r   r   crop_to_bboxr   r   r   r    rt  r   rv  r   rS   r   r   rX   rX   rv   rY   r     s"     r   c                       sJ   e Zd Zdedeegef f fddZede	e
ef fddZ  ZS )rZ  rx  	filter_fnc                    s   |j | _ || _t | d S r}   )r   r  rs   r   )rk   rx  r  rv   rX   rY   r     s   zFilteredPage.__init__rK   c                    r  )Nr   c                    s"   i | ]\}}|t t j|qS rX   )r]   r
  r  r  rm   rX   rY   rb     s    z(FilteredPage.objects.<locals>.<dictcomp>r  rm   rX   rm   rY   r     s   


zFilteredPage.objects)r   r   r   r   r   r   rt  r   rv  r   rS   r    r   r   rX   rX   rv   rY   rZ    s      rZ  rs  )Sr   r  	functoolsr   typingr   r   r   r   r   r   r	   r
   r   r   unicodedatar   r  warningsr   pdfminer.converterr   pdfminer.layoutr   r   r   r   r   r   r   pdfminer.pdfinterpr   r   pdfminer.pdfpager   pdfminer.psparserr   r   r   _typingr   r   r   r    	containerr!   	structurer"   r#   r0  r$   r%   r&   r'   r(   r)   r*   utils.exceptionsr+   r,   
utils.textr-   compiler	  setr   rf  rI   r   rJ   rQ   r  rS   rZ   rd   re   r   r   r   rw  r  r   rZ  rX   rX   rX   rY   <module>   sZ    0$	
!	"3   I(