o
    -hO]                     @   s4  d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
mZmZmZmZ ddlmZ ddlmZmZmZmZmZmZ dZdZdZdZeeeeef f Zed	eeef f Zer`dd
lm Z  eefdedededefddZ!efdedededefddZ"dedededededefddZ#efdede$defddZ%efdede$defdd Z&	d8dedededefd!d"Z'd#ede	e fd$d%Z(d&e	e de	e	e  fd'd(Z)G d)d* d*e*Z+G d+d, d,e+Z,G d-d. d.e+Z-G d/d0 d0e*Z.g d1Z/g d2Z0G d3d4 d4e1Z2e2d Z3eG d5d	 d	Z4G d6d7 d7e*Z5dS )9    N)	dataclass)
itemgetter)	TYPE_CHECKINGAnyDictListOptionalSetTupleTypeUnion   )utils)T_bboxT_numT_obj
T_obj_iter
T_obj_listT_point   TableSettings)Pageedgesx_tolerancey_tolerancereturnc                 C   sR   g g d}| D ]}||d   | qt|d d|}t|d d|}|| S )zs
    Given a list of edges, snap any within `tolerance` pixels of one another
    to their positional average.
    vhorientationr   x0r   top)appendr   snap_objects)r   r   r   by_orientatione	snapped_v	snapped_h r(   I/var/www/html/govbot/env/lib/python3.10/site-packages/pdfplumber/table.py
snap_edges   s   
	r*   r   	tolerancec           	      C   s   |dkr	d\}}n|dkrd\}}nt dtt| t|d}|d g}|dd	 D ])}|d
 }|| || | krO|| || krNt|||| |d
< q+|| q+|S )z
    Given a list of edges along the same infinite line, join those that
    are within `tolerance` pixels of one another.
    r   )r    x1r   )r!   bottomzOrientation must be 'v' or 'h'keyr   r   N)
ValueErrorlistsortedr   r   resize_objectr"   )	r   r   r+   min_propmax_propsorted_edgesjoinedr%   lastr(   r(   r)   join_edge_group'   s   


r:   snap_x_tolerancesnap_y_tolerancejoin_x_tolerancejoin_y_tolerancec           	         sv   dt dtttf fdd}|dks|dkrt| ||} t| |d}tj||d} fdd|D }ttj	| } | S )	z|
    Using the `snap_edges` and `join_edge_group` methods above,
    merge a list of edges into a more "seamless" list.
    edger   c                 S   s$   | d dkrd| d fS d| d fS )Nr   r   r!   r   r    r(   )r?   r(   r(   r)   	get_groupP   s   zmerge_edges.<locals>.get_groupr   r.   c                 3   s4    | ]\}}t ||d  |d  dkr nV  qdS )r   r   N)r:   ).0kitemsr=   r>   r(   r)   	<genexpr>[   s    
zmerge_edges.<locals>.<genexpr>)
r   r
   strr   r*   r3   	itertoolsgroupbyr2   chain)	r   r;   r<   r=   r>   r@   _sortededge_groupsedge_genr(   rD   r)   merge_edgesD   s   rM   wordsword_thresholdc           	   
      s   t | tdd}t fdd|}ttt j|}t|dkr"g S tttd|}t	ttd|}g }|D ]"}||||d |d || dd	|||d
 |d
 || dd	g7 }q8|S )zi
    Find (imaginary) horizontal lines that connect the tops
    of at least `word_threshold` words.
    r!   r   c                       t |  kS NlenxrO   r(   r)   <lambda>m       z"words_to_edges_h.<locals>.<lambda>r   r    r,   r   )r    r,   r!   r-   widthr   r-   )
r   cluster_objectsr   filterr2   mapobjects_to_rectrS   minmax)	rN   rO   by_toplarge_clustersrectsmin_x0max_x1r   rr(   rV   r)   words_to_edges_he   s4   rf   c                    sF  t | tdd}t | tdd}dtdtfdd}t | |d}|| | }t|dd	 d
}tfdd	|}ttt j	|}	g }
|	D ] t
 fdd|
D }|sY|
  qEt|
dkrbg S tt j|
}tt|tdd
}tttd|}tttd|tttd|fdd|D || ddg S )zy
    Find (imaginary) vertical lines that connect the left, right, or
    center of at least `word_threshold` words.
    r    r   r,   wordr   c                 S   s   t | d | d  d S )Nr    r,      )float)rg   r(   r(   r)   
get_center   s   z$words_to_edges_v.<locals>.get_centerc                 S   s
   t |  S rQ   rR   rT   r(   r(   r)   rW      s   
 z"words_to_edges_v.<locals>.<lambda>r.   c                    rP   rQ   rR   rT   rV   r(   r)   rW      rX   c                 3   s    | ]	}t  |V  qd S rQ   )r   get_bbox_overlaprA   cbboxr(   r)   rE      s    z#words_to_edges_v.<locals>.<genexpr>r   r!   r-   c              	      s*   g | ]}|d  |d     ddqS )r    r   r    r,   r!   r-   heightr   r(   )rA   b)
max_bottommin_topr(   r)   
<listcomp>   s    	z$words_to_edges_v.<locals>.<listcomp>r   rp   )r   rZ   r   r   r   r3   r[   r2   r\   objects_to_bboxanyr"   rS   bbox_to_rectr_   r^   )rN   rO   by_x0by_x1rj   	by_centerclusterssorted_clustersra   bboxescondensed_bboxesoverlapcondensed_rectssorted_rectsrd   r(   )ro   rs   rt   rO   r)   words_to_edges_v   sB   
	r   c           	         s   i } fdddD \}}t |tdddD ][}t |tdddD ]O}|d |d | krp|d |d | krp|d |d | krp|d |d | krp|d |d f}||vr^g g d||< || d	 | || d
 | q!q|S )zi
    Given a list of edges, return the points at which they intersect
    within `tolerance` pixels.
    c                    s"   g | ] t t fd dqS )c                    s   | d  kS )Nr   r(   rT   or(   r)   rW      rX   z3edges_to_intersections.<locals>.<listcomp>.<lambda>)r2   r[   )rA   r   r   r)   ru      s    z*edges_to_intersections.<locals>.<listcomp>r   r    r!   r.   r-   r,   r   r   )r3   r   r"   )	r   r   r   intersectionsv_edgesh_edgesr   r   vertexr(   r   r)   edges_to_intersections   s$   

r   r   c                    s   dt dt dtffdd tt tdtt  dtdtt	 f fdd	fd
dt
tD }ttd|S )a8  
    Given a list of points (`intersections`), return all rectangular "cells"
    that those points describe.

    `intersections` should be a dictionary with (x0, top) tuples as keys,
    and a list of edge objects as values. The edge objects should correspond
    to the edges that touch the intersection.
    p1p2r   c                    s   dt dtt fdd}| d |d kr*| |  d | | d }t|r*dS | d |d krI| |  d	 | | d	 }t|rIdS d
S )Nr   r   c                 S   s   t ttj| S rQ   )setr\   r   obj_to_bboxr   r(   r(   r)   edges_to_set   s   zCintersections_to_cells.<locals>.edge_connects.<locals>.edges_to_setr   r   Tr   r   F)r   r	   r   intersectionrS   )r   r   r   common)r   r(   r)   edge_connects   s   z-intersections_to_cells.<locals>.edge_connectspointsic                    s   |d krd S | |  | |d d  } fdd|D } fdd|D }|D ];} |s0q(|D ]0} |s:q2|d |d f}|v rb||rb||rb d  d |d |d f    S q2q(d S )Nr   c                        g | ]}|d   d  kr|qS )r   r(   rA   rU   ptr(   r)   ru          zFintersections_to_cells.<locals>.find_smallest_cell.<locals>.<listcomp>c                    r   r   r(   r   r   r(   r)   ru     r   r   r(   )r   r   restbelowrightbelow_ptright_ptbottom_right)r   r   n_pointsr   r)   find_smallest_cell
  s,   

$z2intersections_to_cells.<locals>.find_smallest_cellc                 3   s    | ]} |V  qd S rQ   r(   )rA   r   )r   r   r(   r)   rE   %  s    z)intersections_to_cells.<locals>.<genexpr>N)r   boolr2   r3   keysrS   r   intr   r   ranger[   )r   cell_genr(   )r   r   r   r   r   r)   intersections_to_cells   s   
&r   cellsc                    s0  dt dtttttf fdd}t| }t  g }g }t|r|t|}t|D ]<}||}t|dkrC t|O  || || q&t fdd|D }|dkrb t|O  || || q&t||krx|t|  	  |	  t|st|r|t| t
|dd	 d
}	dd |	D }
|
S )z
    Given a list of bounding boxes (`cells`), return a list of tables that
    hold those cells most simply (and contiguously).
    ro   r   c                 S   s(   | \}}}}||f||f||f||ffS rQ   r(   )ro   r    r!   r,   r-   r(   r(   r)   bbox_to_corners/  s   z(cells_to_tables.<locals>.bbox_to_cornersr   c                 3   s    | ]}| v V  qd S rQ   r(   rl   current_cornersr(   r)   rE   H  s    z"cells_to_tables.<locals>.<genexpr>c                 S   s   t dd | D S )Nc                 s   s     | ]}|d  |d fV  qdS )r   r   Nr(   rl   r(   r(   r)   rE   a  s    z4cells_to_tables.<locals>.<lambda>.<locals>.<genexpr>)r^   )tr(   r(   r)   rW   a  s    z!cells_to_tables.<locals>.<lambda>r.   c                 S   s   g | ]
}t |d kr|qS r   rR   )rA   r   r(   r(   r)   ru   b      z#cells_to_tables.<locals>.<listcomp>)r   r
   r   r2   r   rS   r"   removesumclearr3   )r   r   remaining_cellscurrent_cellstablesinitial_cell_countcellcell_cornerscorner_countrJ   filteredr(   r   r)   cells_to_tables)  s:   


r   c                   @   s"   e Zd Zdeee  fddZdS )	CellGroupr   c              	   C   sh   || _ tttdtd |tttdtd |tttdtd |tttdtd |f| _d S Nr   r   rh   r   )r   r^   r\   r   r[   r_   ro   )selfr   r(   r(   r)   __init__g  s   
zCellGroup.__init__N)__name__
__module____qualname__r   r   r   r   r(   r(   r(   r)   r   f  s    r   c                   @      e Zd ZdS )RowNr   r   r   r(   r(   r(   r)   r   q      r   c                   @   r   )ColumnNr   r(   r(   r(   r)   r   u  r   r   c                   @   s   e Zd Zdddee fddZedefddZd	ee	 dee	 fd
dZ
edee	 fddZedee	 fddZdedeeee   fddZdS )Tablepager   r   c                 C   s   || _ || _d S rQ   )r   r   )r   r   r   r(   r(   r)   r   z  s   
zTable.__init__r   c                 C   sJ   | j }tttd|tttd|tttd|tttd|fS r   )r   r^   r\   r   r_   )r   rm   r(   r(   r)   ro   ~  s   z
Table.bboxkindc           
         s   |t u rdnd t  }t| jt| d}ttttt | j}t	|t|}g }|D ]\}} fdd|D |fdd|D }	|
|	 q1|S )Nr   r   r.   c                    s   i | ]}|  |qS r(   r(   )rA   r   )axisr(   r)   
<dictcomp>      z+Table._get_rows_or_cols.<locals>.<dictcomp>c                    s   g | ]}  |qS r(   )getr   )xdictr(   r)   ru     r   z+Table._get_rows_or_cols.<locals>.<listcomp>)r   r   r3   r   r   r2   r   r\   rG   rH   r"   )
r   r   antiaxisrJ   xsgroupedrowsy	row_cellsrowr(   )r   r   r)   _get_rows_or_cols  s   
zTable._get_rows_or_colsc                 C   
   |  tS rQ   )r   r   r   r(   r(   r)   r        
z
Table.rowsc                 C   r   rQ   )r   r   r   r(   r(   r)   columns  r   zTable.columnskwargsc                    s   | j j}g }dtdtdtfdd| jD ][g }fdd|D }jD ]D  d u r.d }n6 fdd|D }t|rbd	|v rX d
  d  |d<  d  d  |d<  |d< tj	|fi |}nd}|
| q%|
| q|S )Ncharro   r   c                 S   sX   | d | d  d }| d | d  d }|\}}}}t ||ko*||k o*||ko*||k S )Nr!   r-   rh   r    r,   )r   )r   ro   v_midh_midr    r!   r,   r-   r(   r(   r)   char_in_bbox  s   z#Table.extract.<locals>.char_in_bboxc                    s   g | ]
} |j r|qS r(   rn   rA   r   )r   r   r(   r)   ru     r   z!Table.extract.<locals>.<listcomp>c                    s   g | ]	}| r|qS r(   r(   r   )r   r   r(   r)   ru     s
    
layoutrh   r   layout_widthr   r   layout_heightlayout_bbox )r   charsr   r   r   r   r   rS   r   extract_textr"   )r   r   r   	table_arrarr	row_chars	cell_text
cell_charsr(   )r   r   r   r)   extract  s,   

zTable.extractN)r   r   r   r   r   r   propertyro   r   r   r   r   r   r   r   rF   r   r(   r(   r(   r)   r   y  s    	"r   )lineslines_stricttextexplicit)snap_tolerancer;   r<   join_tolerancer=   r>   edge_min_lengthmin_words_verticalmin_words_horizontalintersection_toleranceintersection_x_toleranceintersection_y_tolerancec                   @   r   )
UnsetFloatNr   r(   r(   r(   r)   r     r   r   c                   @   s*  e Zd ZU dZeed< dZeed< dZee	e
eef   ed< dZee	e
eef   ed< eZeed< eZeed< eZeed	< eZeed
< eZeed< eZeed< dZeed< eZeed< eZeed< dZeed< eZeed< eZeed< dZ ee!ee"f  ed< dddZ#e$dee% dd fddZ&dS )r   r   vertical_strategyhorizontal_strategyNexplicit_vertical_linesexplicit_horizontal_linesr   r;   r<   r   r=   r>   r   r   r   r   r   r   r   text_settingsr   c                 C   s   t D ]}t| |p
ddk rtd| dqdD ]}t| |d }|tvr2t| ddt dq| jd	u r;i | _d
D ]}|| jvrN| jdd| j|< q=d| jv rX| jd= dD ]\}}t| |tu rnt| |t| | qZd	S )a  Clean up user-provided table settings.

        Validates that the table settings provided consists of acceptable values and
        returns a cleaned up version. The cleaned up version fills out the missing
        values with the default values in the provided settings.

        TODO: Can be further used to validate that the values are of the correct
            type. For example, raising a value error when a non-boolean input is
            provided for the key ``keep_blank_chars``.

        :param table_settings: User-provided table settings.
        :returns: A cleaned up version of the user-provided table settings.
        :raises ValueError: When an unrecognised key is provided.
        r   zTable setting 'z' cannot be negative)
horizontalvertical	_strategyz_strategy must be one of{,}N)r   r   r+   r   ))r;   r   )r<   r   )r=   r   )r>   r   )r   r   )r   r   )	NON_NEGATIVE_SETTINGSgetattrr1   TABLE_STRATEGIESjoinr  r   UNSETsetattr)r   settingr   strategyattrfallbackr(   r(   r)   __post_init__  s4   


zTableSettings.__post_init__settingsc                 C   s   |d u r|  S t || r|S t |tr@i }i }| D ]\}}|d d dkr0|||dd  < q|||< q||d< | di |S td| )N   text_r  zCannot resolve settings: r(   )
isinstancedictrC   r1   )clsr  core_settingsr  rB   r   r(   r(   r)   resolve+  s   


zTableSettings.resolve)r   N)'r   r   r   r   rF   __annotations__r   r   r   r   r   r   r   r   DEFAULT_SNAP_TOLERANCEr   r  r;   r<   DEFAULT_JOIN_TOLERANCEr   r=   r>   r   DEFAULT_MIN_WORDS_VERTICALr   r   DEFAULT_MIN_WORDS_HORIZONTALr   r   r   r   r  r   r   r  classmethodT_table_settingsr  r(   r(   r(   r)   r     s*   
 
3c                   @   s6   e Zd ZdZddddee fddZdefd	d
ZdS )TableFindera0  
    Given a PDF page, find plausible table structures.

    Largely borrowed from Anssi Nurminen's master's thesis:
    http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3

    ... and inspired by Tabula:
    https://github.com/tabulapdf/tabula-extractor/issues/16
    Nr   r   r  c                    s^   | _ t| _   _t j jj jj _	t
 j	 _ fddt jD  _d S )Nc                    s   g | ]}t  j|qS r(   )r   r   )rA   
cell_groupr   r(   r)   ru   T  s    z(TableFinder.__init__.<locals>.<listcomp>)r   r   r  r  	get_edgesr   r   r   r   r   r   r   r   r   )r   r   r  r(   r   r)   r   J  s   

zTableFinder.__init__r   c              
   C   s  | j }dD ]'}t||d }|dkr,t|d| d }t|dk r,td| d| d	q|j}|j}|d
ks;|d
krG| jjdi |jpDi }g }|j	pMg D ]9}	t
|	trit|	D ]}
|
d dkrg||
 qZqN||	|	| jjd | jjd | jjd | jjd  dd qN|dkrt| jjd}n!|dkrtj| jjddd}n|d
krt||jd}n|dkrg }|| }g }|jpg D ]9}	t
|	trt|	D ]}
|
d dkr||
 qq|| jjd | jjd | jjd | jjd  |	|	dd q|dkr	t| jjd}n$|dkrtj| jjddd}n|d
kr&t||jd}n|dkr-g }|| }t|t| }t||j|j|j|jd}tj||jdS )N)r  r  r  r   	explicit__linesrh   zIf z"_strategy == 'explicit', explicit_zD_lines must be specified as a list/tuple of two or more floats/ints.r   r   r   r   r   rp   r   r   line)	edge_typerV   r   r   )r    r,   rY   r!   r-   r   )r;   r<   r=   r>   )
min_lengthr(   )r  r  rS   r1   r   r   r   extract_wordsr  r   r  r  r   obj_to_edgesr"   ro   filter_edgesr   r   r   r   rf   r   r2   rM   r;   r<   r=   r>   r   )r   r  r   r  r   v_strath_stratrN   
v_explicitdescr%   v_baser   
h_explicith_baser   r   r(   r(   r)   r#  X  s   











zTableFinder.get_edgesrQ   )	r   r   r   __doc__r   r   r   r   r#  r(   r(   r(   r)   r!  ?  s    
r!  )r   r   )6rG   dataclassesr   operatorr   typingr   r   r   r   r   r	   r
   r   r   r   r   _typingr   r   r   r   r   r   r  r  r  r  rF   T_intersectionsr   r   r   r*   r:   rM   r   rf   r   r   r   r   objectr   r   r   r   r	  r  ri   r   r  r   r!  r(   r(   r(   r)   <module>   s    , 


"
,
@
?=SZ