o
    hg                  #   @   s  d Z ddlZddlZddlmZ ddlmZmZmZm	Z	m
Z
mZ ddlmZmZmZmZmZ ddlmZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ 													d-dede"de%de%de
e de&de
ee&  de%de'de&de%de
e% de(d e(d!e(d"ed#df"d$d%Z)				&		d.d'e#de%de
ee&  de&d(e(de%de
e d#e%fd)d*Z*				&	d/d'e#de%de
ee&  de&d(e(de
e d#e	e fd+d,Z+dS )0zIFunctions that can be used for the most common use-cases for pdfminer.six    N)StringIO)AnyBinaryIO	ContainerIteratorOptionalcast)HOCRConverterHTMLConverterPDFPageAggregatorTextConverterXMLConverter)ImageWriter)LAParamsLTPage)	PDFDeviceTagExtractor)PDFValueError)PDFPageInterpreterPDFResourceManager)PDFPage)AnyIO
FileOrNameopen_filenametextutf-8       ?normalFinfoutfpoutput_typecodeclaparamsmaxpagespage_numberspasswordscalerotation
layoutmode
output_dirstrip_controldebugdisable_cachingkwargsreturnc              	   K   sL  |r
t  t j d}|rt|}t| d}d}|dkr'|tjkr'tjj}|dkr5t	|||||d}nE|dkrDt
||||||d}n6|dkrTt|||||
||d}n&|d	krbt|||||d
}n|dkrqt|tt||d}n	d| }t||dusJ t||}tj| |||| dD ]}|j|	 d |_|| q|  dS )ak  Parses text from inf-file and writes to outfp file-like object.

    Takes loads of optional arguments but the defaults are somewhat sane.
    Beware laparams: Including an empty LAParams is not the same as passing
    None!

    :param inf: a file-like object to read PDF structure from, such as a
        file handler (using the builtin `open()` function) or a `BytesIO`.
    :param outfp: a file-like object to write the text to.
    :param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'.
        Only 'text' works properly.
    :param codec: Text decoding codec
    :param laparams: An LAParams object from pdfminer.layout. Default is None
        but may not layout correctly.
    :param maxpages: How many pages to stop parsing after
    :param page_numbers: zero-indexed page numbers to operate on.
    :param password: For encrypted PDFs, the password to decrypt.
    :param scale: Scale factor
    :param rotation: Rotation factor
    :param layoutmode: Default is 'normal', see
        pdfminer.converter.HTMLConverter
    :param output_dir: If given, creates an ImageWriter for extracted images.
    :param strip_control: Does what it says on the tin
    :param debug: Output more logging data
    :param disable_caching: Does what it says on the tin
    :param other:
    :return: nothing, acting as it does on two streams. Use StringIO to get
        strings.
    Ncachingr   )r"   r#   imagewriterxml)r"   r#   r2   stripcontrolhtml)r"   r'   r)   r#   r2   hocr)r"   r#   r4   tag)r"   z1Output type can be text, html, xml or tag but is r$   r&   r1   ih  )logging	getLoggersetLevelDEBUGr   r   sysstdoutbufferr   r   r
   r	   r   r   r   r   r   r   	get_pagesrotateprocess_pageclose)r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r2   rsrcmgrdevicemsginterpreterpage rI   L/var/www/html/govbot/env/lib/python3.10/site-packages/pdfminer/high_level.pyextract_text_to_fp   sx   /	



rK   Tpdf_filer1   c              	   C   s   |du rt  }t| dP}t <}tt|}t|d}	t|	|||d}
t|	|
}tj	|||||dD ]}|
| q2| W  d   W  d   S 1 sOw   Y  W d   dS 1 s_w   Y  dS )aw  Parse and return the text contained in a PDF file.

    :param pdf_file: Either a file path or a file-like object for the PDF file
        to be worked on.
    :param password: For encrypted PDFs, the password to decrypt.
    :param page_numbers: List of zero-indexed page numbers to extract.
    :param maxpages: The maximum number of pages to parse
    :param caching: If resources should be cached
    :param codec: Text decoding codec
    :param laparams: An LAParams object from pdfminer.layout. If None, uses
        some default settings that often work well.
    :return: a string containing all of the text extracted.
    Nrbr0   )r"   r#   r8   )r   r   r   r   r   r   r   r   r   r@   rB   getvalue)rL   r&   r%   r$   r1   r"   r#   fpoutput_stringrD   rE   rG   rH   rI   rI   rJ   extract_text   s"   



RrQ   c                 c   s    |du rt  }t| d7}tt|}t|d}t||d}t||}	tj|||||dD ]}
|		|
 |
 }|V  q-W d   dS 1 sGw   Y  dS )a  Extract and yield LTPage objects

    :param pdf_file: Either a file path or a file-like object for the PDF file
        to be worked on.
    :param password: For encrypted PDFs, the password to decrypt.
    :param page_numbers: List of zero-indexed page numbers to extract.
    :param maxpages: The maximum number of pages to parse
    :param caching: If resources should be cached
    :param laparams: An LAParams object from pdfminer.layout. If None, uses
        some default settings that often work well.
    :return: LTPage objects
    NrM   r0   )r#   r8   )r   r   r   r   r   r   r   r   r@   rB   
get_result)rL   r&   r%   r$   r1   r#   rO   resource_managerrE   rG   rH   layoutrI   rI   rJ   extract_pages   s(   




"rU   )r   r   Nr   Nr   r   r   r   NFFF)r   Nr   Tr   N)r   Nr   TN),__doc__r9   r=   ior   typingr   r   r   r   r   r   pdfminer.converterr	   r
   r   r   r   pdfminer.imager   pdfminer.layoutr   r   pdfminer.pdfdevicer   r   pdfminer.pdfexceptionsr   pdfminer.pdfinterpr   r   pdfminer.pdfpager   pdfminer.utilsr   r   r   strintfloatboolrK   rQ   rU   rI   rI   rI   rJ   <module>   s     
	

|

-
