
    hhh                    R   d Z ddlmZ ddlZddlZddlZddlZddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZmZmZmZmZmZmZmZ ddlmZ ddlZddlZdd	lmZ dd
lm Z  ddl!m"Z" ddl#m$Z$m%Z% erddl&Z&ddl'Z'ddl(Z(ddl)Z)ddl*m+Z+ g dZ,g dZ-d6dZ. ej/        e0          Z1dZ2dZ3dZ4dZ5h dZ6d7dZ7d8d Z8d8d!Z9d"d#gZ:d9d'Z; G d( d)e           Z< G d* d+e           Z= G d, d-e           Z> G d. d/e           Z? G d0 d1e           Z@ G d2 d3e           ZA G d4 d5e           ZBdS ):z(Module contains common parsers for PDFs.    )annotationsN)datetime)Path)TemporaryDirectory)TYPE_CHECKINGAnyBinaryIOIterableIteratorLiteralMappingOptionalSequenceUnioncast)urlparse)Document)BaseBlobParser)Blob)BaseImageBlobParserRapidOCRBlobParser)TextLinearizationConfig)	DCTDecodeDCT	JPXDecode)	LZWDecodeLZWFlateDecodeFlASCII85DecodeA85ASCIIHexDecodeAHxRunLengthDecodeRLCCITTFaxDecodeCCFJBIG2Decodeimages,Sequence[Union[Iterable[np.ndarray], bytes]]returnstrc                    	 ddl m} n# t          $ r t          d          w xY w |            }d}| D ]6} ||          \  }}|r$d |D             }|d                    |          z  }7|S )zExtract text from images with RapidOCR.

    Args:
        images: Images to extract text from.

    Returns:
        Text extracted from images.

    Raises:
        ImportError: If `rapidocr-onnxruntime` package is not installed.
    r   )RapidOCRzc`rapidocr-onnxruntime` package not found, please install it with `pip install rapidocr-onnxruntime` c                    g | ]
}|d          S )    ).0texts     m/var/www/FlaskApp/flask-venv/lib/python3.11/site-packages/langchain_community/document_loaders/parsers/pdf.py
<listcomp>z5extract_from_images_with_rapidocr.<locals>.<listcomp>Z   s    111$d1g111    
)rapidocr_onnxruntimer.   ImportErrorjoin)r)   r.   ocrr4   imgresult_s          r5   !extract_from_images_with_rapidocrr@   @   s    
1111111 
 
 
1
 
 	


 (**CD & &CHH	 	&11&111FDIIf%%%DKs   	 #z

{image_text}

r8   z
>   sourcecreatorproducertotal_pagescreationdateblobr   contentformatc                    |rR| j         pd}|dk    r |                    dd          }d| d| d}n#|dk    rd	t          j        |d
           d| d}|S )a  Format the content of the image with the source of the blob.

    blob: The blob containing the image.
    format::
      The format for the parsed output.
      - "text" = return the content as is
      - "markdown-img" = wrap the content into an image markdown link, w/ link
      pointing to (`![body)(#)`]
      - "html-img" = wrap the content as the `alt` text of an tag and link to
      (`<img alt="{body}" src="#"/>`)
    #zmarkdown-img]z\\]z![z]()zhtml-imgz
<img alt="T)quotez src="z" />)rA   replacehtmlescape)rF   rG   rH   rA   s       r5   _format_inner_imagerQ   i   s      X#^##ooc622G/7//f///GGz!!W4;wd#C#C#CWW6WWWGNr7   metadatadict[str, Any]c                    t                               |                                           st          d          t	          |                     dd          t                    st          d          | S )zValidate that the metadata has all the standard keys and the page is an integer.

    The standard keys are:
    - source
    - total_page
    - creationdate
    - creator
    - producer

    Validate that page is an integer if it is present.
    z3The PDF parser must valorize the standard metadata.pager   z(The PDF metadata page must be a integer.)_STD_METADATA_KEYSissubsetkeys
ValueError
isinstancegetint)rR   s    r5   _validate_metadatar]      sg     &&x}}77 PNOOOhll61--s33 ECDDDOr7   c                x   i }ddd}|                                  D ]\  }}t          |          t          t          fvrt          |          }|                    d          r
|dd         }|                                }|dv rV	 t          j        |                    dd	          d
          	                    d          ||<   # t          $ r |||<   Y w xY w||v r||||         <   |||<   t          |t                    r|                                ||<   t          |t                    r|||<   |S )zPurge metadata from unwanted keys and normalize key names.

    Args:
        metadata: The original metadata dictionary.

    Returns:
        The cleaned and normalized the key format of metadata dictionary.
    rD   rA   )
page_count	file_path/r1   N)rE   moddate'r/   zD:%Y%m%d%H%M%S%zT)itemstyper,   r\   
startswithlowerr   strptimerN   	isoformatrY   rZ   strip)rR   new_metadatamap_keykvs        r5   _purge_metadatarp      sf    $&L# G       1773*$$AA<< 	!""AGGII+++$"*"3IIc2&&(:# #)C.. Q  $ $ $"#Q$'\\'(L$LOO3 	 ggiiLOO3 	 LOs   ?CCCz




extras	list[str]text_from_pagec                    dfd	 | |d
          }|s=d}d                     t          d |                     }|rt          d         |z   }||z   }|S )a5  Insert extras such as image/table in a text between two paragraphs if possible,
    else at the end of the text.

    Args:
        extras: List of extra content (images/tables) to insert.
        text_from_page: The text content from the page.

    Returns:
        The merged text with extras inserted.
    rr   rs   rt   r,   recursboolr+   Optional[str]c                @   | rt           D ]}|                    |          }|dk    rpd }|r 	| |d |         d          }|r|||d          z   }nEd}d                    t          d |                     }|r||z   }|d |         |z   ||d          z   } nd }n|}|S )NFr/   rq   c                    | S Nr2   xs    r5   <lambda>zO_merge_text_and_extras.<locals>._recurs_merge_text_and_extras.<locals>.<lambda>   s    ! r7   )_PARAGRAPH_DELIMITERrfindr;   filter)
rr   rt   rv   delimposprevious_textall_text
all_extras
str_extras_recurs_merge_text_and_extrass
            r5   r   z=_merge_text_and_extras.<locals>._recurs_merge_text_and_extras   s      	&-    $**511"99$(M (E(E"N4C4$8%) ) % 	#0>#$$3G#G%'
%+[[V1L1L%M%M
% <).);J*4C40:=stt@TT ! E# &  %Hr7   Tr/   rq   c                    | S r|   r2   r}   s    r5   r   z(_merge_text_and_extras.<locals>.<lambda>   s    ! r7   rz   )rr   rs   rt   r,   rv   rw   r+   rx   )r;   r   r   )rr   rt   r   r   r   r   s        @r5   _merge_text_and_extrasr      s         < -,V^TJJH /
[[V!<!<==
 	?-b1J>J!J.Or7   c                  J     e Zd ZdZ	 	 d dedddddd! fdZd"dZd#dZ xZS )$PyPDFParsera  Parse a blob from a PDF using `pypdf` library.

    This class provides methods to parse a blob from a PDF document, supporting various
    configurations such as handling password-protected PDFs, extracting images.
    It integrates the 'pypdf' library for PDF processing and offers synchronous blob
    parsing.

    Examples:
        Setup:

        .. code-block:: bash

            pip install -U langchain-community pypdf

        Load a blob from a PDF file:

        .. code-block:: python

            from langchain_core.documents.base import Blob

            blob = Blob.from_path("./example_data/layout-parser-paper.pdf")

        Instantiate the parser:

        .. code-block:: python

            from langchain_community.document_loaders.parsers import PyPDFParser

            parser = PyPDFParser(
                # password = None,
                mode = "single",
                pages_delimiter = "
",
                # images_parser = TesseractBlobParser(),
            )

        Lazily parse the blob:

        .. code-block:: python

            docs = []
            docs_lazy = parser.lazy_parse(blob)

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    NFrU   r4   plain)modepages_delimiterimages_parserimages_inner_formatextraction_modeextraction_kwargspasswordOptional[Union[str, bytes]]extract_imagesrw   r   Literal['single', 'page']r   r,   r   Optional[BaseImageBlobParser]r   +Literal['text', 'markdown-img', 'html-img']r   Literal['plain', 'layout']r   Optional[dict[str, Any]]c                  t                                                       |dvrt          d          || _        |r|st	                      }|| _        || _        || _        || _        || _	        || _
        |pi | _        dS )u  Initialize a parser based on PyPDF.

        Args:
            password: Optional password for opening encrypted PDFs.
            extract_images: Whether to extract images from the PDF.
            mode: The extraction mode, either "single" for the entire document or "page"
                for page-wise extraction.
            pages_delimiter: A string delimiter to separate pages in single-mode
                extraction.
            images_parser: Optional image blob parser.
            images_inner_format: The format for the parsed output.
                - "text" = return the content as is
                - "markdown-img" = wrap the content into an image markdown link, w/ link
                pointing to (`![body)(#)`]
                - "html-img" = wrap the content as the `alt` text of an tag and link to
                (`<img alt="{body}" src="#"/>`)
            extraction_mode: “plain” for legacy functionality, “layout” extract text
                in a fixed width format that closely adheres to the rendered layout in
                the source pdf.
            extraction_kwargs: Optional additional parameters for the extraction
                process.

        Raises:
            ValueError: If the `mode` is not "single" or "page".
        singlerU   mode must be single or pageN)super__init__rY   r   r   r   r   r   r   r   r   r   )
selfr   r   r   r   r   r   r   r   	__class__s
            r5   r   zPyPDFParser.__init__$  s    J 	))):;;;, 	1- 	1.00M*#6  	..!2!8br7   rF   r   r+   Iterator[Document]c              #  n   K   	 ddl n# t          $ r t          d          w xY wd fd}|                                5 } j        | j        	          }t          d
d
ddt          t          |j        pi           z  |j	        t          |j                  dz            }g }t          |j                  D ]\  }} ||          }	                     |          }
t          |
g|	                                          } j        dk    r2t#          |t%          |||j        |         dz                      V  |                    |            j        dk    r8t#           j                            |          t%          |                    V  ddd           dS # 1 swxY w Y   dS )m  
        Lazily parse the blob.
        Insert image, if possible, between two paragraphs.
        In this way, a paragraph can be continued on the next page.

        Args:
            blob: The blob to parse.

        Raises:
            ImportError: If the `pypdf` package is not found.

        Yield:
            An iterator over the parsed documents.
        r   NzE`pypdf` package not found, please install it with `pip install pypdf`rU   pypdf.PageObjectr+   r,   c                    j                             d          r|                                 S  | j        ddj        ij        S )z
            Extract text from image given the version of pypdf.

            Args:
                page: The page object to extract text from.

            Returns:
                str: The extracted text.
            3r   Nr2   )__version__rg   extract_textr   r   )rU   pypdfr   s    r5   _extract_text_from_pagez7PyPDFParser.lazy_parse.<locals>._extract_text_from_pagem  sb      ++C00 ((***(t(  $($8,  r7   r   PyPDFr/   rC   rB   rE   )rA   rD   )rU   )rU   
page_labelpage_contentrR   r   )rU   r   r+   r,   )r   r:   as_bytes_io	PdfReaderr   rp   r   dictrR   rA   lenpages	enumerateextract_images_from_pager   rk   r   r   r]   page_labelsappendr   r;   )r   rF   r   pdf_file_obj
pdf_readerdoc_metadatasingle_textspage_numberrU   rt   images_from_pager   r   s   `           @r5   
lazy_parsezPyPDFParser.lazy_parseW  s     	LLLL 	 	 	W  	
	 	 	 	 	 	 	$  #	<(NNNJ*$"MMtZ06B778 #k#&z'7#8#8  L L%.z/?%@%@ 2 2!T!8!8d!C!C!C#'#@#@#F#F 1%& %''  9&&"%-!3((3.8.D[.Q " "	 	 	 	 	 	 	 !''1111yH$$!%!5!:!:<!H!H/==     A#	 #	 #	 #	 #	 #	 #	 #	 #	 #	 #	 #	 #	 #	 #	 #	 #	 #	s    %EF**F.1F.pypdf._page.PageObjectc           	        | j         sdS ddl}ddlm} dt	          t
          |d                                                   vrdS |d         d                                         }g }|D ]}d}||         d         dk    rt          ||         d	                   |j	        j
        j        u r||         d	         d
d         n||         d	         d         d
d         }|t          v rj||         d         ||         d         }
}	t          j        ||                                         t          j                                      |	|
d          }nu|t$          v rRt          j        |                    t+          j        ||                                                                       }nt.                              d           |t+          j                    }|                    |                              |d           t7          j        |                                d          }t=          | j                             |                    j         }|!                    tE          ||| j#                             tH          %                    tL          '                    tQ          d|                              S )Extract images from a PDF page and get the text using images_to_text.

        Args:
            page: The page object from which to extract images.

        Returns:
            str: The extracted text from the images on the page.
        r/   r   NImagez/XObjectz
/Resourcesz/Subtypez/Imagez/Filterr1   z/Heightz/Widthdtyperz   Unknown PDF Filter!PNG)rH   z	image/png	mime_type
image_text))r   r   PILr   r   r   rX   
get_objectrf   generic_base
NameObject_PDF_FILTER_WITHOUT_LOSSnp
frombufferget_datauint8reshape_PDF_FILTER_WITH_LOSSarrayopenioBytesIOloggerwarning	fromarraysaver   	from_datagetvaluenextr   r   r   rQ   r   _FORMAT_IMAGE_STRrH   _JOIN_IMAGESr;   r   )r   rU   r   r   xObjectr)   objnp_image
img_filterheightwidthimage_bytesrF   r   s                 r5   r   z$PyPDFParser.extract_images_from_page  s    ! 	2T$\(:;;@@BBBB2|$Z0;;== 	 	C Hs|J'833 GCL3448K8VVV CL+ABB// i03ABB7 
 !999$+CL$;WS\(=SEF!}--//rx     gfeR00 H  #888!x

2:gcl>S>S>U>U3V3V(W(WXXHH NN#8999'"$*,,KOOH--22;u2MMM>+*>*>*@*@KXXXD!%d&8&C&CD&I&I!J!J!WJMM+D*d>VWW   !''#((f)=)=>> ( 
 
 	
r7   NF)r   r   r   rw   r   r   r   r,   r   r   r   r   r   r   r   r   rF   r   r+   r   )rU   r   r+   r,   )	__name__
__module____qualname____doc___DEFAULT_PAGES_DELIMITERr   r   r   __classcell__r   s   @r5   r   r      s        . .d 15$19
 +177;KQ6=6:19 19 19 19 19 19 19 19fK K K KZ0
 0
 0
 0
 0
 0
 0
 0
r7   r   c                       e Zd ZdZdZ	 d(ddeddddd) fdZed*d            Zed+d            Z		 	 d,d-d#Z
d.d'Z xZS )/PDFMinerParsera  Parse a blob from a PDF using `pdfminer.six` library.

    This class provides methods to parse a blob from a PDF document, supporting various
    configurations such as handling password-protected PDFs, extracting images, and
    defining extraction mode.
    It integrates the 'pdfminer.six' library for PDF processing and offers synchronous
    blob parsing.

    Examples:
        Setup:

        .. code-block:: bash

            pip install -U langchain-community pdfminer.six pillow

        Load a blob from a PDF file:

        .. code-block:: python

            from langchain_core.documents.base import Blob

            blob = Blob.from_path("./example_data/layout-parser-paper.pdf")

        Instantiate the parser:

        .. code-block:: python

            from langchain_community.document_loaders.parsers import PDFMinerParser

            parser = PDFMinerParser(
                # password = None,
                mode = "single",
                pages_delimiter = "
",
                # extract_images = True,
                # images_to_text = convert_images_to_text_with_tesseract(),
            )

        Lazily parse the blob:

        .. code-block:: python

            docs = []
            docs_lazy = parser.lazy_parse(blob)

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    FNr   r4   )r   r   r   r   r   concatenate_pagesr   rw   r   rx   r   r   r   r,   r   r   r   r   r   Optional[bool]c               h   t                                                       |dvrt          d          |r|st                      }|| _        || _        || _        || _        || _        || _	        |?t          j        s&dt          _        t                              d           |rdnd| _        dS dS )aH  Initialize a parser based on PDFMiner.

        Args:
            password: Optional password for opening encrypted PDFs.
            mode: Extraction mode to use. Either "single" or "page" for page-wise
                extraction.
            pages_delimiter: A string delimiter to separate pages in single-mode
                extraction.
            extract_images: Whether to extract images from PDF.
            images_inner_format: The format for the parsed output.
                - "text" = return the content as is
                - "markdown-img" = wrap the content into an image markdown link, w/ link
                pointing to (`![body)(#)`]
                - "html-img" = wrap the content as the `alt` text of an tag and link to
                (`<img alt="{body}" src="#"/>`)
            concatenate_pages: Deprecated. If True, concatenate all PDF pages
                into one a single document. Otherwise, return one document per page.

        Returns:
            This method does not directly return data. Use the `parse` or `lazy_parse`
            methods to retrieve parsed documents with content and metadata.

        Raises:
            ValueError: If the `mode` is not "single" or "page".

        Warnings:
            `concatenate_pages` parameter is deprecated. Use `mode='single' or 'page'
            instead.
        r   r   NTzS`concatenate_pages` parameter is deprecated. Use `mode='single' or 'page'` instead.r   rU   )r   r   rY   r   r   r   r   r   r   r   r   _warn_concatenate_pagesr   r   )	r   r   r   r   r   r   r   r   r   s	           r5   r   zPDFMinerParser.__init__  s    P 	))):;;; 	1- 	1.00M,*#6  	.(!9 9=6=   %6A6DIII )(r7   sUnion[bytes, str]r+   c                0   ddl m t          | t                    r.|                     d          rt          | dd         dd          S 	 d | D             }d	                    fd
|D                       S # t          $ r t          |           cY S w xY w)z
        Decodes a PDFDocEncoding string to Unicode.
        Adds py3 compatibility to pdfminer's version.

        Args:
            s: The string to decode.

        Returns:
            str: The decoded Unicode string.
        r   )PDFDocEncodings      Nzutf-16beignorec              3  b   K   | ]*}t          |t                    rt          |          n|V  +d S r|   )rZ   r,   ord)r3   cs     r5   	<genexpr>z-PDFMinerParser.decode_text.<locals>.<genexpr>Y  s;      CCAjC007CFFFaCCCCCCr7   r/   c              3  (   K   | ]}|         V  d S r|   r2   )r3   or  s     r5   r
  z-PDFMinerParser.decode_text.<locals>.<genexpr>Z  s(      ;;>!,;;;;;;r7   )pdfminer.utilsr  rZ   bytesrg   r,   r;   
IndexError)r  ordsr  s     @r5   decode_textzPDFMinerParser.decode_textH  s     	211111a 	4ALL$=$= 	4quj(333	CCCCCD77;;;;d;;;;;; 	 	 	q66MMM	s   ,A9 9BBr   r   c                8   ddl m} t          | d          r|                                 } t	          | t
                    r't          t          t          j        |                     S t	          | |          rt          	                    | j
                  S t	          | t          t          f          rt          	                    |           S t	          | t                    r9|                                 D ]"\  }}t                              |          | |<   #| S | S )z
        Recursively resolve the metadata values.

        Args:
            obj: The object to resolve and decode. It can be of any type.

        Returns:
            The resolved and decoded object.
        r   )	PSLiteralresolve)pdfminer.psparserr  hasattrr  rZ   listmapr   resolve_and_decoder  namer,   r  r   re   )r   r  rn   ro   s       r5   r  z!PDFMinerParser.resolve_and_decode^  s    	0/////3	"" 	 ++--Cc4   		N=sCCDDDY'' 	!--ch777c5\** 	!--c222T"" 			 > >1'::1==AJ
r7   r/   Tfpr	   cachingrS   c           	        ddl m}m}m}  ||          } ||||          }i }	|j        D ]}
|	                    |
           |	                                D ]c\  }}	 t                              |          |	|<   $# t          $ r3}t                              d|t          |                     Y d}~\d}~ww xY wt          t          |                    |                              |	d<   |	S )ag  
        Extract metadata from a PDF file.

        Args:
            fp: The file pointer to the PDF file.
            password: The password for the PDF file, if encrypted. Defaults to an empty
                string.
            caching: Whether to cache the PDF structure. Defaults to True.

        Returns:
            Metadata of the PDF file.
        r   )PDFDocumentPDFPage	PDFParser)r   r  zD[WARNING] Metadata key "%s" could not be parsed due to exception: %sNrD   )pdfminer.pdfpager  r  r   infoupdatere   r   r  	Exceptionr   r   r,   r   r  create_pages)r   r  r   r  r  r  r   parserdocrR   r"  rn   ro   es                 r5   _get_metadatazPDFMinerParser._get_metadataz  s>   $ 	EDDDDDDDDD 2k&8WEEEH 	" 	"DOOD!!!!NN$$ 	 	DAq
,??BB    $FF	        #&d7+?+?+D+D&E&E"F"Fs   A==
B:)B55B:rF   r   r   c              #     K   	 ddl }ddlm} ddlm}mmm}m}m	m
 ddlm}m} ddlm}	 t!          |j                  dk     rt%          d          n# t$          $ r t%          d	          w xY w|                                5 }
t)                      5 |	                    |
 j        pd
          } |            }t/          ddd
d                     |
 j        pd
          z            }|j        |d<    G  fdd|          }t5          j                     || || |                                }g }t9          |          D ]\  }}                    d                               d           |                    |                                            }|!                                } j"        dk    rP                    d                               d           tG          |tI          |d|iz                      V  |%                    d          r
|dd         }|&                    |            j"        dk    r: j'        (                    |          }tG          |tI          |                    V  ddd           n# 1 swxY w Y   ddd           dS # 1 swxY w Y   dS )a  
        Lazily parse the blob.
        Insert image, if possible, between two paragraphs.
        In this way, a paragraph can be continued on the next page.

        Args:
            blob: The blob to parse.

        Raises:
            ImportError: If the `pdfminer.six` or `pillow` package is not found.

        Yield:
            An iterator over the parsed documents.
        r   N)PDFLayoutAnalyzer)LAParamsLTContainerLTImageLTItemLTPageLTText	LTTextBox)PDFPageInterpreterPDFResourceManager)r  i:>4zThis parser is tested with pdfminer.six version 20201018 or later. Remove pdfminer, and install pdfminer.six with `pip uninstall pdfminer && pip install pdfminer.six`.zMpdfminer package not found, please install it with `pip install pdfminer.six`r/   r   PDFMinerr   rA   c                  @     e Zd Z	 	 dd fdZdfdZ xZS )*PDFMinerParser.lazy_parse.<locals>.Visitorr1   Nrsrcmgrr4  pagenor\   laparamsOptional[LAParams]r+   Nonec                P    t                                          |||           d S )N)r9  r:  )r   r   )r   r8  r9  r:  r   s       r5   r   z3PDFMinerParser.lazy_parse.<locals>.Visitor.__init__  s*     GG$$WVh$OOOOOr7   ltpager0  c           	     :    d	fd |           d S )Nitemr/  r+   r<  c                   t          |           r| D ]} |           n7t          | 	          r'                    |                                            t          | 
          r                    d           d S t          |           rj        rddlm}  |          }|                    |           }t          j        t                    |z            }d|j
        d<   t          j                            |                    j        }                    t          ||j                             d S d S d S )Nr8   r   )ImageWriterrJ   rA   )rZ   writeget_textr   pdfminer.imagerB  export_imager   	from_pathr   rR   r   r   r   rQ   r   )r@  childrB  image_writerfilenamerF   r   r-  r.  r1  r2  renderr   tempdirtext_ios          r5   rK  zIPDFMinerParser.lazy_parse.<locals>.Visitor.receive_layout.<locals>.render  sx   %dK88 ;)- . . &u.'f55 ;#MM$--//:::%dI66 !#MM$/////'g66 !#1 " F F F F F F/:{7/C/C+7+D+DT+J+J'+~d7mmh6N'O'O:=h 7-1$($6$A$A$$G$G." ."". !+ !($7(,j$:R%& %&!" !" !" !" !"" "" !Dr7   )r@  r/  r+   r<  r2   )
mer>  rK  r-  r.  r1  r2  r   rL  rM  s
     @r5   receive_layoutz9PDFMinerParser.lazy_parse.<locals>.Visitor.receive_layout  sX    ! ! ! ! ! ! ! ! ! ! ! ! !8 F6NNNNNr7   )r1   N)r8  r4  r9  r\   r:  r;  r+   r<  )r>  r0  r+   r<  )r   r   r   r   rO  r   )r   r-  r.  r1  r2  r   rL  rM  s   @r5   Visitorr7    s         #$37	P P P P P P P# # # # # # # # # # # # # # # #r7   rP  )r:  rU   r   rz   r   ))pdfminerpdfminer.converterr+  pdfminer.layoutr,  r-  r.  r/  r0  r1  r2  pdfminer.pdfinterpr3  r4  r!  r  r\   r   r:   r   r   	get_pagesr   rp   r)  rA   r   StringIOr   truncateseekprocess_pager   rk   r   r   r]   endswithr   r   r;   )r   rF   rR  r+  r,  r/  r0  r3  r4  r  r   r   r8  r   rP  visitor_for_allall_contentirU   r   document_contentr-  r.  r1  r2  rL  rM  s   `                    @@@@@@r5   r   zPDFMinerParser.lazy_parse  s     	OOO<<<<<<                  RQQQQQQQ0000008'((833!L   4  	 	 	2  	  O	<1C1E1E O	%%lT]=Pb%QQE((**G*'JPRSS$$\DM<OR$PPQ L &*[L"&# &# &# &# &# &# &# &# &# &# &# &# &#+ &# &# &#P kmmG0088::>>> O K$U++ 1 14  ###Q,,T222"++--#>>++9&&$$Q'''LLOOO"%-!3LFA;4N!O!O      
  ((.. 1#+CRC=&&x0000yH$$#'#7#<#<[#I#I !1/==     YO	 O	 O	 O	 O	 O	 O	 O	 O	 O	 O	 O	 O	 O	 O	 O	 O	 O	 O	 O	 O	 O	 O	 O	 O	 O	 O	 O	 O	 O	 O	 O	 O	sC   AA A7K G?J(K (J,	,K /J,	0K  KKF)r   rw   r   rx   r   r   r   r,   r   r   r   r   r   r   )r  r  r+   r,   )r   r   r+   r   )r/   T)r  r	   r   r,   r  rw   r+   rS   r   )r   r   r   r   r   r   r   staticmethodr  r  r)  r   r   r   s   @r5   r   r     s        0 0d $  %:B #'*277;KQ,0:B :B :B :B :B :B :B :Bx    \*    \< 	, , , , ,\y y y y y y y yr7   r   c            	           e Zd ZdZ ej                    Z	 	 d)ddedddddd* fdZd+dZ		 d,d-dZ
d.d$Zd/d&Zd0d'Zd1d(Z xZS )2PyMuPDFParsera  Parse a blob from a PDF using `PyMuPDF` library.

    This class provides methods to parse a blob from a PDF document, supporting various
    configurations such as handling password-protected PDFs, extracting images, and
    defining extraction mode.
    It integrates the 'PyMuPDF' library for PDF processing and offers synchronous blob
    parsing.

    Examples:
        Setup:

        .. code-block:: bash

            pip install -U langchain-community pymupdf

        Load a blob from a PDF file:

        .. code-block:: python

            from langchain_core.documents.base import Blob

            blob = Blob.from_path("./example_data/layout-parser-paper.pdf")

        Instantiate the parser:

        .. code-block:: python

            from langchain_community.document_loaders.parsers import PyMuPDFParser

            parser = PyMuPDFParser(
                # password = None,
                mode = "single",
                pages_delimiter = "
",
                # images_parser = TesseractBlobParser(),
                # extract_tables="markdown",
                # extract_tables_settings=None,
                # text_kwargs=None,
            )

        Lazily parse the blob:

        .. code-block:: python

            docs = []
            docs_lazy = parser.lazy_parse(blob)

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    NFrU   r4   )r   r   r   r   r   extract_tablesextract_tables_settingstext_kwargsr   r   rw   r   rx   r   r   r   r,   r   r   r   r   rd  /Union[Literal['csv', 'markdown', 'html'], None]re  r+   r<  c               >   t                                                       |dvrt          d          |r|dvrt          d          || _        || _        || _        |pi | _        |r|st                      }|| _        || _	        || _
        || _        |	| _        dS )a  Initialize a parser based on PyMuPDF.

        Args:
            password: Optional password for opening encrypted PDFs.
            mode: The extraction mode, either "single" for the entire document or "page"
                for page-wise extraction.
            pages_delimiter: A string delimiter to separate pages in single-mode
                extraction.
            extract_images: Whether to extract images from the PDF.
            images_parser: Optional image blob parser.
            images_inner_format: The format for the parsed output.
                - "text" = return the content as is
                - "markdown-img" = wrap the content into an image markdown link, w/ link
                pointing to (`![body)(#)`]
                - "html-img" = wrap the content as the `alt` text of an tag and link to
                (`<img alt="{body}" src="#"/>`)
            extract_tables: Whether to extract tables in a specific format, such as
                "csv", "markdown", or "html".
            extract_tables_settings: Optional dictionary of settings for customizing
                table extraction.

        Returns:
            This method does not directly return data. Use the `parse` or `lazy_parse`
            methods to retrieve parsed documents with content and metadata.

        Raises:
            ValueError: If the mode is not "single" or "page".
            ValueError: If the extract_tables format is not "markdown", "html",
            or "csv".
        r   r   )markdownrO   csvzmode must be markdownN)r   r   rY   r   r   r   rf  r   r   r   r   rd  re  )r   rf  r   r   r   r   r   r   rd  re  r   s             r5   r   zPyMuPDFParser.__init__]  s    V 	))):;;; 	6n4OOO4555	. &," 	1- 	1.00M,#6 *,'>$$$r7   rF   r   r   c                ,    |                      |          S r|   )_lazy_parse)r   rF   s     r5   r   zPyMuPDFParser.lazy_parse  s    
 
 	
r7   c              #    K   	 ddl }|p| j        }| j        sNddlm}m}m}m} i ddddddddd	dd
|ddddd|ddddddd|d|dddddddddddd| _        n# t          $ r t          d          w xY wt          j
        5  |                                5 }|j         |j        |          }	n |j        |d          }	|	j        r|	                    | j                   dddd|                     |	|          z  }
g }|	D ]v}|                     |	||                                          }| j        dk    r+t+          |t-          |
d|j        iz                      V  a|                    |           w| j        d k    r8t+          | j                            |          t-          |
                    V  ddd           n# 1 swxY w Y   ddd           dS # 1 swxY w Y   dS )!a  Lazily parse the blob.
        Insert image, if possible, between two paragraphs.
        In this way, a paragraph can be continued on the next page.

        Args:
            blob: The blob to parse.
            text_kwargs: Optional keyword arguments to pass to the `get_text` method.
                If provided at run time, it will override the default text_kwargs.

        Raises:
            ImportError: If the `pypdf` package is not found.

        Yield:
            An iterator over the parsed documents.
        r   N)DEFAULT_JOIN_TOLERANCEDEFAULT_MIN_WORDS_HORIZONTALDEFAULT_MIN_WORDS_VERTICALDEFAULT_SNAP_TOLERANCEclipvertical_strategylineshorizontal_strategyvertical_lineshorizontal_linessnap_tolerancesnap_x_tolerancesnap_y_tolerancejoin_tolerancejoin_x_tolerancejoin_y_toleranceedge_min_length   min_words_verticalmin_words_horizontalintersection_toleranceintersection_x_toleranceintersection_y_tolerance)text_tolerancetext_x_tolerancetext_y_tolerancestrategy	add_lineszGpymupdf package not found, please install it with `pip install pymupdf`pdf)streamfiletypePyMuPDFr/   r   rU   r   r   )pymupdfrf  re  pymupdf.tablern  ro  rp  rq  r:   rc  _lockr   datar   is_encryptedauthenticater   _extract_metadata_get_page_contentrk   r   r   r]   numberr   r   r;   )r   rF   rf  r  rn  ro  rp  rq  r`   r'  r   full_contentrU   r   s                 r5   rl  zPyMuPDFParser._lazy_parse  s     ,)	NNN%9)9K/             0D0 (0 *7	0
 %d0 '0 %&<0 '0 '0 %&<0 '0 '0 &q0 )*D0 +,H0  -a!0" /#0$ /%0& '(()() $!%/0 0 0,2  	 	 	-  	   	 	!!## y9$&',y11CC&',i%HHHC# 4$$T]333 )($&    **355	 6
  " 
6 
6D#55c4MMSSUUHyF**&)1%7 ,/D D& &       %++H55559(("%)%9%>%>|%L%L!3L!A!A     5              	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	sC   A"A' 'BG0%D'GG0G	G0G	 G00G47G4r'  pymupdf.Documentpymupdf.PagerS   c                    |j         di i | j        |}|                     ||          }|                     |          }g }|r|                    |           |r|                    |           t          ||          }|S )a:  Get the text of the page using PyMuPDF and RapidOCR and issue a warning
        if it is empty.

        Args:
            doc: The PyMuPDF document object.
            page: The PyMuPDF page object.
            blob: The blob being parsed.

        Returns:
            str: The text content of the page.
        r2   )rD  rf  _extract_images_from_page_extract_tables_from_pager   r   )	r   r'  rU   rf  rt   r   tables_from_pagerr   r   s	            r5   r  zPyMuPDFParser._get_page_content   s    " 'MM)LD,<)L)LMM99#tDD99$?? 	,MM*+++ 	,MM*+++)&.AAr7   r   c                    t          i ddd|j        |j        t                    dfdj        D                       }dD ]}|j        v rj        |         ||<   |S )zExtract metadata from the document and page.

        Args:
            doc: The PyMuPDF document object.
            blob: The blob being parsed.

        Returns:
            dict: The extracted metadata.
        r  r/   )rC   rB   rE   rA   r`   rD   c                z    i | ]7}t          j        |         t          t          f          )|j        |         8S r2   )rZ   rR   r,   r\   r3   rn   r'  s     r5   
<dictcomp>z3PyMuPDFParser._extract_metadata.<locals>.<dictcomp>1  sK       !#,q/C:>>s|A  r7   )modDatecreationDate)rp   rA   r   rR   )r   r'  rF   rR   rn   s    `   r5   r  zPyMuPDFParser._extract_metadata  s     # )($&"k!%#&s88     \  
 
" - 	. 	.ACL  !l1or7   c                   | j         sdS ddl}|                                }g }|D ]}| j         r|d         } |j        ||          }t	          j        |j        t          j                                      |j	        |j
        d          }	t          j                    }
t          j        |
|	           t          j        |
                                d          }t%          | j                             |                    j        }|                    t-          ||| j                             t0                              t4                              t9          d|                              S )	a	  Extract images from a PDF page and get the text using images_to_text.

        Args:
            doc: The PyMuPDF document object.
            page: The PyMuPDF page object.

        Returns:
            str: The extracted text from the images on the page.
        r/   r   Nr   rz   application/x-npyr   r   )r   r  
get_imagesPixmapr   r   samplesr   r   r   r   r   r   numpyr   r   r   r   r   r   r   r   rQ   r   r   rH   r   r;   r   )r   r'  rU   r  img_listr)   r=   xrefpiximager   rF   r   s                r5   r  z'PyMuPDFParser._extract_images_from_page=  sg    ! 	2??$$ 	 	C! 1v$gnS$//ckBBBJJJ	2  !jll
;...~((**6I   "$"4"?"?"E"EFFS
'j$:RSS   !''#((f)=)=>> ( 
 
 	
r7   c                   | j         dS ddl}t           |j        j        |fi | j                  }|r| j         dk    r$t                              d |D                       S | j         dk    r$t                              d |D                       S | j         dk    r$t                              d	 |D                       S t          d
| j          d          dS )zExtract tables from a PDF page.

        Args:
            page: The PyMuPDF page object.

        Returns:
            str: The extracted tables in the specified format.
        Nr/   r   ri  c                6    g | ]}|                                 S r2   )to_markdownr3   tables     r5   r6   z;PyMuPDFParser._extract_tables_from_page.<locals>.<listcomp>v  s$    )W)W)W%%*;*;*=*=)W)W)Wr7   rO   c                b    g | ],}|                                                     d d d           -S )F)headerindex	bold_rows)	to_pandasto_htmlr  s     r5   r6   z;PyMuPDFParser._extract_tables_from_page.<locals>.<listcomp>y  sR        " ))11#("'&+ 2    r7   rj  c                `    g | ]+}|                                                     d d           ,S )F)r  r  )r  to_csvr  s     r5   r6   z;PyMuPDFParser._extract_tables_from_page.<locals>.<listcomp>  sO       
 "	 ))00#("' 1    r7   zextract_tables z not implemented)	rd  r  r  r  find_tablesre  _JOIN_TABLESr;   rY   )r   rU   r  tables_lists       r5   r  z'PyMuPDFParser._extract_tables_from_paged  s8    &2%GM%dKKd.JKK
 
  	"j00#(()W)W;)W)W)WXXX$..#((  &1  	 	 	 $--#(( 
 &1     !Kd&9KKK   rr7   r   )rf  r   r   rw   r   rx   r   r   r   r,   r   r   r   r   rd  rg  re  r   r+   r<  r   r|   )rF   r   rf  r   r+   r   )r'  r  rU   r  rf  rS   r+   r,   )r'  r  rF   r   r+   r   )r'  r  rU   r  r+   r,   )rU   r  r+   r,   )r   r   r   r   	threadingLockr  r   r   r   rl  r  r  r  r  r   r   s   @r5   rc  rc  $  s       2 2l INE 15$;?
 #'*077;KQJN<@;? ;? ;? ;? ;? ;? ;? ;?z
 
 
 
 15_ _ _ _ _B   :   @%
 %
 %
 %
N, , , , , , , ,r7   rc  c                  d     e Zd ZdZ ej                    Z	 dddedddd fdZddZ	ddZ
 xZS ) PyPDFium2Parserao  Parse a blob from a PDF using `PyPDFium2` library.

    This class provides methods to parse a blob from a PDF document, supporting various
    configurations such as handling password-protected PDFs, extracting images, and
    defining extraction mode.
    It integrates the 'PyPDFium2' library for PDF processing and offers synchronous
    blob parsing.

    Examples:
        Setup:

        .. code-block:: bash

            pip install -U langchain-community pypdfium2

        Load a blob from a PDF file:

        .. code-block:: python

            from langchain_core.documents.base import Blob

            blob = Blob.from_path("./example_data/layout-parser-paper.pdf")

        Instantiate the parser:

        .. code-block:: python

            from langchain_community.document_loaders.parsers import PyPDFium2Parser

            parser = PyPDFium2Parser(
                # password=None,
                mode="page",
                pages_delimiter="
",
                # extract_images = True,
                # images_to_text = convert_images_to_text_with_tesseract(),
            )

        Lazily parse the blob:

        .. code-block:: python

            docs = []
            docs_lazy = parser.lazy_parse(blob)

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    FNrU   r4   )r   r   r   r   r   r   rw   r   rx   r   r   r   r,   r   r   r   r   r+   r<  c                   t                                                       |dvrt          d          || _        |r|st	                      }|| _        || _        || _        || _        || _	        dS )uk  Initialize a parser based on PyPDFium2.

        Args:
            password: Optional password for opening encrypted PDFs.
            mode: The extraction mode, either "single" for the entire document or "page"
                for page-wise extraction.
            pages_delimiter: A string delimiter to separate pages in single-mode
                extraction.
            extract_images: Whether to extract images from the PDF.
            images_parser: Optional image blob parser.
            images_inner_format: The format for the parsed output.
                - "text" = return the content as is
                - "markdown-img" = wrap the content into an image markdown link, w/ link
                pointing to (`![body)(#)`]
                - "html-img" = wrap the content as the `alt` text of an tag and link to
                (`<img alt="{body}" src="#"/>`)
            extraction_mode: “plain” for legacy functionality, “layout” for experimental
                layout mode functionality
            extraction_kwargs: Optional additional parameters for the extraction
                process.

        Returns:
            This method does not directly return data. Use the `parse` or `lazy_parse`
            methods to retrieve parsed documents with content and metadata.

        Raises:
            ValueError: If the mode is not "single" or "page".
        r   r   N)
r   r   rY   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   s          r5   r   zPyPDFium2Parser.__init__  s    L 	))):;;;, 	1- 	1.00M*#6  	.r7   rF   r   r   c              #    K   	 ddl }n# t          $ r t          d          w xY wt          j        5  |                                5 }d}	  |j        || j        d          }g }ddddt          |                                          z  }|j	        |d	<   t          |          |d
<   t          |          D ]\  }}|                                }	d                    |	                                                                          }
|	                                 |                     |          }t%          |g|
                                          }|                                 | j        dk    rA|                    d          s|dz  }t-          |t/          i |d|i                    V  |                    |           | j        dk    r8t-          | j                            |          t/          |                    V  |r|                                 n# |r|                                 w w xY wddd           n# 1 swxY w Y   ddd           dS # 1 swxY w Y   dS )r   r   NzKpypdfium2 package not found, please install it with `pip install pypdfium2`T)r   	autoclose	PyPDFium2r/   r   rA   rD   r8   rU   r   r   )	pypdfium2r:   r  r  r   PdfDocumentr   rp   get_metadata_dictrA   r   r   get_textpager;   get_text_range
splitlinescloser  r   rk   r   r[  r   r]   r   r   )r   rF   r  r`   r   r  r   r   rU   	text_pagert   image_from_pager   s                r5   r   zPyPDFium2Parser.lazy_parse  s     	 	 	 	+  	 " 4	+ 4	+!!## 3+y!
1+!6!6!DMT" " "J $&L %0#.(*$ $ (
(D(D(F(FGG	$HL
 .2[L*25j//L/-6z-B-B : :)T$($5$5$7$7	)-%4466AACC* * ")))*.*H*H*N*N#9,-~$ $%'' ! 

9..#+#4#4T#:#: 1 (D 0"*-5);%&*6%&(.%& %&*" *"# # #     )//9999yH,,&)-)=)B)B<)P)P%7%E%E     
 " +"((*** " +"((****+e3+ 3+ 3+ 3+ 3+ 3+ 3+ 3+ 3+ 3+ 3+ 3+ 3+ 3+ 3+4	+ 4	+ 4	+ 4	+ 4	+ 4	+ 4	+ 4	+ 4	+ 4	+ 4	+ 4	+ 4	+ 4	+ 4	+ 4	+ 4	+ 4	+sV   	 #I(IGH(I(III(I	I(I	I((I,/I,pypdfium2._helpers.page.PdfPagec                   | j         sdS ddlm} t          |                    |j        f                    }|sdS g }|D ]}t          j                    }|                                	                                }|j
        dk     rHt          j        ||                                	                                           t          j        |                                d          }t!          | j                             |                    j        }	|                    t)          ||	| j                             |                                 t.                              t2                              |                    S )	r   r/   r   N)r   r  r  r   r   )r   pypdfium2.rawrawr  get_objectsFPDF_PAGEOBJ_IMAGEr   r   
get_bitmapto_numpysizer  r   r   r   r   r   r   r   r   rQ   r   r  r   rH   r   r;   )
r   rU   pdfium_cr)   
str_imagesr  r   r   rF   text_from_images
             r5   r  z)PyPDFium2Parser._extract_images_from_pageK  so    ! 	2((((((d&&x/J.L&MMNN 	2
 	 	E*,,K''))2244H}q  J{E$4$4$6$6$?$?$A$ABBB>+"6"6"8"8DWXXXD"4#5#@#@#F#FGGTO#D/4;STT   KKMMMM ''<3D3DZ3P3P'QQQr7   r`  )r   rw   r   rx   r   r   r   r,   r   r   r   r   r+   r<  r   )rU   r  r+   r,   )r   r   r   r   r  r  r  r   r   r   r  r   r   s   @r5   r  r    s        0 0h INE  %0/ #'*077;KQ0/ 0/ 0/ 0/ 0/ 0/ 0/ 0/dM+ M+ M+ M+^R R R R R R R Rr7   r  c                  :    e Zd ZdZ	 	 	 dddZddZddZddZdS )PDFPlumberParserzParse `PDF` with `PDFPlumber`.NFrf  Optional[Mapping[str, Any]]deduperw   r   r+   r<  c                z    	 ddl }n# t          $ r t          d          w xY w|pi | _        || _        || _        dS )zInitialize the parser.

        Args:
            text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
            dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
        r   NzEpillow package not found, please install it with `pip install pillow`)r   r:   rf  r  r   )r   rf  r  r   r   s        r5   r   zPDFPlumberParser.__init__o  sf    	JJJJ 	 	 	W  	 ',",s    !rF   r   r   c              #      K   ddl }                                5 } |j        |           fdj        D             E d{V  ddd           dS # 1 swxY w Y   dS )Lazily parse the blob.r   Nc                   g | ]}t                              |          d z                       |          z   t          j        j        |j        dz
  t          j                  dfi fdj        D                       S )r8   r1   )rA   r`   rU   rD   c                |    i | ]8}t          j        |                   t          t          fv *|j        |         9S r2   )rf   rR   r,   r\   r  s     r5   r  z:PDFPlumberParser.lazy_parse.<locals>.<listcomp>.<dictcomp>  sH        !#CLO44c
BB s|ABBBr7   r   )	r   _process_page_contentr  r   rA   r   r   r   rR   )r3   rU   rF   r'  r   s     r5   r6   z/PDFPlumberParser.lazy_parse.<locals>.<listcomp>  s       & % !%!;!;D!A!A"44T::"; "&*k)-$($4q$8+.sy>>	     %(\   	    r7   )
pdfplumberr   r   r   )r   rF   r  r`   r'  s   ``  @r5   r   zPDFPlumberParser.lazy_parse  s       	9!*/),,C     &  I'         	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   ,AAArU   pdfplumber.page.Pager,   c                |    | j         r$ |                                j        di | j        S  |j        di | j        S )z)Process the page content based on dedupe.r2   )r  dedupe_charsr   rf  )r   rU   s     r5   r  z&PDFPlumberParser._process_page_content  sQ    ; 	H34$$&&3GGd6FGGG t 444#3444r7   c                f   ddl m} | j        sdS g }|j        D ]}|d         d         j        t
          v r|d         d         dk    r|                    t          j        |	                    d|d         d	         |d         d
         f|d         
                                                              d                               |                    t          j        |d         
                                t          j                                      |d         d
         |d         d	         d                     ,|d         d         j        t          v r/|                    |d         
                                           ut!          j        d           t%          |          S )z8Extract images from page and get the text with RapidOCR.r   r   r/   r  FilterBitsPerComponentr1   1WidthHeightLr   rz   r   )r   r   r   r)   r  r   r   r   r   	frombytesr   convertr   r   r   r   warningswarnr@   )r   rU   r   r)   r=   s        r5   r  z*PDFPlumberParser._extract_images_from_page  s   " 	2; 	5 	5C8}X&+/GGGx=!3499MM!OO #!$Xw!7Xx9P Q #H 6 6 8 8  &gcll     MMc(m&<&<&>&>bhOOOWWM(3S]75KR    
 Xx(-1FFFc(m44667777344440888r7   )NFF)rf  r  r  rw   r   rw   r+   r<  r   )rU   r  r+   r,   )r   r   r   r   r   r   r  r  r2   r7   r5   r  r  l  sz        (( 48$	- - - - -,   :5 5 5 59 9 9 9 9 9r7   r  c                  .    e Zd ZdZ	 	 dddddZddZdS )AmazonTextractPDFParsera  Send `PDF` files to `Amazon Textract` and parse them.

    For parsing multi-page PDFs, they have to reside on S3.

    The AmazonTextractPDFLoader calls the
    [Amazon Textract Service](https://aws.amazon.com/textract/)
    to convert PDFs into a Document structure.
    Single and multi-page documents are supported with up to 3000 pages
    and 512 MB of size.

    For the call to be successful an AWS account is required,
    similar to the
    [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html)
    requirements.

    Besides the AWS configuration, it is very similar to the other PDF
    loaders, while also supporting JPEG, PNG and TIFF and non-native
    PDF formats.

    ```python
    from langchain_community.document_loaders import AmazonTextractPDFLoader
    loader=AmazonTextractPDFLoader("example_data/alejandro_rosalez_sample-small.jpeg")
    documents = loader.load()
    ```

    One feature is the linearization of the output.
    When using the features LAYOUT, FORMS or TABLES together with Textract

    ```python
    from langchain_community.document_loaders import AmazonTextractPDFLoader
    # you can mix and match each of the features
    loader=AmazonTextractPDFLoader(
        "example_data/alejandro_rosalez_sample-small.jpeg",
        textract_features=["TABLES", "LAYOUT"])
    documents = loader.load()
    ```

    it will generate output that formats the text in reading order and
    try to output the information in a tabular structure or
    output the key/value pairs with a colon (key: value).
    This helps most LLMs to achieve better accuracy when
    processing these texts.

    ``Document`` objects are returned with metadata that includes the ``source`` and
    a 1-based index of the page number in ``page``. Note that ``page`` represents
    the index of the result returned from Textract, not necessarily the as-written
    page number in the document.

    N)linearization_configtextract_featuresOptional[Sequence[int]]clientOptional[Any]r  !Optional[TextLinearizationConfig]r+   r<  c                  	 ddl ddlmc m} | _        || _        |fd|D             | _        ng | _        ||| _        n#| j                            dddd          | _        n# t          $ r t          d	          w xY w|s>	 ddl
}|                    d
          | _        dS # t          $ r t          d          w xY w|| _        dS )a5  Initializes the parser.

        Args:
            textract_features: Features to be used for extraction, each feature
                               should be passed as an int that conforms to the enum
                               `Textract_Features`, see `amazon-textract-caller` pkg
            client: boto3 textract client
            linearization_config: Config to be used for linearization of the output
                                  should be an instance of TextLinearizationConfig from
                                  the `textractor` pkg
        r   Nc                :    g | ]}                     |          S r2   )Textract_Features)r3   ftcs     r5   r6   z4AmazonTextractPDFParser.__init__.<locals>.<listcomp>  s4     * * *01B((++* * *r7   Tz# z## *)hide_figure_layouttitle_prefixsection_header_prefixlist_element_prefixzCould not import amazon-textract-caller or amazon-textract-textractor python package. Please install it with `pip install amazon-textract-caller` & `pip install amazon-textract-textractor`.textractzRCould not import boto3 python package. Please install it with `pip install boto3`.)textractcallertextractor.entities.documententitiesdocumentr   
textractorr  r  r   r:   boto3r  boto3_textract_client)r   r  r  r  r  r  r   s         @r5   r   z AmazonTextractPDFParser.__init__  sb   &	''''=========DG(DO ,* * * *5F* * *&& *,&#/,@)),0O,S,S'+!%*/(+	 -T - -)  	 	 	<  	  	0-2\\*-E-E***   !B   *0D&&&s   A%A) )B	B) )CrF   r   r   c              #  l  K   |j         r!t          t          |j                             nd}|rL|j        dk    rA|j        r:| j                            t          |j                   | j        | j                  }nI| j                            |	                                | j        | j        j
        j        | j                  }| j        j                            |          }t          |j                  D ]<\  }}t          |                    | j                  |j        |dz   d          V  =dS )	zIterates over the Blob pages and returns an Iterator with a Document
        for each page, like the other parsers If multi-page document, blob.path
        has to be set to the S3 URI and for single page docs
        the blob.data is taken
        Ns3)input_documentfeaturesr  )r  r  	call_moder  )configr1   rA   rU   r   )pathr   r,   schemenetlocr   call_textractr  r  as_bytesTextract_Call_Mode
FORCE_SYNCr  r   r   r   r   rD  r  rA   )r   rF   url_parse_resulttextract_response_jsonr
  idxrU   s          r5   r   z"AmazonTextractPDFParser.lazy_parse<  sT      8<yJ8C	NN333d 	 '4// ' 0 &*W%:%:"49~~/&*&@ &; & &"" &*W%:%:#}}/'4?&*&@	 &; & &" ?+001GHH"8>22 	 	IC!]]$2K]LL$(KqAA      	 	r7   )NN)r  r  r  r  r  r  r+   r<  r   )r   r   r   r   r   r   r2   r7   r5   r  r    sf        0 0h 6: $=0
 CG=0 =0 =0 =0 =0 =0~! ! ! ! ! !r7   r  c                  *    e Zd ZdZddZddZddZdS )DocumentIntelligenceParserzjLoads a PDF with Azure Document Intelligence
    (formerly Form Recognizer) and chunks at character level.r  r   modelr,   c                J    t          j        d           || _        || _        d S )Na<  langchain_community.document_loaders.parsers.pdf.DocumentIntelligenceParserand langchain_community.document_loaders.pdf.DocumentIntelligenceLoader are deprecated. Please upgrade to langchain_community.document_loaders.DocumentIntelligenceLoader for any file parsing purpose using Azure Document Intelligence service.)r  r  r  r!  )r   r  r!  s      r5   r   z#DocumentIntelligenceParser.__init__d  s/    	
 	
 	
 


r7   rF   r   r>   r+   r   c              #     K   |j         D ]H}d                    d |j        D                       }t          ||j        |j        d          }|V  Id S )N c                    g | ]	}|j         
S r2   )rG   )r3   lines     r5   r6   z=DocumentIntelligenceParser._generate_docs.<locals>.<listcomp>r  s    AAAAAAr7   r  r   )r   r;   rt  r   rA   r   )r   rF   r>   prG   ds         r5   _generate_docsz)DocumentIntelligenceParser._generate_docsp  s{       
	 
	AhhAAAAABBG$"kM   A GGGG
	 
	r7   c              #    K   |                                 5 }| j                            | j        |          }|                                }|                     ||          }|E d{V  ddd           dS # 1 swxY w Y   dS )r  N)r   r  begin_analyze_documentr!  r>   r)  )r   rF   file_objpollerr>   docss         r5   r   z%DocumentIntelligenceParser.lazy_parse}  s        	8[77
HMMF]]__F&&tV44DOOOOOOO	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   AA77A;>A;N)r  r   r!  r,   )rF   r   r>   r   r+   r   r   )r   r   r   r   r   r)  r   r2   r7   r5   r   r   `  s\        A A
 
 
 
   	 	 	 	 	 	r7   r   )r)   r*   r+   r,   )rF   r   rG   r,   rH   r,   r+   r,   )rR   rS   r+   rS   )rr   rs   rt   r,   r+   r,   )Cr   
__future__r   rO   r   loggingr  r  r   pathlibr   tempfiler   typingr   r   r	   r
   r   r   r   r   r   r   r   urllib.parser   r  r   langchain_core.documentsr   )langchain_community.document_loaders.baser   1langchain_community.document_loaders.blob_loadersr   3langchain_community.document_loaders.parsers.imagesr   r   r  r  r   r  )textractor.data.text_linearization_configr   r   r   r@   	getLoggerr   r   r   r   r  r   rV   rQ   r]   rp   r   r   r   r   rc  r  r  r  r   r2   r7   r5   <module>r;     s   . . " " " " " "  				                   ' ' ' ' ' '                          " ! ! ! ! !      - - - - - - D D D D D D B B B B B B       
  RNNNLLLQQQQQQ999    "   > 
	8	$	$* ! UUU    ,   &# # # #N 
 2 2 2 2ja
 a
 a
 a
 a
. a
 a
 a
HJ J J J J^ J J JZ
l l l l lN l l l^VR VR VR VR VRn VR VR VRr[9 [9 [9 [9 [9~ [9 [9 [9|S S S S Sn S S Sl& & & & & & & & & &r7   