
    hh                         d Z ddlZddlZddlmZ ddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlZddlmZ ddlmZ dd	lmZmZ  G d
 dee          Z G d de          ZdS )zLoads word documents.    N)ABC)Path)AnyListUnion)urlparse)Document)
BaseLoader)UnstructuredFileLoadervalidate_unstructured_versionc                   n    e Zd ZdZdeeef         fdZd
dZde	e
         fdZededefd	            ZdS )Docx2txtLoaderzLoad `DOCX` file using `docx2txt` and chunks at character level.

    Defaults to check for local file, but if the file is a web path, it will download it
    to a temporary file, and use that, then clean up the temporary file after completion
    	file_pathc                    t          |          | _        | j        | _        d| j        v r)t          j                            | j                  | _        t          j                            | j                  s|                     | j                  rt          j	        | j                  }|j
        dk    rt          d|j
        z            | j        | _        t          j                    | _        | j                            |j                   | j        j        | _        dS t          j                            | j                  st          d| j        z            dS )zInitialize with file path.~   z3Check the url of your file; returned status code %sz'File path %s is not a valid file or urlN)strr   original_file_pathospath
expanduserisfile_is_valid_urlrequestsgetstatus_code
ValueErrorweb_pathtempfileNamedTemporaryFile	temp_filewritecontentname)selfr   rs      o/var/www/FlaskApp/flask-venv/lib/python3.11/site-packages/langchain_community/document_loaders/word_document.py__init__zDocx2txtLoader.__init__   s-   Y"&.$.  W//??DN w~~dn-- 	Y$2D2DT^2T2T 	YT^,,A}## Im$  
 !NDM%8::DNN  +++!^0DNNN// 	YFWXXX	Y 	Y    returnNc                 \    t          | d          r| j                                         d S d S )Nr!   )hasattrr!   close)r%   s    r'   __del__zDocx2txtLoader.__del__3   s8    4%% 	#N  """""	# 	#r)   c                 l    ddl }t          |                    | j                  d| j        i          gS )zLoad given path as single page.r   Nsource)page_contentmetadata)docx2txtr	   processr   r   )r%   r3   s     r'   loadzDocx2txtLoader.load7   sG     %--dn=="D$;<  
 	
r)   urlc                 p    t          |           }t          |j                  ot          |j                  S )zCheck if the url is valid.)r   boolnetlocscheme)r6   parseds     r'   r   zDocx2txtLoader._is_valid_urlB   s.     #FM"":tFM':'::r)   )r*   N)__name__
__module____qualname____doc__r   r   r   r(   r.   r   r	   r5   staticmethodr8   r    r)   r'   r   r      s         Y%T	"2 Y Y Y Y0# # # #	
d8n 	
 	
 	
 	
 ;3 ;4 ; ; ; \; ; ;r)   r   c                   P     e Zd ZdZ	 d	deeef         dedef fdZde	fdZ
 xZS )
UnstructuredWordDocumentLoadera_  Load `Microsoft Word` file using `Unstructured`.

    Works with both .docx and .doc files.
    You can run the loader in one of two modes: "single" and "elements".
    If you use "single" mode, the document will be returned as a single
    langchain Document object. If you use "elements" mode, the unstructured
    library will split the document into elements such as Title and NarrativeText.
    You can pass in additional unstructured kwargs after mode to apply
    different unstructured settings.

    Examples
    --------
    from langchain_community.document_loaders import UnstructuredWordDocumentLoader

    loader = UnstructuredWordDocumentLoader(
        "example.docx", mode="elements", strategy="fast",
    )
    docs = loader.load()

    References
    ----------
    https://unstructured-io.github.io/unstructured/bricks.html#partition-docx
    singler   modeunstructured_kwargsc                 ^    t          |          } t                      j        d||d| dS )a&  

        Args:
            file_path: The path to the Word file to load.
            mode: The mode to use when loading the file. Can be one of "single",
                "multi", or "all". Default is "single".
            **unstructured_kwargs: Any kwargs to pass to the unstructured.
        )r   rE   NrA   )r   superr(   )r%   r   rE   rF   	__class__s       r'   r(   z'UnstructuredWordDocumentLoader.__init__b   s;     	NN	O94OO;NOOOOOr)   r*   c                 r   ddl m}m} 	 dd l} || j                  |j        k    }nJ# t          $ r= t          j        	                    t          | j                            \  }}|dk    }Y nw xY w|rt          d           |rddlm}  |dd| j        i| j        S ddlm}  |dd| j        i| j        S )	Nr   )FileTypedetect_filetypez.docz0.4.11)partition_docfilename)partition_docxrA   ) unstructured.file_utils.filetyperK   rL   magicr   DOCImportErrorr   r   splitextr   r   unstructured.partition.docrM   rF   unstructured.partition.docxrO   )	r%   rK   rL   rQ   is_doc_	extensionrM   rO   s	            r'   _get_elementsz,UnstructuredWordDocumentLoader._get_elementss   s   NNNNNNNN
	)LLL$_T^44DFF 	) 	) 	)7++C,?,?@@LAy&(FFF	)  	4)(333 	W@@@@@@ =UU$.UD<TUUUBBBBBB!>VV4>VT=UVVVs   ( AA/.A/)rD   )r<   r=   r>   r?   r   r   r   r   r(   r   rZ   __classcell__)rI   s   @r'   rC   rC   I   s         6 P Pd#P P  #	P P P P P P"Wt W W W W W W W Wr)   rC   )r?   r   r   abcr   pathlibr   typingr   r   r   urllib.parser   r   langchain_core.documentsr	   )langchain_community.document_loaders.baser
   1langchain_community.document_loaders.unstructuredr   r   r   rC   rA   r)   r'   <module>rc      s<     				              # # # # # # # # # # ! ! ! ! ! !  - - - - - - @ @ @ @ @ @       2; 2; 2; 2; 2;Z 2; 2; 2;jBW BW BW BW BW%; BW BW BW BW BWr)   