
    hh&                        d Z ddlZddlZddlmZmZ ddlmZmZm	Z	m
Z
mZmZmZ ddlmZ ddlmZ erddlmZ ddlmZ dd	lmZ dd
lmZmZmZ  ej        e          Z G d de          Z G d de          Z G d de          ZdS )zQLoader that uses Playwright to load a page, then uses unstructured to parse html.    N)ABCabstractmethod)TYPE_CHECKINGAsyncIteratorDictIteratorListOptionalUnion)Document)
BaseLoader)Browser)Page)Response)r   r   r   c            	       b    e Zd ZdZedddddddefd	            Zedd
dddddefd            ZdS )PlaywrightEvaluatorzAbstract base class for all evaluators.

    Each evaluator should take a page, a browser instance, and a response
    object, process the page as necessary, and return the resulting text.
    pager   browserr   responser   returnc                     dS )a  Synchronously process the page and return the resulting text.

        Args:
            page: The page to process.
            browser: The browser instance.
            response: The response from page.goto().

        Returns:
            text: The text content of the page.
        N selfr   r   r   s       p/var/www/FlaskApp/flask-venv/lib/python3.11/site-packages/langchain_community/document_loaders/url_playwright.pyevaluatezPlaywrightEvaluator.evaluate   s	     	    	AsyncPageAsyncBrowserAsyncResponsec                 
   K   dS )a  Asynchronously process the page and return the resulting text.

        Args:
            page: The page to process.
            browser: The browser instance.
            response: The response from page.goto().

        Returns:
            text: The text content of the page.
        Nr   r   s       r   evaluate_asyncz"PlaywrightEvaluator.evaluate_async+   s       	r   N)__name__
__module____qualname____doc__r   strr   r"   r   r   r   r   r      s          V i : RU    ^ *8DS	   ^  r   r   c                   h    e Zd ZdZddeee                  fdZddddd	d
defdZddddd	ddefdZ	dS )UnstructuredHtmlEvaluatorz@Evaluate the page HTML content using the `unstructured` library.Nremove_selectorsc                 Z    	 ddl }n# t          $ r t          d          w xY w|| _        dS )z%Initialize UnstructuredHtmlEvaluator.r   NzQunstructured package not found, please install it with `pip install unstructured`)unstructuredImportErrorr*   )r   r*   r,   s      r   __init__z"UnstructuredHtmlEvaluator.__init__?   sX    	 	 	 	-  	 !1    !r   r   r   r   r   r   r   c                 N   ddl m} | j        pg D ]W}|                    |                                          }|D ]+}|                                r|                    d           ,X|                                } ||          }d                    d |D                       S )z3Synchronously process the HTML content of the page.r   partition_htmlelement => element.remove()text

c                 ,    g | ]}t          |          S r   r'   .0els     r   
<listcomp>z6UnstructuredHtmlEvaluator.evaluate.<locals>.<listcomp>W       777CGG777r   	unstructured.partition.htmlr2   r*   locatorall
is_visibler   contentjoin	r   r   r   r   r2   selectorelementselementpage_sources	            r   r   z"UnstructuredHtmlEvaluator.evaluateK   s    >>>>>>-3 	D 	DH||H--1133H# D D%%'' D$$%BCCCD llnn!>{333{{77h777888r   r   r   r    c                   K   ddl m} | j        pg D ]i}|                    |                                           d{V }|D ]7}|                                 d{V r|                    d           d{V  8j|                                 d{V } ||          }d                    d |D                       S )z4Asynchronously process the HTML content of the page.r   r1   Nr3   r4   r6   c                 ,    g | ]}t          |          S r   r8   r9   s     r   r<   z<UnstructuredHtmlEvaluator.evaluate_async.<locals>.<listcomp>g   r=   r   r>   rE   s	            r   r"   z(UnstructuredHtmlEvaluator.evaluate_asyncY   s      	?>>>>>-3 	J 	JH!\\(337799999999H# J J ++-------- J!**+HIIIIIIIIIJ !LLNN******!>{333{{77h777888r   N)
r#   r$   r%   r&   r
   r	   r'   r.   r   r"   r   r   r   r)   r)   <   s        JJ
1 
1$s))< 
1 
1 
1 
19V 9i 9: 9RU 9 9 9 999*89DS9	9 9 9 9 9 9r   r)   c                   
   e Zd ZdZ	 	 	 	 	 	 ddee         dededeee                  dee         d	ee	eef                  d
ee
eej        e         f                  fdZdee         fdZdee         fdZdee         fdZdS )PlaywrightURLLoaderad  Load `HTML` pages with `Playwright` and parse with `Unstructured`.

    This is useful for loading pages that require javascript to render.

    Attributes:
        urls (List[str]): List of URLs to load.
        continue_on_failure (bool): If True, continue loading other URLs on failure.
        headless (bool): If True, the browser will run in headless mode.
        proxy (Optional[Dict[str, str]]): If set, the browser will access URLs
            through the specified proxy.
        browser_session (Optional[Union[str, os.PathLike[str]]]): Path to a file with
            browser session data that can be used to restore the browser session.

    Example:
        .. code-block:: python

            from langchain_community.document_loaders import PlaywrightURLLoader

            urls = ["https://api.ipify.org/?format=json",]
            proxy={
                "server": "https://xx.xx.xx:15818", # https://<host>:<port>
                "username": "username",
                "password": "password"
            }
            loader = PlaywrightURLLoader(urls, proxy=proxy)
            data = loader.load()
    TNurlscontinue_on_failureheadlessr*   	evaluatorproxybrowser_sessionc                     	 ddl }n# t          $ r t          d          w xY w|| _        || _        || _        || _        || _        |r|rt          d          |pt          |          | _	        dS )z%Load a list of URLs using Playwright.r   NzMplaywright package not found, please install it with `pip install playwright`z:`remove_selectors` and `evaluator` cannot be both not None)

playwrightr-   rO   rP   rQ   rS   rT   
ValueErrorr)   rR   )	r   rO   rP   rQ   r*   rR   rS   rT   rV   s	            r   r.   zPlaywrightURLLoader.__init__   s    	 	 	 	+  	 	#6  
. 		 	L  
 #Q&?@P&Q&Qr/   r   c           	   #     K   ddl m}  |            5 }|j                            | j        | j                  }d}| j        rbt          j        	                    | j                  r|
                    | j                  }n"t                              d| j                    ||
                                }| j        D ]}	 |                                }|                    |          }|t!          d|           |                    d           | j                            |||          }|                                 d	|i}	t+          ||	
          V  # t,          $ r4}
| j        r!t                              d| d|
            n|
Y d}
~
d}
~
ww xY w|                                 ddd           dS # 1 swxY w Y   dS )zLoad the specified URLs using Playwright and create Document instances.

        Returns:
            A list of Document instances with loaded content.
        r   )sync_playwrightrQ   rS   Nstorage_stateSession file not found: "page.goto() returned None for url loadsourcepage_contentmetadataError fetching or processing , exception: )playwright.sync_apirY   chromiumlaunchrQ   rS   rT   ospathexistsnew_contextloggerwarningrO   new_pagegotorW   wait_for_load_staterR   r   closer   	ExceptionrP   error)r   rY   pr   contexturlr   r   r5   rc   es              r   	lazy_loadzPlaywrightURLLoader.lazy_load   s[      	877777_ !	!j''dj'QQGG# V7>>$"677 V%11@T1UUGGNN#Td>R#T#TUUU!--//y     "++--D#yy~~H'()Sc)S)STTT,,V444>224(KKDJJLLL (#H"xHHHHHHH       /  QCQQaQQ     	      MMOOOC!	 !	 !	 !	 !	 !	 !	 !	 !	 !	 !	 !	 !	 !	 !	 !	 !	 !	s=   B1GBEG
F)*FGFGGGc                 N   K   d |                                  2              d{V S )Load the specified URLs with Playwright and create Documents asynchronously.
        Use this function when in a jupyter notebook environment.

        Returns:
            A list of Document instances with loaded content.
        c                 "   K   g | 3 d {V }|
6 S rL   r   )r:   docs     r   r<   z-PlaywrightURLLoader.aload.<locals>.<listcomp>   s.      77777777c7777s   N)
alazy_load)r   s    r   aloadzPlaywrightURLLoader.aload   s8       87T__%6%67777777777r   c           	     .  K   ddl m}  |            4 d{V }|j                            | j        | j                   d{V }d}| j        rht          j        	                    | j                  r"|
                    | j                   d{V }n"t                              d| j                    ||
                                 d{V }| j        D ]}	 |                                 d{V }|                    |           d{V }|t!          d|           |                    d           d{V  | j                            |||           d{V }|                                 d{V  d	|i}	t+          ||	
          W V  # t,          $ r4}
| j        r!t                              d| d|
            n|
Y d}
~
d}
~
ww xY w|                                 d{V  ddd          d{V  dS # 1 d{V swxY w Y   dS )r{   r   )async_playwrightNrZ   r[   r]   r^   r_   r`   ra   rd   re   )playwright.async_apir   rg   rh   rQ   rS   rT   ri   rj   rk   rl   rm   rn   rO   ro   rp   rW   rq   rR   r"   rr   r   rs   rP   rt   )r   r   ru   r   rv   rw   r   r   r5   rc   rx   s              r   r~   zPlaywrightURLLoader.alazy_load   sr      	:99999##%% #	" #	" #	" #	" #	" #	" #	"J--t}DJ-WWWWWWWWGG# V7>>$"677 V$+$7$7&*&: %8 % %      GG NN#Td>R#T#TUUU ' 3 3 5 5555555y     !(!1!1!3!3333333D%)YYs^^333333H'()Sc)S)STTT226:::::::::!%!>!>tWh!W!WWWWWWWD**,,&&&&&&& (#H"xHHHHHHHH       /  QCQQaQQ     	      --//!!!!!!!G#	" #	" #	" #	" #	" #	" #	" #	" #	" #	" #	" #	" #	" #	" #	" #	" #	" #	" #	" #	" #	" #	" #	" #	" #	" #	" #	" #	" #	" #	"s=   CHB8FH
G *G
HGH
HH)TTNNNN)r#   r$   r%   r&   r	   r'   boolr
   r   r   r   ri   PathLiker.   r   r   ry   r   r   r~   r   r   r   rN   rN   j   s2        > %)0437*.BFR R3iR "R 	R
 #49-R /0R S#X'R "%R[-=(=">?R R R RB)8H- ) ) ) )V8T(^ 8 8 8 8,"-"9 ," ," ," ," ," ,"r   rN   ) r&   loggingri   abcr   r   typingr   r   r   r   r	   r
   r   langchain_core.documentsr   )langchain_community.document_loaders.baser   r   r   r   r   r   r   r    rf   	getLoggerr#   rm   r   r)   rN   r   r   r   <module>r      s   W W  				 # # # # # # # # V V V V V V V V V V V V V V V V V V - - - - - - @ @ @ @ @ @ <<<<<<<666666>>>>>>;;;;;;;;;; 
	8	$	$# # # # ## # # #L+9 +9 +9 +9 +9 3 +9 +9 +9\^" ^" ^" ^" ^"* ^" ^" ^" ^" ^"r   