
    hhR                    D   d dl mZ d dlZd dlZd dlZd dlZd dlmZmZm	Z	m
Z
mZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ  ej        e          ZddZ G d de          Zeeegef         Zeeeeej        ej         f         gef         Z!eee!f         Z"ddZ#dS )    )annotationsN)CallableIteratorListOptionalSequenceSetUnioncast)Document)extract_sub_links)
BaseLoaderraw_htmlstrurlresponse0Union[requests.Response, aiohttp.ClientResponse]returndictc                   t          |d                              dd          }||d}	 ddlm} n,# t          $ r t
                              d           |cY S w xY w || d          }|                    d	          x}r|                                |d	<   |                    d
ddi          x}r|                    dd          |d<   |                    d          x}	r|	                    dd          |d<   |S )z3Extract metadata from raw html using BeautifulSoup.headerszContent-Type )sourcecontent_typer   )BeautifulSoupztThe bs4 package is required for default metadata extraction. Please install it with `pip install -U beautifulsoup4`.zhtml.parsertitlemetanamedescription)attrscontentNhtmllanglanguage)	getattrgetbs4r   ImportErrorloggerwarningfindget_text)
r   r   r   r   metadatar   soupr   r   r"   s
             v/var/www/FlaskApp/flask-venv/lib/python3.11/site-packages/langchain_community/document_loaders/recursive_url_loader.py_metadata_extractorr0      s=    8Y//33NBGGL|<<H%%%%%%%   F	
 	
 	
  ==11D		'"""u -!NN,,iiv}.EiFFF{ C"-//)T"B"Byy   t 6#xx55Os   2 &AAc                  d    e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 d4dddddd5d%Zd&d'd6d-Zdd&d.d7d2Zd8d3ZdS )9RecursiveUrlLoaderaY  Recursively load all child links from a root URL.

    **Security Note**:
        This loader is a crawler that will start crawling
        at a given URL and then expand to crawl child links recursively.

        Web crawlers should generally NOT be deployed with network access
        to any internal servers.

        Control access to who can submit crawling requests and what network access
        the crawler has.

        While crawling, the crawler may encounter malicious URLs that would lead to a
        server-side request forgery (SSRF) attack.

        To mitigate risks, the crawler by default will only load URLs from the same
        domain as the start URL (controlled via prevent_outside named argument).

        This will mitigate the risk of SSRF attacks, but will not eliminate it.

        For example, if crawling a host which hosts several sites:

        https://some_host/alice_site/
        https://some_host/bob_site/

        A malicious URL on Alice's site could cause the crawler to make a malicious
        GET request to an endpoint on Bob's site. Both sites are hosted on the
        same host, so such a request would not be prevented by default.

        See https://python.langchain.com/docs/security/

    Setup:

        This class has no required additional dependencies. You can optionally install
        ``beautifulsoup4`` for richer default metadata extraction:

        .. code-block:: bash

            pip install -U beautifulsoup4

    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import RecursiveUrlLoader

            loader = RecursiveUrlLoader(
                "https://docs.python.org/3.9/",
                # max_depth=2,
                # use_async=False,
                # extractor=None,
                # metadata_extractor=None,
                # exclude_dirs=(),
                # timeout=10,
                # check_response_status=True,
                # continue_on_failure=True,
                # prevent_outside=True,
                # base_url=None,
                # ...
            )

    Lazy load:
        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

            # async variant:
            # docs_lazy = await loader.alazy_load()

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            <!DOCTYPE html>

            <html xmlns="http://www.w3.org/1999/xhtml">
            <head>
                <meta charset="utf-8" /><
            {'source': 'https://docs.python.org/3.9/', 'content_type': 'text/html', 'title': '3.9.19 Documentation', 'language': None}

    Async load:
        .. code-block:: python

            docs = await loader.aload()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            <!DOCTYPE html>

            <html xmlns="http://www.w3.org/1999/xhtml">
            <head>
                <meta charset="utf-8" /><
            {'source': 'https://docs.python.org/3.9/', 'content_type': 'text/html', 'title': '3.9.19 Documentation', 'language': None}

    Content parsing / extraction:
        By default the loader sets the raw HTML from each link as the Document page
        content. To parse this HTML into a more human/LLM-friendly format you can pass
        in a custom ``extractor`` method:

        .. code-block:: python

            # This example uses `beautifulsoup4` and `lxml`
            import re
            from bs4 import BeautifulSoup

            def bs4_extractor(html: str) -> str:
                soup = BeautifulSoup(html, "lxml")
                return re.sub(r"\n\n+", "\n\n", soup.text).strip()

            loader = RecursiveUrlLoader(
                "https://docs.python.org/3.9/",
                extractor=bs4_extractor,
            )
            print(loader.load()[0].page_content[:200])


        .. code-block:: python

            3.9.19 Documentation

            Download
            Download these documents
            Docs by version

            Python 3.13 (in development)
            Python 3.12 (stable)
            Python 3.11 (security-fixes)
            Python 3.10 (security-fixes)
            Python 3.9 (securit

    Metadata extraction:
        Similarly to content extraction, you can specify a metadata extraction function
        to customize how Document metadata is extracted from the HTTP response.

        .. code-block:: python

            import aiohttp
            import requests
            from typing import Union

            def simple_metadata_extractor(
                raw_html: str, url: str, response: Union[requests.Response, aiohttp.ClientResponse]
            ) -> dict:
                content_type = getattr(response, "headers").get("Content-Type", "")
                return {"source": url, "content_type": content_type}

            loader = RecursiveUrlLoader(
                "https://docs.python.org/3.9/",
                metadata_extractor=simple_metadata_extractor,
            )
            loader.load()[0].metadata

        .. code-block:: python

            {'source': 'https://docs.python.org/3.9/', 'content_type': 'text/html'}

    Filtering URLs:
        You may not always want to pull every URL from a website. There are four parameters
        that allow us to control what URLs we pull recursively. First, we can set the
        ``prevent_outside`` parameter to prevent URLs outside of the ``base_url`` from
        being pulled. Note that the ``base_url`` does not need to be the same as the URL we
        pass in, as shown below. We can also use ``link_regex`` and ``exclude_dirs`` to be
        more specific with the URLs that we select. In this example, we only pull websites
        from the python docs, which contain the string "index" somewhere and are not
        located in the FAQ section of the website.

        .. code-block:: python

            loader = RecursiveUrlLoader(
                "https://docs.python.org/3.9/",
                prevent_outside=True,
                base_url="https://docs.python.org",
                link_regex=r'<a\s+(?:[^>]*?\s+)?href="([^"]*(?=index)[^"]*)"',
                exclude_dirs=['https://docs.python.org/3.9/faq']
            )
            docs = loader.load()

        .. code-block:: python

            ['https://docs.python.org/3.9/',
            'https://docs.python.org/3.9/py-modindex.html',
            'https://docs.python.org/3.9/genindex.html',
            'https://docs.python.org/3.9/tutorial/index.html',
            'https://docs.python.org/3.9/using/index.html',
            'https://docs.python.org/3.9/extending/index.html',
            'https://docs.python.org/3.9/installing/index.html',
            'https://docs.python.org/3.9/library/index.html',
            'https://docs.python.org/3.9/c-api/index.html',
            'https://docs.python.org/3.9/howto/index.html',
            'https://docs.python.org/3.9/distributing/index.html',
            'https://docs.python.org/3.9/reference/index.html',
            'https://docs.python.org/3.9/whatsnew/index.html']

       N 
   TF)base_urlautoset_encodingencodingproxiesr   r   	max_depthOptional[int]	use_asyncOptional[bool]	extractorOptional[Callable[[str], str]]metadata_extractor Optional[_MetadataExtractorType]exclude_dirsOptional[Sequence[str]]timeoutprevent_outsidebool
link_regexUnion[str, re.Pattern, None]r   Optional[dict]check_response_statuscontinue_on_failurer6   Optional[str]r7   r8   r9   r   Nonec                  | _         ||nd| _        ||nd| _        ||nd | _        ||nt          }|| _        || _        t          |          | _        ||nd| _	        t          fd| j	        D                       rt          d d| j	                   || _        ||nd	| _        |	| _        |
| _        || _        || _        ||n| _        || _        dS )
a  Initialize with URL to crawl and any subdirectories to exclude.

        Args:
            url: The URL to crawl.
            max_depth: The max depth of the recursive loading.
            use_async: Whether to use asynchronous loading.
                If True, lazy_load function will not be lazy, but it will still work in the
                expected way, just not lazy.
            extractor: A function to extract document contents from raw HTML.
                When extract function returns an empty string, the document is
                ignored. Default returns the raw HTML.
            metadata_extractor: A function to extract metadata from args: raw HTML, the
                source url, and the requests.Response/aiohttp.ClientResponse object
                (args in that order).
                Default extractor will attempt to use BeautifulSoup4 to extract the
                title, description and language of the page.
                ..code-block:: python

                    import requests
                    import aiohttp

                    def simple_metadata_extractor(
                        raw_html: str, url: str, response: Union[requests.Response, aiohttp.ClientResponse]
                    ) -> dict:
                        content_type = getattr(response, "headers").get("Content-Type", "")
                        return {"source": url, "content_type": content_type}

            exclude_dirs: A list of subdirectories to exclude.
            timeout: The timeout for the requests, in the unit of seconds. If None then
                connection will not timeout.
            prevent_outside: If True, prevent loading from urls which are not children
                of the root url.
            link_regex: Regex for extracting sub-links from the raw html of a web page.
            headers: Default request headers to use for all requests.
            check_response_status: If True, check HTTP response status and skip
                URLs with error responses (400-599).
            continue_on_failure: If True, continue if getting or parsing a link raises
                an exception. Otherwise, raise the exception.
            base_url: The base url to check for outside links against.
            autoset_encoding: Whether to automatically set the encoding of the response.
                If True, the encoding of the response will be set to the apparent
                encoding, unless the `encoding` argument has already been explicitly set.
            encoding: The encoding of the response. If manually set, the encoding will be
                set to given value, regardless of the `autoset_encoding` argument.
            proxies: A dictionary mapping protocol names to the proxy URLs to be used for requests.
                This allows the crawler to route its requests through specified proxy servers.
                If None, no proxies will be used and requests will go directly to the target URL.
                Example usage:
                ..code-block:: python

                    proxies = {
                        "http": "http://10.10.1.10:3128",
                        "https": "https://10.10.1.10:1080",
                    }
        Nr3   Fc                    | S Nr4   )xs    r/   <lambda>z-RecursiveUrlLoader.__init__.<locals>.<lambda>M  s    1     r4   c              3  B   K   | ]}                     |          V  d S rP   )
startswith).0exclude_dirr   s     r/   	<genexpr>z.RecursiveUrlLoader.__init__.<locals>.<genexpr>X  s/      PP{s~~k**PPPPPPrS   z9Base url is included in exclude_dirs. Received base_url: z and exclude_dirs: T)r   r:   r<   r>   r0   r7   r8   _wrap_metadata_extractorr@   rB   any
ValueErrorrD   rE   rG   r   rJ   rK   r6   r9   )selfr   r:   r<   r>   r@   rB   rD   rE   rG   r   rJ   rK   r6   r7   r8   r9   s    `               r/   __init__zRecursiveUrlLoader.__init__   s9   X &/&;&/&;&/&; "- $ 	
 !1 ":;M"N"N,8,DLL"PPPPd>OPPPPP 	5C 5 5!%!25 5  
 2A2MSW$%:"#6 $,$8crS   r   depthvisitedSet[str]r_   intIterator[Document]c          	   #  >  K   || j         k    rdS |                    |           	 t          j        || j        | j        | j                  }| j        | j        |_        n| j        r|j	        |_        | j
        r,d|j        cxk    rdk    rn nt          d|j                   nN# t          $ rA}| j        r3t                              d| d| d|j        j                    Y d}~dS |d}~ww xY w|                     |j                  }|r-t+          ||                     |j        ||          	          V  t/          |j        || j        | j        | j        | j        | j        
          }|D ]'}||vr!|                     |||dz             E d{V  (dS )zRecursively get all child links starting with the path of the input URL.

        Args:
            url: The URL to crawl.
            visited: A set of visited URLs.
            depth: Current depth of recursion. Stop when depth >= max_depth.
        N)rD   r   r9     W  Received HTTP status zUnable to load from . Received error 	 of type page_contentr-   r6   patternrE   exclude_prefixesrK      r^   )r:   addrequestsr&   rD   r   r9   r8   r7   apparent_encodingrJ   status_coder[   	ExceptionrK   r)   r*   	__class____name__r>   textr   r@   r   r6   rG   rE   rB   _get_child_links_recursive)	r\   r   r`   r_   r   er!   	sub_linkslinks	            r/   rx   z-RecursiveUrlLoader._get_child_links_recursiveg  sR      DN""F 	C	|T\4<  H }($(M!!& ?$,$>!) QcX5I.P.P.P.PS.P.P.P.P.P !O9M!O!OPPP 	 	 	' .3 . . . .{+. .   	 ..// 	$00XNN      &M]O 0!. $ 8
 
 
	  	 	D7""::' ;         	 	s   BB( (
C324C.,C..C3sessionr_   r}   Optional[aiohttp.ClientSession]List[Document]c          
       K   | j         st          d          || j        k    rg S |du }||nFt          j        t          j        d          t          j        | j                  | j                  }|	                    |           	 |
                    |          4 d{V }|                                 d{V }| j        r,d|j        cxk    rdk    rn nt          d	|j                   	 ddd          d{V  n# 1 d{V swxY w Y   n|# t          j        j        t           f$ r^}|r|                                 d{V  | j        r4t&                              d
| d| d|j        j                    g cY d}~S |d}~ww xY wg |                     |          }	|	r9                    t3          |	|                     |||                               || j        dz
  k     rt7          ||| j        | j        | j        | j        | j                  }
g }tA          |
          !                    |          }|D ]1}|                    | "                    ||||dz                        2tG          j$        |  d{V }|D ]+}tK          |t                     s|fd|D             z  ,|r|                                 d{V  S )zRecursively get all child links starting with the path of the input URL.

        Args:
            url: The URL to crawl.
            visited: A set of visited URLs.
            depth: To reach the current url, how many pages have been visited.
        z?Async functions forbidden when not initialized with `use_async`NF)ssl)total)	connectorrD   r   re   rf   rg   zUnable to load rh   ri   rj   ro   rl   r|   c                    g | ]}|v|	S r4   r4   )rV   rresultss     r/   
<listcomp>zGRecursiveUrlLoader._async_get_child_links_recursive.<locals>.<listcomp>  s#    FFF!Qg5E5EA5E5E5ErS   )&r<   r[   r:   aiohttpClientSessionTCPConnectorClientTimeoutrD   r   rp   r&   rw   rJ   statusclient_exceptions
InvalidURLrt   closerK   r)   r*   ru   rv   r>   appendr   r@   r   r6   rG   rE   rB   set
difference _async_get_child_links_recursiveasynciogather
isinstance)r\   r   r`   r}   r_   close_sessionr   rw   ry   r!   rz   	sub_tasksto_visitr{   next_results
sub_resultr   s                   @r/   r   z3RecursiveUrlLoader._async_get_child_links_recursive  sm      ~ 	Q   DN""I  4 " G&!.5999-DLAAA   	 	C	{{3'' P P P P P P P8%]]__,,,,,,- P#2O2O2O2OC2O2O2O2O2O$%NX_%N%NOOOOP P P P P P P P P P P P P P P P P P P P P P P P P P P )4i@ 
	 
	 
	 &mmoo%%%%%%%' .c . .A . .{+. .   						
	 ..&& 	NN!(!44T3II     4>A%%%) $ 4!%!2$($<  I I9~~0099H     99gweai :     
 ")!;;;;;;;L* G G
j)44 
8J FFFFzFFFF 	"--//!!!!!!!sJ   D 'AD6D 
DD DD F5AFFFFc                    t                      }| j        r>t          j        |                     | j        |                    }t          |pg           S |                     | j        |          S )zLazy load web pages.
        When use_async is True, this function will not be lazy,
        but it will still work in the expected way, just not lazy.)r   r<   r   runr   r   iterrx   )r\   r`   r   s      r/   	lazy_loadzRecursiveUrlLoader.lazy_load  sj      EE> 	Fk55dhHH G 2&&&2248WEEErS   )r3   NNNr4   r5   TNNFT)"r   r   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   r;   rE   rF   rG   rH   r   rI   rJ   rF   rK   rF   r6   rL   r7   rF   r8   rL   r9   rI   r   rM   )r   r   r`   ra   r_   rb   r   rc   )
r   r   r`   ra   r}   r~   r_   rb   r   r   )r   rc   )rv   
__module____qualname____doc__r]   rx   r   r   r4   rS   r/   r2   r2   5   s        F FV $%$(48?C02!# $37"&&+$(g #'!%"&"&%g g g g g gT <=; ; ; ; ; ;D 48[ [ [ [ [ [zF F F F F FrS   r2   r@   _MetadataExtractorType_MetadataExtractorType2c                     t          t          j                   j                  dk    rt	          t
                     S d
 fd	}|S )N   r   r   r   r   r   r   r   c                B     t          t                    | |          S rP   )r   _MetadataExtractorType1)r   r   r   r@   s      r/   _metadata_extractor_wrapperz=_wrap_metadata_extractor.<locals>._metadata_extractor_wrapper  s$    
 E4/1CDDXsSSSrS   r   r   r   r   r   r   r   r   )leninspect	signature
parametersr   r   )r@   r   s   ` r/   rY   rY     se     7/00;<<AA+-?@@@	T 	T 	T 	T 	T 	T +*rS   r   )r@   r   r   r   )$
__future__r   r   r   loggingretypingr   r   r   r   r   r	   r
   r   r   rq   langchain_core.documentsr   langchain_core.utils.htmlr   )langchain_community.document_loaders.baser   	getLoggerrv   r)   r0   r2   r   r   r   ResponseClientResponser   r   rY   r4   rS   r/   <module>r      s   " " " " " "    					 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	   - - - - - - 7 7 7 7 7 7 @ @ @ @ @ @		8	$	$   2WF WF WF WF WF WF WF WFt #C:t#34 "#uX&(>>?@$F  68OOP + + + + + +rS   