
    hh;                         d Z ddlZddlZddlZddlmZmZmZmZm	Z	m
Z
mZmZ ddlZddlZddlmZ ddlmZ ddlmZ ddlmZ  ej        e          Z e            dd	d
ddddZdededefdZ G d de          ZdS )zWeb base loader class.    N)AnyAsyncIteratorDictIteratorListOptionalSequenceUnion)
deprecated)Document)
BaseLoader)get_user_agentzJtext/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8zen-US,en;q=0.5zhttps://www.google.com/1z
keep-alive)
User-AgentAcceptzAccept-LanguageRefererDNT
ConnectionzUpgrade-Insecure-Requestssoupurlreturnc                 2   d|i}|                      d          x}r|                                |d<   |                      dddi          x}r|                    dd          |d<   |                      d	          x}r|                    d
d          |d<   |S )z)Build metadata from BeautifulSoup output.sourcetitlemetanamedescription)attrscontentzNo description found.htmllangzNo language found.language)findget_textget)r   r   metadatar   r   r    s         j/var/www/FlaskApp/flask-venv/lib/python3.11/site-packages/langchain_community/document_loaders/web_base.py_build_metadatar(      s    #H		'"""u -!NN,,iiv}.EiFFF{ V"-//)=T"U"Uyy   t F#xx0DEEO    c            &       N   e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d8ddd	d
eeee         f         dee         de	dee         de	de	dee         dee         de
dedeeeef                  de	deeeef                  deeeef                  dede	de	ddf$dZedefd            Z	 d9d ed!e
d"e
d#edef
d$Zd ed%ej        defd&Zd'ee         defd(Zed)eddfd*            Z	 d:d+ed'ee         d)eedf         dee         fd,Zd:d'ee         d)eedf         dee         fd-Z	 d:d'ee         d)eedf         dee         fd.Z	 	 d;d ed)eedf         dee         defd/Zd:d)eedf         defd0Zdee         fd1Z de!e         fd2Z" e#d3d4d56          dee         fd7            Z$dS )<WebBaseLoaderaQ  
    WebBaseLoader document loader integration

    Setup:
        Install ``langchain_community``.

        .. code-block:: bash

            pip install -U langchain_community

    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import WebBaseLoader

            loader = WebBaseLoader(
                web_path = "https://www.espn.com/"
                # header_template = None,
                # verify_ssl = True,
                # proxies = None,
                # continue_on_failure = False,
                # autoset_encoding = True,
                # encoding = None,
                # web_paths = (),
                # requests_per_second = 2,
                # default_parser = "html.parser",
                # requests_kwargs = None,
                # raise_for_status = False,
                # bs_get_text_kwargs = None,
                # bs_kwargs = None,
                # session = None,
                # show_progress = True,
                # trust_env = False,
            )

    Lazy load:
        .. code-block:: python

            docs = []
            for doc in loader.lazy_load():
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            ESPN - Serving Sports Fans. Anytime. Anywhere.

            {'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}


    Async load:
        .. code-block:: python

            docs = []
            async for doc in loader.alazy_load():
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            ESPN - Serving Sports Fans. Anytime. Anywhere.

            {'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}

    .. versionchanged:: 0.3.14

        Deprecated ``aload`` (which was not async) and implemented a native async
        ``alazy_load``. Expand below for more details.

        .. dropdown:: How to update ``aload``

            Instead of using ``aload``, you can use ``load`` for synchronous loading or
            ``alazy_load`` for asynchronous lazy loading.

            Example using ``load`` (synchronous):

            .. code-block:: python

                docs: List[Document] = loader.load()

            Example using ``alazy_load`` (asynchronous):

            .. code-block:: python

                docs: List[Document] = []
                async for doc in loader.alazy_load():
                    docs.append(doc)

            This is in preparation for accommodating an asynchronous ``aload`` in the
            future:

            .. code-block:: python

                docs: List[Document] = await loader.aload()

     NTF    html.parser)show_progress	trust_envweb_pathheader_template
verify_sslproxiescontinue_on_failureautoset_encodingencoding	web_pathsrequests_per_seconddefault_parserrequests_kwargsraise_for_statusbs_get_text_kwargs	bs_kwargssessionr0   r1   r   c                   |r|rt          d          |rt          |          | _        nxt          |t                    r	|g| _        nZt          |t
                    rt          |          | _        n0t          dt          |           dt          |           d          |	| _        |
| _	        |pi | _
        || _        || _        |pi | _        |pi | _        |r|| _        nt!          j                    }|pt$                                          }|                    d          sD	 ddlm}  |            j        |d<   n*# t0          $ r t2                              d           Y nw xY wt7          |          |_        ||_        |r|j                            |           || _        || _         || _!        || _"        || _#        d	S )
a  Initialize loader.

        Args:
            web_paths: Web paths to load from.
            requests_per_second: Max number of concurrent requests to make.
            default_parser: Default parser to use for BeautifulSoup.
            requests_kwargs: kwargs for requests
            raise_for_status: Raise an exception if http status code denotes an error.
            bs_get_text_kwargs: kwargs for beatifulsoup4 get_text
            bs_kwargs: kwargs for beatifulsoup4 web page parsing
            show_progress: Show progress bar when loading pages.
            trust_env: set to True if using proxy to make web requests, for example
                using http(s)_proxy environment variables. Defaults to False.
        zmReceived web_path and web_paths. Only one can be specified. web_path is deprecated, web_paths should be used.z+web_path must be str or Sequence[str] got (z*) or web_paths must be Sequence[str] got ()r   r   )	UserAgentzxfake_useragent not found, using default user agent.To get a realistic header for requests, `pip install fake_useragent`.N)$
ValueErrorlistr9   
isinstancestrr	   	TypeErrortyper:   r;   r<   r=   r0   r>   r?   r@   requestsSessiondefault_header_templatecopyr%   fake_useragentrC   randomImportErrorloggerinfodictheadersverifyr5   updater6   r7   r8   r1   )selfr2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   r0   r1   rC   s                      r'   __init__zWebBaseLoader.__init__   s)   H  		 	D    
	!)__DNN#&& 	&ZDNN(++ 	!(^^DNNLd8nn L L9=iL L L   $7 ,.4" 0*"4":"b 	#"DLL&((G-O1H1M1M1O1OO"&&|44 
	8888884=IKK4FOL11"   KK8     #?33GO'GN 0&&w///"DL#6  0 "s   (E $E('E(c                 j    t          | j                  dk    rt          d          | j        d         S )N   zMultiple webpaths found.r   )lenr9   rD   )rW   s    r'   r2   zWebBaseLoader.web_path   s3    t~""7888~a  r)            ?r   retriescooldownbackoffc                 \  K   t          j        | j                  4 d {V }t          |          D ]F}	 t	          | j        j        | j        j                                                  }| j        j	        sd|d<    |j
        |fi | j        |z  4 d {V 	 }| j        r|                                 |                                 d {V cd d d           d {V  c cd d d           d {V  S # 1 d {V swxY w Y   # t           j        $ r_}	||dz
  k    r t                              d| d|dz    d| d	|	 d
	           t#          j        |||z  z             d {V  Y d }	~	@d }	~	ww xY w	 d d d           d {V  n# 1 d {V swxY w Y   t'          d          )N)r1   )rT   cookiesFsslrZ   Error fetching z with attempt /z: z. Retrying...zretry count exceeded)aiohttpClientSessionr1   rangerS   r@   rT   rb   get_dictrU   r%   r<   r=   textClientConnectionErrorrQ   warningasynciosleeprD   )
rW   r   r^   r_   r`   r@   ikwargsresponsees
             r'   _fetchzWebBaseLoader._fetch   s      (4>BBB 	C 	C 	C 	C 	C 	C 	Cg7^^ C CC#' $ 4 $ 4 = = ? ?$ $ $F  <. .(-u*w{    $ 4v =    5 5 5 5 5 5 5 5!0 8$55777%-]]__4444445 5 5 5 5 5 5 5 5 5 5 5 5 5 5	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 4 C C CGaK''Cc C C 1uC C'.C C23C C C   &mHwz,ABBBBBBBBBBBBBBCC	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C2 /000sf   FA$D
5C8D
#F8
DD
DD
	F
E8AE3-F3E88F
FF	semaphorec                 x  K   |4 d {V  	 |                      |           d {V 	 cd d d           d {V  S # t          $ ra}| j        r5t                              d| d           Y d }~d d d           d {V  dS t                              d| d           |d }~ww xY w# 1 d {V swxY w Y   d S )Nrd   z*, skipping due to continue_on_failure=Truer,   za and aborting, use continue_on_failure=True to continue loading urls after encountering an error.)rs   	Exceptionr6   rQ   rl   	exception)rW   r   rt   rr   s       r'   _fetch_with_rate_limitz$WebBaseLoader._fetch_with_rate_limit  s       	 	 	 	 	 	 	 	![[---------	 	 	 	 	 	 	 	 	 	 	 	 	 	    + NN5# 5 5 5   222	 	 	 	 	 	 	 	 	 	 	 	 	 	   Lc L L L   	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s8   B);
B&%B!*B) B!!B&&B))
B36B3urlsc                   K   t          j        | j                  }g }|D ]?}t          j        |                     ||                    }|                    |           @	 | j        rddlm}  |j	        |dddd d{V S t          j	        |  d{V S # t          $ r+ t          j        d           t          j	        |  d{V cY S w xY w)	z/Fetch all urls concurrently with rate limiting.r   )tqdm_asynciozFetching pagesTrZ   )descasciiminintervalNz2For better logging of progress, `pip install tqdm`)rm   	Semaphorer:   ensure_futurerx   appendr0   tqdm.asyncior{   gatherrP   warningswarn)rW   ry   rt   tasksr   taskr{   s          r'   	fetch_allzWebBaseLoader.fetch_all  s:     %d&>??	 	 	C()D)DS))T)TUUDLL	0! 45555550\0!11          %^U33333333 	0 	0 	0MNOOO //////////	0s   !!B B 2CCparserc                 f    g d}| |vr(t          dd                    |          z   dz             dS )z#Check that parser is valid for bs4.)r/   lxmlxmlzlxml-xmlhtml5libz`parser` must be one of z, .N)rD   join)r   valid_parserss     r'   _check_parserzWebBaseLoader._check_parser.  sN     ONN&&*TYY}-E-EEK   '&r)   resultsc                     ddl m} g }t          |          D ]e\  }}||         }|4|                    d          rd}n| j        }|                     |           |                     |||fi | j                   f|S )z0Unpack fetch results into BeautifulSoup objects.r   BeautifulSoupN.xmlr   )bs4r   	enumerateendswithr;   r   r   r?   )	rW   r   ry   r   r   final_resultsro   resultr   s	            r'   _unpack_fetch_resultsz#WebBaseLoader._unpack_fetch_results7  s     	&%%%%%"7++ 	R 	RIAvq'C~<<'' 1"FF!0F""6***  vv!P!P!P!PQQQQr)   c                     t          j        |                     |                    }|                     |||          S )z2Fetch all urls, then return soups for all results.r   )rm   runr   r   rW   ry   r   r   s       r'   
scrape_allzWebBaseLoader.scrape_allI  s8    +dnnT2233))'4)GGGr)   c                 l   K   |                      |           d{V }|                     |||          S )z8Async fetch all urls, then return soups for all results.Nr   )r   r   r   s       r'   ascrape_allzWebBaseLoader.ascrape_allN  sF       t,,,,,,,,))'4)GGGr)   c                 P   ddl m} ||                    d          rd}n| j        }|                     |            | j        j        |fi | j        }| j        r|                                 | j	        | j	        |_	        n| j
        r|j        |_	         ||j        |fi |pi S )Nr   r   r   r   )r   r   r   r;   r   r@   r%   r<   r=   r8   r7   apparent_encodingrj   )rW   r   r   r?   r   html_docs         r'   _scrapezWebBaseLoader._scrapeU  s     	&%%%%%>||F## -,6"""#4<#C@@4+?@@  	(%%'''=$ $H" 	; ( :H}X]FHHyBHHHr)   c                 F    |                      | j        || j                  S )z?Scrape data from webpage and return it in BeautifulSoup format.)r   r?   )r   r2   r?   )rW   r   s     r'   scrapezWebBaseLoader.scrapeo  s      ||DM&DN|SSSr)   c              #      K   | j         D ]S}|                     || j                  } |j        di | j        }t          ||          }t          ||          V  TdS )z+Lazy load text from the url(s) in web_path.)r?   page_contentr&   Nr-   )r9   r   r?   r$   r>   r(   r   )rW   pathr   rj   r&   s        r'   	lazy_loadzWebBaseLoader.lazy_loadt  s      N 	A 	AD<<<??D 4=;;4#:;;D&tT22Hx@@@@@@@		A 	Ar)   c                   K   |                      | j                   d{V }t          | j        |          D ];\  }} |j        di | j        }t          ||          }t          ||          W V  <dS )z1Async lazy load text from the url(s) in web_path.Nr   r-   )r   r9   zipr$   r>   r(   r   )rW   r   r   r   rj   r&   s         r'   
alazy_loadzWebBaseLoader.alazy_load|  s      ((88888888dng66 	A 	AJD$ 4=;;4#:;;D&tT22Hx@@@@@@@@	A 	Ar)   z0.3.14z1.0zSee API reference for updated usage: https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.web_base.WebBaseLoader.html)sinceremovalmessagec                     |                      | j                  }g }t          | j        |          D ]K\  }} |j        di | j        }t          ||          }|                    t          ||                     L|S )z9Load text from the urls in web_path async into Documents.r   r-   )r   r9   r   r$   r>   r(   r   r   )rW   r   docsr   r   rj   r&   s          r'   aloadzWebBaseLoader.aload  s     //$.11dng66 	H 	HJD$ 4=;;4#:;;D&tT22HKKdXFFFGGGGr)   )r,   NTNFTNr-   r.   r/   NFNNN)r\   r.   r]   )N)NN)%__name__
__module____qualname____doc__r
   rG   r	   r   rS   boolintr   r   rX   propertyr2   floatrs   rm   r   rx   r   r   staticmethodr   r   r   r   r   r   r   r   r   r   r   r   r   r-   r)   r'   r+   r+   *   s       a aJ /1*."&$)!%"&#%#$+48!&7;.2!S#$ #'S# S# S#Xc]*+S# "$S# 	S#
 $S# "S# S# 3-S# C=S# !S# S# "$sCx.1S# S# %T#s(^4S# DcN+S#  !S#$ %S#& 'S#( 
)S# S# S# S#j !# ! ! ! X! OR1 11!$1471FK1	1 1 1 1<#*#4	   &0DI 0# 0 0 0 0( c d    \ IM "&s)5:395E	c   $H HtCy H%T	2B HdSVi H H H H ;?H HIH',S$Y'7H	cH H H H $($(	I II c4i I D>	I
 
I I I I4T TU39- T T T T T
A8H- A A A AA-"9 A A A A ZU	  
tH~ 
 
 
 
 
 
r)   r+   )r   rm   loggingr   typingr   r   r   r   r   r   r	   r
   rf   rJ   langchain_core._apir   langchain_core.documentsr   )langchain_community.document_loaders.baser   $langchain_community.utils.user_agentr   	getLoggerr   rQ   rL   rG   rS   r(   r+   r-   r)   r'   <module>r      sc        V V V V V V V V V V V V V V V V V V V V   * * * * * * - - - - - - @ @ @ @ @ @ ? ? ? ? ? ?		8	$	$ !.""'(!$	 	 	# 	C 	D 	 	 	 	l l l l lJ l l l l lr)   