
    hhY                     b    d dl mZmZmZmZ d dlmZmZ d dlm	Z	 d dl
mZ  G d de          ZdS )    )AnyIteratorListOptional)urljoinurlparse)Document)WebBaseLoaderc                        e Zd ZdZ	 	 	 	 	 ddddeded	ee         d
edededee         f fdZdee	         fdZ
	 ddedee         dee	         fdZdedee         fdZ xZS )GitbookLoaderztLoad `GitBook` data.

    1. load from either a single page, or
    2. load all (relative) paths in the navbar.
    FNmainT)sitemap_urlweb_pageload_all_pathsbase_urlcontent_selectorcontinue_on_failureshow_progressr   c                    |p|| _         | j                             d          r| j         dd         | _         |r|r|}n
| j          d}t                                          |f||           || _        || _        dS )aZ  Initialize with web page and whether to load all paths.

        Args:
            web_page: The web page to load or the starting point from where
                relative paths are discovered.
            load_all_paths: If set to True, all relative paths in the navbar
                are loaded instead of only `web_page`.
            base_url: If `load_all_paths` is True, the relative paths are
                appended to this base url. Defaults to `web_page`.
            content_selector: The CSS selector for the content to load.
                Defaults to "main".
            continue_on_failure: whether to continue loading the sitemap if an error
                occurs loading a url, emitting a warning instead of raising an
                exception. Setting this to True makes the loader more robust, but also
                may result in missing data. Default: False
            show_progress: whether to show a progress bar while loading. Default: True
            sitemap_url: Custom sitemap URL to use when load_all_paths is True.
                Defaults to "{base_url}/sitemap.xml".
        /Nz/sitemap.xml)	web_pathsr   r   )r   endswithsuper__init__r   r   )	selfr   r   r   r   r   r   r   	__class__s	           i/var/www/FlaskApp/flask-venv/lib/python3.11/site-packages/langchain_community/document_loaders/gitbook.pyr   zGitbookLoader.__init__   s    < !,H=!!#&& 	/ M#2#.DM 	: :&"m999k 3' 	 	
 	
 	

 - 0    returnc              #      K    j         r                                 }                     |          } fd|D             }                     |          }t	          ||          D ]!\  }}                     ||          }|r|V  "dS                                  }                     | j                  }|r|V  dS dS )z(Fetch text from one single GitBook page.c                 :    g | ]}t          j        |          S  )r   r   ).0pathr   s     r   
<listcomp>z+GitbookLoader.lazy_load.<locals>.<listcomp>F   s%    LLLTGDM400LLLr   N)r   scrape
_get_paths
scrape_allzip_get_documentweb_path)r   	soup_inforelative_pathsurls
soup_infosurldocs   `      r   	lazy_loadzGitbookLoader.lazy_loadA   s       	I!__Y77NLLLL^LLLD..J"%j$"7"7  	3((C88 III  I$$Y>>C 					 r   soup
custom_urlc                    |                     | j                  }|sdS |                    d                                          }|                     d          }|r|j        nd}|p| j        |d}t          ||          S )z,Fetch content from page and return Document.N
)	separatorh1 )sourcetitle)page_contentmetadata)findr   get_textstriptextr,   r	   )r   r4   r5   page_content_rawcontenttitle_if_existsr<   r>   s           r   r+   zGitbookLoader._get_documentS   s      99T%:;; 	4"++d+;;AACC*//55(7?$$R(9DMEJJWx@@@@r   c                 @    d |                     d          D             S )z'Fetch all relative paths in the navbar.c                 @    g | ]}t          |j                  j        S r#   )r   rB   r%   )r$   locs     r   r&   z,GitbookLoader._get_paths.<locals>.<listcomp>b   s%    HHHC""'HHHr   rH   )find_all)r   r4   s     r   r(   zGitbookLoader._get_paths`   s"    HH4==3G3GHHHHr   )FNr   FT)N)__name__
__module____qualname____doc__strboolr   r   r   r	   r3   r   r+   r   r(   __classcell__)r   s   @r   r   r   	   sP          %"& &$)"/1 &*/1 /1 /1/1 /1 3-	/1
 /1 "/1 /1 c]/1 /1 /1 /1 /1 /1b8H-    & 6:A AA%-c]A	(	A A A AIs ItCy I I I I I I I Ir   r   N)typingr   r   r   r   urllib.parser   r   langchain_core.documentsr	   -langchain_community.document_loaders.web_baser
   r   r#   r   r   <module>rU      s    0 0 0 0 0 0 0 0 0 0 0 0 * * * * * * * * - - - - - - G G G G G GYI YI YI YI YIM YI YI YI YI YIr   