
    hhH                         d Z ddlZddlmZmZmZmZ ddlmZ ddl	m
Z
 ddlmZ  ej        e          Z G d de
          ZdS )	zLLoader that uses bs4 to load HTML files, enriching metadata with page title.    N)AnyDictIteratorUnion)Document)BaseBlobParser)Blobc            	       N    e Zd ZdZddddedededd	fd
Zdedee	         fdZ
d	S )BS4HTMLParserz(Parse HTML files using `Beautiful Soup`.lxml )featuresget_text_separatorr   r   kwargsreturnNc                p    	 ddl }n# t          $ r t          d          w xY wd|i|| _        || _        dS )z#Initialize a bs4 based HTML parser.r   NzUbeautifulsoup4 package not found, please install it with `pip install beautifulsoup4`r   )bs4ImportError	bs_kwargsr   )selfr   r   r   r   s        r/var/www/FlaskApp/flask-venv/lib/python3.11/site-packages/langchain_community/document_loaders/parsers/html/bs4.py__init__zBS4HTMLParser.__init__   se    	JJJJ 	 	 	/  	 %h9&9"4s    !blobc              #   >  K   ddl m} |                                5 } ||fi | j        }ddd           n# 1 swxY w Y   |                    | j                  }|j        rt          |j        j                  }nd}|j	        |d}t          ||          V  dS )z)Load HTML document into document objects.r   )BeautifulSoupNr   )sourcetitle)page_contentmetadata)r   r   as_bytes_ior   get_textr   r   strstringr   r   )r   r   r   fsouptextr   r   s           r   
lazy_parsezBS4HTMLParser.lazy_parse$   s     %%%%%% 	61 =55dn55D	6 	6 	6 	6 	6 	6 	6 	6 	6 	6 	6 	6 	6 	6 	6 }}T455: 	
)**EEE k1
 1
 D8<<<<<<<<s   8<<)__name__
__module____qualname____doc__r"   r   r   r	   r   r   r'        r   r   r      s        22
 "$	5 5 5 5  	5
 5 
5 5 5 5&=t =(: = = = = = =r-   r   )r+   loggingtypingr   r   r   r   langchain_core.documentsr   )langchain_community.document_loaders.baser   1langchain_community.document_loaders.blob_loadersr	   	getLoggerr(   loggerr   r,   r-   r   <module>r5      s    R R  - - - - - - - - - - - - - - - - - - D D D D D D B B B B B B		8	$	$(= (= (= (= (=N (= (= (= (= (=r-   