from langchain_community.document_loaders import WebBaseLoader from llama_index.readers.web import SimpleWebPageReader from newspaper import Article from llama_index.core.llms import ChatMessage import httpx from bs4 import BeautifulSoup def load_web(url: str): loader = WebBaseLoader(url) return loader.load() def llama_load_web(url: str): docs = SimpleWebPageReader(html_to_text=True).load_data([url]) return docs def newspaper_load_web(url: str): article = Article(url) try: article.download() article.parse() result = { "title": article.title, "text": article.text, } return result except Exception as e: return "" def html2text(url: str): from html2text import HTML2Text h = HTML2Text() h.ignore_links = True return h.handle(url) def httpxs(url: str): import httpx r = httpx.get(url) return r.text if __name__ == "__main__": url = "https://cn.pornhub.com/video/search?search=hongkongdoll" response = httpx.get(url) soup = BeautifulSoup(response.text, "html.parser") text_content = soup.find_all("p") print([ele.get_text() for ele in text_content if ele.get_text() != ""]) text_content = "\n".join([p.get_text() for p in text_content if p.get_text() != ""]) print(f"text_content: {text_content}")