from langchain_community.document_loaders import WebBaseLoader | |
from llama_index.readers.web import SimpleWebPageReader | |
from newspaper import Article | |
from llama_index.core.llms import ChatMessage | |
import httpx | |
from bs4 import BeautifulSoup | |
def load_web(url: str): | |
loader = WebBaseLoader(url) | |
return loader.load() | |
def llama_load_web(url: str): | |
docs = SimpleWebPageReader(html_to_text=True).load_data([url]) | |
return docs | |
def newspaper_load_web(url: str): | |
article = Article(url) | |
try: | |
article.download() | |
article.parse() | |
result = { | |
"title": article.title, | |
"text": article.text, | |
} | |
return result | |
except Exception as e: | |
return "" | |
def html2text(url: str): | |
from html2text import HTML2Text | |
h = HTML2Text() | |
h.ignore_links = True | |
return h.handle(url) | |
def httpxs(url: str): | |
import httpx | |
r = httpx.get(url) | |
return r.text | |
if __name__ == "__main__": | |
url = "https://cn.pornhub.com/video/search?search=hongkongdoll" | |
response = httpx.get(url) | |
soup = BeautifulSoup(response.text, "html.parser") | |
text_content = soup.find_all("p") | |
print([ele.get_text() for ele in text_content if ele.get_text() != ""]) | |
text_content = "\n".join([p.get_text() for p in text_content if p.get_text() != ""]) | |
print(f"text_content: {text_content}") | |