SexBot / webloader.py
Pew404's picture
Upload folder using huggingface_hub
13fbd2e verified
from langchain_community.document_loaders import WebBaseLoader
from llama_index.readers.web import SimpleWebPageReader
from newspaper import Article
from llama_index.core.llms import ChatMessage
import httpx
from bs4 import BeautifulSoup
def load_web(url: str):
loader = WebBaseLoader(url)
return loader.load()
def llama_load_web(url: str):
docs = SimpleWebPageReader(html_to_text=True).load_data([url])
return docs
def newspaper_load_web(url: str):
article = Article(url)
try:
article.download()
article.parse()
result = {
"title": article.title,
"text": article.text,
}
return result
except Exception as e:
return ""
def html2text(url: str):
from html2text import HTML2Text
h = HTML2Text()
h.ignore_links = True
return h.handle(url)
def httpxs(url: str):
import httpx
r = httpx.get(url)
return r.text
if __name__ == "__main__":
url = "https://cn.pornhub.com/video/search?search=hongkongdoll"
response = httpx.get(url)
soup = BeautifulSoup(response.text, "html.parser")
text_content = soup.find_all("p")
print([ele.get_text() for ele in text_content if ele.get_text() != ""])
text_content = "\n".join([p.get_text() for p in text_content if p.get_text() != ""])
print(f"text_content: {text_content}")