File size: 6,144 Bytes
e904f34 766e109 e904f34 766e109 e904f34 766e109 e904f34 766e109 e904f34 766e109 e904f34 766e109 e904f34 766e109 e904f34 766e109 e904f34 766e109 e904f34 766e109 e904f34 766e109 e904f34 766e109 e904f34 766e109 e904f34 766e109 e904f34 766e109 e904f34 766e109 e904f34 766e109 e904f34 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
from typing import List, Dict, Any, Optional
import aiohttp
from bs4 import BeautifulSoup
import faiss
import logging
from config.config import settings
import asyncio
from urllib.parse import urljoin
logger = logging.getLogger(__name__)
class FAQService:
def __init__(self, model_service):
self.embedder = model_service.embedder
self.faiss_index = None
self.faq_data = []
self.visited_urls = set()
self.base_url = "https://www.bofrost.de/faq/"
async def fetch_faq_pages(self) -> List[Dict[str, Any]]:
async with aiohttp.ClientSession() as session:
try:
# Start with the main FAQ page
pages = await self.crawl_faq_pages(self.base_url, session)
return [page for page in pages if page]
except Exception as e:
logger.error(f"Error fetching FAQ pages: {e}")
return []
async def crawl_faq_pages(self, url: str, session: aiohttp.ClientSession) -> List[Dict[str, Any]]:
if url in self.visited_urls or not url.startswith(self.base_url):
return []
self.visited_urls.add(url)
pages = []
try:
async with session.get(url, timeout=settings.TIMEOUT) as response:
if response.status == 200:
content = await response.text()
soup = BeautifulSoup(content, 'html.parser')
# Add current page content
page_content = await self.parse_faq_content(soup, url)
if page_content:
pages.append(page_content)
# Find and follow FAQ links
tasks = []
for link in soup.find_all('a', href=True):
href = link['href']
full_url = urljoin(url, href)
if (full_url.startswith(self.base_url) and
full_url not in self.visited_urls):
tasks.append(self.crawl_faq_pages(full_url, session))
if tasks:
results = await asyncio.gather(*tasks)
for result in results:
pages.extend(result)
except Exception as e:
logger.error(f"Error crawling FAQ page {url}: {e}")
return pages
async def parse_faq_content(self, soup: BeautifulSoup, url: str) -> Optional[Dict[str, Any]]:
try:
faqs = []
faq_items = soup.find_all('div', class_='faq-item')
for item in faq_items:
# Extract question
question_elem = item.find('a', class_='headline-collapse')
if not question_elem:
continue
question = question_elem.find('span')
if not question:
continue
question_text = question.text.strip()
# Extract answer
content_elem = item.find('div', class_='content-collapse')
if not content_elem:
continue
wysiwyg = content_elem.find('div', class_='wysiwyg-content')
if not wysiwyg:
continue
# Extract all text while preserving structure
answer_parts = []
for elem in wysiwyg.find_all(['p', 'li']):
text = elem.get_text(strip=True)
if text:
answer_parts.append(text)
answer_text = ' '.join(answer_parts)
if question_text and answer_text:
faqs.append({
"question": question_text,
"answer": answer_text
})
if faqs:
return {
"url": url,
"faqs": faqs
}
except Exception as e:
logger.error(f"Error parsing FAQ content from {url}: {e}")
return None
async def index_faqs(self):
faq_pages = await self.fetch_faq_pages()
self.faq_data = []
all_texts = []
for faq_page in faq_pages:
for item in faq_page['faqs']:
# Combine question and answer for better semantic search
combined_text = f"{item['question']} {item['answer']}"
all_texts.append(combined_text)
self.faq_data.append({
"question": item['question'],
"answer": item['answer'],
"source": faq_page['url']
})
if not all_texts:
logger.warning("No FAQ content found to index")
return
# Create embeddings and index them
embeddings = self.embedder.encode(all_texts, convert_to_tensor=True).cpu().detach().numpy()
dimension = embeddings.shape[1]
self.faiss_index = faiss.IndexFlatL2(dimension)
self.faiss_index.add(embeddings)
async def search_faqs(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
if not self.faiss_index:
await self.index_faqs()
if not self.faq_data:
logger.warning("No FAQ data available for search")
return []
query_embedding = self.embedder.encode([query], convert_to_tensor=True).cpu().detach().numpy()
distances, indices = self.faiss_index.search(query_embedding, top_k)
results = []
for i, idx in enumerate(indices[0]):
if idx < len(self.faq_data):
result = self.faq_data[idx].copy()
result["score"] = float(distances[0][i])
results.append(result)
return results |