Spaces:
Runtime error
Runtime error
File size: 1,416 Bytes
2aae583 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
"""Loader that loads Telegram chat json dump."""
import json
import pandas as pd
from pathlib import Path
from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
def concatenate_rows(row):
date = row['date']
sender = row['from']
text = row['text']
return f'{sender} on {date}: {text}\n\n'
class TelegramChatLoader(BaseLoader):
"""Loader that loads Telegram chat json directory dump."""
def __init__(self, path: str):
"""Initialize with path."""
self.file_path = path
def load(self) -> List[Document]:
"""Load documents."""
p = Path(self.file_path)
with open(p, encoding="utf8") as f:
d = json.load(f)
normalized_messages = pd.json_normalize(d['messages'])
df_normalized_messages = pd.DataFrame(normalized_messages)
# Only keep plain text messages (no services, nor links, hashtags, code, bold ...)
df_filtered = df_normalized_messages[
(df_normalized_messages.type == "message") &
(df_normalized_messages.text.apply(lambda x: type(x) == str))
]
df_filtered = df_filtered[["date", "text", "from"]]
text = df_filtered.apply(concatenate_rows, axis=1).str.cat(sep='')
metadata = {"source": str(p)}
return [Document(page_content=text, metadata=metadata)]
|