File size: 1,416 Bytes
2aae583
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
"""Loader that loads Telegram chat json dump."""
import json
import pandas as pd
from pathlib import Path
from typing import List

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader


def concatenate_rows(row):
    date = row['date']
    sender = row['from']
    text = row['text']
    return f'{sender} on {date}: {text}\n\n'


class TelegramChatLoader(BaseLoader):
    """Loader that loads Telegram chat json directory dump."""

    def __init__(self, path: str):
        """Initialize with path."""
        self.file_path = path

    def load(self) -> List[Document]:
        """Load documents."""
        p = Path(self.file_path)

        with open(p, encoding="utf8") as f:
            d = json.load(f)

        normalized_messages = pd.json_normalize(d['messages'])
        df_normalized_messages = pd.DataFrame(normalized_messages)

        # Only keep plain text messages (no services, nor links, hashtags, code, bold ...)
        df_filtered = df_normalized_messages[
            (df_normalized_messages.type == "message") &
            (df_normalized_messages.text.apply(lambda x: type(x) == str))
            ]

        df_filtered = df_filtered[["date", "text", "from"]]

        text = df_filtered.apply(concatenate_rows, axis=1).str.cat(sep='')

        metadata = {"source": str(p)}

        return [Document(page_content=text, metadata=metadata)]