File size: 1,068 Bytes
7cf68b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
class LangChainChunker:
    def __init__(self, text):
        self.text = text
    
    def chunker(self, size=1000):
        from langchain.text_splitter import CharacterTextSplitter
        
        # attach the duration of the video to the chunk
        # [[chunk, duration]]
        
        text_splitter = CharacterTextSplitter(
            separator=" ",
            chunk_size=size,
            chunk_overlap=0.9,
        )
        
        return text_splitter.split_text(self.text)
    
    def __sizeof__(self) -> int:
        count = 0
        for _ in self.text:
            count += 1
        return count


def getSubsText(video_id="", getGenerated=False):
    from youtube_transcript_api import YouTubeTranscriptApi as ytapi
    from youtube_transcript_api.formatters import TextFormatter

    tList = ytapi.list_transcripts(video_id)
    data = ""
    if getGenerated:
        # TODO: implement getGenerated
        pass
    
    for t in tList:
        data = t.fetch()
    
    return (TextFormatter().format_transcript(data)).replace("\n", " ")