gneya commited on
Commit
1b0b939
·
verified ·
1 Parent(s): 2f5cada

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +23 -0
  2. requirements.txt +8 -0
  3. utils.py +93 -0
app.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from utils import download_mp4_from_youtube
3
+ import asyncio
4
+
5
+ def ui():
6
+ st.title("Youtube Video Summarizer")
7
+
8
+ # Create a form for URL input
9
+ with st.form(key='url_form'):
10
+ user_url = st.text_input("Please enter a URL:")
11
+ submit_button = st.form_submit_button(label='Submit')
12
+
13
+ # Display the URL when the form is submitted
14
+ if submit_button:
15
+
16
+ loop = asyncio.new_event_loop()
17
+ asyncio.set_event_loop(loop)
18
+ loop.run_until_complete(download_mp4_from_youtube(user_url))
19
+
20
+
21
+ if __name__ == "__main__":
22
+ ui()
23
+
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ langchain_groq
3
+ deeplake
4
+ yt_dlp
5
+ langchain-community
6
+ python-dotenv
7
+ streamlit
8
+ git+https://github.com/openai/whisper.git
utils.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.document_loaders import ApifyDatasetLoader
2
+ from langchain.utilities import ApifyWrapper
3
+ from langchain.docstore.document import Document
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.embeddings.cohere import CohereEmbeddings
6
+ from langchain.vectorstores.deeplake import DeepLake
7
+ from langchain_cohere import CohereRerank
8
+ from langchain.retrievers import ContextualCompressionRetriever
9
+ from langchain.memory import ConversationBufferWindowMemory
10
+ import os
11
+ from langchain.chains.conversational_retrieval.base import ConversationalRetrievalChain
12
+ from langchain_groq import ChatGroq
13
+ from dotenv import load_dotenv
14
+ load_dotenv()
15
+
16
+ def get_and_load_data():
17
+ apify_key = os.getenv("apify")
18
+
19
+ apify = ApifyWrapper()
20
+
21
+ loader = apify.call_actor(
22
+ actor_id="apify/website-content-crawler",
23
+ run_input={"startUrls": [{"url": "https://en.wikipedia.org/wiki/Artificial_intelligence"}]},
24
+ dataset_mapping_function=lambda dataset_item: Document(
25
+ page_content=dataset_item["text"] if dataset_item["text"] else "No content available",
26
+ metadata={
27
+ "source": dataset_item["url"],
28
+ "title": dataset_item["metadata"]["title"]
29
+ }
30
+ ),
31
+ )
32
+ docs = loader.load()
33
+ text_splitter = RecursiveCharacterTextSplitter(
34
+ chunk_size=1000, chunk_overlap=20, length_function=len
35
+ )
36
+ docs_split = text_splitter.split_documents(docs)
37
+ embeddings = CohereEmbeddings(model="embed-english-v2.0")
38
+ username = "gneyapandya1234"
39
+ db_id= "educational_chatbot"
40
+
41
+ dbs = DeepLake(dataset_path=f"hub://{username}/{db_id}", embedding_function=embeddings)
42
+ dbs.add_documents(docs_split)
43
+
44
+ def deeplake():
45
+ embeddings= CohereEmbeddings(model = "embed-english-v2.0")
46
+ dbs = DeepLake(
47
+ dataset_path="hub://gneyapandya1234/educational_chatbot",
48
+ read_only=True,
49
+ embedding_function= embeddings
50
+ )
51
+ retriever = dbs.as_retriever()
52
+ retriever.search_kwargs["distance_metric"] = "cos"
53
+ retriever.search_kwargs["fetch_k"] = 20
54
+ # retriever.search_kwargs["maximal_marginal_relevance"] = True
55
+ retriever.search_kwargs["k"] = 20
56
+
57
+ compressor = CohereRerank(
58
+ model = "rerank-english-v2.0",
59
+ top_n=5
60
+ )
61
+ compressor_retriever = ContextualCompressionRetriever(
62
+ base_compressor = compressor , base_retriever=retriever
63
+ )
64
+ print("DOne")
65
+ return dbs, compressor_retriever, retriever
66
+
67
+ def memory():
68
+ mem = ConversationBufferWindowMemory(
69
+ k=3,
70
+ memory_key="chat_history",
71
+ return_messages=True,
72
+ output_key="answer"
73
+ )
74
+ return mem
75
+ def create_llm():
76
+ llm = ChatGroq(api_key= os.getenv("GROQ_API_KEY"),model="llama3-70b-8192")
77
+ return llm
78
+
79
+ def chain(llm,compression_retriever,memory):
80
+ qa = ConversationalRetrievalChain.from_llm(
81
+ llm = llm,
82
+ memory= memory,
83
+ retriever= compression_retriever,
84
+ verbose= True,
85
+ return_source_documents=True
86
+ )
87
+ return qa
88
+ def final_function():
89
+ llm = create_llm()
90
+ mem =memory()
91
+ dbs, compressor_retriever, retriever = deeplake()
92
+ qa= chain(llm,compressor_retriever,mem)
93
+ return qa, mem