-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdefs.py
144 lines (119 loc) · 5.1 KB
/
defs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# Importing necessary modules and classes
from langchain.text_splitter import RecursiveCharacterTextSplitter, NLTKTextSplitter
from langchain_community.document_loaders import YoutubeLoader, DirectoryLoader, UnstructuredEPubLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
import os
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import (
ChatPromptTemplate,
)
import time
import scrapetube
import csv
# Function to update the Chroma vector store with new YouTube videos
def update_chroma_youtube(vectorstore, video_list):
for url in video_list:
loader_yt = YoutubeLoader.from_youtube_url(
url,
add_video_info=True,
language=["it"],
)
documents = loader_yt.load()
documents[0].metadata['url'] = url
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=300)
docs = text_splitter.split_documents(documents)
vectorstore.add_documents(docs)
return vectorstore
# Function to create or load the Chroma vector store
def chroma_vectorstore(video_list = []):
save_load = './vectorstore/chroma'
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
if os.path.exists(save_load):
vector_store = Chroma(persist_directory=save_load, embedding_function=embeddings)
print("A new instance of vector store was loaded from", save_load)
else:
loader_epub = DirectoryLoader('./sources/', glob="./*.epub", loader_cls=UnstructuredEPubLoader)
documents = loader_epub.load()
documents[0].metadata['url'] = 'https://amzn.to/3UX6umg'
text_splitter = NLTKTextSplitter()
docs = text_splitter.split_documents(documents)
vector_store = Chroma.from_documents(
documents=docs,
embedding=embeddings,
persist_directory=save_load
)
print("Vector store was saved to", save_load)
if len(video_list) == 0:
pass
else:
return update_chroma_youtube(vector_store, video_list)
return vector_store
# Function to create a conversation chain for the chatbot
def get_conversation_chain(vector_store, system_message:str, human_message:str) -> ConversationalRetrievalChain:
llm = ChatOpenAI(model_name="gpt-4o", temperature=0)
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key='answer')
conversation_chain = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=vector_store.as_retriever(search_type = 'mmr', search_kwargs = {'k' : 5} ),
memory=memory,
return_source_documents=True,
combine_docs_chain_kwargs={
"prompt": ChatPromptTemplate.from_messages(
[
system_message,
human_message,
]
),
},
)
return conversation_chain
# Function to retrieve a list of video URLs from a given source (channel or playlist)
def get_video_list(source) -> list:
videos = scrapetube.get_playlist(source)
video_list = []
for video in videos:
video_list.append("https://youtu.be/"+str(video['videoId']))
return video_list
# Function to stream data
def stream_data(text):
for word in text.split(" "):
yield word + " "
time.sleep(0.02)
# Function to extract sources from the results of a query
def get_sources(results):
sources = {}
for source in results['source_documents']:
if 'title' in source.metadata.keys():
text = source.metadata['title']
else:
text = 'Office of Cards: Una guida pratica per raggiungere il successo e la felicità nelle grandi aziende (e nella vita)'
sources[source.metadata['url']] = text
return sources
# Function to read old video IDs from a CSV file
def read_old_videos(csv_file):
with open(csv_file, newline='') as f:
reader = csv.reader(f)
data = list(reader)
return data[0]
# Function to save video IDs to a CSV file
def save_video_list(name, video_exp):
with open(name, 'w', newline='') as myfile:
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
wr.writerow(video_exp)
# Check if there are new videos in the channel and update the vectorstore
def check_and_update_new_videos(channel_id, old_videos_file, vectorstore):
# Fetching current videos from YouTube channel
current_videos = scrapetube.get_channel(channel_id)
current_videos_id = [video['videoId'] for video in current_videos]
old_videos_id = read_old_videos(old_videos_file)
# Constructing URLs for new videos
videos_to_add = ["https://youtu.be/"+str(id) for id in list(set(current_videos_id) - set(old_videos_id))]
# Creating or updating a Chroma vector store with new videos
if len(videos_to_add) > 0:
vectorstore = update_chroma_youtube(vectorstore, videos_to_add)
save_video_list(old_videos_file, current_videos_id)
else:
pass
return vectorstore