-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
128 lines (105 loc) · 4.6 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import HuggingFacePipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import streamlit as st
import time
# Set up Streamlit UI
st.set_page_config(
page_title="Document QA System",
page_icon=":books:",
layout="wide",
initial_sidebar_state="expanded",
)
# Sidebar configuration
with st.sidebar:
st.title(":books: Document QA System")
st.write("Upload a document (PDF or text), process it, and ask questions!")
st.info(
"✨ **Tip:** Use this app for quick insights from long documents."
)
st.markdown("---")
# Title and description
st.title(":mag_right: Document-Based Question Answering")
st.markdown(
"""
**Upload a PDF or text file** and **ask questions** about its content. This app uses advanced language models to provide accurate answers.
"""
)
# File upload
uploaded_file = st.file_uploader("Choose a file to upload:", type=["pdf", "txt"])
if uploaded_file is not None:
# Display file details
st.success(f"**Uploaded:** {uploaded_file.name}")
# Save uploaded file temporarily
file_path = f"uploaded_file.{uploaded_file.name.split('.')[-1]}"
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
# Progress bar
progress_bar = st.progress(0)
progress_bar.progress(10)
# Load the uploaded document
st.info("📚 Loading the document...")
if uploaded_file.name.endswith(".pdf"):
loader = PyPDFLoader(file_path)
elif uploaded_file.name.endswith(".txt"):
loader = TextLoader(file_path, encoding="utf-8") # Specify UTF-8 encoding for text files
else:
st.error("Unsupported file format. Please upload a PDF or text file.")
st.stop()
try:
documents = loader.load()
except Exception as e:
st.error(f"Error loading document: {str(e)}")
st.stop()
progress_bar.progress(30)
# Split the document into chunks
st.info("✂️ Splitting the document into chunks...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)
progress_bar.progress(60)
# Initialize the embedding model and FAISS
st.info("🔍 Creating embeddings for efficient search...")
model = SentenceTransformer('all-MiniLM-L6-v2')
embedding = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
faiss_index = FAISS.from_documents(chunks, embedding)
progress_bar.progress(80)
# Set up HuggingFace LLM pipeline
st.info("🤖 Initializing the question-answering model...")
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
llm = HuggingFacePipeline(pipeline=qa_pipeline)
# Load the QA chain
qa_chain = load_qa_chain(llm, chain_type="map_reduce")
progress_bar.progress(100)
st.success("🎉 Document processed and ready for questions!")
# Question input from user
st.markdown("---")
question = st.text_input("**Ask a question:**")
if question:
# Retrieve relevant documents and run the QA chain
def ask_question_with_retry(question, retries=3, delay=5):
for attempt in range(retries):
try:
relevant_docs = faiss_index.similarity_search(question, k=3) # Retrieve 3 most relevant docs
answer = qa_chain.run(input_documents=relevant_docs, question=question)
return answer
except Exception as e:
print(f"Error: {str(e)}. Retrying... (Attempt {attempt + 1}/{retries})")
time.sleep(delay)
return "Sorry, I couldn't get an answer."
with st.spinner("🔎 Finding the best answer..."):
answer = ask_question_with_retry(question)
# Display the answer
st.markdown(f"### :speech_balloon: Answer:")
st.success(answer)
# Optionally display relevant chunks (debugging or additional info)
if st.checkbox("Show relevant document chunks"):
relevant_docs = faiss_index.similarity_search(question, k=3)
for i, doc in enumerate(relevant_docs):
st.markdown(f"**Chunk {i + 1}:** {doc.page_content}")
else:
st.warning("⚠️ Please upload a PDF or text file to get started.")