Skip to content

Commit

Permalink
Merge branch 'newArticle' into prod
Browse files Browse the repository at this point in the history
  • Loading branch information
seplee committed Apr 13, 2024
2 parents 1d11f20 + 1d777fd commit 4f118f0
Show file tree
Hide file tree
Showing 11 changed files with 515 additions and 19 deletions.
3 changes: 2 additions & 1 deletion api/backend/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
app = Flask(__name__)
app.secret_key = urandom(24)

print(environ["REQUESTS_CA_BUNDLE"])
GOOGLE_CLIENT_ID = environ["GOOGLE_CLIENT_ID"]
GOOGLE_CLIENT_SECRET = environ["GOOGLE_CLIENT_SECRET"]
GOOGLE_DISCOVERY_URL = (
Expand Down Expand Up @@ -269,7 +270,7 @@ def get_audio_for_text(data: ReadTextRequestDto):


@app.route("/api/ask", methods=["POST"])
@login_required
# @login_required
@validate_file_on_request("question.wav")
@validate_request_form(QuestionTextRequestSchema())
def answer_question(file_data: IO, data: QuestionTextRequestDto):
Expand Down
Empty file.
59 changes: 59 additions & 0 deletions api/backend/dataloaders/arxiv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from langchain_community.document_loaders import ArxivLoader
from langchain_core.documents.base import Document
from transformers import pipeline

TOKEN_CLASSIFIER = "ml6team/keyphrase-extraction-distilbert-inspec"
# yanekyuk/bert-uncased-keyword-extractor -- this model is very fast but does not always find tokens. Perhaps it is the correct choice for general questions
# ml6team/keyphrase-extraction-distilbert-inspec -- use this model for science-y questions:

def supplementary_info(question_text: str, answer_text: str):

# convert question_text into a query for arxiv
# extract topic from string
query = find_keywords(question_text)
document = find_arxiv_document(query)


# convert answer_text into a query for arxi

return f"""
Here is an arxiv article related to your question.
The title is {document.metadata.get("Title")}.
The summary is {document.metadata.get("Summary")}.
The authors are {document.metadata.get("Authors")}.
It was published in {document.metadata.get("Published")}.
Would you like to hear the article?
Would you like to hear a summary of a different article?"
"""


def find_keywords(text) -> str:
"""
If keywords are found, find_keywords returns a string containing the top three keywords from the text.
If no keywords are found, find_keywords returns a string containing the word "no-tokens-found".
"""
pipe = pipeline("token-classification", model=TOKEN_CLASSIFIER)
keywords = pipe(text)

if len(keywords) == 0:
return "no-tokens-found"
if len(keywords) > 3:
keywords = keywords[:3]
print(keywords)
keywords = [k.get("word") for k in keywords]
keywords = " ".join(keywords)
return keywords

def find_arxiv_document(query_string) -> Document:
docs = ArxivLoader(query=query_string, load_max_docs=1).load()
# consider adding a layer of LLM-comparison here.
# LLM skims the paper summaries in reference to the question and then selects the most relevant summary.
# Or, it says that no relevant summaries were found.
# for now, select and return the top article.
return docs[0]
# use langchain arxiv retriever to put documents into a vector db



# retrieve arxiv texts from vector db when user asks a question that would require an arxiv text
# ^^ do the same thing for topics that require data from other sources
2 changes: 1 addition & 1 deletion api/backend/handlers/answer_question.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def answer_question_handler(question_audio: IO, data: QuestionTextRequestDto):
# Or would you like to hear a summary of a different article"

text_to_speech = build_text_to_speech()
audio_content = text_to_speech(answer_text, data.emotion, data.speed)
audio_content = text_to_speech(answer_text + followup_answer, data.emotion, data.speed)

file_contents = io.BytesIO(audio_content)
file_contents.seek(0)
Expand Down
11 changes: 11 additions & 0 deletions api/backend/handlers/classify_question.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
def classify_question_type(question_text: str):
"""
Guess at the kind of supplementary information the user would appreciate.
Some example types: research paper, medium article, recipe, news story
"""
# how to implement?

#🏆- use vector-similarity to find the closest match in a list of possible answers
# - Use a pre-trained model to classify the kind of question
# - Use a general purpose llm to classify the kind of question
return "research_paper"
1 change: 1 addition & 0 deletions api/backend/llm/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def build_text_to_speech():
return modal_coqui_text_to_speech
raise Exception("Unsupported TEXT_TO_SPEECH_MODEL value")


def build_supplementary_info():
match question_model:
case "research_paper":
Expand Down
5 changes: 5 additions & 0 deletions api/cloud/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ RUN groupadd --gid 1010 socketWriters
&& sed -i '1s;^;source ./.venv/bin/activate\n;' /home/appUser/.bashrc \
&& chown -R appUser /app

# sate netskope
COPY cloud/nscacert.pem /etc/ssl/certs/nscacert.pem
ENV REQUESTS_CA_BUNDLE="/etc/ssl/certs/nscacert.pem"
ENV PYTHONHTTPSVERIFY=0

USER appUser
ENV PATH="${PATH}:/home/appUser/.local/bin"
RUN pip install --upgrade pip \
Expand Down
24 changes: 12 additions & 12 deletions api/cloud/nginx.conf
Original file line number Diff line number Diff line change
Expand Up @@ -28,21 +28,21 @@ http {
'' close;
}

# server {
# listen 80;
# server_name drive-gooder.com;
# return 301 https://drive-gooder.com$request_uri;
# }
server {
listen 80;
server_name drive-gooder.com;
return 301 https://drive-gooder.com$request_uri;
}

server {
listen 3000;
# listen 443 ssl;
# listen 80;
listen 443 ssl;
listen 80;
server_name drive-gooder.com;

# proxy_ssl_server_name on;
# ssl_certificate /etc/nginx/certs/cert.pem;
# ssl_certificate_key /etc/nginx/certs/key.pem;
proxy_ssl_server_name on;
ssl_certificate /etc/nginx/certs/cert.pem;
ssl_certificate_key /etc/nginx/certs/key.pem;
access_log /var/log/nginx/data-access.log combined;

location /nginx-healthcheck {
Expand Down Expand Up @@ -95,8 +95,8 @@ http {
server {
listen 5003 default_server;
listen [::]:5003 default_server;
server_name localhost;
# does this change to drive-gooder.com?
# does localhost change to drive-gooder.com?
server_name localhost;
root /var/www/html;

location / {
Expand Down
Loading

0 comments on commit 4f118f0

Please sign in to comment.