Skip to content

Commit

Permalink
Merge branch 'main' into prod
Browse files Browse the repository at this point in the history
  • Loading branch information
seplee committed Apr 20, 2024
2 parents 484678e + 4c58cfb commit e4def43
Show file tree
Hide file tree
Showing 7 changed files with 587 additions and 31 deletions.
29 changes: 5 additions & 24 deletions api/backend/dataloaders/arxiv.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
from langchain_community.document_loaders import ArxivLoader
from langchain_core.documents.base import Document
from transformers import pipeline

TOKEN_CLASSIFIER = "ml6team/keyphrase-extraction-distilbert-inspec"
import modal
# yanekyuk/bert-uncased-keyword-extractor -- this model is very fast but does not always find tokens. Perhaps it is the correct choice for general questions
# ml6team/keyphrase-extraction-distilbert-inspec -- use this model for science-y questions:

def supplementary_info(question_text: str, answer_text: str):

find_keywords = modal.Function.lookup("find_keyword", "find_keywords")
# convert question_text into a query for arxiv
# extract topic from string
query = find_keywords(question_text)
query = find_keywords.remote(question_text)
document = find_arxiv_document(query)


# convert answer_text into a query for arxi

Expand All @@ -26,24 +25,6 @@ def supplementary_info(question_text: str, answer_text: str):
Would you like to hear a summary of a different article?"
"""


def find_keywords(text) -> str:
"""
If keywords are found, find_keywords returns a string containing the top three keywords from the text.
If no keywords are found, find_keywords returns a string containing the word "no-tokens-found".
"""
pipe = pipeline("token-classification", model=TOKEN_CLASSIFIER)
keywords = pipe(text)

if len(keywords) == 0:
return "no-tokens-found"
if len(keywords) > 3:
keywords = keywords[:3]
print(keywords)
keywords = [k.get("word") for k in keywords]
keywords = " ".join(keywords)
return keywords

def find_arxiv_document(query_string) -> Document:
docs = ArxivLoader(query=query_string, load_max_docs=1).load()
# consider adding a layer of LLM-comparison here.
Expand All @@ -56,4 +37,4 @@ def find_arxiv_document(query_string) -> Document:


# retrieve arxiv texts from vector db when user asks a question that would require an arxiv text
# ^^ do the same thing for topics that require data from other sources
# ^^ do the same thing for topics that require data from other sources
9 changes: 5 additions & 4 deletions api/backend/handlers/answer_question.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,18 @@ def answer_question_handler(question_audio: IO, data: QuestionTextRequestDto):
ask_question = build_ask_question()
answer_text = ask_question(question_text, data.text)

# question_type = classify_question_type(question_text)
# supplementary_info = build_supplementary_info(question_type)
# followup_answer = supplementary_info(question_text, answer_text)
question_type = classify_question_type(question_text)
supplementary_info = build_supplementary_info(question_type)
followup_answer = supplementary_info(question_text, answer_text)
print(followup_answer)
# add follow-up answer:
# "here is an arxiv article related to your question.
# The title is "". The summary is "" and the authors are "".
# Would you like to hear the article?
# Or would you like to hear a summary of a different article"

text_to_speech = build_text_to_speech()
audio_content = text_to_speech(answer_text, data.emotion, data.speed)
audio_content = text_to_speech(answer_text + followup_answer, data.emotion, data.speed)

file_contents = io.BytesIO(audio_content)
file_contents.seek(0)
Expand Down
56 changes: 56 additions & 0 deletions api/backend/handlers/find_keywords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import modal

app = modal.App(
"find_keyword"
)

def download_model_to_image(model_dir, model_name):
import os
from huggingface_hub import snapshot_download
from transformers.utils import move_cache

os.makedirs(model_dir, exist_ok=True)

snapshot_download(
model_name,
local_dir=model_dir
)
move_cache()

TOKEN_CLASSIFIER = "ml6team/keyphrase-extraction-distilbert-inspec"
MODEL_DIR = "/token_classifier"
transformers_image = (
modal.Image.debian_slim()
.pip_install(
"torch==2.2.2",
"transformers==4.40.0",
"hf-transfer==0.1.6",
"huggingface_hub==0.22.2",
)
.run_function(download_model_to_image,
timeout=60 * 4,
kwargs={
"model_dir": MODEL_DIR,
"model_name": TOKEN_CLASSIFIER,
}
)
)

@app.function(image=transformers_image, gpu="a10g")
def find_keywords(text) -> str:
"""
If keywords are found, find_keywords returns a string containing the top three keywords from the text.
If no keywords are found, find_keywords returns a string containing the word "no-tokens-found".
"""
from transformers import pipeline
pipe = pipeline("token-classification", model=MODEL_DIR)
keywords = pipe(text)

if len(keywords) == 0:
return "no-tokens-found"
if len(keywords) > 3:
keywords = keywords[:3]
print(keywords)
keywords = [k.get("word") for k in keywords]
keywords = " ".join(keywords)
return keywords
2 changes: 1 addition & 1 deletion api/cloud/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ RUN groupadd --gid 1010 socketWriters
# create appUser
# gid 1010 is socketWriters
# uid 1007 will own the /app dir
# Coqui saves models to the user's home dir, so we need to create a user with a home dir
# we need to create a user with a home dir
&& useradd -u 1007 -g 1010 -m -p "$(cat password)" appUser \
&& shred -u password \
# set HOME on non-login user changes for appUser:
Expand Down
3 changes: 2 additions & 1 deletion api/cloud/nginx.conf
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ http {
'' close;
}

# keep this in place so that we can dev against the docker container locally
server {
listen 80;
server_name drive-gooder.com;
Expand All @@ -36,7 +37,7 @@ http {

server {
listen 3000;
listen 443 ssl;
listen 443 ssl; # keep this line so that we can dev against the docker container locally
server_name drive-gooder.com;

proxy_ssl_server_name on;
Expand Down
Loading

0 comments on commit e4def43

Please sign in to comment.