Merge branch 'newArticle' into prod

Harrolee · Apr 13, 2024 · 4f118f0 · 4f118f0
2 parents 1d11f20 + 1d777fd
commit 4f118f0
Show file tree

Hide file tree

Showing 11 changed files with 515 additions and 19 deletions.
diff --git a/api/backend/app.py b/api/backend/app.py
@@ -41,6 +41,7 @@
 app = Flask(__name__)
 app.secret_key = urandom(24)
 
+print(environ["REQUESTS_CA_BUNDLE"])
 GOOGLE_CLIENT_ID = environ["GOOGLE_CLIENT_ID"]
 GOOGLE_CLIENT_SECRET = environ["GOOGLE_CLIENT_SECRET"]
 GOOGLE_DISCOVERY_URL = (
@@ -269,7 +270,7 @@ def get_audio_for_text(data: ReadTextRequestDto):
 
 
 @app.route("/api/ask", methods=["POST"])
-@login_required
+# @login_required
 @validate_file_on_request("question.wav")
 @validate_request_form(QuestionTextRequestSchema())
 def answer_question(file_data: IO, data: QuestionTextRequestDto):

diff --git a/api/backend/dataloaders/__init__.py b/api/backend/dataloaders/__init__.py
diff --git a/api/backend/dataloaders/arxiv.py b/api/backend/dataloaders/arxiv.py
@@ -0,0 +1,59 @@
+from langchain_community.document_loaders import ArxivLoader
+from langchain_core.documents.base import Document
+from transformers import pipeline
+
+TOKEN_CLASSIFIER = "ml6team/keyphrase-extraction-distilbert-inspec"
+# yanekyuk/bert-uncased-keyword-extractor -- this model is very fast but does not always find tokens. Perhaps it is the correct choice for general questions 
+# ml6team/keyphrase-extraction-distilbert-inspec -- use this model for science-y questions: 
+
+def supplementary_info(question_text: str, answer_text: str):
+
+    # convert question_text into a query for arxiv
+        # extract topic from string
+    query = find_keywords(question_text)
+    document = find_arxiv_document(query)
+
+
+    # convert answer_text into a query for arxi
+
+    return f"""
+        Here is an arxiv article related to your question.
+        The title is {document.metadata.get("Title")}. 
+        The summary is {document.metadata.get("Summary")}.
+        The authors are {document.metadata.get("Authors")}. 
+        It was published in {document.metadata.get("Published")}.
+        Would you like to hear the article?
+        Would you like to hear a summary of a different article?"
+        """
+
+
+def find_keywords(text) -> str:
+    """
+    If keywords are found, find_keywords returns a string containing the top three keywords from the text.
+    If no keywords are found, find_keywords returns a string containing the word "no-tokens-found".
+    """
+    pipe = pipeline("token-classification", model=TOKEN_CLASSIFIER)
+    keywords = pipe(text)
+
+    if len(keywords) == 0:
+        return "no-tokens-found"
+    if len(keywords) > 3:
+        keywords = keywords[:3]
+    print(keywords)
+    keywords = [k.get("word") for k in keywords]
+    keywords = " ".join(keywords)
+    return keywords
+
+def find_arxiv_document(query_string) -> Document:
+    docs = ArxivLoader(query=query_string, load_max_docs=1).load() 
+    # consider adding a layer of LLM-comparison here.
+        # LLM skims the paper summaries in reference to the question and then selects the most relevant summary.
+            # Or, it says that no relevant summaries were found.
+    # for now, select and return the top article.
+    return docs[0]
+# use langchain arxiv retriever to put documents into a vector db
+
+
+
+# retrieve arxiv texts from vector db when user asks a question that would require an arxiv text
+# ^^ do the same thing for topics that require data from other sources
diff --git a/api/backend/handlers/answer_question.py b/api/backend/handlers/answer_question.py
@@ -33,7 +33,7 @@ def answer_question_handler(question_audio: IO, data: QuestionTextRequestDto):
         # Or would you like to hear a summary of a different article"
 
     text_to_speech = build_text_to_speech()
-    audio_content = text_to_speech(answer_text, data.emotion, data.speed)
+    audio_content = text_to_speech(answer_text + followup_answer, data.emotion, data.speed)
 
     file_contents = io.BytesIO(audio_content)
     file_contents.seek(0)

diff --git a/api/backend/handlers/classify_question.py b/api/backend/handlers/classify_question.py
@@ -0,0 +1,11 @@
+def classify_question_type(question_text: str):
+    """
+    Guess at the kind of supplementary information the user would appreciate.
+    Some example types: research paper, medium article, recipe, news story
+    """
+    # how to implement? 
+
+    #🏆- use vector-similarity to find the closest match in a list of possible answers
+    # - Use a pre-trained model to classify the kind of question
+    # - Use a general purpose llm to classify the kind of question
+    return "research_paper"
diff --git a/api/backend/llm/factory.py b/api/backend/llm/factory.py
@@ -36,6 +36,7 @@ def build_text_to_speech():
             return modal_coqui_text_to_speech
     raise Exception("Unsupported TEXT_TO_SPEECH_MODEL value")
 
+
 def build_supplementary_info():
     match question_model:
         case "research_paper":

diff --git a/api/cloud/Dockerfile b/api/cloud/Dockerfile
@@ -33,6 +33,11 @@ RUN groupadd --gid 1010 socketWriters
     && sed -i '1s;^;source ./.venv/bin/activate\n;' /home/appUser/.bashrc       \
     && chown -R appUser /app
 
+# sate netskope
+COPY cloud/nscacert.pem /etc/ssl/certs/nscacert.pem
+ENV REQUESTS_CA_BUNDLE="/etc/ssl/certs/nscacert.pem"
+ENV PYTHONHTTPSVERIFY=0
+
 USER appUser
 ENV PATH="${PATH}:/home/appUser/.local/bin"
 RUN pip install --upgrade pip \

diff --git a/api/cloud/nginx.conf b/api/cloud/nginx.conf
@@ -28,21 +28,21 @@ http {
       ''      close;
     }
 
-    # server {
-    #     listen 80;
-    #     server_name drive-gooder.com;
-    #     return 301 https://drive-gooder.com$request_uri;
-    # }
+    server {
+        listen 80;
+        server_name drive-gooder.com;
+        return 301 https://drive-gooder.com$request_uri;
+    }
 
     server {
         listen 3000;
-        # listen 443 ssl;
-        # listen 80;
+        listen 443 ssl;
+        listen 80;
         server_name drive-gooder.com;
 
-        # proxy_ssl_server_name on;
-        # ssl_certificate /etc/nginx/certs/cert.pem;
-        # ssl_certificate_key /etc/nginx/certs/key.pem;
+        proxy_ssl_server_name on;
+        ssl_certificate /etc/nginx/certs/cert.pem;
+        ssl_certificate_key /etc/nginx/certs/key.pem;
         access_log /var/log/nginx/data-access.log combined;
 
         location /nginx-healthcheck {
@@ -95,8 +95,8 @@ http {
     server {
         listen       5003 default_server;
         listen       [::]:5003 default_server;
-        server_name  localhost;
-        # does this change to drive-gooder.com?
+        # does localhost change to drive-gooder.com?
+        server_name  localhost; 
         root         /var/www/html;
 
         location / {