-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
515 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
from langchain_community.document_loaders import ArxivLoader | ||
from langchain_core.documents.base import Document | ||
from transformers import pipeline | ||
|
||
TOKEN_CLASSIFIER = "ml6team/keyphrase-extraction-distilbert-inspec" | ||
# yanekyuk/bert-uncased-keyword-extractor -- this model is very fast but does not always find tokens. Perhaps it is the correct choice for general questions | ||
# ml6team/keyphrase-extraction-distilbert-inspec -- use this model for science-y questions: | ||
|
||
def supplementary_info(question_text: str, answer_text: str): | ||
|
||
# convert question_text into a query for arxiv | ||
# extract topic from string | ||
query = find_keywords(question_text) | ||
document = find_arxiv_document(query) | ||
|
||
|
||
# convert answer_text into a query for arxi | ||
|
||
return f""" | ||
Here is an arxiv article related to your question. | ||
The title is {document.metadata.get("Title")}. | ||
The summary is {document.metadata.get("Summary")}. | ||
The authors are {document.metadata.get("Authors")}. | ||
It was published in {document.metadata.get("Published")}. | ||
Would you like to hear the article? | ||
Would you like to hear a summary of a different article?" | ||
""" | ||
|
||
|
||
def find_keywords(text) -> str: | ||
""" | ||
If keywords are found, find_keywords returns a string containing the top three keywords from the text. | ||
If no keywords are found, find_keywords returns a string containing the word "no-tokens-found". | ||
""" | ||
pipe = pipeline("token-classification", model=TOKEN_CLASSIFIER) | ||
keywords = pipe(text) | ||
|
||
if len(keywords) == 0: | ||
return "no-tokens-found" | ||
if len(keywords) > 3: | ||
keywords = keywords[:3] | ||
print(keywords) | ||
keywords = [k.get("word") for k in keywords] | ||
keywords = " ".join(keywords) | ||
return keywords | ||
|
||
def find_arxiv_document(query_string) -> Document: | ||
docs = ArxivLoader(query=query_string, load_max_docs=1).load() | ||
# consider adding a layer of LLM-comparison here. | ||
# LLM skims the paper summaries in reference to the question and then selects the most relevant summary. | ||
# Or, it says that no relevant summaries were found. | ||
# for now, select and return the top article. | ||
return docs[0] | ||
# use langchain arxiv retriever to put documents into a vector db | ||
|
||
|
||
|
||
# retrieve arxiv texts from vector db when user asks a question that would require an arxiv text | ||
# ^^ do the same thing for topics that require data from other sources |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
def classify_question_type(question_text: str): | ||
""" | ||
Guess at the kind of supplementary information the user would appreciate. | ||
Some example types: research paper, medium article, recipe, news story | ||
""" | ||
# how to implement? | ||
|
||
#🏆- use vector-similarity to find the closest match in a list of possible answers | ||
# - Use a pre-trained model to classify the kind of question | ||
# - Use a general purpose llm to classify the kind of question | ||
return "research_paper" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.