diff --git a/graph/component/__init__.py b/graph/component/__init__.py index 611289c648..9e723a26f5 100644 --- a/graph/component/__init__.py +++ b/graph/component/__init__.py @@ -12,6 +12,7 @@ from .baidu import Baidu, BaiduParam from .duckduckgo import DuckDuckGo, DuckDuckGoParam from .wikipedia import Wikipedia, WikipediaParam +from .pubmed import PubMed, PubMedParam def component_class(class_name): diff --git a/graph/component/pubmed.py b/graph/component/pubmed.py new file mode 100644 index 0000000000..23abd8139d --- /dev/null +++ b/graph/component/pubmed.py @@ -0,0 +1,63 @@ +# +# Copyright 2024 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import random +from abc import ABC +from functools import partial +from Bio import Entrez +import pandas as pd +import xml.etree.ElementTree as ET +from graph.settings import DEBUG +from graph.component.base import ComponentBase, ComponentParamBase + + +class PubMedParam(ComponentParamBase): + """ + Define the PubMed component parameters. + """ + + def __init__(self): + super().__init__() + self.top_n = 5 + self.email = "A.N.Other@example.com" + + def check(self): + self.check_positive_integer(self.top_n, "Top N") + + +class PubMed(ComponentBase, ABC): + component_name = "PubMed" + + def _run(self, history, **kwargs): + ans = self.get_input() + ans = " - ".join(ans["content"]) if "content" in ans else "" + if not ans: + return PubMed.be_output("") + + Entrez.email = self._param.email + pubmedids = Entrez.read(Entrez.esearch(db='pubmed', retmax=self._param.top_n, term=ans))['IdList'] + pubmedcnt = ET.fromstring( + Entrez.efetch(db='pubmed', id=",".join(pubmedids), retmode="xml").read().decode("utf-8")) + pubmed_res = [{"content": 'Title:' + child.find("MedlineCitation").find("Article").find( + "ArticleTitle").text + '\nUrl:' + '\n' + 'Abstract:' + child.find("MedlineCitation").find( + "Article").find("Abstract").find("AbstractText").text} for child in pubmedcnt.findall("PubmedArticle")] + + if not pubmed_res: + return PubMed.be_output("") + + df = pd.DataFrame(pubmed_res) + if DEBUG: print(df, ":::::::::::::::::::::::::::::::::") + return df diff --git a/requirements.txt b/requirements.txt index 9d02e96486..8f9fa545a1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ Aspose.Slides==24.2.0 BCEmbedding==0.1.3 +Bio==1.7.1 boto3==1.34.140 botocore==1.34.140 cachetools==5.3.3 diff --git a/requirements_arm.txt b/requirements_arm.txt index 579c3fc1fc..23895815fc 100644 --- a/requirements_arm.txt +++ b/requirements_arm.txt @@ -151,3 +151,4 @@ duckduckgo_search==6.1.9 google-generativeai==0.7.2 groq==0.9.0 wikipedia==1.4.0 +Bio==1.7.1 diff --git a/requirements_dev.txt b/requirements_dev.txt index a1ea8b35df..77b8bd6195 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -136,3 +136,4 @@ duckduckgo_search==6.1.9 google-generativeai==0.7.2 groq==0.9.0 wikipedia==1.4.0 +Bio==1.7.1