Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ A Model Context Protocol (MCP) server for searching and downloading academic pap

## Features

- **Multi-Source Support**: Search and download papers from arXiv, PubMed, bioRxiv, medRxiv, Google Scholar, IACR ePrint Archive, Semantic Scholar.
- **Multi-Source Support**: Search and download papers from arXiv, PubMed, bioRxiv, medRxiv, Google Scholar, IACR ePrint Archive, Semantic Scholar, and Scopus.
- **Standardized Output**: Papers are returned in a consistent dictionary format via the `Paper` class.
- **Asynchronous Tools**: Efficiently handles network requests using `httpx`.
- **MCP Integration**: Compatible with MCP clients for LLM context enhancement.
Expand Down Expand Up @@ -78,7 +78,8 @@ For users who want to quickly run the server:
"paper_search_mcp.server"
],
"env": {
"SEMANTIC_SCHOLAR_API_KEY": "" // Optional: For enhanced Semantic Scholar features
"SEMANTIC_SCHOLAR_API_KEY": "", // Optional: For enhanced Semantic Scholar features
"SCOPUS_API_KEY": "" // Add your Scopus API Key here
}
}
}
Expand Down Expand Up @@ -157,13 +158,13 @@ We welcome contributions! Here's how to get started:
- [√] Google Scholar
- [√] IACR ePrint Archive
- [√] Semantic Scholar
- [√] Scopus
- [ ] PubMed Central (PMC)
- [ ] Science Direct
- [ ] Springer Link
- [ ] IEEE Xplore
- [ ] ACM Digital Library
- [ ] Web of Science
- [ ] Scopus
- [ ] JSTOR
- [ ] ResearchGate
- [ ] CORE
Expand Down
30 changes: 30 additions & 0 deletions paper_search_mcp/academic_platforms/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# paper_search_mcp/academic_platforms/__init__.py

"""
This package provides modules for searching various academic platforms.
Each module should contain a searcher class that implements a common interface
(e.g., inheriting from a base PaperSource class and having a 'search' method).
"""

from .arxiv import ArxivSearcher
from .biorxiv import BioRxivSearcher # Corrected capitalization
from .google_scholar import GoogleScholarSearcher
# hub.py is not a searcher, so it's not imported here for direct use as a platform
from .iacr import IACRSearcher
from .medrxiv import MedRxivSearcher # Corrected capitalization
from .pubmed import PubMedSearcher
from .scopus import ScopusSearcher
from .semantic import SemanticSearcher
from .shodhganga import ShodhgangaSearcher

__all__ = [
"ArxivSearcher",
"BioRxivSearcher", # Corrected capitalization
"GoogleScholarSearcher",
"IACRSearcher",
"MedRxivSearcher", # Corrected capitalization
"PubMedSearcher",
"ScopusSearcher",
"SemanticSearcher",
"ShodhgangaSearcher",
]
74 changes: 74 additions & 0 deletions paper_search_mcp/academic_platforms/hub.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# paper_search_mcp/academic_platforms/hub.py

"""
Central hub for accessing different academic platform searchers.
This allows for dynamic instantiation of searchers based on a key.
"""

from .arxiv import ArxivSearcher
from .biorxiv import BiorxivSearcher
from .google_scholar import GoogleScholarSearcher
from .iacr import IACRSearcher
from .medrxiv import MedrxivSearcher
from .pubmed import PubMedSearcher
from .scopus import ScopusSearcher
from .semantic import SemanticSearcher
from .shodhganga import ShodhgangaSearcher

# A dictionary mapping platform names (keys) to their searcher classes.
# This allows for easy lookup and instantiation of searchers.
AVAILABLE_SEARCHERS = {
"arxiv": ArxivSearcher,
"biorxiv": BiorxivSearcher,
"google_scholar": GoogleScholarSearcher,
"iacr": IACRSearcher,
"medrxiv": MedrxivSearcher,
"pubmed": PubMedSearcher,
"scopus": ScopusSearcher,
"semantic_scholar": SemanticSearcher, # Assuming 'semantic_scholar' as key for SemanticSearcher
"shodhganga": ShodhgangaSearcher,
}

def get_searcher(platform_name: str):
"""
Returns an instance of the searcher for the given platform name.

Args:
platform_name (str): The key for the desired platform
(e.g., "arxiv", "pubmed", "shodhganga").

Returns:
An instance of the searcher class if found, otherwise None.

Raises:
ValueError: If the platform_name is not recognized.
"""
platform_name = platform_name.lower()
searcher_class = AVAILABLE_SEARCHERS.get(platform_name)
if searcher_class:
return searcher_class() # Instantiate the class
else:
raise ValueError(f"Unknown platform: {platform_name}. Available platforms are: {list(AVAILABLE_SEARCHERS.keys())}")

if __name__ == '__main__':
# Example usage:
print(f"Available searcher platforms: {list(AVAILABLE_SEARCHERS.keys())}")

try:
arxiv_searcher = get_searcher("arxiv")
print(f"Successfully got searcher for 'arxiv': {type(arxiv_searcher)}")

shodhganga_searcher = get_searcher("shodhganga")
print(f"Successfully got searcher for 'shodhganga': {type(shodhganga_searcher)}")

# Test a non-existent platform
# get_searcher("nonexistent_platform")

except ValueError as e:
print(f"Error: {e}")
except ImportError as e:
print(f"ImportError: {e}. This might indicate an issue with the class names in __init__.py or the files themselves.")
print("Please ensure all Searcher classes (e.g., ArxivSearcher, PubMedSearcher) are correctly defined and imported.")

# TODO: Consider adding a more robust plugin system if the number of platforms grows significantly.
# TODO: Potentially load API keys or configurations here if needed by searchers in the future.
246 changes: 246 additions & 0 deletions paper_search_mcp/academic_platforms/pmc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
# paper_search_mcp/academic_platforms/pmc.py
from typing import List, Optional
import requests
from xml.etree import ElementTree as ET
from datetime import datetime
from ..paper import Paper
import os
from .pubmed import PaperSource # Reusing PaperSource
import PyPDF2
import io

class PMCSearcher(PaperSource):
"""Searcher for PubMed Central (PMC) papers."""
ESEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
EFETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
PMC_PDF_URL = "https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/pdf"

def search(self, query: str, max_results: int = 10) -> List[Paper]:
"""Search PMC for papers."""
search_params = {
'db': 'pmc',
'term': query,
'retmax': max_results,
'retmode': 'xml'
}
try:
search_response = requests.get(self.ESEARCH_URL, params=search_params)
search_response.raise_for_status()
search_root = ET.fromstring(search_response.content)
except requests.RequestException as e:
print(f"Error during PMC esearch request: {e}")
return []
except ET.ParseError as e:
print(f"Error parsing PMC esearch XML response: {e}")
return []

ids = [id_node.text for id_node in search_root.findall('.//Id') if id_node.text]
if not ids:
return []

fetch_params = {
'db': 'pmc',
'id': ','.join(ids),
'retmode': 'xml'
}
try:
fetch_response = requests.get(self.EFETCH_URL, params=fetch_params)
fetch_response.raise_for_status()
fetch_root = ET.fromstring(fetch_response.content)
except requests.RequestException as e:
print(f"Error during PMC efetch request: {e}")
return []
except ET.ParseError as e:
print(f"Error parsing PMC efetch XML response: {e}")
return []

papers = []
for article_node in fetch_root.findall('.//article'): # PMC uses <article>
try:
# Extract PMCID
pmcid_node = article_node.find(".//article-id[@pub-id-type='pmc']")
pmcid = pmcid_node.text if pmcid_node is not None else None
if not pmcid:
continue # Skip if no PMCID

# Extract title
title_node = article_node.find(".//article-title")
title = title_node.text if title_node is not None else "N/A"

# Extract authors
authors = []
for contrib_node in article_node.findall(".//contrib[@contrib-type='author']"):
surname_node = contrib_node.find(".//name/surname")
given_names_node = contrib_node.find(".//name/given-names")
surname = surname_node.text if surname_node is not None else ""
given_names = given_names_node.text if given_names_node is not None else ""
authors.append(f"{given_names} {surname}".strip())

# Extract abstract
abstract_text = "N/A"
abstract_element = article_node.find("./front/article-meta/abstract")

if abstract_element is not None:
# Check for structured abstract (sections)
sections = abstract_element.findall("./sec")
if sections: # If <sec> tags are present, parse them
abstract_parts = []
for sec_node in sections:
# Get all <p> text within this <sec>
sec_content_parts = [p_node.text.strip() for p_node in sec_node.findall(".//p") if p_node.text and p_node.text.strip()]
if sec_content_parts:
abstract_parts.append(" ".join(sec_content_parts))
if abstract_parts:
abstract_text = "\n".join(abstract_parts)
else:
# Try to find a single <p> directly under <abstract>
p_nodes = abstract_element.findall("./p")
if p_nodes:
abstract_text_parts = [p.text.strip() for p in p_nodes if p.text and p.text.strip()]
if abstract_text_parts:
abstract_text = "\n".join(abstract_text_parts)
# If no <p> directly under abstract, but abstract_element itself has text (less common)
elif abstract_element.text and abstract_element.text.strip():
abstract_text = abstract_element.text.strip()

abstract = abstract_text # Assign to the variable used later


# Extract publication date
pub_date_node = article_node.find(".//pub-date[@pub-type='epub']") # Prefer electronic pub date
if pub_date_node is None: # Fallback to print or other pub-types if epub not found
pub_date_node = article_node.find(".//pub-date[@pub-type='ppub']")
if pub_date_node is None:
pub_date_node = article_node.find(".//pub-date") # Generic fallback

year, month, day = "N/A", "N/A", "N/A"
if pub_date_node is not None:
year_node = pub_date_node.find("./year") # Use relative path
month_node = pub_date_node.find("./month")
day_node = pub_date_node.find("./day")
year = year_node.text if year_node is not None and year_node.text else "N/A"
month = month_node.text if month_node is not None and month_node.text else "01" # Default month/day
day = day_node.text if day_node is not None and day_node.text else "01"

try:
# Handle cases where year might be invalid
if year == "N/A" or not year.isdigit():
year_int = 1900 # Default year
else:
year_int = int(year)

if month == "N/A" or not month.isdigit() or not (1 <= int(month) <= 12):
month_int = 1
else:
month_int = int(month)

if day == "N/A" or not day.isdigit() or not (1 <= int(day) <= 31):
day_int = 1
else:
day_int = int(day)

published = datetime(year_int, month_int, day_int)
except ValueError:
published = datetime(1900, 1, 1) # Default for parsing errors

# Extract DOI
doi_node = article_node.find(".//article-id[@pub-id-type='doi']")
doi = doi_node.text if doi_node is not None and doi_node.text else ""

papers.append(Paper(
paper_id=pmcid, # Use PMCID as the primary ID
title=title,
authors=authors,
abstract=abstract,
url=f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmcid}/", # PMC uses PMCID in URL
pdf_url=self.PMC_PDF_URL.format(pmcid=f"PMC{pmcid}"),
published_date=published,
updated_date=published, # Assuming same as published for now
source='pmc',
categories=[], # PMC API doesn't easily provide categories
keywords=[], # PMC API doesn't easily provide keywords
doi=doi
))
except Exception as e:
print(f"Error parsing PMC article XML (PMCID: {pmcid if 'pmcid' in locals() else 'unknown'}): {e}")
return papers

def download_pdf(self, paper_id: str, save_path: str = "./downloads") -> str:
"""Download the PDF for a PMC paper."""
if not paper_id.startswith("PMC"):
paper_id = f"PMC{paper_id}"

pdf_url = self.PMC_PDF_URL.format(pmcid=paper_id)

os.makedirs(save_path, exist_ok=True)
file_path = os.path.join(save_path, f"{paper_id}.pdf")

try:
response = requests.get(pdf_url, stream=True)
response.raise_for_status()
with open(file_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
return file_path
except requests.RequestException as e:
raise ConnectionError(f"Failed to download PDF from {pdf_url}: {e}")

def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str:
"""Download and read text from a PMC paper's PDF."""
pdf_path = self.download_pdf(paper_id, save_path)

try:
with open(pdf_path, 'rb') as f:
reader = PyPDF2.PdfReader(f)
text = ""
for page_num in range(len(reader.pages)):
text += reader.pages[page_num].extract_text() or ""
return text
except Exception as e:
print(f"Error reading PDF {pdf_path}: {e}")
return "" # Return empty string on error

if __name__ == "__main__":
searcher = PMCSearcher()

print("Testing PMC search functionality...")
query = "crispr gene editing"
max_results = 3
try:
papers = searcher.search(query, max_results=max_results)
print(f"Found {len(papers)} papers for query '{query}':")
for i, paper in enumerate(papers, 1):
print(f"{i}. ID: {paper.paper_id} - {paper.title}")
print(f" Authors: {', '.join(paper.authors)}")
print(f" DOI: {paper.doi}")
print(f" URL: {paper.url}")
print(f" PDF URL: {paper.pdf_url}\n")

if papers:
# Test PDF download and read
test_paper = papers[0]
print(f"\nTesting PDF download and read for PMCID: {test_paper.paper_id}")
try:
pdf_file_path = searcher.download_pdf(test_paper.paper_id)
print(f"PDF downloaded to: {pdf_file_path}")

# Check if file exists and is not empty
if os.path.exists(pdf_file_path) and os.path.getsize(pdf_file_path) > 0:
print("PDF file seems valid.")
paper_text = searcher.read_paper(test_paper.paper_id)
if paper_text:
print(f"Successfully read paper. First 500 chars:\n{paper_text[:500]}...")
else:
print("Could not extract text from PDF, or PDF was empty.")
else:
print(f"PDF file at {pdf_file_path} is missing or empty.")

except ConnectionError as e:
print(f"Connection error during PDF download/read test: {e}")
except Exception as e:
print(f"Error during PDF download/read test: {e}")
else:
print("No papers found to test download/read functionality.")

except Exception as e:
print(f"Error during PMC search test: {e}")
Loading