Skip to content

Commit b1c55b5

Browse files
authored
Merge pull request #44 from PRIDE-Archive/stream_apis
Stream apis
2 parents 7751f9e + 67fd0b1 commit b1c55b5

File tree

8 files changed

+122
-9
lines changed

8 files changed

+122
-9
lines changed

README.md

+18-4
Original file line numberDiff line numberDiff line change
@@ -50,23 +50,34 @@ $ pridepy download-all-public-raw-files -a PXD012353 -o /Users/yourname/Download
5050
```
5151

5252
Download single file by name:
53-
5453
```bash
5554
$ pridepy download-file-by-name -a PXD022105 -o /Users/yourname/Downloads/foldername/ -f checksum.txt -p globus
5655
```
5756

5857
>**NOTE**: Currently we use Globus URLs (when `-p globus` is used) via HTTPS, not the Globus protocol. For more information about Globus, see [Globus documentation](https://www.globus.org/data-transfer).
5958
6059
Search projects with keywords and filters
61-
6260
```bash
6361
$ pridepy search-projects-by-keywords-and-filters --keyword accession:PXD012353
6462
```
65-
Search files with filters
6663

64+
Search files with filters
6765
```bash
6866
$ pridepy get-files-by-filter --filter fileCategory.value==RAW
6967
```
68+
69+
Stream metadata of all projects as json and write it to a file
70+
```bash
71+
$ pridepy stream-projects-metadata -o all_pride_projects.json
72+
```
73+
74+
Stream metadata of all files as json and write it to a file. Project accession can be specified as an optional parameter
75+
```bash
76+
$ pridepy stream-files-metadata -o all_pride_files.json
77+
OR
78+
$ pridepy stream-files-metadata -o PXD005011_files.json -a PXD005011
79+
```
80+
7081
Use the below command to view a list of commands available:
7182

7283
```bash
@@ -83,7 +94,10 @@ Commands:
8394
get-files-by-project-accession get files by project accession...
8495
get-private-files Get private files by project...
8596
get-projects get paged projects :return:
86-
get-projects-by-accession get projects by accession...
97+
get-projects-by-accession get projects by accession...
98+
stream-files-metadata Stream all files metadata in...
99+
stream-projects-metadata Stream all projects metadata...
100+
87101
```
88102
# NOTE
89103

environment.yml

+2-1
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,5 @@ dependencies:
1313
- boto3
1414
- botocore
1515
- tqdm
16-
- urllib3
16+
- urllib3
17+
- httpx

pridepy/files/files.py

+18
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ class Files:
5050
This class handles PRIDE API files endpoint.
5151
"""
5252

53+
V3_API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v3"
5354
API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v2"
5455
API_PRIVATE_URL = "https://www.ebi.ac.uk/pride/private/ws/archive/v2"
5556
PRIDE_ARCHIVE_FTP = "ftp.pride.ebi.ac.uk"
@@ -62,6 +63,23 @@ class Files:
6263
def __init__(self):
6364
pass
6465

66+
async def stream_all_files_metadata(self, output_file, accession=None):
67+
"""
68+
get stream all project files from PRIDE API in JSON format
69+
"""
70+
if accession is None:
71+
request_url = f"{self.V3_API_BASE_URL}/files/all"
72+
count_request_url = f"{self.V3_API_BASE_URL}/files/count"
73+
else:
74+
request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/all"
75+
count_request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/count"
76+
headers = {"Accept": "application/JSON"}
77+
response = Util.get_api_call(count_request_url, headers)
78+
total_records = response.json()
79+
80+
regex_search_pattern = '"fileName"'
81+
await Util.stream_response_to_file(output_file, total_records, regex_search_pattern, request_url, headers)
82+
6583
def get_all_paged_files(
6684
self, query_filter, page_size, page, sort_direction, sort_conditions
6785
):

pridepy/pridepy.py

+38-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/usr/bin/env python3
2-
2+
import asyncio
33
import logging
44
import click
55
from pridepy.files.files import Files
@@ -254,6 +254,43 @@ def search_projects_by_keywords_and_filters(
254254
)
255255

256256

257+
@main.command()
258+
@click.option(
259+
"-o",
260+
"--output_file",
261+
required=True,
262+
help="output file to save all the projects metadata",
263+
)
264+
def stream_projects_metadata(output_file):
265+
"""
266+
Stream all projects metadata in JSON format to a file
267+
:return:
268+
"""
269+
project = Project()
270+
asyncio.run(project.stream_all_projects(output_file))
271+
272+
273+
@main.command()
274+
@click.option(
275+
"-o",
276+
"--output_file",
277+
required=True,
278+
help="output file to save all the files metadata",
279+
)
280+
@click.option(
281+
"-a",
282+
"--accession",
283+
required=False,
284+
help="project accession",
285+
)
286+
def stream_files_metadata(accession, output_file):
287+
"""
288+
Stream all files metadata in JSON format and write it to a file
289+
:return:
290+
"""
291+
files = Files()
292+
asyncio.run(files.stream_all_files_metadata(output_file, accession))
293+
257294
@main.command()
258295
@click.option(
259296
"-ps",

pridepy/project/project.py

+14
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/usr/bin/env python
2+
23
from pridepy.authentication.authentication import Authentication
34
from pridepy.util.api_handling import Util
45

@@ -9,6 +10,7 @@ class Project:
910
"""
1011

1112
API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v2/"
13+
V3_API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v3/"
1214
PRIVATE_API_BASE_URL = "https://www.ebi.ac.uk/pride/private/ws/archive/v2/"
1315

1416
def __init__(self):
@@ -39,6 +41,18 @@ def get_projects(self, page_size, page, sort_direction, sort_conditions):
3941
response = Util.get_api_call(request_url, headers)
4042
return response.json()
4143

44+
async def stream_all_projects(self, output_file):
45+
"""
46+
get stream of all projects from PRIDE API in JSON format
47+
"""
48+
request_url = self.V3_API_BASE_URL + "projects/all"
49+
count_request_url = self.V3_API_BASE_URL + "projects/count"
50+
headers = {"Accept": "application/JSON"}
51+
response = Util.get_api_call(count_request_url, headers)
52+
total_records = response.json()
53+
regex_search_pattern = '"projectDescription"'
54+
await Util.stream_response_to_file(output_file, total_records, regex_search_pattern, request_url, headers)
55+
4256
def get_reanalysis_projects_by_accession(self, accession):
4357
"""
4458
search PRIDE projects by reanalysis accession

pridepy/util/api_handling.py

+28
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
#!/usr/bin/env python
2+
import re
3+
import sys
24

5+
import httpx
36
import requests
47
import logging
58
from ratelimit import limits, sleep_and_retry
69
from requests.adapters import HTTPAdapter
10+
from tqdm import tqdm
711
from urllib3.util.retry import Retry
812

913

@@ -30,6 +34,30 @@ def get_api_call(url, headers=None):
3034
)
3135
return response
3236

37+
@staticmethod
38+
@sleep_and_retry
39+
@limits(calls=1000, period=50)
40+
async def stream_response_to_file(out_file, total_records, regex_search_pattern, url, headers=None):
41+
# Initialize the progress bar
42+
with tqdm(total=total_records, unit_scale=True) as pbar:
43+
async with httpx.AsyncClient() as client:
44+
# Use a GET request with stream=True to handle streaming responses
45+
async with client.stream("GET", url, headers=headers) as response:
46+
# Check if the response is successful
47+
response.raise_for_status()
48+
try:
49+
with open(out_file, 'w') as cfile:
50+
# Iterate over the streaming content line by line
51+
async for line in response.aiter_lines():
52+
if line: # Avoid printing empty lines (common with text/event-stream)
53+
cfile.write(line + "\n")
54+
# Check if the pattern exists in the string
55+
if re.search(regex_search_pattern, line):
56+
pbar.update(1) # Update progress bar by 1 for each detection
57+
except PermissionError as e:
58+
print("[ERROR] No permissions to write to:", out_file)
59+
sys.exit(1)
60+
3361
@staticmethod
3462
@sleep_and_retry
3563
@limits(calls=1000, period=50)

requirements.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@ plotly
77
boto3
88
botocore
99
tqdm
10-
urllib3
10+
urllib3
11+
httpx

setup.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66

77
setup(
88
name="pridepy",
9-
version="0.0.4",
9+
version="0.0.5",
1010
author="PRIDE Team",
11-
author_email="pride-report@ebi.ac.uk",
11+
author_email="pride-support@ebi.ac.uk",
1212
description="Python Client library for PRIDE Rest API",
1313
long_description=long_description,
1414
long_description_content_type="text/markdown",

0 commit comments

Comments
 (0)