Skip to content
This repository was archived by the owner on Mar 15, 2019. It is now read-only.

Commit 4329d40

Browse files
committed
Add tests
1 parent 2af5940 commit 4329d40

File tree

8 files changed

+171
-90
lines changed

8 files changed

+171
-90
lines changed

.vscode/settings.json

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"python.pythonPath": "/Users/abanihi/opt/miniconda3/envs/pangeo/bin/python"
3+
}

intake_cmip5/__init__.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
#!/usr/bin/env python
22
"""Top-level package for intake_cmip5."""
33
from ._version import get_versions
4+
import intake_cmip5
5+
from intake_cmip5 import generate_database
46

57
__version__ = get_versions()["version"]
68
del get_versions
79

8-
__all__ = []
10+
__all__ = ["intake_cmip5", "generate_database"]

intake_cmip5/_version.py

+9-13
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,7 @@ def decorate(f):
6868
return decorate
6969

7070

71-
def run_command(commands, args, cwd=None, verbose=False,
72-
hide_stderr=False, env=None):
71+
def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None):
7372
"""Call the given command(s)."""
7473
assert isinstance(commands, list)
7574
p = None
@@ -121,7 +120,7 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
121120
dirname = os.path.basename(root)
122121
if dirname.startswith(parentdir_prefix):
123122
return {
124-
"version": dirname[len(parentdir_prefix):],
123+
"version": dirname[len(parentdir_prefix) :],
125124
"full-revisionid": None,
126125
"dirty": False,
127126
"error": None,
@@ -191,7 +190,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
191190
# starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
192191
# just "foo-1.0". If we see a "tag: " prefix, prefer those.
193192
TAG = "tag: "
194-
tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
193+
tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)])
195194
if not tags:
196195
# Either we're using git < 1.8.3, or there really are no tags. We use
197196
# a heuristic: assume all version tags have a digit. The old git %d
@@ -208,7 +207,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
208207
for ref in sorted(tags):
209208
# sorting will prefer e.g. "2.0" over "2.0rc1"
210209
if ref.startswith(tag_prefix):
211-
r = ref[len(tag_prefix):]
210+
r = ref[len(tag_prefix) :]
212211
if verbose:
213212
print("picking %s" % r)
214213
return {
@@ -242,8 +241,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
242241
if sys.platform == "win32":
243242
GITS = ["git.cmd", "git.exe"]
244243

245-
out, rc = run_command(
246-
GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True)
244+
out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True)
247245
if rc != 0:
248246
if verbose:
249247
print("Directory %s not under git control" % root)
@@ -313,7 +311,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
313311
tag_prefix,
314312
)
315313
return pieces
316-
pieces["closest-tag"] = full_tag[len(tag_prefix):]
314+
pieces["closest-tag"] = full_tag[len(tag_prefix) :]
317315

318316
# distance: number of commits since tag
319317
pieces["distance"] = int(mo.group(2))
@@ -324,8 +322,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
324322
else:
325323
# HEX: no tags
326324
pieces["closest-tag"] = None
327-
count_out, rc = run_command(
328-
GITS, ["rev-list", "HEAD", "--count"], cwd=root)
325+
count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root)
329326
pieces["distance"] = int(count_out) # total number of commits
330327

331328
# commit date: see ISO-8601 comment in git_versions_from_keywords()
@@ -522,8 +519,7 @@ def get_versions():
522519
verbose = cfg.verbose
523520

524521
try:
525-
return git_versions_from_keywords(
526-
get_keywords(), cfg.tag_prefix, verbose)
522+
return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose)
527523
except NotThisMethod:
528524
pass
529525

@@ -561,4 +557,4 @@ def get_versions():
561557
"dirty": None,
562558
"error": "unable to compute version",
563559
"date": None,
564-
}
560+
}

intake_cmip5/generate_database.py

+92-58
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# -*- coding: utf-8 -*-
2-
import os
2+
import os
33
import pandas as pd
44
from dask import delayed
55
import re
@@ -9,55 +9,77 @@
99
import shutil
1010

1111

12-
1312
HOME = os.environ["HOME"]
1413
INTAKE_CMIP5_DIR = f"{HOME}/.intake_cmip5"
1514

15+
1616
@functools.lru_cache(maxsize=1024, typed=False)
1717
def _parse_dirs(root_dir):
18-
institution_dirs = [os.path.join(root_dir, activity, institution)
19-
for activity in os.listdir(root_dir)
20-
for institution in os.listdir(os.path.join(root_dir, activity))
21-
if os.path.isdir(os.path.join(root_dir, activity, institution))]
22-
23-
model_dirs = [os.path.join(institution_dir, model)
24-
for institution_dir in institution_dirs
25-
for model in os.listdir(institution_dir)
26-
if os.path.isdir(os.path.join(institution_dir, model))]
27-
28-
experiment_dirs = [os.path.join(model_dir, exp)
29-
for model_dir in model_dirs
30-
for exp in os.listdir(model_dir)
31-
if os.path.isdir(os.path.join(model_dir, exp))]
32-
33-
freq_dirs = [os.path.join(experiment_dir, freq)
34-
for experiment_dir in experiment_dirs
35-
for freq in os.listdir(experiment_dir)
36-
if os.path.isdir(os.path.join(experiment_dir, freq))]
37-
38-
realm_dirs = [os.path.join(freq_dir, realm)
39-
for freq_dir in freq_dirs
40-
for realm in os.listdir(freq_dir)
41-
if os.path.isdir(os.path.join(freq_dir, realm))]
42-
18+
institution_dirs = [
19+
os.path.join(root_dir, activity, institution)
20+
for activity in os.listdir(root_dir)
21+
for institution in os.listdir(os.path.join(root_dir, activity))
22+
if os.path.isdir(os.path.join(root_dir, activity, institution))
23+
]
24+
25+
model_dirs = [
26+
os.path.join(institution_dir, model)
27+
for institution_dir in institution_dirs
28+
for model in os.listdir(institution_dir)
29+
if os.path.isdir(os.path.join(institution_dir, model))
30+
]
31+
32+
experiment_dirs = [
33+
os.path.join(model_dir, exp)
34+
for model_dir in model_dirs
35+
for exp in os.listdir(model_dir)
36+
if os.path.isdir(os.path.join(model_dir, exp))
37+
]
38+
39+
freq_dirs = [
40+
os.path.join(experiment_dir, freq)
41+
for experiment_dir in experiment_dirs
42+
for freq in os.listdir(experiment_dir)
43+
if os.path.isdir(os.path.join(experiment_dir, freq))
44+
]
45+
46+
realm_dirs = [
47+
os.path.join(freq_dir, realm)
48+
for freq_dir in freq_dirs
49+
for realm in os.listdir(freq_dir)
50+
if os.path.isdir(os.path.join(freq_dir, realm))
51+
]
52+
4353
return realm_dirs
44-
54+
55+
4556
def _get_entry(directory):
46-
dir_split = directory.split('/')
57+
dir_split = directory.split("/")
4758
entry = {}
48-
entry['realm'] = dir_split[-1]
49-
entry['frequency'] = dir_split[-2]
50-
entry['experiment'] = dir_split[-3]
51-
entry['model'] = dir_split[-4]
52-
entry['institution'] = dir_split[-5]
59+
entry["realm"] = dir_split[-1]
60+
entry["frequency"] = dir_split[-2]
61+
entry["experiment"] = dir_split[-3]
62+
entry["model"] = dir_split[-4]
63+
entry["institution"] = dir_split[-5]
5364
return entry
54-
65+
66+
5567
@delayed
5668
def parse_directory(directory):
57-
exclude = set(["files", "latests"]) # directories to exclude
69+
exclude = set(["files", "latests"]) # directories to exclude
5870

59-
columns = ["ensemble", "experiment", "file_basename", "file_fullpath",
60-
"frequency", "institution", "model", "root", "realm", "varname"]
71+
columns = [
72+
"ensemble",
73+
"experiment",
74+
"file_basename",
75+
"file_fullpath",
76+
"frequency",
77+
"institution",
78+
"model",
79+
"root",
80+
"realm",
81+
"varname",
82+
]
6183
df = pd.DataFrame(columns=columns)
6284

6385
entry = _get_entry(directory)
@@ -68,17 +90,18 @@ def parse_directory(directory):
6890
if not files:
6991
continue
7092
sfiles = sorted([f for f in files if os.path.splitext(f)[1] == ".nc"])
71-
if not sfiles: continue
93+
if not sfiles:
94+
continue
7295

7396
fs = []
7497
for f in sfiles:
7598
try:
7699
f_split = f.split("_")
77-
entry['varname'] = f_split[0]
78-
entry['ensemble'] = f_split[-2]
79-
entry['root'] = root
80-
entry['file_basename'] = f
81-
entry['file_fullpath'] = os.path.join(root, f)
100+
entry["varname"] = f_split[0]
101+
entry["ensemble"] = f_split[-2]
102+
entry["root"] = root
103+
entry["file_basename"] = f
104+
entry["file_fullpath"] = os.path.join(root, f)
82105
fs.append(entry)
83106
except:
84107
continue
@@ -88,34 +111,45 @@ def parse_directory(directory):
88111
else:
89112
temp_df = pd.DataFrame()
90113
temp_df.columns = df.columns
91-
df = pd.concat([temp_df, df], ignore_index=True)
114+
df = pd.concat([temp_df, df], ignore_index=True, sort=False)
92115
return df
93116

94-
def _persist_database(df):
95-
vYYYYMMDD = r'v\d{4}\d{2}\d{2}'
96-
vN = r'v\d{1}'
97-
v = re.compile( "|".join([vYYYYMMDD, vN])) # Combine both regex into one
117+
118+
def _persist_database(df, path):
119+
vYYYYMMDD = (
120+
r"v\d{4}\d{2}\d{2}"
121+
) # TODO: Very dangerous in case the root dir matches the pattern
122+
vN = r"v\d{1}"
123+
v = re.compile("|".join([vYYYYMMDD, vN])) # Combine both regex into one
98124
df["version"] = df.root.str.findall(v)
99-
df["version"] = df["version"].apply(lambda x: x[0] if x else 'v0')
100-
sorted_df = df.sort_values("version").drop_duplicates(subset="file_basename", keep="last")\
101-
.reset_index(drop=True)
125+
df["version"] = df["version"].apply(lambda x: x[0] if x else "v0")
126+
sorted_df = (
127+
df.sort_values("version")
128+
.drop_duplicates(subset="file_basename", keep="last")
129+
.reset_index(drop=True)
130+
)
131+
132+
if path:
133+
INTAKE_CMIP5_DIR = path
134+
102135
print(f"**** Persisting CMIP5 database in {INTAKE_CMIP5_DIR} ****")
103136

104137
if os.path.isdir(INTAKE_CMIP5_DIR):
105138
shutil.rmtree(INTAKE_CMIP5_DIR)
106139
os.makedirs(INTAKE_CMIP5_DIR, exist_ok=True)
107-
140+
108141
sorted_df.to_csv(f"{INTAKE_CMIP5_DIR}/clean_cmip5_database.csv", index=False)
109142
df.to_csv(f"{INTAKE_CMIP5_DIR}/raw_cmip5_database.csv", index=False)
110-
143+
111144
return sorted_df
112145

113-
def create_CMIP5Database(root_dir=None):
146+
147+
def create_CMIP5Database(root_dir=None, db_path=None):
114148
if not os.path.exists(root_dir):
115149
raise NotADirectoryError(f"{root_dir} does not exist")
116-
150+
117151
dirs = _parse_dirs(root_dir)
118152
dfs = [parse_directory(directory) for directory in dirs]
119153
df = dd.from_delayed(dfs).compute()
120-
df = _persist_database(df)
121-
return df
154+
df = _persist_database(df, db_path)
155+
return df

setup.cfg

+5-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,11 @@ replace = __version__ = '{new_version}'
1515
universal = 1
1616

1717
[flake8]
18-
exclude = docs
18+
exclude = docs,versioneer.py,intake_cmip5/._version.py
19+
ignore = E203, E266, E501, W503, F401, E722
20+
max-line-length = 88
21+
max-complexity = 18
22+
select = B,C,E,F,W,T4,B9
1923

2024
[aliases]
2125
# Define setup.py command aliases here

setup.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -9,26 +9,26 @@
99
with open("README.md", encoding="utf-8") as readme_file:
1010
readme = readme_file.read()
1111

12-
requirements = open('requirements.txt').read().strip().split('\n')
12+
requirements = open("requirements.txt").read().strip().split("\n")
1313

1414
setup(
1515
maintainer="Anderson Banihirwe",
1616
maintainer_email="[email protected]",
17-
description='An intake plugin for loading CMIP5 data sets',
17+
description="An intake plugin for loading CMIP5 data sets",
1818
install_requires=requirements,
1919
license="https://github.com/NCAR/intake-cmip5/blob/master/LICENSE.rst",
2020
long_description=readme,
2121
long_description_content_type="text/markdown",
2222
keywords=["cmip5", "intake"],
2323
name="intake-cmip5",
2424
packages=find_packages(),
25-
py_modules=['intake_cmip5'],
26-
package_data={'': ['*.yml', '*.yaml']},
25+
py_modules=["intake_cmip5"],
26+
package_data={"": ["*.yml", "*.yaml", "*.csv"]},
2727
include_package_data=True,
2828
url="https://github.com/NCAR/intake-cmip5",
2929
version=versioneer.get_version(),
3030
cmdclass=versioneer.get_cmdclass(),
3131
entry_points="""
3232
""",
3333
zip_safe=False,
34-
)
34+
)

0 commit comments

Comments
 (0)