Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: n-dimensional support #20

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 9 additions & 26 deletions jupyterlab_hdf/baseHandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
# Distributed under the terms of the Modified BSD License.

import h5py
import json
# import json
import os
from tornado import gen
from tornado.httpclient import HTTPError
import simplejson

from notebook.base.handlers import APIHandler
from notebook.utils import url_path_join
Expand All @@ -24,28 +25,20 @@ class HdfBaseManager:
def __init__(self, notebook_dir):
self.notebook_dir = notebook_dir

def _get(self, f, uri, row, col):
def _get(self, f, uri, select):
raise NotImplementedError

def get(self, relfpath, uri, row, col):
def get(self, relfpath, uri, select):
def _handleErr(code, msg):
raise HTTPError(code, '\n'.join((
msg,
f'relfpath: {relfpath}, uri: {uri}, row: {row}, col: {col}'
f'relfpath: {relfpath}, uri: {uri}, select: {select}'
)))

if not relfpath:
msg = f'The request was malformed; fpath should not be empty.'
_handleErr(400, msg)

if row and not col:
msg = f'The request was malformed; row slice was specified, but col slice was empty.'
_handleErr(400, msg)

if col and not row:
msg = f'The request was malformed; col slice was specified, but row slice was empty.'
_handleErr(400, msg)

fpath = url_path_join(self.notebook_dir, relfpath)

if not os.path.exists(fpath):
Expand All @@ -61,15 +54,14 @@ def _handleErr(code, msg):
_handleErr(401, msg)
try:
with h5py.File(fpath, 'r') as f:
out = self._get(f, uri, row, col)
out = self._get(f, uri, select)
except Exception as e:
msg = (f'Found and opened file, error getting contents from object specified by the uri.\n'
f'Error: {e}')
_handleErr(500, msg)

return out


## handler
class HdfBaseHandler(APIHandler):
managerClass = None
Expand All @@ -86,25 +78,16 @@ def initialize(self, notebook_dir):
@gen.coroutine
def get(self, path):
"""Based on an api request, get either the contents of a group or a
slice of a dataset and return it as serialized JSON.
selected hyperslab of a dataset and return it as serialized JSON.
"""
uri = '/' + self.get_query_argument('uri').lstrip('/')
row = self.getQueryArguments('row', int)
col = self.getQueryArguments('col', int)

select = self.get_query_argument('select', default=None)
try:
self.finish(json.dumps(self.manager.get(path, uri, row, col)))

self.finish(simplejson.dumps(self.manager.get(path, uri, select), ignore_nan=True))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The NaN handling issue was a good catch.

Ultimately I want to try the plan that @kgryte sketched out in #22, but that can be a future PR

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I trust that the reviver approach is probably the best option for handling this on the front end, but is there a reason to avoid handling it (or an option to) on the server?

except HTTPError as err:
self.set_status(err.code)
response = err.response.body if err.response else str(err.code)
self.finish('\n'.join((
response,
err.message
)))

def getQueryArguments(self, key, func=None):
if func is not None:
return [func(x) for x in self.get_query_argument(key).split(',')] if key in self.request.query_arguments else None
else:
return [x for x in self.get_query_argument(key).split(',')] if key in self.request.query_arguments else None
9 changes: 4 additions & 5 deletions jupyterlab_hdf/contents.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,20 @@
class HdfContentsManager(HdfBaseManager):
"""Implements HDF5 contents handling
"""
def _get(self, f, uri, row, col):
def _get(self, f, uri, select):
obj = f[uri]

if isinstance(obj, h5py.Group):
if isinstance(obj, h5py.Group):
return [(groupDict if isinstance(val, h5py.Group) else dsetDict)
(name=name, uri=uriJoin(uri, name))
for name,val in obj.items()]
else:
return dsetDict(
name=uriName(uri),
uri=uri,
content=dsetContentDict(obj, row, col),
content=dsetContentDict(obj, select)
)


## handler
class HdfContentsHandler(HdfBaseHandler):
"""A handler for HDF5 contents
Expand Down
4 changes: 2 additions & 2 deletions jupyterlab_hdf/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
class HdfDataManager(HdfBaseManager):
"""Implements HDF5 data handling
"""
def _get(self, f, uri, row, col):
return dsetChunk(f[uri], row, col)
def _get(self, f, uri, select):
return dsetChunk(f[uri], select)


## handler
Expand Down
107 changes: 101 additions & 6 deletions jupyterlab_hdf/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,17 @@

import re

__all__ = ['chunkSlice', 'dsetChunk', 'dsetContentDict', 'dsetDict', 'groupDict', 'uriJoin', 'uriName']
from tornado.web import HTTPError

__all__ = [
'chunkSlice',
'dsetChunk',
'dsetContentDict',
'dsetDict',
'groupDict',
'uriJoin',
'uriName'
]

## chunk handling
def chunkSlice(chunk, s):
Expand All @@ -14,12 +24,12 @@ def chunkSlice(chunk, s):

return slice(s.start*chunk, s.stop*chunk, s.step)

def dsetChunk(dset, row, col):
return dset[slice(*row), slice(*col)].tolist()

def dsetChunk(dset, select):
slices = _getHyperslabSlices(dset.shape, select)
return dset[slices].tolist()

## create dicts to be converted to json
def dsetContentDict(dset, row=None, col=None):
def dsetContentDict(dset, select=None):
return dict([
# metadata
('attrs', dict(*dset.attrs.items())),
Expand All @@ -28,7 +38,7 @@ def dsetContentDict(dset, row=None, col=None):
('shape', dset.shape),

# actual data
('data', dsetChunk(dset, row, col) if row and col else None)
('data', dsetChunk(dset, select) if select else None)
])

def dsetDict(name, uri, content=None):
Expand All @@ -55,3 +65,88 @@ def uriJoin(*parts):

def uriName(uri):
return uri.split('/')[-1]


def _getHyperslabSlices(dsetshape, select):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One of my big design goals in how I put the frontend, API, and backend together was to avoid the need for any complex parsing function. They're never worth the trouble, there's almost always an edge case or three that you miss, and maintenance is a pain.

Instead, what I'd really like to do is make sure that the frontend is structured such that the slices it asks for don't require anything other than some simple splitting, if even that. I was even thinking that it might be good to split up the slice input box into n segments given a nD dataset (or it might be really annoying; I need to try that one out). The job of the API and backend is then to not lose the information that gets handed to them (by, eg concating all the info on all the slices into a single string).

If you do ever really want a parsing function like this, I'd strongly recommend against rolling your own. Instead, you should search for prior art, or even better, come up with something that works off of ast.literal_eval (which is safe, so long as you can tolerate the possibility of a user causing a segfault at will).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed. I think sending a list of slices in a post body as you've proposed is a cleaner approach. The idea for the select query param came directly from the HDF group's API, so it seemed like a good idea at the time. The complexity of the parser did get out of hand pretty quick, and as you mentioned is error prone.

"""
Parse SELECT query param and return tuple of Python slice objects

:param dsetshape: The shape of the dataset as returned by dset.shape
:param select: The SELECT query param should be in the form:
[<dim1>, <dim2>, ... , <dimn>]
For each dimension, valid formats are:
single integer: n
start and end: n:m
start, end, and stride: n:m:s
:type select: str

:returns: tuple of Python slices based on the SELECT query param
"""

if select is None:
# Default: return entire dataset
return tuple(slice(0, extent) for extent in dsetshape)

if not select.startswith('['):
msg = "Bad Request: selection query missing start bracket"
raise HTTPError(400, reason=msg)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's good practice/encapsulation for a function like this to raise semantic errors (in this case, probably a ValueError). It would then be the responsibility of eg HdfBaseManager (the thing that's actually interacting directly with the HTTP requests) to catch said semantic errors and raise HTTPError as appropriate

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes that makes sense, good call!

if not select.endswith(']'):
msg = "Bad Request: selection query missing end bracket"
raise HTTPError(400, reason=msg)

# strip brackets
select = select[1:-1]

select_array = select.split(',')
if len(select_array) > len(dsetshape):
msg = "Bad Request: number of selected dimensions exceeds the rank of the dataset"
raise HTTPError(400, reason=msg)

slices = []
for dim, dim_slice in enumerate(select_array):
extent = dsetshape[dim]

# default slice values
start = 0
stop = extent
step = 1
if dim_slice.find(':') < 0:
# just a number - append to SLICES, and continue to next iteration
try:
slices.append(int(dim_slice))
continue
except ValueError:
msg = "Bad Request: invalid selection parameter (can't convert to int) for dimension: " + str(dim)
raise HTTPError(400, reason=msg)
stop = start + 1
elif dim_slice == ':':
# select everything (default)
pass
else:
fields = dim_slice.split(":")
if len(fields) > 3:
msg = "Bad Request: Too many ':' seperators for dimension: " + str(dim)
raise HTTPError(400, reason=msg)
try:
if fields[0]:
start = int(fields[0])
if fields[1]:
stop = int(fields[1])
if len(fields) > 2 and fields[2]:
step = int(fields[2])
except ValueError:
msg = "Bad Request: invalid selection parameter (can't convert to int) for dimension: " + str(dim)
raise HTTPError(400, reason=msg)

if start < 0 or start > extent:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here's a good example of why you generally shouldn't write your own parsing function. If we're claiming to support Numpy-like syntax, then both negative indices and indices out of bounds should be treated as valid in slices.

msg = "Bad Request: Invalid selection start parameter for dimension: " + str(dim)
raise HTTPError(400, reason=msg)
if stop > extent:
msg = "Bad Request: Invalid selection stop parameter for dimension: " + str(dim)
raise HTTPError(400, reason=msg)
if step <= 0:
msg = "Bad Request: invalid selection step parameter for dimension: " + str(dim)
raise HTTPError(400, reason=msg)
slices.append(slice(start, stop, step))

return tuple(slices)
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@
],
install_requires=[
'h5py',
'notebook'
'notebook',
'simplejson'
]
)

Expand Down
3 changes: 1 addition & 2 deletions src/dataset.ts
Original file line number Diff line number Diff line change
Expand Up @@ -251,8 +251,7 @@ export class HdfDatasetModelBase extends DataModel {
const params = {
fpath: this._fpath,
uri: this._uri,
col: [colStart, colStop],
row: [rowStart, rowStop]
select: `[${rowStart}:${rowStop}, ${colStart}:${colStop}]`
};
hdfDataRequest(params, this._serverSettings).then(data => {
this._blocks[rowBlock][colBlock] = data;
Expand Down
19 changes: 9 additions & 10 deletions src/hdf.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,12 @@ export function hdfDataRequest(
parameters: IContentsParameters,
settings: ServerConnection.ISettings
): Promise<number[][]> {
// require the uri, row, and col query parameters
const { fpath, uri, row, col } = parameters;
// require the uri query parameter, select is optional
const { fpath, uri, ...select } = parameters;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The ... doesn't really mean optional in typescript. Check out https://basarat.gitbooks.io/typescript/docs/spread-operator.html

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oops. I'm familiar with spread syntax -- not sure why I have it there!


const fullUrl =
URLExt.join(settings.baseUrl, "hdf", "data", fpath).split("?")[0] +
URLExt.objectToQueryString({ uri, row, col });
URLExt.objectToQueryString({ uri, ...select });

return ServerConnection.makeRequest(fullUrl, {}, settings).then(response => {
if (response.status !== 200) {
Expand All @@ -105,14 +105,13 @@ export interface IContentsParameters {
uri: string;

/**
* Row slice. Up to 3 integers, same syntax as for Python `slice` built-in.
* String representing an array of slices which determine a
* hyperslab selection of an n-dimensional HDF5 dataset.
* Syntax and semantics matches that of h5py's Dataset indexing.
* E.g, ?select=[0:3, 1:5, :, 9]
* See jupyterlab_hdf/util.py for full details.
*/
row?: number[];

/**
* Column slice. Up to 3 integers, same syntax as for Python `slice` built-in.
*/
col?: number[];
select?: string;
}

/**
Expand Down
8 changes: 2 additions & 6 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -226,13 +226,9 @@ function addBrowserCommands(
fpath: args["fpath"] as string,
uri: args["uri"] as string
};
if (args["col"]) {
params.col = args["col"] as number[];
if (args["select"]) {
params.select = args["select"] as string;
}
if (args["row"]) {
params.row = args["row"] as number[];
}

return hdfContentsRequest(params, serverSettings);
},
label: "For an HDF5 file at `fpath`, fetch the contents at `uri`"
Expand Down