-
Notifications
You must be signed in to change notification settings - Fork 632
/
dataset_utils.py
159 lines (145 loc) · 5.79 KB
/
dataset_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
"""
License:
This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
"""
import numpy as np
import sys
from hub.exceptions import ModuleNotInstalledException
def slice_split(slice_):
"""Splits a slice into subpath and list of slices"""
path = ""
list_slice = []
for sl in slice_:
if isinstance(sl, str):
path += sl if sl.startswith("/") else "/" + sl
elif isinstance(sl, (int, slice)):
list_slice.append(sl)
else:
raise TypeError(
"type {} isn't supported in dataset slicing".format(type(sl))
)
return path, list_slice
def slice_extract_info(slice_, num):
"""Extracts number of samples and offset from slice"""
if isinstance(slice_, int):
slice_ = slice_ + num if num and slice_ < 0 else slice_
if num and (slice_ >= num or slice_ < 0):
raise IndexError(
"index out of bounds for dimension with length {}".format(num)
)
return (1, slice_)
if slice_.step is not None and slice_.step < 0: # negative step not supported
raise ValueError("Negative step not supported in dataset slicing")
offset = 0
if slice_.start is not None:
slice_ = (
slice(slice_.start + num, slice_.stop) if slice_.start < 0 else slice_
) # make indices positive if possible
if num and (slice_.start < 0 or slice_.start >= num):
raise IndexError(
"index out of bounds for dimension with length {}".format(num)
)
offset = slice_.start
if slice_.stop is not None:
slice_ = (
slice(slice_.start, slice_.stop + num) if slice_.stop < 0 else slice_
) # make indices positive if possible
if num and (slice_.stop < 0 or slice_.stop > num):
raise IndexError(
"index out of bounds for dimension with length {}".format(num)
)
if slice_.start is not None and slice_.stop is not None:
if (
slice_.start < 0
and slice_.stop < 0
or slice_.start >= 0
and slice_.stop >= 0
):
# If same signs, bound checking can be done
if abs(slice_.start) > abs(slice_.stop):
raise IndexError("start index is greater than stop index")
num = abs(slice_.stop) - abs(slice_.start)
else:
num = 0
# num = 0 if slice_.stop < slice_.start else slice_.stop - slice_.start
elif slice_.start is None and slice_.stop is not None:
num = slice_.stop
elif slice_.start is not None and slice_.stop is None:
num = num - slice_.start if num else 0
return num, offset
def create_numpy_dict(dataset, index, label_name=False):
"""Creates a list of dictionaries with the values from the tensorview objects in the dataset schema.
Parameters
----------
dataset: hub.api.dataset.Dataset object
The dataset whose TensorView objects are being used.
index: int
The index of the dataset record that is being used.
label_name: bool, optional
If the TensorView object is of the ClassLabel type, setting this to True would retrieve the label names
instead of the label encoded integers, otherwise this parameter is ignored.
"""
numpy_dict = {}
for path in dataset._tensors.keys():
d = numpy_dict
split = path.split("/")
for subpath in split[1:-1]:
if subpath not in d:
d[subpath] = {}
d = d[subpath]
d[split[-1]] = dataset[path, index].numpy(label_name=label_name)
return numpy_dict
def get_value(value):
if isinstance(value, np.ndarray) and value.shape == ():
value = value.item()
elif isinstance(value, list):
for i in range(len(value)):
if isinstance(value[i], np.ndarray) and value[i].shape == ():
value[i] = value[i].item()
return value
def str_to_int(assign_value, tokenizer):
if isinstance(assign_value, bytes):
try:
assign_value = assign_value.decode("utf-8")
except Exception:
raise ValueError(
"Bytes couldn't be decoded to string. Other encodings of bytes are currently not supported"
)
if (
isinstance(assign_value, np.ndarray) and assign_value.dtype.type is np.bytes_
) or (isinstance(assign_value, list) and isinstance(assign_value[0], bytes)):
assign_value = [item.decode("utf-8") for item in assign_value]
if tokenizer is not None:
if "transformers" not in sys.modules:
raise ModuleNotInstalledException("transformers")
import transformers
global transformers
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased")
assign_value = (
np.array(tokenizer(assign_value, add_special_tokens=False)["input_ids"])
if isinstance(assign_value, str)
else assign_value
)
if (
isinstance(assign_value, list)
and assign_value
and isinstance(assign_value[0], str)
):
assign_value = [
np.array(tokenizer(item, add_special_tokens=False)["input_ids"])
for item in assign_value
]
else:
assign_value = (
np.array([ord(ch) for ch in assign_value])
if isinstance(assign_value, str)
else assign_value
)
if (
isinstance(assign_value, list)
and assign_value
and isinstance(assign_value[0], str)
):
assign_value = [np.array([ord(ch) for ch in item]) for item in assign_value]
return assign_value