forked from mlcommons/algorithmic-efficiency
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspec.py
295 lines (239 loc) · 7.9 KB
/
spec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
"""MLPerf Algorithmic Efficiency API."""
import enum
import time
from typing import Any, Callable, Dict, Iterator, List, Tuple, Union
import abc
import jax
import jax.numpy as jnp
# import numpy as np
# import tensorflow as tf
class LossType(enum.Enum):
SOFTMAX_CROSS_ENTROPY = 0
SIGMOID_CROSS_ENTROPY = 1
MEAN_SQUARED_ERROR = 2
class ForwardPassMode(enum.Enum):
TRAIN = 0
EVAL = 1
# ... ?
class ParamType(enum.Enum):
WEIGHT = 0
BIAS = 1
CONV_WEIGHT = 2
BATCH_NORM = 3
EMBEDDING = 4
class ComparisonDirection(enum.Enum):
MINIMIZE = 0
MAXIMIZE = 1
# Of course, Tensor knows its shape and dtype.
# Tensor = Union[jnp.array, np.array, tf.Tensor, ...]
Tensor = Union[jnp.array] # DeviceArray??
# Define this so that if using pytree iteration utilities, can iterate
# over the model shapes pytree without iterating over the shape tuples.
class ShapeTuple:
def __init__(self, shape_tuple):
self.shape_tuple = shape_tuple
Shape = Union[
Tuple[int],
Tuple[int, int],
Tuple[int, int, int],
Tuple[int, int, int, int],
ShapeTuple]
ParameterShapeTree = Dict[str, Dict[str, Shape]]
# If necessary, these can be izipped together easily given they have the same
# structure, to get an iterator over pairs of leaves.
ParameterKey = str
# Dicts can be arbitrarily nested.
ParameterTree = Dict[ParameterKey, Dict[ParameterKey, Tensor]]
ParameterTypeTree = Dict[ParameterKey, Dict[ParameterKey, ParamType]]
RandomState = jax.random.PRNGKey
OptimizerState = Any
Hyperparamters = Any
Timing = int
Steps = int
# BN EMAs.
ModelAuxillaryState = Any
UpdateReturn = Tuple[
OptimizerState, ParameterTree, ModelAuxillaryState]
InitOptimizerFn = Callable[
[ParameterShapeTree, Hyperparamters, RandomState],
OptimizerState]
UpdateParamsFn = Callable[
[
ParameterTree,
ParameterTypeTree,
ModelAuxillaryState,
Hyperparamters,
Tensor,
Tensor,
LossType,
OptimizerState,
List[Tuple[int, float]],
int,
RandomState
],
UpdateReturn]
DataSelectionFn = Callable[
[
Iterator[Tuple[Tensor, Tensor]],
OptimizerState,
ParameterTree,
LossType,
Hyperparamters,
int,
RandomState
],
Tuple[Tensor, Tensor]]
class Workload(metaclass=abc.ABCMeta):
@abc.abstractmethod
def has_reached_goal(self, eval_result: float) -> bool:
"""Return whether or not the workload goal has been reached."""
@abc.abstractmethod
def build_input_queue(
self,
data_rng: RandomState,
split: str,
batch_size: int):
"""Build the input queue for the workload data.
This is the only function that is NOT allowed to be called by submitters.
"""
@abc.abstractmethod
def param_shapes(self):
"""The shapes of the parameters in the workload model."""
@abc.abstractmethod
def model_params_types(self):
"""The types of the parameters in the workload model."""
@abc.abstractproperty
def loss_type(self):
"""The type of loss function."""
@abc.abstractproperty
def train_mean(self):
"""The mean of the training data."""
@abc.abstractproperty
def train_stddev(self):
"""The stddev of the training data."""
@abc.abstractproperty
def max_allowed_runtime_sec(self):
"""The max allowed runtime of the workload in seconds."""
@abc.abstractproperty
def eval_period_time_sec(self):
"""The eval period of the workload in seconds."""
@abc.abstractmethod
def is_output_params(self, param_key: ParameterKey) -> bool:
"""Whether or not a key in ParameterTree is the output layer parameters."""
@abc.abstractmethod
def preprocess_for_train(
self,
selected_raw_input_batch: Tensor,
selected_label_batch: Tensor,
rng: RandomState) -> Tensor:
"""return augmented_and_preprocessed_input_batch"""
@abc.abstractmethod
def preprocess_for_eval(
self,
raw_input_batch: Tensor,
train_mean: Tensor,
train_stddev: Tensor) -> Tensor:
"""return preprocessed_input_batch"""
# InitModelFn = Callable[
# Tuple[ParameterShapeTree, RandomState], ParameterTree]
@abc.abstractmethod
def init_model_fn(
self, rng: RandomState) -> Tuple[ParameterTree, ModelAuxillaryState]:
"""return initial_params, initial_model_state"""
# ModelFn = Callable[
# Tuple[ParameterTree, Tensor, ForwardPassMode, RandomState, bool],
# Tensor]
@abc.abstractmethod
def model_fn(
self,
params: ParameterTree,
augmented_and_preprocessed_input_batch: Tensor,
model_state: ModelAuxillaryState,
mode: ForwardPassMode,
rng: RandomState,
update_batch_norm: bool) -> Tuple[Tensor, ModelAuxillaryState]:
"""return logits_batch"""
# Possible side effect of updating BN.
# Keep this separate from the loss function in order to support optimizers
# that use the logits.
def output_activation_fn(
self,
logits_batch: Tensor,
loss_type: LossType) -> Tensor:
if loss_type == LossType.SOFTMAX_CROSS_ENTROPY:
return jax.nn.softmax(logits_batch, axis=-1)
if loss_type == LossType.SIGMOID_CROSS_ENTROPY:
return jax.nn.sigmoid(logits_batch)
if loss_type == LossType.MEAN_SQUARED_ERROR:
return logits_batch
# LossFn = Callable[Tuple[Tensor, Tensor], Tensor]
# Does NOT apply regularization, which is left to the submitter to do in
# `update_params`.
@abc.abstractmethod
def loss_fn(
self,
label_batch: Tensor,
logits_batch: Tensor,
loss_type: LossType) -> Tensor: # differentiable
"""return oned_array_of_losses_per_example"""
@abc.abstractmethod
def eval_model(
self,
params: ParameterTree,
model_state: ModelAuxillaryState,
rng: RandomState):
"""Run a full evaluation of the model."""
class TrainingCompleteError(Exception):
pass
# Training algorithm track submission functions, to be filled in by the
# submitter.
def init_optimizer_state(
workload: Workload,
hyperparameters: Hyperparamters,
rng: RandomState) -> OptimizerState:
# return initial_optimizer_state
pass
_UpdateReturn = Tuple[
OptimizerState, ParameterTree, ModelAuxillaryState]
# Each call to this function is considered a "step".
# Can raise a TrainingCompleteError if it believe it has achieved the goal and
# wants to end the run and receive a final free eval. It will not be restarted,
# and if has not actually achieved the goal then it will be considered as not
# achieved the goal and get an infinite time score. Most submissions will likely
# wait until the next free eval and not use this functionality.
def update_params(
workload: Workload,
current_params: ParameterTree,
current_params_types: ParameterTypeTree,
model_state: ModelAuxillaryState,
hyperparameters: Hyperparamters,
augmented_and_preprocessed_input_batch: Tensor,
label_batch: Tensor,
# This will define the output activation via `output_activation_fn`.
loss_type: LossType,
optimizer_state: OptimizerState,
eval_results: List[Tuple[int, float]],
global_step: int,
rng: RandomState) -> _UpdateReturn:
"""Return (updated_optimizer_state, updated_params, updated_model_state)."""
pass
# Not allowed to update the model parameters, hyperparameters, global step, or
# optimzier state.
def data_selection(
workload: Workload,
input_queue: Iterator[Tuple[Tensor, Tensor]],
optimizer_state: OptimizerState,
current_params: ParameterTree,
hyperparameters: Hyperparamters,
global_step: int,
rng: RandomState) -> Tuple[Tensor, Tensor]:
"""Select data from the infinitely repeating, pre-shuffled input queue.
Each element of the queue is a single training example and label.
We left out `current_params_types` because we do not believe that it would
# be necessary for this function.
"""
# return input_batch, label_batch
pass
def get_batch_size(workload_name):
"""Return a batch size to use for a given workload."""
pass