Skip to content

Commit 8aa7c02

Browse files
authored
Lite Unify Metric Formatting (#803)
1 parent 19e639a commit 8aa7c02

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+3720
-3046
lines changed

.github/workflows/build-and-publish.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ jobs:
1515
- uses: actions/checkout@v3
1616
- uses: actions/setup-python@v4
1717
with:
18-
python-version: "3.8"
18+
python-version: "3.10"
1919
- name: Build wheel
2020
run: pip install build && python -m build
2121
- name: Publish to PyPI

lite/benchmarks/benchmark_classification.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ def run_benchmarking_analysis(
211211
)
212212

213213
# evaluate
214-
eval_time, _ = time_it(evaluator.compute_precision_recall)()
214+
eval_time, _ = time_it(evaluator.compute_precision_recall_rocauc)()
215215
if eval_time > evaluation_timeout and evaluation_timeout != -1:
216216
raise TimeoutError(
217217
f"Base evaluation timed out with {evaluator.n_datums} datums."

lite/examples/object-detection.ipynb

+95-91
Large diffs are not rendered by default.

lite/examples/tabular_classification.ipynb

+58-43
Large diffs are not rendered by default.

lite/tests/classification/test_accuracy.py

+30-22
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
Classification,
44
DataLoader,
55
MetricType,
6-
compute_metrics,
6+
compute_precision_recall_rocauc,
77
)
88

99

@@ -44,7 +44,7 @@ def test_accuracy_computation():
4444

4545
score_thresholds = np.array([0.25, 0.75], dtype=np.float64)
4646

47-
(_, _, _, accuracy, _, _, _) = compute_metrics(
47+
(_, _, _, accuracy, _, _, _) = compute_precision_recall_rocauc(
4848
data=data,
4949
label_metadata=label_metadata,
5050
score_thresholds=score_thresholds,
@@ -75,15 +75,23 @@ def test_accuracy_basic(basic_classifications: list[Classification]):
7575
"missing_prediction_labels": [],
7676
}
7777

78-
metrics = evaluator.evaluate(score_thresholds=[0.25, 0.75], as_dict=True)
78+
metrics = evaluator.evaluate(score_thresholds=[0.25, 0.75])
7979

80-
actual_metrics = [m for m in metrics[MetricType.Accuracy]]
80+
actual_metrics = [m.to_dict() for m in metrics[MetricType.Accuracy]]
8181
expected_metrics = [
8282
{
8383
"type": "Accuracy",
84-
"value": [2 / 3, 1 / 3],
84+
"value": 2 / 3,
8585
"parameters": {
86-
"score_thresholds": [0.25, 0.75],
86+
"score_threshold": 0.25,
87+
"hardmax": True,
88+
},
89+
},
90+
{
91+
"type": "Accuracy",
92+
"value": 1 / 3,
93+
"parameters": {
94+
"score_threshold": 0.75,
8795
"hardmax": True,
8896
},
8997
},
@@ -102,15 +110,15 @@ def test_accuracy_with_animal_example(
102110
loader.add_data(classifications_animal_example)
103111
evaluator = loader.finalize()
104112

105-
metrics = evaluator.evaluate(score_thresholds=[0.5], as_dict=True)
113+
metrics = evaluator.evaluate(score_thresholds=[0.5])
106114

107-
actual_metrics = [m for m in metrics[MetricType.Accuracy]]
115+
actual_metrics = [m.to_dict() for m in metrics[MetricType.Accuracy]]
108116
expected_metrics = [
109117
{
110118
"type": "Accuracy",
111-
"value": [2.0 / 6.0],
119+
"value": 2 / 6,
112120
"parameters": {
113-
"score_thresholds": [0.5],
121+
"score_threshold": 0.5,
114122
"hardmax": True,
115123
},
116124
},
@@ -129,15 +137,15 @@ def test_accuracy_color_example(
129137
loader.add_data(classifications_color_example)
130138
evaluator = loader.finalize()
131139

132-
metrics = evaluator.evaluate(score_thresholds=[0.5], as_dict=True)
140+
metrics = evaluator.evaluate(score_thresholds=[0.5])
133141

134-
actual_metrics = [m for m in metrics[MetricType.Accuracy]]
142+
actual_metrics = [m.to_dict() for m in metrics[MetricType.Accuracy]]
135143
expected_metrics = [
136144
{
137145
"type": "Accuracy",
138-
"value": [2 / 6],
146+
"value": 2 / 6,
139147
"parameters": {
140-
"score_thresholds": [0.5],
148+
"score_threshold": 0.5,
141149
"hardmax": True,
142150
},
143151
},
@@ -164,15 +172,15 @@ def test_accuracy_with_image_example(
164172
"missing_prediction_labels": [],
165173
}
166174

167-
metrics = evaluator.evaluate(as_dict=True)
175+
metrics = evaluator.evaluate()
168176

169-
actual_metrics = [m for m in metrics[MetricType.Accuracy]]
177+
actual_metrics = [m.to_dict() for m in metrics[MetricType.Accuracy]]
170178
expected_metrics = [
171179
{
172180
"type": "Accuracy",
173-
"value": [0.5],
181+
"value": 0.5,
174182
"parameters": {
175-
"score_thresholds": [0.0],
183+
"score_threshold": 0.0,
176184
"hardmax": True,
177185
},
178186
},
@@ -199,15 +207,15 @@ def test_accuracy_with_tabular_example(
199207
"missing_prediction_labels": [],
200208
}
201209

202-
metrics = evaluator.evaluate(as_dict=True)
210+
metrics = evaluator.evaluate()
203211

204-
actual_metrics = [m for m in metrics[MetricType.Accuracy]]
212+
actual_metrics = [m.to_dict() for m in metrics[MetricType.Accuracy]]
205213
expected_metrics = [
206214
{
207215
"type": "Accuracy",
208-
"value": [5 / 10],
216+
"value": 0.5,
209217
"parameters": {
210-
"score_thresholds": [0.0],
218+
"score_threshold": 0.0,
211219
"hardmax": True,
212220
},
213221
},

lite/tests/classification/test_confusion_matrix.py

+44-15
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,9 @@ def test_confusion_matrix_basic(basic_classifications: list[Classification]):
117117
actual_metrics = evaluator.compute_confusion_matrix(
118118
score_thresholds=[0.25, 0.75],
119119
number_of_examples=1,
120-
as_dict=True,
121120
)
122121

122+
actual_metrics = [m.to_dict() for m in actual_metrics]
123123
expected_metrics = [
124124
{
125125
"type": "ConfusionMatrix",
@@ -146,7 +146,10 @@ def test_confusion_matrix_basic(basic_classifications: list[Classification]):
146146
},
147147
"missing_predictions": {},
148148
},
149-
"parameters": {"score_threshold": 0.25},
149+
"parameters": {
150+
"score_threshold": 0.25,
151+
"maximum_number_of_examples": 1,
152+
},
150153
},
151154
{
152155
"type": "ConfusionMatrix",
@@ -167,7 +170,10 @@ def test_confusion_matrix_basic(basic_classifications: list[Classification]):
167170
"3": {"count": 1, "examples": [{"datum": "uid2"}]}
168171
},
169172
},
170-
"parameters": {"score_threshold": 0.75},
173+
"parameters": {
174+
"score_threshold": 0.75,
175+
"maximum_number_of_examples": 1,
176+
},
171177
},
172178
]
173179
for m in actual_metrics:
@@ -190,9 +196,9 @@ def test_confusion_matrix_unit(
190196

191197
actual_metrics = evaluator.compute_confusion_matrix(
192198
score_thresholds=[0.5],
193-
as_dict=True,
194199
)
195200

201+
actual_metrics = [m.to_dict() for m in actual_metrics]
196202
expected_metrics = [
197203
{
198204
"type": "ConfusionMatrix",
@@ -208,7 +214,10 @@ def test_confusion_matrix_unit(
208214
},
209215
"missing_predictions": {},
210216
},
211-
"parameters": {"score_threshold": 0.5},
217+
"parameters": {
218+
"score_threshold": 0.5,
219+
"maximum_number_of_examples": 0,
220+
},
212221
},
213222
]
214223
for m in actual_metrics:
@@ -232,9 +241,9 @@ def test_confusion_matrix_with_animal_example(
232241
actual_metrics = evaluator.compute_confusion_matrix(
233242
score_thresholds=[0.5],
234243
number_of_examples=6,
235-
as_dict=True,
236244
)
237245

246+
actual_metrics = [m.to_dict() for m in actual_metrics]
238247
expected_metrics = [
239248
{
240249
"type": "ConfusionMatrix",
@@ -277,7 +286,10 @@ def test_confusion_matrix_with_animal_example(
277286
"dog": {"count": 1, "examples": [{"datum": "uid5"}]}
278287
},
279288
},
280-
"parameters": {"score_threshold": 0.5},
289+
"parameters": {
290+
"score_threshold": 0.5,
291+
"maximum_number_of_examples": 6,
292+
},
281293
},
282294
]
283295
for m in actual_metrics:
@@ -301,9 +313,9 @@ def test_confusion_matrix_with_color_example(
301313
actual_metrics = evaluator.compute_confusion_matrix(
302314
score_thresholds=[0.5],
303315
number_of_examples=6,
304-
as_dict=True,
305316
)
306317

318+
actual_metrics = [m.to_dict() for m in actual_metrics]
307319
expected_metrics = [
308320
{
309321
"type": "ConfusionMatrix",
@@ -348,7 +360,10 @@ def test_confusion_matrix_with_color_example(
348360
"red": {"count": 1, "examples": [{"datum": "uid2"}]}
349361
},
350362
},
351-
"parameters": {"score_threshold": 0.5},
363+
"parameters": {
364+
"score_threshold": 0.5,
365+
"maximum_number_of_examples": 6,
366+
},
352367
},
353368
]
354369
for m in actual_metrics:
@@ -380,9 +395,9 @@ def test_confusion_matrix_multiclass(
380395
actual_metrics = evaluator.compute_confusion_matrix(
381396
score_thresholds=[0.05, 0.5, 0.85],
382397
number_of_examples=5,
383-
as_dict=True,
384398
)
385399

400+
actual_metrics = [m.to_dict() for m in actual_metrics]
386401
expected_metrics = [
387402
{
388403
"type": "ConfusionMatrix",
@@ -427,6 +442,7 @@ def test_confusion_matrix_multiclass(
427442
},
428443
"parameters": {
429444
"score_threshold": 0.05,
445+
"maximum_number_of_examples": 5,
430446
},
431447
},
432448
{
@@ -458,7 +474,10 @@ def test_confusion_matrix_multiclass(
458474
"bee": {"count": 1, "examples": [{"datum": "uid1"}]},
459475
},
460476
},
461-
"parameters": {"score_threshold": 0.5},
477+
"parameters": {
478+
"score_threshold": 0.5,
479+
"maximum_number_of_examples": 5,
480+
},
462481
},
463482
{
464483
"type": "ConfusionMatrix",
@@ -478,6 +497,7 @@ def test_confusion_matrix_multiclass(
478497
},
479498
"parameters": {
480499
"score_threshold": 0.85,
500+
"maximum_number_of_examples": 5,
481501
},
482502
},
483503
]
@@ -511,9 +531,9 @@ def test_confusion_matrix_without_hardmax_animal_example(
511531
score_thresholds=[0.05, 0.4, 0.5],
512532
number_of_examples=6,
513533
hardmax=False,
514-
as_dict=True,
515534
)
516535

536+
actual_metrics = [m.to_dict() for m in actual_metrics]
517537
expected_metrics = [
518538
{
519539
"type": "ConfusionMatrix",
@@ -542,7 +562,10 @@ def test_confusion_matrix_without_hardmax_animal_example(
542562
},
543563
"missing_predictions": {},
544564
},
545-
"parameters": {"score_threshold": 0.05},
565+
"parameters": {
566+
"score_threshold": 0.05,
567+
"maximum_number_of_examples": 6,
568+
},
546569
},
547570
{
548571
"type": "ConfusionMatrix",
@@ -559,7 +582,10 @@ def test_confusion_matrix_without_hardmax_animal_example(
559582
},
560583
"missing_predictions": {},
561584
},
562-
"parameters": {"score_threshold": 0.4},
585+
"parameters": {
586+
"score_threshold": 0.4,
587+
"maximum_number_of_examples": 6,
588+
},
563589
},
564590
{
565591
"type": "ConfusionMatrix",
@@ -576,7 +602,10 @@ def test_confusion_matrix_without_hardmax_animal_example(
576602
}
577603
},
578604
},
579-
"parameters": {"score_threshold": 0.5},
605+
"parameters": {
606+
"score_threshold": 0.5,
607+
"maximum_number_of_examples": 6,
608+
},
580609
},
581610
]
582611
for m in actual_metrics:

0 commit comments

Comments
 (0)