-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathpate_core.py
456 lines (349 loc) · 15.2 KB
/
pate_core.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
# core method from 'Scalable Private Learning with PATE'
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Core functions for RDP analysis in PATE framework.
This library comprises the core functions for doing differentially private
analysis of the PATE architecture and its various Noisy Max and other
mechanisms.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
from absl import app
import numpy as np
import scipy.stats
def _logaddexp(x):
"""Addition in the log space. Analogue of numpy.logaddexp for a list."""
m = max(x)
return m + math.log(sum(np.exp(x - m)))
def _log1mexp(x):
"""Numerically stable computation of log(1-exp(x))."""
if x < -1:
return math.log1p(-math.exp(x))
elif x < 0:
return math.log(-math.expm1(x))
elif x == 0:
return -np.inf
else:
raise ValueError("Argument must be non-positive.")
def compute_eps_from_delta(orders, rdp, delta):
"""Translates between RDP and (eps, delta)-DP.
Args:
orders: A list (or a scalar) of orders.
rdp: A list of RDP guarantees (of the same length as orders).
delta: Target delta.
Returns:
Pair of (eps, optimal_order).
Raises:
ValueError: If input is malformed.
"""
if len(orders) != len(rdp):
raise ValueError("Input lists must have the same length.")
eps = np.array(rdp) - math.log(delta) / (np.array(orders) - 1)
idx_opt = np.argmin(eps)
return eps[idx_opt], orders[idx_opt]
#####################
# RDP FOR THE GNMAX #
#####################
def compute_logq_gaussian(counts, sigma):
"""Returns an upper bound on ln Pr[outcome != argmax] for GNMax.
Implementation of Proposition 7.
Args:
counts: A numpy array of scores.
sigma: The standard deviation of the Gaussian noise in the GNMax mechanism.
Returns:
logq: Natural log of the probability that outcome is different from argmax.
"""
n = len(counts)
variance = sigma**2
idx_max = np.argmax(counts)
counts_normalized = counts[idx_max] - counts
counts_rest = counts_normalized[np.arange(n) != idx_max] # exclude one index
# Upper bound q via a union bound rather than a more precise calculation.
logq = _logaddexp(
scipy.stats.norm.logsf(counts_rest, scale=math.sqrt(2 * variance)))
print("log q:", logq)
# A sketch of a more accurate estimate, which is currently disabled for two
# reasons:
# 1. Numerical instability;
# 2. Not covered by smooth sensitivity analysis.
# from statsmodels.sandbox.distributions import extras
# covariance = variance * (np.ones((n - 1, n - 1)) + np.identity(n - 1))
# from scipy.stats import multivariate_normal
# print(multivariate_normal.cdf(
# counts_rest, np.zeros(n - 1), covariance, maxpts=1e12))
# logq = np.log1p(-multivariate_normal.cdf(
# counts_rest, np.zeros(n - 1), covariance, maxpts=1e12))
# print("log q 2:", logq)
# print("another:", math.log(1 - (1 / n)))
return min(logq, math.log(1 - (1 / n)))
def rdp_data_independent_gaussian(sigma, orders):
"""Computes a data-independent RDP curve for GNMax.
Implementation of Proposition 8.
Args:
sigma: Standard deviation of Gaussian noise.
orders: An array_like list of Renyi orders.
Returns:
Upper bound on RPD for all orders. A scalar if orders is a scalar.
Raises:
ValueError: If the input is malformed.
"""
if sigma < 0 or np.any(orders <= 1): # not defined for alpha=1
raise ValueError("Inputs are malformed.")
variance = sigma**2
if np.isscalar(orders):
return orders / variance
else:
return np.atleast_1d(orders) / variance
def rdp_gaussian(logq, sigma, orders):
"""Bounds RDP from above of GNMax given an upper bound on q (Theorem 6).
Args:
logq: Natural logarithm of the probability of a non-argmax outcome.
sigma: Standard deviation of Gaussian noise.
orders: An array_like list of Renyi orders.
Returns:
Upper bound on RPD for all orders. A scalar if orders is a scalar.
Raises:
ValueError: If the input is malformed.
"""
if logq > 0 or sigma < 0 or np.any(orders <= 1): # not defined for alpha=1
raise ValueError("Inputs are malformed.")
if np.isneginf(logq): # If the mechanism's output is fixed, it has 0-DP.
if np.isscalar(orders):
return 0.
else:
return np.full_like(orders, 0., dtype=np.float)
variance = sigma**2
# Use two different higher orders: mu_hi1 and mu_hi2 computed according to
# Proposition 10.
mu_hi2 = math.sqrt(variance * -logq)
mu_hi1 = mu_hi2 + 1
orders_vec = np.atleast_1d(orders)
ret = orders_vec / variance # baseline: data-independent bound
# Filter out entries where data-dependent bound does not apply.
mask = np.logical_and(mu_hi1 > orders_vec, mu_hi2 > 1)
print('mask len: ', np.sum(mask))
rdp_hi1 = mu_hi1 / variance
rdp_hi2 = mu_hi2 / variance
log_a2 = (mu_hi2 - 1) * rdp_hi2
# Make sure q is in the increasing wrt q range and A is positive.
if (np.any(mask) and logq <= log_a2 - mu_hi2 *
(math.log(1 + 1 / (mu_hi1 - 1)) + math.log(1 + 1 / (mu_hi2 - 1))) and
-logq > rdp_hi2):
# Use log1p(x) = log(1 + x) to avoid catastrophic cancellations when x ~ 0.
log1q = _log1mexp(logq) # log1q = log(1-q)
log_a = (orders - 1) * (
log1q - _log1mexp((logq + rdp_hi2) * (1 - 1 / mu_hi2)))
log_b = (orders - 1) * (rdp_hi1 - logq / (mu_hi1 - 1))
# Use logaddexp(x, y) = log(e^x + e^y) to avoid overflow for large x, y.
log_s = np.logaddexp(log1q + log_a, logq + log_b)
print('choose data dependent bound: ', np.sum((ret > log_s / (orders - 1))[mask]))
ret[mask] = np.minimum(ret, log_s / (orders - 1))[mask]
else:
print("Condition not satisfied:")
# print("np.any(mask): ", np.any(mask))
# print("logq <= log_a2 - mu_hi2 * (math.log(1 + 1 / (mu_hi1 - 1)) + math.log(1 + 1 / (mu_hi2 - 1))):", logq <= log_a2 - mu_hi2 * (math.log(1 + 1 / (mu_hi1 - 1)) + math.log(1 + 1 / (mu_hi2 - 1))))
# print("-logq > rdp_hi2:", -logq > rdp_hi2)
assert np.all(ret >= 0)
if np.isscalar(orders):
return np.asscalar(ret)
else:
return ret
def double_rdp_gaussian(logq, sigma, orders):
"""Bounds RDP from above of GNMax given an upper bound on q (Theorem 6).
Args:
logq: Natural logarithm of the probability of a non-argmax outcome.
sigma: Standard deviation of Gaussian noise.
orders: An array_like list of Renyi orders.
Returns:
Upper bound on RPD for all orders. A scalar if orders is a scalar.
Raises:
ValueError: If the input is malformed.
"""
if logq > 0 or sigma < 0 or np.any(orders <= 1): # not defined for alpha=1
raise ValueError("Inputs are malformed.")
if np.isneginf(logq): # If the mechanism's output is fixed, it has 0-DP.
if np.isscalar(orders):
return 0.
else:
return np.full_like(orders, 0., dtype=np.float)
variance = sigma**2
# Use two different higher orders: mu_hi1 and mu_hi2 computed according to
# Proposition 10.
mu_hi2 = math.sqrt(variance * -logq)
mu_hi1 = mu_hi2 + 1
orders_vec = np.atleast_1d(orders)
ret = orders_vec / variance # baseline: data-independent bound
ret1 = np.array(ret)
# Filter out entries where data-dependent bound does not apply.
mask = np.logical_and(mu_hi1 > orders_vec, mu_hi2 > 1)
print('mask len: ', np.sum(mask))
rdp_hi1 = mu_hi1 / variance
rdp_hi2 = mu_hi2 / variance
log_a2 = (mu_hi2 - 1) * rdp_hi2
# Make sure q is in the increasing wrt q range and A is positive.
if (np.any(mask) and logq <= log_a2 - mu_hi2 *
(math.log(1 + 1 / (mu_hi1 - 1)) + math.log(1 + 1 / (mu_hi2 - 1))) and
-logq > rdp_hi2):
# Use log1p(x) = log(1 + x) to avoid catastrophic cancellations when x ~ 0.
log1q = _log1mexp(logq) # log1q = log(1-q)
log_a = (orders - 1) * (
log1q - _log1mexp((logq + rdp_hi2) * (1 - 1 / mu_hi2)))
log_b = (orders - 1) * (rdp_hi1 - logq / (mu_hi1 - 1))
# Use logaddexp(x, y) = log(e^x + e^y) to avoid overflow for large x, y.
log_s = np.logaddexp(log1q + log_a, logq + log_b)
print('choose data dependent bound: ', np.sum((ret > log_s / (orders - 1))[mask]))
# always use data-dependent bound without comparison
ret1[mask] = (log_s / (orders - 1))[mask]
else:
print("Condition not satisfied")
assert np.all(ret >= 0)
if np.isscalar(orders):
return np.asscalar(ret1), np.asscalar(ret)
else:
return ret1, ret
def is_data_independent_always_opt_gaussian(num_teachers, num_classes, sigma,
orders):
"""Tests whether data-ind bound is always optimal for GNMax.
Args:
num_teachers: Number of teachers.
num_classes: Number of classes.
sigma: Standard deviation of the Gaussian noise.
orders: An array_like list of Renyi orders.
Returns:
Boolean array of length |orders| (a scalar if orders is a scalar). True if
the data-independent bound is always the same as the data-dependent bound.
"""
unanimous = np.array([num_teachers] + [0] * (num_classes - 1))
logq = compute_logq_gaussian(unanimous, sigma)
rdp_dep = rdp_gaussian(logq, sigma, orders)
rdp_ind = rdp_data_independent_gaussian(sigma, orders)
return np.isclose(rdp_dep, rdp_ind)
###################################
# RDP FOR THE THRESHOLD MECHANISM #
###################################
def compute_logpr_answered(t, sigma, counts):
"""Computes log of the probability that a noisy threshold is crossed.
Args:
t: The threshold.
sigma: The stdev of the Gaussian noise added to the threshold.
counts: An array of votes.
Returns:
Natural log of the probability that max is larger than a noisy threshold.
"""
# Compared to the paper, max(counts) is rounded to the nearest integer. This
# is done to facilitate computation of smooth sensitivity for the case of
# the interactive mechanism, where votes are not necessarily integer.
return scipy.stats.norm.logsf(t - round(max(counts)), scale=sigma)
def compute_rdp_data_independent_threshold(sigma, orders):
# The input to the threshold mechanism has stability 1, compared to
# GNMax, which has stability = 2. Hence the sqrt(2) factor below.
return rdp_data_independent_gaussian(2**.5 * sigma, orders)
def compute_rdp_threshold(log_pr_answered, sigma, orders):
logq = min(log_pr_answered, _log1mexp(log_pr_answered))
# The input to the threshold mechanism has stability 1, compared to
# GNMax, which has stability = 2. Hence the sqrt(2) factor below.
return rdp_gaussian(logq, 2**.5 * sigma, orders)
def is_data_independent_always_opt_threshold(num_teachers, threshold, sigma,
orders):
"""Tests whether data-ind bound is always optimal for the threshold mechanism.
Args:
num_teachers: Number of teachers.
threshold: The cut-off threshold.
sigma: Standard deviation of the Gaussian noise.
orders: An array_like list of Renyi orders.
Returns:
Boolean array of length |orders| (a scalar if orders is a scalar). True if
the data-independent bound is always the same as the data-dependent bound.
"""
# Since the data-dependent bound depends only on max(votes), it suffices to
# check whether the data-dependent bounds are better than data-independent
# bounds in the extreme cases when max(votes) is minimal or maximal.
# For both Confident GNMax and Interactive GNMax it holds that
# 0 <= max(votes) <= num_teachers.
# The upper bound is trivial in both cases.
# The lower bound is trivial for Confident GNMax (and a stronger one, based on
# the pigeonhole principle, is possible).
# For Interactive GNMax (Algorithm 2), the lower bound follows from the
# following argument. Since the votes vector is the difference between the
# actual teachers' votes and the student's baseline, we need to argue that
# max(n_j - M * p_j) >= 0.
# The bound holds because sum_j n_j = sum M * p_j = M. Thus,
# sum_j (n_j - M * p_j) = 0, and max_j (n_j - M * p_j) >= 0 as needed.
logq1 = compute_logpr_answered(threshold, sigma, [0])
logq2 = compute_logpr_answered(threshold, sigma, [num_teachers])
rdp_dep1 = compute_rdp_threshold(logq1, sigma, orders)
rdp_dep2 = compute_rdp_threshold(logq2, sigma, orders)
rdp_ind = compute_rdp_data_independent_threshold(sigma, orders)
return np.isclose(rdp_dep1, rdp_ind) and np.isclose(rdp_dep2, rdp_ind)
#############################
# RDP FOR THE LAPLACE NOISE #
#############################
def compute_logq_laplace(counts, lmbd):
"""Computes an upper bound on log Pr[outcome != argmax] for LNMax.
Args:
counts: A list of scores.
lmbd: The lambda parameter of the Laplace distribution ~exp(-|x| / lambda).
Returns:
logq: Natural log of the probability that outcome is different from argmax.
"""
# For noisy max, we only get an upper bound via the union bound. See Lemma 4
# in https://arxiv.org/abs/1610.05755.
#
# Pr[ j beats i*] = (2+gap(j,i*))/ 4 exp(gap(j,i*)
# proof at http://mathoverflow.net/questions/66763/
idx_max = np.argmax(counts)
counts_normalized = (counts - counts[idx_max]) / lmbd
counts_rest = np.array(
[counts_normalized[i] for i in range(len(counts)) if i != idx_max])
logq = _logaddexp(np.log(2 - counts_rest) + math.log(.25) + counts_rest)
return min(logq, math.log(1 - (1 / len(counts))))
def rdp_pure_eps(logq, pure_eps, orders):
"""Computes the RDP value given logq and pure privacy eps.
Implementation of https://arxiv.org/abs/1610.05755, Theorem 3.
The bound used is the min of three terms. The first term is from
https://arxiv.org/pdf/1605.02065.pdf.
The second term is based on the fact that when event has probability (1-q) for
q close to zero, q can only change by exp(eps), which corresponds to a
much smaller multiplicative change in (1-q)
The third term comes directly from the privacy guarantee.
Args:
logq: Natural logarithm of the probability of a non-optimal outcome.
pure_eps: eps parameter for DP
orders: array_like list of moments to compute.
Returns:
Array of upper bounds on rdp (a scalar if orders is a scalar).
"""
orders_vec = np.atleast_1d(orders)
q = math.exp(logq)
log_t = np.full_like(orders_vec, np.inf)
if q <= 1 / (math.exp(pure_eps) + 1):
logt_one = math.log1p(-q) + (
math.log1p(-q) - _log1mexp(pure_eps + logq)) * (
orders_vec - 1)
logt_two = logq + pure_eps * (orders_vec - 1)
log_t = np.logaddexp(logt_one, logt_two)
ret = np.minimum(
np.minimum(0.5 * pure_eps * pure_eps * orders_vec,
log_t / (orders_vec - 1)), pure_eps)
if np.isscalar(orders):
return np.asscalar(ret)
else:
return ret
def main(argv):
del argv # Unused.
if __name__ == "__main__":
app.run(main)