-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFunctions_likelihood.py
357 lines (251 loc) · 12 KB
/
Functions_likelihood.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
#!/usr/bin/env python
# coding: utf-8
# In this notebook, all the calculation for the likelihood are gathered.
#
# The main function looks for a pdf distribution, that will then permit to determine the likelihood of an event, thanks to the cdf of the function.
# In[1]:
get_ipython().run_line_magic('matplotlib', 'inline')
import warnings
import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.api as sm
from scipy.stats._continuous_distns import _distn_names
import matplotlib
import matplotlib.pyplot as plt
import statistics
# In[3]:
# Create models from data
def best_fit_distribution(data, bins=5000, ax=None):
"""Model data by finding best fit distribution to data"""
# Get histogram of original data
y, x = np.histogram(data, bins=bins, density=True)
x = (x + np.roll(x, -1))[:-1] / 2.0
# Best holders
best_distributions = []
# Estimate distribution parameters from data
for ii, distribution in enumerate([d for d in _distn_names if not d in ['levy_stable', 'studentized_range']]):
#print("{:>3} / {:<3}: {}".format( ii+1, len(_distn_names), distribution ))
distribution = getattr(st, distribution)
# Try to fit the distribution
try:
# Ignore warnings from data that can't be fit
with warnings.catch_warnings():
warnings.filterwarnings('ignore')
# fit dist to data
params = distribution.fit(data)
# Separate parts of parameters
arg = params[:-2]
loc = params[-2]
scale = params[-1]
# Calculate fitted PDF and error with fit in distribution
pdf = distribution.pdf(x, loc=loc, scale=scale, *arg)
sse = np.sum(np.power(y - pdf, 2.0)) # error calculated here with the mean squared error
# y ; observed values from histogram
# pdf ; predicted values from distribution function
# if axis pass in add to plot
try:
if ax:
pd.Series(pdf, x).plot(ax=ax)
end
except Exception:
pass
# identify if this distribution is better
best_distributions.append((distribution, params, sse))
except Exception:
pass
return sorted(best_distributions, key=lambda x:x[2])
# In[4]:
def make_pdf(dist, params, size=10000):
"""Generate distributions's Probability Distribution Function """
# Separate parts of parameters
arg = params[:-2]
loc = params[-2]
scale = params[-1]
# Get sane start and end points of distribution
start = dist.ppf(0.01, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.01, loc=loc, scale=scale)
end = dist.ppf(0.99, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.99, loc=loc, scale=scale)
# Build PDF and turn into pandas Series
x = np.linspace(start, end, size)
y = dist.pdf(x, loc=loc, scale=scale, *arg)
pdf = pd.Series(y, x)
return pdf
# In[5]:
def look_best_distr(data,climate_var,unit):
matplotlib.rcParams['figure.figsize'] = (16.0, 12.0)
matplotlib.style.use('ggplot')
# Plot for comparison
plt.figure(figsize=(12,8))
ax = data.plot(kind='hist', bins=50, density=True, alpha=0.5, color=list(matplotlib.rcParams['axes.prop_cycle'])[1]['color'])
# Save plot limits
dataYLim = ax.get_ylim()
# Find best fit distribution
best_distibutions = best_fit_distribution(data, 200, ax)
best_dist = best_distibutions[0]
# Update plots
ax.set_ylim(dataYLim)
ax.set_title(u'Histogram data\n All Fitted Distributions')
ax.set_xlabel(climate_var+' '+unit)
ax.set_ylabel('Frequency')
# Make PDF with best params
pdf = make_pdf(best_dist[0], best_dist[1])
# Display
plt.figure(figsize=(12,8))
ax = pdf.plot(lw=2, label='PDF', legend=True)
data.plot(kind='hist', bins=50, density=True, alpha=0.5, label='Data', legend=True, ax=ax)
param_names = (best_dist[0].shapes + ', loc, scale').split(', ') if best_dist[0].shapes else ['loc', 'scale']
param_str = ', '.join(['{}={:0.2f}'.format(k,v) for k,v in zip(param_names, best_dist[1])])
dist_str = '{}({})'.format(best_dist[0].name, param_str)
ax.set_title(u'Data with best fit distribution \n' + dist_str)
ax.set_xlabel(climate_var+' '+unit)
ax.set_ylabel('Frequency')
print('best distribution '+str(best_dist[0].name))
return best_dist[0], param_str # return the function of distribution attributes, return the parameters as strings
# In[6]:
# before appling this function, need to import the best ditribution function attributed to the distribution, otherwise, it will not work
# idea of function that gives you direclty the likelihood of an event
# rice is a distribution function
#rice.ppf(0.95,1.8,loc=36.01,scale=2.61)
#rice.cdf(45.512922702726385,1.8,loc=36.01,scale=2.61)
#rice.ppf(0.95,1.8,loc=36.01,scale=2.61)
#rice.ppf(0.05,1.8,loc=36.01,scale=2.61)
# In[7]:
def probability(event, name_distr, params):
# Separate parts of parameters
arg = params[:-2]
loc = params[-2]
scale = params[-1]
# find the function of the distribution based on the name of the function
# The getattr() method returns the value of the named attribute of an object.
# If not found, it returns the default value provided to the function.
# https://www.programiz.com/python-programming/methods/built-in/getattr
distribution = getattr(st, name_distr) # st is the short name for the module 'scipy.stats'
pdf_ = distribution.pdf(x, loc=loc, scale=scale, *arg)
cdf_ = distribution.cdf(event,loc=loc, scale=scale, *arg)
return
# In[8]:
def range_likelihood(event,unit, distribution, params):
arg = params[:-2]
loc = params[-2]
scale = params[-1]
#distribution = getattr(st, name_distr) # st is the short name for the module 'scipy.stats'
cdf_event = distribution.ppf(event,loc=loc, scale=scale, *arg)
if cdf_event>=0.05 and cdf_event<=0.95:
if cdf_event>=0.17 and cdf_event<=0.83:
if cdf_event>=0.25 and cdf_event<=0.75:
likelihood = str(round(distribution.ppf(0.25,loc=loc, scale=scale, *arg),2))+'-'+str(round(distribution.ppf(0.75,loc=loc, scale=scale, *arg),2))+unit+', probable range' # more likely than not
return likelihood
likelihood = str(round(distribution.ppf(0.17,loc=loc, scale=scale, *arg),2))+'-'+str(round(distribution.ppf(0.83,loc=loc, scale=scale, *arg),2))+unit+', likely range'
return likelihood
if cdf_event<0.17 and cdf_event>0.83:
likelihood = 'unlikely range'
return likelihood
likelihood = str(round(distribution.ppf(0.05,loc=loc, scale=scale, *arg),2))+'-'+str(round(distribution.ppf(0.95,loc=loc, scale=scale, *arg),2))+unit+', very likely range'
return likelihood
else:
likelihood = 'very unlikely range'
return likelihood
print('Problem, no likelihood was found')
# In[9]:
def likelihood(event,type_event,unit, distribution, params):
proba_event=type_event_f(event,type_event,distribution,params)
likelihood = define_likelihood(proba_event)
return proba_event,likelihood
# In[10]:
def type_event_f(event,type_event,distribution,params):
params1=params.split(',')
arg=[]
for p in params1:
if 'loc' in p:
temp=p.split('=')
loc = float(temp[len(temp)-1])
continue
if 'scale' in p:
temp=p.split('=')
scale = float(temp[len(temp)-1] )
continue
else:
temp=p.split('=')
arg.append(float(temp[len(temp)-1]))
continue
#distribution = getattr(st, distribution) # st is the short name for the module 'scipy.stats'
if type_event == '=':
proba_event = distribution.pdf(event,loc=loc, scale=scale, *arg)
if type_event == '>': # likelihood variable over a threshold
proba_event = 1-distribution.cdf(event,loc=loc, scale=scale, *arg)
if type_event == '<': # likelihood variable under a threshold
proba_event = distribution.cdf(event,loc=loc, scale=scale, *arg)
return proba_event
# In[11]:
def define_likelihood(proba_event):
if proba_event<0.05:
likelihood = 'Rare'
return likelihood
if proba_event<0.2:
likelihood = 'Unlikely'
return likelihood
if proba_event<0.5:
likelihood = 'Moderate'
return likelihood
if proba_event<0.8:
likelihood = 'Likely'
return likelihood
if proba_event<0.95:
likelihood = 'Almost certain'
return likelihood
else: # over 0.95
likelihood = 'Certain'
return likelihood
# In[12]:
# the input dataframe df should have no Nan values (add .dropna() at the end of input)
# climate_var is 'tas', 'tasmax','tasmin' or 'pr'
# name_column is the name of the column of the values of interest
# event is the threshold value of the event to look at
# type_event is either '=','>' or '<'.
# examples,
# if we are interested in the event 'the temperature will go over 40', event = 40, type_event = '>'
# if we are interested in the event 'the temperature will go under 40', event = 40, type_event = '<'
def likelihood_accross_models(df,climate_var,unit,name_column,event,type_event):
proba_event=[]
model_not_in_final_calculation = []
for model in list(set(df['Model'])):
(distribution,params)=look_best_distr(df[df['Model']==model][[name_column]],climate_var,unit)
print('params '+str(params))
proba_event_model = type_event_f(event,type_event,distribution,params)
print('proba_event_model '+str(proba_event_model))
if not np.isnan(proba_event_model): # apparently, when the parameters are too small, can cause trouble to calculate the probability
proba_event.append(proba_event_model)
print('proba_event_model registered')
else:
model_not_in_final_calculation.append(distribution.name)
proba_event_accross_model=statistics.mean(proba_event)
likelihood=define_likelihood(proba_event_accross_model)
return proba_event_accross_model,likelihood
# In[ ]:
# the input dataframe df should have no Nan values (add .dropna() at the end of input)
# CAREFUL --> only inout one project each time, not sevral ones in the same dataframe
# climate_var is 'tas', 'tasmax','tasmin' or 'pr'
# name_column is the name of the column of the values of interest
# event is the threshold value of the event to look at
# type_event is either '=','>' or '<'.
# examples,
# if we are interested in the event 'the temperature will go over 40', event = 40, type_event = '>'
# if we are interested in the event 'the temperature will go under 40', event = 40, type_event = '<'
def likelihood_accross_models_and_ssps(df,climate_var,unit,name_column,event,type_event):
proba_event=[]
model_not_in_final_calculation = []
for model in list(set(df['Model'])):
for ssp in list(set(df['Experiment'])):
(distribution,params)=look_best_distr(df[(df['Model']==model)&(df['Experiment']==ssp)][[name_column]],climate_var,unit)
print('params '+str(params))
proba_event_model = type_event_f(event,type_event,distribution,params)
print('proba_event_model '+str(proba_event_model))
if not np.isnan(proba_event_model): # apparently, when the parameters are too small, can cause trouble to calculate the probability
proba_event.append(proba_event_model)
print('proba_event_model registered')
else:
model_not_in_final_calculation.append(distribution.name)
proba_event_accross_model=statistics.mean(proba_event)
likelihood=define_likelihood(proba_event_accross_model)
return proba_event_accross_model,likelihood
# In[ ]: