TL_functions.py

"""Module with functions to carry out analyses for the TL project"""
from __future__ import division, with_statement
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid.inset_locator import inset_axes
import pypartitions as parts
from macroecotools import AICc
import numpy as np
import scipy
from scipy import stats
import scikits.statsmodels.api as sm
import random
import csv
import signal
from pyper import *
from contextlib import contextmanager

# Define constants
Q_MIN = 5 # Minimal Q for a (Q, N) combo to be included 
N_MIN = 3 # Minimal N for a (Q, N) combo to be included
n_MIN = 5 # Minimal number of valid points in a study to be included 

class TimeoutException(Exception): pass

@contextmanager
def time_limit(seconds):
    """Function to skip step after given time"""
    def signal_handler(signum, frame):
        raise TimeoutException, 'Time out!'
    signal.signal(signal.SIGALRM, signal_handler)
    signal.alarm(seconds)
    try:
        yield
    finally:
        signal.alarm(0)

def get_QN_mean_var_data(data_dir):
    """Read in data file with study, Q, and N"""
    data = np.genfromtxt(data_dir, dtype = 'S25, i15, i15, f15, f15', delimiter = '\t', 
                         names = ['study', 'Q', 'N', 'mean', 'var'])
    return data

def get_study_info(data_dir):
    """Read in data file with study, taxon, and type"""
    data = np.genfromtxt(data_dir, dtype = 'S25, S25, S25', delimiter = '\t',
                          names = ['study', 'taxon', 'type'])
    return data

def get_var_sample_file(data_dir, sample_size = 1000):
    """Read in the file generated by the function sample_var()"""
    names_data = ['study', 'Q', 'N', 'mean', 'var']
    names_sample = ['sample'+str(i) for i in xrange(1, sample_size + 1)]
    names_data.extend(names_sample)
    type_data = 'S15, i15, i15' + ',<f8'*(len(names_data) - 3)
    data = np.genfromtxt(data_dir, delimiter = '\t', names = names_data, dtype = type_data)
    return data

def get_val_ind_sample_file(data_dir, sample_size = 1000):
    """Read in a file with 'study' as the first column, value from empirical TL as the second column,
    
    and one value from each simualted sample as the rest of the columns.
    
    """
    names_data = ['study', 'emp_val']
    names_sample = ['sample'+str(i) for i in xrange(1, sample_size + 1)]
    names_data.extend(names_sample)
    type_data = 'S15' + ',<f8'*(len(names_data) - 1)
    data = np.genfromtxt(data_dir, delimiter = '\t', names = names_data, dtype = type_data, autostrip = True)
    return data

def get_tl_par_file(data_dir):
    """Read in the file generated by the function TL_form_sample()"""
    type_data = 'S15' + ',f15' * 14
    names_data = ['study', 'b_obs', 'inter_obs', 'R2_obs', 'p_obs', 'b_expc', 'inter_expc', 'R2_expc', \
                  'p_sample', 'b_z', 'b_lower', 'b_upper', 'inter_z', 'inter_lower', 'inter_upper']
    data = np.genfromtxt(data_dir, delimiter = ' ', names = names_data, dtype = type_data)
    return data
    
def RandomComposition_weak(q, n):
    indices = sorted(np.random.randint(0, q, n - 1))
    parts = [(indices + [q])[i] - ([0] + indices)[i] for i in range(len(indices)+1)]
    return parts

def rand_compositions(q, n, sample_size, zeros):
    comps = []
    while len(comps) < sample_size:
        
        comp = RandomComposition_weak(q, n)
                
        if len(comp) != n or sum(comp) != q:
            print zeros,'error: q=',q,'!=sum(comp)=',sum(comp),'or n=',n,'!=len(comp)=',len(comp)
            sys.exit()
        comp.sort()
        comp.reverse()
        comps.append(comp)
    
    comps = [list(x) for x in set(tuple(x) for x in comps)]
    return comps

def get_var_for_Q_N(q, n, sample_size, t_limit, analysis):
    """Given q and n, returns a list of variance of length sample size with variance of 
    
    each sample partitions or compositions.
    
    """
    QN_var = []
    try:
        with time_limit(t_limit):
            for Niter in range(sample_size):
                if analysis == 'partition':
                    QN_parts = parts.rand_partitions(q, n, 1, 'bottom_up', {}, True)
                else: QN_parts = rand_compositions(q, n, 1, True)
                QN_var.append(np.var(QN_parts[0], ddof = 1))
            return QN_var
    except TimeoutException, msg:
        print 'Timed out!'
        return QN_var

def sample_var(data, study, sample_size = 1000, t_limit = 7200, analysis = 'partition', out_folder = './out_files/'):
    """Obtain and record the variance of partition or composition samples.
    
    Input:
    data - data list read in with get_QN_mean_var_data()
    study - ID of study
    sample_size - number of samples to be drawn, default value is 1000
    t_limit - abort sampling procedure for one Q-N combo after t_limit seconds, default value is 7200 (2 hours)
    analysis - partition or composition
    
    """
    data_study = data[data['study'] == study]
    var_parts = []
    for record in data_study:
        q = record[1]
        n = record[2]
        out_row = [x for x in record]
        QN_var = get_var_for_Q_N(q, n, sample_size, t_limit, analysis)
        if len(QN_var) == sample_size:
            out_row.extend(QN_var)
            var_parts.append(out_row)
        else: break # Break out of for-loop if a Q-N combo is skipped
    
    if len(data_study) == len(var_parts): # If no QN combos are omitted, print to file
        out_write_var = open(out_folder + 'taylor_QN_var_predicted_' + analysis + '_' + str(sample_size) + '_full.txt', 'a')
        for var_row in var_parts:
            print>>out_write_var, '\t'.join([str(x) for x in var_row])
        out_write_var.close()

def get_z_score(emp_var, sim_var_list):
    """Return the z-score as a measure of the discrepancy between empirical and sample variance"""
    sd_sim = (np.var(sim_var_list, ddof = 1)) ** 0.5
    return (emp_var - np.mean(sim_var_list)) / sd_sim

def quadratic_term(list_of_mean, list_of_var):
    """Fit a quadratic term and return its p-value"""
    # Remove records with 0 variance
    log_var = [np.log(x) for x in list_of_var if x > 0]
    log_mean = [np.log(list_of_mean[i]) for i in range(len(list_of_mean)) if list_of_var[i] > 0]
    log_mean_quad = [x ** 2 for x in log_mean]
    indep_var = np.column_stack((log_mean, log_mean_quad))
    indep_var = sm.add_constant(indep_var, prepend = True)
    quad_res = sm.OLS(log_var, indep_var).fit()
    return quad_res.pvalues[2]

def TL_from_sample(dat_sample, analysis = 'partition', out_folder = './out_files/'):
    """Obtain the empirical and simulated TL relationship given the output file from sample_var().
    
    Here only the summary statistics are recorded for each study, instead of results from each 
    individual sample, because the analysis can be quickly re-done given the input file, without
    going through the time-limiting step of generating samples from partitions.
    The input dat_sample is in the same format as defined by get_var_sample_file().
    The output file has the following columns: 
    study, empirical b, empirical intercept, empirical R-squared, empirical p-value, mean b, intercept, R-squared from samples, 
    percentage of significant TL in samples (at alpha = 0.05), z-score between empirical and sample b, 2.5 and 97.5 percentile of sample b,
    z-score between empirical and sample intercept, 2.5 and 97.5 percentile of sample intercept.
    
    """
    study_list = sorted(np.unique(dat_sample['study']))
    for study in study_list:
        dat_study = dat_sample[dat_sample['study'] == study]
        emp_b, emp_inter, emp_r, emp_p, emp_std_err = stats.linregress(np.log(dat_study['mean']), np.log(dat_study['var']))
        b_list = []
        inter_list = []
        psig = 0
        R2_list = []
        for i_sim in dat_sample.dtype.names[5:]:
            var_sim = dat_study[i_sim][dat_study[i_sim] > 0] # Omit samples of zero variance 
            mean_list = dat_study['mean'][dat_study[i_sim] > 0]
            sim_b, sim_inter, sim_r, sim_p, sim_std_error = stats.linregress(np.log(mean_list), np.log(var_sim))
            b_list.append(sim_b)
            inter_list.append(sim_inter)
            R2_list.append(sim_r ** 2)
            if sim_p < 0.05: psig += 1
        psig /= len(dat_sample.dtype.names[5:])
        out_file = open(out_folder + 'TL_form_' + analysis + '.txt', 'a')
        print>>out_file, study, emp_b, emp_inter, emp_r ** 2, emp_p, np.mean(b_list), np.mean(inter_list), np.mean(R2_list), \
             psig, get_z_score(emp_b, b_list), np.percentile(b_list, 2.5), np.percentile(b_list, 97.5), get_z_score(emp_inter, inter_list), \
             np.percentile(inter_list, 2.5), np.percentile(inter_list, 97.5)
        out_file.close()

def get_quadratic_sig_data(dat_sample, analysis = 'partition', out_folder = './out_files/'):
    """Compute the p-value of the quadratic term for each dataset
    
    as well as all of its partitions/compositions and write results to file.
    
    """
    study_list = sorted(np.unique(dat_sample['study']))
    for study in study_list:
        p_list = [study]
        dat_study = dat_sample[dat_sample['study'] == study]
        emp_quad_p = quadratic_term(dat_study['mean'], dat_study['var'])
        p_list.append(emp_quad_p)
        for i_sim in dat_sample.dtype.names[5:]:
            var_sim = dat_study[i_sim][dat_study[i_sim] > 0] # Omit samples of zero variance 
            mean_list = dat_study['mean'][dat_study[i_sim] > 0]
            sim_quad_p = quadratic_term(mean_list, var_sim)
            p_list.append(sim_quad_p)
        out_file = open(out_folder + 'TL_quad_p_' + analysis + '.txt', 'a')
        print>>out_file, ' \t'.join(map(str, p_list))
        out_file.close()
    
def inclusion_criteria(dat_study, sig = False):
    """Criteria that datasets need to meet to be included in the analysis"""
    dat_study = dat_study[(dat_study['N'] >= N_MIN) * (dat_study['Q'] >= Q_MIN)]
    if len(dat_study) >= n_MIN: 
        b, inter, rval, pval, std_err = stats.linregress(np.log(dat_study['mean']), np.log(dat_study['var']))
        if ((not sig) or (pval < 0.05)): # If significance is not required, or if the relationship is significant
            return True
    else: return False

# Below are functions for plotting
def plot_obs_expc(obs, expc, expc_upper, expc_lower, obs_type, loglog, legend = False, loc = 2, ax = None):
    """Generic function to generate an observed vs expected figure with 1:1 line, 
    
    with obs on the x-axis, expected on the y-axis, and shading for CI of expected.
    Input: 
    obs - list of observed values
    expc - list of expected values, the same length as obs
    expc_upper - list of the upper percentile of expected values, the same length as obs
    expc_lower - list of the lower percentile of expected values, the same length as obs
    obs_type - list of the same length of obs, specifying whether each obs is spatial (red) or temporal (blue)
    loglog - whether both axes are to be transformed
    legend - if legend is to be included
    loc - if legend is True, the location of the legend (default at upper left)
    ax - whether the plot is generated on a given figure, or a new plot object is to be created
    
    """
    obs, expc, expc_upper, expc_lower = list(obs), list(expc), list(expc_upper), list(expc_lower)
    if not ax:
        fig = plt.figure(figsize = (3.5, 3.5))
        ax = plt.subplot(111)
    
    if loglog:
        axis_min = 0.9 * min([x for x in obs if x > 0] + [y for y in expc if y > 0])
        axis_max = 3 * max(obs + expc)
        ax.set_xscale('log')
        ax.set_yscale('log')        
    else:
        axis_min = 0.9 * min(obs + expc)
        axis_max = 1.1 * max(obs + expc)

    # Sort all lists with respect to obs
    index = sorted(range(len(obs)), key = lambda k: obs[k])
    expc = [expc[i] for i in index]
    expc_upper = [expc_upper[i] for i in index]
    expc_lower = [expc_lower[i] for i in index]
    obs = [obs[i] for i in index]
    obs_type = [obs_type[i] for i in index]
     
    # Replace zeros in expc_lower with the minimal value above zero for the purpose of plotting
    expc_lower_min = min([x for x in expc_lower if x > 0])
    expc_lower = [expc_lower_min if x == 0 else x for x in expc_lower]
    
    i_spac = [i for i, x in enumerate(obs_type) if x == 'spatial']
    i_temp = [i for i, x in enumerate(obs_type) if x == 'temporal']
    
    plt.fill_between(obs, expc_lower, expc_upper, color = '#FF83FA', alpha = 0.5)
    spat = plt.scatter([obs[i] for i in i_spac], [expc[i] for i in i_spac], c = '#EE4000',  \
                        edgecolors='none', alpha = 0.5, s = 8, label = 'Spatial')
    temp = plt.scatter([obs[i] for i in i_temp], [expc[i] for i in i_temp], c = '#1C86EE',  \
                        edgecolors='none', alpha = 0.5, s = 8, label = 'Temporal')   
    plt.plot([axis_min, axis_max],[axis_min, axis_max], 'k-')
    plt.xlim(axis_min, axis_max)
    plt.ylim(axis_min, axis_max)
    if legend:
        plt.legend([spat, temp], ['Spatial', 'Temporal'], scatterpoints = 1, loc = loc, prop = {'size': 8})
    ax.tick_params(axis = 'both', which = 'major', labelsize = 6)
    return ax

def plot_obs_expc_alt(obs, expc, obs_type, loglog, ax = None):
    """Alternative visual representation of the obs-expc plot, with not CI range but each dot plotted
    
    semi-transparently to illustrate the heat of different values. 
    Input: 
    obs - list of observed values
    expc - list of lists of expected values, each sublist is of the same length as obs, the number of sublists equal sample size 
    obs_type - list of the same length of obs, specifying whether each obs is spatial (red) or temporal (blue)
    loglog - whether both axes are to be transformed
    ax - whether the plot is generated on a given figure, or a new plot object is to be created
    
    """
    obs = list(obs)
    n_sample = len(expc)
    if not ax:
        fig = plt.figure(figsize = (3.5, 3.5))
        ax = plt.subplot(111)
    
    if loglog:
        expc_above_zero = [[x for x in sublist if x > 0] for sublist in expc]
        axis_min = 0.9 * np.min(expc_above_zero)
        axis_max = 3 * np.max(expc)
        ax.set_xscale('log')
        ax.set_yscale('log')        
    else:
        axis_min = 0.9 * np.min(expc)
        axis_max = 1.1 * np.max(expc)

    # Sort all lists with respect to obs
    index = sorted(range(len(obs)), key = lambda k: obs[k])
    expc = [expc[i] for i in index]
    expc_upper = [expc_upper[i] for i in index]
    expc_lower = [expc_lower[i] for i in index]
    obs = [obs[i] for i in index]
    obs_type = [obs_type[i] for i in index]
     
    # Replace zeros in expc_lower with the minimal value above zero for the purpose of plotting   
    i_spac = [i for i, x in enumerate(obs_type) if x == 'spatial']
    i_temp = [i for i, x in enumerate(obs_type) if x == 'temporal']
     
    for j in range(n_sample):  
        expc_sample = expc[j]
        plt.scatter([obs[i] for i in i_spac], [expc_sample[i] for i in i_spac], c = '#EE4000',  edgecolors='none', alpha = 1 / n_sample * 10, s = 8)
        plt.scatter([obs[i] for i in i_temp], [expc_sample[i] for i in i_temp], c = '#1C86EE',  edgecolors='none', alpha = 1 / n_sample * 10, s = 8)   
    plt.plot([axis_min, axis_max],[axis_min, axis_max], 'k-')
    plt.xlim(axis_min, axis_max)
    plt.ylim(axis_min, axis_max)
    ax.tick_params(axis = 'both', which = 'major', labelsize = 6)
    return ax

def plot_obs_expc_new(obs, expc, expc_upper, expc_lower, analysis, log, ax = None):
    """Modified version of obs-expc plot suggested by R2. The points are separated by whether their CIs are above, below, 
    
    or overlapping the empirical value
    Input: 
    obs - list of observed values
    expc_mean - list of mean simulated values for the corresponding observed values
    expc_upper - list of the 97.5% quantile of the simulated vlaues
    expc_lower - list of the 2.5% quantile of the simulated values
    analysis - whether it is patitions or compositions
    log - whether the y axis is to be transformed. If True, expc/obs is plotted. If Flase, expc - obs is plotted.
    ax - whether the plot is generated on a given figure, or a new plot object is to be created
    
    """
    obs, expc, expc_upper, expc_lower = list(obs), list(expc), list(expc_upper), list(expc_lower)
    if not ax:
        fig = plt.figure(figsize = (3.5, 3.5))
        ax = plt.subplot(111)
    
    ind_above = [i for i in range(len(obs)) if expc_lower[i] > obs[i]]
    ind_below = [i for i in range(len(obs)) if expc_upper[i] < obs[i]]
    ind_overlap = [i for i in range(len(obs)) if expc_lower[i] <= obs[i] <= expc_upper[i]]
    
    if log:
        expc_standardize = [expc[i] / obs[i] for i in range(len(obs))]
        expc_upper_standardize = [expc_upper[i] / obs[i] for i in range(len(obs))]
        expc_lower_standardize = [expc_lower[i] / obs[i] for i in range(len(obs))]
        axis_min = 0.9 * min([expc_lower_standardize[i] for i in range(len(expc_lower_standardize)) if expc_lower_standardize[i] != 0])
        axis_max = 1.5 * max(expc_upper_standardize)
    else:
        expc_standardize = [expc[i] - obs[i] for i in range(len(obs))]
        expc_upper_standardize = [expc_upper[i] - obs[i] for i in range(len(obs))]
        expc_lower_standardize = [expc_lower[i] - obs[i] for i in range(len(obs))]
        axis_min = 1.1 * min(expc_lower_standardize)
        axis_max = 1.1 * max(expc_upper_standardize)
   
    if analysis == 'partition': col = '#228B22'
    else: col = '#CD69C9'
    ind_full = [] 
    for index in [ind_below, ind_overlap, ind_above]:
        expc_standardize_ind = [expc_standardize[i] for i in index]
        sort_ind_ind = sorted(range(len(expc_standardize_ind)), key = lambda i: expc_standardize_ind[i])
        sorted_index = [index[i] for i in sort_ind_ind]
        ind_full.extend(sorted_index)

    xaxis_max = len(ind_full)
    for i, ind in enumerate(ind_full):
        plt.plot([i, i],[expc_lower_standardize[ind], expc_upper_standardize[ind]], '-', c = col, linewidth = 0.4)
    plt.scatter(range(len(ind_full)), [expc_standardize[i] for i in ind_full], c = col,  edgecolors='none', s = 8)    
    if log: 
        plt.plot([0, xaxis_max + 1], [1, 1], 'k-', linewidth = 1.5)
        ax.set_yscale('log')
    else: plt.plot([0, xaxis_max + 1], [0, 0], 'k-', linewidth = 1.5)
    plt.plot([len(ind_below) - 0.5, len(ind_below) - 0.5], [axis_min, axis_max], 'k--')
    plt.plot([len(ind_below) + len(ind_overlap) - 0.5, len(ind_below) + len(ind_overlap) - 0.5], [axis_min, axis_max], 'k--')
    plt.xlim(0, xaxis_max)
    plt.ylim(axis_min, axis_max)
    plt.tick_params(axis = 'y', which = 'major', labelsize = 8, labelleft = 'on')
    plt.tick_params(axis = 'x', which = 'major', top = 'off', bottom = 'off', labelbottom = 'off')
    return ax

def plot_mean_var(mean, obs_var, expc_var, obs_type, loglog = True, ax = None):
    """Plot the observed and expected variance against mean, distinguishing 
    
    between spatial and temporal data.
    
    """
    mean, obs_var, expc_var = list(mean), list(obs_var), list(expc_var)
    if not ax:
        fig = plt.figure(figsize = (3.5, 3.5))
        ax = plt.subplot(111)
    
    if loglog:
        ax.set_xscale('log')
        ax.set_yscale('log')        
    
    i_spac = [i for i, x in enumerate(obs_type) if x == 'spatial']
    i_temp = [i for i, x in enumerate(obs_type) if x == 'temporal']
    
    plt.scatter([mean[i] for i in i_spac], [obs_var[i] for i in i_spac], c = '#EE4000',  edgecolors='none', alpha = 0.5, s = 8)
    plt.scatter([mean[i] for i in i_temp], [obs_var[i] for i in i_temp], c = '#1C86EE',  edgecolors='none', alpha = 0.5, s = 8)
    plt.scatter(mean, expc_var, c = 'black', edgecolors = 'none', alpha = 0.5, s = 8)
    ax.tick_params(axis = 'both', which = 'major', labelsize = 6)
    plt.xlabel('Mean', fontsize = 8)
    plt.ylabel('Variance', fontsize = 8)
    return ax

def comp_dens(val_list, cov_factor):
    """Compute the density function given covariance factor."""
    density = stats.gaussian_kde(val_list)
    density.covariance_factor = lambda :  cov_factor
    density._compute_covariance()
    return density

def plot_dens(obs, expc, obs_type, ax = None, legend = False, loc = 2, vline = None, xlim = None):
    """Plot the density of observed and expected values, with spatial and temporal observations 
    
    distinguished by color.
    
    """
    if not ax:
        fig = plt.figure(figsize = (3.5, 3.5))
        ax = plt.subplot(111)
    
    obs_spatial = [obs[i] for i in range(len(obs)) if obs_type[i] == 'spatial']
    obs_temporal = [obs[i] for i in range(len(obs)) if obs_type[i] == 'temporal']
    full_values = list(obs) + list(expc)
    min_plot = 0.9 * min(full_values)
    max_plot = 1.1 * max(full_values)
    xs = np.linspace(min_plot, max_plot, 200)
    cov_factor = 0.2
    dens_obs_spatial = comp_dens(obs_spatial, cov_factor)
    dens_obs_temporal = comp_dens(obs_temporal, cov_factor)
    dens_expc = comp_dens(expc, cov_factor)
    spat, = plt.plot(xs, dens_obs_spatial(xs), c = '#EE4000', linewidth=2)
    temp, = plt.plot(xs, dens_obs_temporal(xs), c = '#1C86EE', linewidth=2)
    feas, = plt.plot(xs, dens_expc(xs), 'k-', linewidth=2)
    if vline != None:
        ymax = 1.1 * max([max(dens_obs_spatial(xs)), max(dens_obs_temporal(xs)), max(dens_expc(xs))])
        plt.plot((vline, vline), (0, ymax), 'k--')
    if legend:
        plt.legend([spat, temp, feas], ['Spatial', 'Temporal', 'Feasible Set'], loc = loc, prop = {'size': 8})
    ax.tick_params(axis = 'both', which = 'major', labelsize = 6)
    if xlim != None:
        plt.xlim(xlim)
    return ax

def plot_dens_par_comp(obs, pars, comps, ax = None, legend = False, loc = 2, vline = None, xlim = None):
    """Density plot of the spatial and temporal data pooled together, and results from both partitions and compositions.
    
    """
    if not ax:
        fig = plt.figure(figsize = (3.5, 3.5))
        ax = plt.subplot(111)
    
    full_values = list(obs) + list(pars) + list(comps)
    min_plot = 0.9 * min(full_values)
    max_plot = 1.1 * max(full_values)
    xs = np.linspace(min_plot, max_plot, 200)
    cov_factor = 0.2
    dens_obs = comp_dens(obs, cov_factor)
    dens_par = comp_dens(pars, cov_factor)
    dens_comp = comp_dens(comps, cov_factor)
    obs_plot, = plt.plot(xs, dens_obs(xs), 'k-', linewidth=2)
    par_plot, = plt.plot(xs, dens_par(xs), c = '#228B22', linewidth=2)
    comp_plot, = plt.plot(xs, dens_comp(xs), c = '#CD69C9', linewidth=2)
    if vline != None:
        ymax = 1.1 * max([max(dens_obs(xs)), max(dens_par(xs)), max(dens_comp(xs))])
        plt.plot((vline, vline), (0, ymax), 'k--')
    if legend:
        plt.legend([obs_plot, par_plot, comp_plot], ['Empirical', 'Partitions', 'Compositions'], loc = loc, prop = {'size': 8})
    ax.tick_params(axis = 'both', which = 'major', labelsize = 6)
    if xlim != None:
        plt.xlim(xlim)
    return ax

def plot_dens_par_comp_single_obs(obs, pars, comps, ax = None, legend = False, loc = 2, vline = None, xlim = None):
    """Density plot of results from both partitions and compositions with value from a single observation.
    
    """
    if not ax:
        fig = plt.figure(figsize = (3.5, 3.5))
        ax = plt.subplot(111)
    
    full_values = list(pars) + list(comps) + list([obs])
    min_plot = 0.9 * min(full_values)
    max_plot = 1.1 * max(full_values)
    xs = np.linspace(min_plot, max_plot, 200)
    cov_factor = 0.2
    dens_par = comp_dens(pars, cov_factor)
    dens_comp = comp_dens(comps, cov_factor)
    par_plot, = plt.plot(xs, dens_par(xs), c = '#228B22', linewidth=2)
    comp_plot, = plt.plot(xs, dens_comp(xs), c = '#CD69C9', linewidth=2)
    ymax = 1.1 * max([max(dens_par(xs)), max(dens_comp(xs))])
    plt.plot((obs, obs), (0, ymax), 'k-', linewidth = 2)
    if legend:
        plt.legend([par_plot, comp_plot], ['Partitions', 'Compositions'], loc = loc, prop = {'size': 10})
    ax.tick_params(axis = 'both', which = 'major', labelsize = 8)
    if xlim != None:
        plt.xlim(xlim)
    else: plt.xlim((0.9 * min(full_values), 1.1 * max(full_values)))
    return ax

def plot_emp_vs_sim(study_id, data_dir = './out_files/', feas_type = 'partition', ax = None, inset = True, legend = False):
    """Plot of empirical and simulated mean-variance relationships for a given data set
    
    to help visually illustrate our results.
    Includes scatter plot of empirical data and its fitted line, scatter plot and fitted line for one
    set of simulated s_ij^2, 95 quantiles of s_ij^2 for each s_i^2 value, and the distribution of b in an inset.
    
    Input: 
    study_id - ID of the data set of interest, in the form listed in Appendix A. 
    """
    if not ax:
        fig = plt.figure(figsize = (3.5, 3.5))
        ax = plt.subplot(111)
    var_dat = get_var_sample_file(data_dir + 'taylor_QN_var_predicted_' + feas_type + '_full.txt')
    var_study = var_dat[var_dat['study'] == study_id]
    sim_var = [var_study[x][5] for x in xrange(len(var_study))] # take the first simulated sequence
    
    b_emp, inter_emp, r, p, std_err = stats.linregress(np.log(var_study['mean']), np.log(var_study['var']))
    b_list = []
    for k in xrange(len(var_study[0]) - 5):
        study_k = [var_study[x][k + 5] for x in xrange(len(var_study))]
        mean_k = [var_study['mean'][p] for p in xrange(len(var_study)) if study_k[p] != 0]
        study_k = [study_k[p] for p in xrange(len(study_k)) if study_k[p] != 0]
        b_k, inter, r, p, std_err = stats.linregress(np.log(mean_k), np.log(study_k))
        if k == 0: b_0, inter_0 = b_k, inter
        b_list.append(b_k)
   
    ax.set_xscale('log')
    ax.set_yscale('log')
    plt.scatter(var_study['mean'], var_study['var'], s = 8, c = 'black', edgecolors='none')
    emp, = plt.plot(var_study['mean'], np.exp(inter_emp) * var_study['mean'] ** b_emp, '-', c = 'black', linewidth=1.5)
    if feas_type == 'partition': plot_col = '#228B22'
    else: plot_col = '#CD69C9'
    plt.scatter(var_study['mean'], sim_var, s = 8, c = plot_col, edgecolors='none')
    sim, = plt.plot(var_study['mean'], np.exp(inter_0) * var_study['mean'] ** b_0, '-', linewidth=1.5, c = plot_col)
    ax.tick_params(axis = 'both', which = 'major', labelsize = 9)
    ax.set_xlabel('Mean', labelpad = 4, size = 10)
    ax.set_ylabel('Variance', labelpad = 4, size = 10)
    if legend:
        plt.legend([emp, sim], ['Empirical', (feas_type.title()) + 's'], loc = 4, prop = {'size': 8}) 
    if inset:
        axins = inset_axes(ax, width="30%", height="30%", loc=2)
        cov_factor = 0.2
        xs = np.linspace(0.9 * min(b_list + [b_emp]), 1.1 * max(b_list + [b_emp]), 200)
        dens_b = comp_dens(b_list, cov_factor)
        b_dens, = plt.plot(xs, dens_b(xs), c = plot_col, linewidth=1.5)
        ymax = 1.1 * max(dens_b(xs))
        plt.plot((b_emp, b_emp), (0, ymax), 'k-', linewidth = 1.5)
        plt.tick_params(axis = 'y', which = 'major', left = 'off', right = 'off', labelleft = 'off')
        plt.tick_params(axis = 'x', which = 'major', top = 'off', bottom = 'off', labelbottom = 'off')
    return ax