-
Notifications
You must be signed in to change notification settings - Fork 86
/
verification.py
6607 lines (5837 loc) · 233 KB
/
verification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
Script to automate verification of msprime against known statistical
results and benchmark programs such as ms and Seq-Gen.
Tests are structured in a similar way to Python unittests. Tests
are organised into classes of similar tests. Ideally, each test
in the class is a simple call to a general method with
different parameters (this is called ``_run``, by convention).
Tests must be *independent* and not depend on any shared
state within the test class, other than the ``self.output_dir``
variable which is guaranteed to be set when the method is called.
The output directory is <output-dir>/<class name>/<test name>.
Each test should output one or more diagnostic plots, which have
a clear interpretation as "correct" or "incorrect". QQ-plots
are preferred, where possible. Numerical results can also be
output by using ``logging.debug()``, where appropriate; to
view these, append ``--debug`` to the comand line running
your tests.
Test classes must be a subclass of the ``Test`` class defined
in this module.
To run the tests, first get some help from the CLI:
python3 verification.py --help
This will output some basic help on the tests. Use
python3 verification.py --list
to show all the available tests.
If you run without any arguments, this will run all the tests
sequentially. The progress bar and output behaviour can be
controlled using command line parameters, and running over
multiple processes is possible.
If you wish to run a specific tests, you can provide the
test names as positional arguments, i.e.,
python3 verification.py test_msdoc_outgroup_sequence test_msdoc_recomb_ex
will just run these two specific tests.
Using the ``-c`` option allows you to run all tests in a
given class.
Gotchas:
- Any test superclasses must be abstract. That is, you cannot
inherit from a test class that contains any tests.
- Test method names must be unique across *all* classes.
"""
import argparse
import ast
import collections
import concurrent.futures
import functools
import inspect
import itertools
import json
import logging
import math
import pathlib
import pickle
import random
import subprocess
import sys
import tempfile
import warnings
import allel
import attr
import daiquiri
import dendropy
import matplotlib
import numpy as np
import pandas as pd
import pyslim
import pyvolve
import scipy.special
import scipy.stats
import seaborn as sns
import tqdm
import tskit
from matplotlib import pyplot
import msprime
import msprime.cli as cli
import msprime.pedigrees as pedigrees
from msprime.demography import _matrix_exponential
# Force matplotlib to not use any Xwindows backend.
# Note this must be done before importing statsmodels.
matplotlib.use("Agg")
import statsmodels.api as sm # noqa: E402
_mspms_executable = [sys.executable, "mspms_dev.py"]
_slim_executable = ["./data/slim"]
_ms_executable = ["./data/ms"]
_discoal_executable = ["./data/discoal"]
_scrm_executable = ["./data/scrm"]
_msms_executable = ["java", "-Xmx1G", "-jar", "data/msms.jar"]
def flatten(li):
return [x for sublist in li for x in sublist]
def harmonic_number(n):
return np.sum(1 / np.arange(1, n + 1))
def hk_f(n, z):
"""
Returns Hudson and Kaplan's f_n(z) function. This is based on the exact
value for n=2 and the approximations given in the 1985 Genetics paper.
"""
ret = 0
if n == 2:
ret = (18 + z) / (z**2 + 13 * z + 18)
else:
ret = sum(1 / j**2 for j in range(1, n)) * hk_f(2, z)
return ret
def chisquare_stat(observed, expected):
return np.sum((observed - expected) ** 2 / expected)
def get_predicted_variance(n, R):
# We import this here as it's _very_ slow to import and we
# only use it in this case.
import scipy.integrate
def g(z):
return (R - z) * hk_f(n, z)
res, err = scipy.integrate.quad(g, 0, R)
return R * harmonic_number(n - 1) + 2 * res
def write_slim_script(outfile, format_dict):
slim_str = """
// set up a simple neutral simulation
initialize()
{{
initializeTreeSeq(checkCoalescence=T);
initializeMutationRate(0);
initializeMutationType('m1', 0.5, 'f', 0.0);
// g1 genomic element type: uses m1 for all mutations
initializeGenomicElementType('g1', m1, 1.0);
// uniform chromosome
initializeGenomicElement(g1, 0, {NUM_LOCI});
// uniform recombination along the chromosome
initializeRecombinationRate({RHO});
}}
// create a population
1
{{
{POP_STRS};
sim.tag = 0;
}}
// run for set number of generations
1: late()
{{
if (sim.tag == 0) {{
if (sim.treeSeqCoalesced()) {{
sim.tag = sim.generation;
catn(sim.tag + ': COALESCED');
}}
}}
if (sim.generation == sim.tag * 10) {{
sim.simulationFinished();
catn('Ran a further ' + sim.tag * 10 + ' generations');
sim.treeSeqOutput('{OUTFILE}');
}}
}}
100000 late() {{
catn('No coalescence after 100000 generations!');
}}
"""
with open(outfile, "w") as f:
f.write(slim_str.format(**format_dict))
def write_sweep_slim_script(outfile, format_dict):
slim_str = """
initialize() {{
initializeTreeSeq();
initializeMutationRate(0);
initializeMutationType('m1', 0.5, 'f', 0.0);
initializeMutationType('m2', 0.5, 'f', {s});
initializeGenomicElementType('g1', m1, 1.0);
initializeGenomicElement(g1, 0, {NUMLOCI});
initializeRecombinationRate({r});
}}
s1 200000 late() {{
sim.treeSeqOutput('{OUTFILE}');
sim.simulationFinished();
}}
1 {{
// save this run's identifier, used to save and restore
defineConstant("simID", getSeed());
sim.addSubpop("p1", {POPSIZE});
sim.setValue("flag",0);
}}
2 late() {{
// save the state of the simulation
sim.treeSeqOutput("/tmp/slim_" + simID + ".trees");
target = sample(p1.genomes, 1);
target.addNewDrawnMutation(m2, {SWEEPPOS});
}}
2:2000 late() {{
if (sim.countOfMutationsOfType(m2) == 0)
{{
fixed = (sum(sim.substitutions.mutationType == m2) == 1);
if (fixed){{
sim.setValue("flag", sim.getValue("flag") + 1);
}}
if (fixed)
{{
if (sim.getValue("flag") == 1){{
sim.rescheduleScriptBlock(s1,
start=sim.generation+{TAU}, end=sim.generation+{TAU});
}}
}}
else
{{
sim.readFromPopulationFile("/tmp/slim_" + simID + ".trees");
setSeed(rdunif(1, 0, asInteger(2^62) - 1));
target = sample(p1.genomes, 1);
target.addNewDrawnMutation(m2, {SWEEPPOS});
}}
}}
}}
"""
with open(outfile, "w") as f:
f.write(slim_str.format(**format_dict))
def subsample_simplify_slim_treesequence(ts, sample_sizes):
tables = ts.dump_tables()
samples = set(ts.samples())
num_populations = len(set(tables.nodes.population))
assert len(sample_sizes) == num_populations
subsample = []
for i, size in enumerate(sample_sizes):
# Stride 2 to only sample one chrom per diploid SLiM individual
ss = np.where(tables.nodes.population == i)[0][::2]
ss = list(samples.intersection(ss))
ss = np.random.choice(ss, replace=False, size=size)
subsample.extend(ss)
tables.nodes.individual = None
tables.individuals.clear()
tables.simplify(subsample)
ts = tables.tree_sequence()
return ts
def plot_qq(v1, v2):
sm.graphics.qqplot(v1)
sm.qqplot_2samples(v1, v2, line="45")
def plot_stat_hist(v1, v2, v1_name, v2_name):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
sns.kdeplot(v1, color="b", fill=True, label=v1_name, legend=False)
sns.kdeplot(v2, color="r", fill=True, label=v2_name, legend=False)
pyplot.legend(loc="upper right")
def plot_breakpoints_hist(v1, v2, v1_name, v2_name):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
sns.kdeplot(v1, color="b", label=v1_name, fill=True, legend=False)
sns.kdeplot(v2, color="r", label=v2_name, fill=True, legend=False)
pyplot.legend(loc="upper right")
def all_breakpoints_in_replicates(replicates):
return [right for intervals in replicates for left, right in intervals]
@attr.s
class Test:
"""
The superclass of all tests. The only attribute defined is the output
directory for the test, which is guaranteed to exist when the
test method is called.
"""
output_dir = attr.ib(type=str, default=None)
def _run_sample_stats(self, args):
logging.debug(f"{' '.join(args)}")
p1 = subprocess.Popen(args, stdout=subprocess.PIPE)
p2 = subprocess.Popen(
["./data/sample_stats"], stdin=p1.stdout, stdout=subprocess.PIPE
)
p1.stdout.close()
output = p2.communicate()[0]
p1.wait()
if p1.returncode != 0:
raise ValueError("Error occured in subprocess: ", p1.returncode)
with tempfile.TemporaryFile() as f:
f.write(output)
f.seek(0)
df = pd.read_csv(f, sep="\t")
return df
def _build_filename(self, *args):
return self.output_dir / ("_".join(args[1:]) + ".png")
def _plot_stats(self, stats_type, df1, df2, df1_name, df2_name):
assert set(df1.columns.values) == set(df2.columns.values)
for stat in df1.columns.values:
v1 = df1[stat]
v2 = df2[stat]
if stat == "breakpoints":
plot_breakpoints_hist(flatten(v1), flatten(v2), df1_name, df2_name)
pyplot.xlabel("genome")
f = self._build_filename(stats_type, stat)
pyplot.savefig(f, dpi=72)
else:
plot_qq(v1, v2)
pyplot.xlabel(df1_name)
pyplot.ylabel(df2_name)
f = self._build_filename(stats_type, stat)
pyplot.savefig(f, dpi=72)
pyplot.close("all")
# Put the histograms in their own directory to avoid
# cluttering up the qqplots.
plot_stat_hist(v1, v2, df1_name, df2_name)
histdir = self.output_dir / "histograms"
histdir.mkdir(exist_ok=True)
f = histdir / f.name
pyplot.savefig(f, dpi=72)
pyplot.close("all")
def get_ms_seeds(self):
max_seed = 2**16
seeds = [random.randint(1, max_seed) for j in range(3)]
return ["-seed"] + list(map(str, seeds))
def _run_msprime_mutation_stats(self, args):
return self._run_sample_stats(
_mspms_executable + args.split() + self.get_ms_seeds()
)
class MsTest(Test):
"""
Superclass of tests that perform comparisons with ms. Provides some
infrastructure for common operations.
"""
def _deserialize_breakpoints(self, df):
breakpoints_strs = df["breakpoints"]
breakpoints = [ast.literal_eval(literal) for literal in breakpoints_strs]
df["breakpoints"] = breakpoints
return df
def _exec_coalescent_stats(self, executable, args, seeds=None):
with tempfile.TemporaryFile() as f:
argList = [executable] + args.split() + self.get_ms_seeds()
logging.debug(f"{' '.join(argList)}")
subprocess.call(argList, stdout=f)
f.seek(0)
df = pd.read_table(f)
self._deserialize_breakpoints(df)
return df
def _run_ms_coalescent_stats(self, args):
return self._exec_coalescent_stats("./data/ms_summary_stats", args)
def _run_ms_mutation_stats(self, args):
return self._run_sample_stats(
_ms_executable + args.split() + self.get_ms_seeds()
)
def _run_mutation_stats(self, args):
df_ms = self._run_ms_mutation_stats(args)
df_msp = self._run_msprime_mutation_stats(args)
self._plot_stats("mutation", df_ms, df_msp, "ms", "msp")
def _run_mspms_coalescent_stats(self, args):
logging.debug(f"mspms: {args}")
runner = cli.get_mspms_runner(args.split())
sim = runner.simulator
num_populations = sim.num_populations
replicates = runner.num_replicates
num_trees = [0 for j in range(replicates)]
time = [0 for j in range(replicates)]
ca_events = [0 for j in range(replicates)]
re_events = [0 for j in range(replicates)]
gc_events = [0 for j in range(replicates)]
mig_events = [None for j in range(replicates)]
breakpoints = [[] for j in range(replicates)]
for j in range(replicates):
sim.reset()
sim.run()
num_trees[j] = sim.num_breakpoints + 1
breakpoints[j] = sim.breakpoints
time[j] = sim.time
ca_events[j] = sim.num_common_ancestor_events
re_events[j] = sim.num_recombination_events
gc_events[j] = sim.num_gene_conversion_events
mig_events[j] = [r for row in sim.num_migration_events for r in row]
d = {
"t": time,
"num_trees": num_trees,
"ca_events": ca_events,
"re_events": re_events,
"gc_events": gc_events,
}
for j in range(num_populations**2):
events = [mig_events[k][j] for k in range(replicates)]
d[f"mig_events_{j}"] = events
d["breakpoints"] = breakpoints
df = pd.DataFrame(d)
return df
def _run_coalescent_stats(self, args):
df_msp = self._run_mspms_coalescent_stats(args)
df_ms = self._run_ms_coalescent_stats(args)
self._plot_stats("coalescent", df_msp, df_ms, "msp", "ms")
# end of tests common to MS and random
def _run_variable_recombination_coalescent_stats(self, args):
df_msp = self._run_mspms_coalescent_stats(args)
df_mshot = self._run_mshot_coalescent_stats(args)
self._plot_stats("recomb map coalescent", df_msp, df_mshot, "msp", "msHOT")
def _run_mshot_coalescent_stats(self, args):
return self._exec_coalescent_stats("./data/msHOT_summary_stats", args)
def _run(self, cmd):
self._run_coalescent_stats(cmd)
self._run_mutation_stats(cmd)
class MsDemography(MsTest):
def test_size_change_1(self):
self._run("10 10000 -t 2.0 -eN 0.1 2.0")
def test_growth_rate_change_1(self):
self._run("10 10000 -t 2.0 -eG 0.1 5.0")
def test_growth_rate_change1(self):
self._run("10 10000 -t 2.0 -eG 0.1 5.0")
def test_growth_rate_2_pops1(self):
self._run("10 10000 -t 2.0 -I 2 5 5 2.5 -G 5.0")
def test_growth_rate_2_pops2(self):
self._run("10 10000 -t 2.0 -I 2 5 5 2.5 -G 5.0 -g 1 0.1")
def test_growth_rate_2_pops3(self):
self._run("10 10000 -t 2.0 -I 2 5 5 2.5 -g 1 0.1")
def test_growth_rate_2_pops4(self):
self._run("10 10000 -t 2.0 -I 2 5 5 2.5 -eg 1.0 1 5.0")
def test_pop_size_2_pops1(self):
self._run("100 10000 -t 2.0 -I 2 50 50 2.5 -n 1 0.1")
def test_pop_size_2_pops2(self):
self._run("100 10000 -t 2.0 -I 2 50 50 2.5 -g 1 2 -n 1 0.1")
def test_pop_size_2_pops3(self):
self._run("100 10000 -t 2.0 -I 2 50 50 2.5 -eN 0.5 3.5")
def test_pop_size_2_pops4(self):
self._run("100 10000 -t 2.0 -I 2 50 50 2.5 -en 0.5 1 3.5")
def test_migration_rate_2_pops1(self):
self._run("100 10000 -t 2.0 -I 2 50 50 0 -eM 3 5")
def test_migration_matrix_2_pops1(self):
self._run("100 10000 -t 2.0 -I 2 50 50 -ma x 10 0 x")
def test_migration_matrix_2_pops2(self):
self._run("100 10000 -t 2.0 -I 2 50 50 -m 1 2 10 -m 2 1 50")
def test_migration_rate_change_2_pops1(self):
self._run("100 10000 -t 2.0 -I 2 50 50 -eM 5 10")
def test_migration_matrix_entry_change_2_pops1(self):
self._run("100 10000 -t 2.0 -I 2 50 50 -em 0.5 2 1 10")
def test_migration_matrix_change_2_pops1(self):
self._run("100 10000 -t 2.0 -I 2 50 50 -ema 10.0 2 x 10 0 x")
def migration_matrix_change_2_pops2(self):
cmd = """100 10000 -t 2.0 -I 2 50 50 -ema 1.0
2 x 0.1 0 x -eN 1.1 0.001 -ema 10 2 x 0 10 x"""
self._run(cmd)
def test_population_split_2_pops1(self):
self._run("100 10000 -t 2.0 -I 2 50 50 5.0 -ej 2.0 1 2")
def test_population_split_4_pops1(self):
self._run("100 10000 -t 2.0 -I 4 50 50 0 0 2.0 -ej 0.5 2 1")
def test_population_split_4_pops2(self):
self._run("100 10000 -t 2.0 -I 4 25 25 25 25 -ej 1 2 1 -ej 2 3 1 -ej 3 4 1")
def test_population_split_4_pops3(self):
cmd = (
"100 10000 -t 2.0 -I 4 25 25 25 25 -ej 1 2 1 "
"-em 1.5 4 1 2 -ej 2 3 1 -ej 3 4 1"
)
self._run(cmd)
def test_admixture_1_pop1(self):
self._run("1000 1000 -t 2.0 -es 0.1 1 0.5 -em 0.1 1 2 1")
def test_admixture_1_pop2(self):
self._run("1000 1000 -t 2.0 -es 0.1 1 0.1 -em 0.1 1 2 1")
def test_admixture_1_pop3(self):
self._run("1000 1000 -t 2.0 -es 0.01 1 0.1 -em 0.1 2 1 1")
def test_admixture_1_pop4(self):
self._run("1000 1000 -t 2.0 -es 0.01 1 0.1 -es 0.1 2 0 -em 0.1 3 1 1")
def test_admixture_1_pop5(self):
self._run("1000 1000 -t 2.0 -es 0.01 1 0.1 -ej 1 2 1")
def test_admixture_1_pop6(self):
self._run("1000 1000 -t 2.0 -es 0.01 1 0.0 -eg 0.02 2 5.0 ")
def test_admixture_1_pop7(self):
self._run("1000 1000 -t 2.0 -es 0.01 1 0.0 -en 0.02 2 5.0 ")
def test_admixture_2_pop1(self):
self._run("1000 1000 -t 2.0 -I 2 500 500 1 -es 0.01 1 0.1 -ej 1 3 1")
def test_admixture_2_pop2(self):
self._run("1000 1000 -t 2.0 -I 2 500 500 2 -es 0.01 1 0.75 -em 2.0 3 1 1")
def test_admixture_2_pop3(self):
self._run(
"1000 1000 -t 2.0 -I 2 500 500 2 -es 0.01 1 0.75 -G 5.0 " "-em 2.0 3 1 1"
)
def test_admixture_2_pop4(self):
cmd = (
"1000 1000 -t 2.0 -I 2 500 500 2 -es 0.01 1 0.75 "
"-eg 0.02 1 5.0 -em 0.02 3 1 1"
)
self._run(cmd)
class MsGeneConversion(MsTest):
def _run(self, cmd):
# The mutation stats are a waste of time for GC, they tell us basically
# nothing.
self._run_coalescent_stats(cmd)
def test_gene_conversion_c10_r0(self):
self._run("100 10000 -t 5.0 -r 0 2501 -c 10 1")
def test_gene_conversion_c100_tl1000_r0(self):
self._run("100 10000 -t 5.0 -r 0 2501 -c 100 1000")
def test_gene_conversion_c1000_tl_1(self):
self._run("100 10000 -t 5.0 -r 0.01 2501 -c 1000 1")
def test_gene_conversion_c1000_tl_1000(self):
self._run("100 10000 -t 5.0 -r 0.01 2501 -c 1000 1000")
def test_gene_conversion_c2_r10(self):
self._run("100 10000 -t 5.0 -r 10 2501 -c 2 1")
def test_gene_conversion_c2_tl_10_r10(self):
self._run("100 10000 -t 5.0 -r 10 2501 -c 2 10")
def test_gene_conversion_c2_tl_100(self):
self._run("100 10000 -t 5.0 -r 10 2501 -c 2 100")
def test_gene_conversion_c2_tl_100_r0(self):
self._run("100 10000 -t 5.0 -r 0 2501 -c 2 100")
def test_gene_conversion_c20_tl_1000_r0(self):
self._run("100 10000 -t 5.0 -r 0 2501 -c 20 1000")
class MsDocExamples(MsTest):
def test_msdoc_simple_ex(self):
self._run("4 20000 -t 5.0")
def test_msdoc_recomb_ex(self):
self._run("15 1000 -t 10.04 -r 100.0 2501")
def test_msdoc_structure_ex1(self):
self._run("15 1000 -t 2.0 -I 3 10 4 1 5.0")
def test_msdoc_structure_ex2(self):
self._run("15 1000 -t 2.0 -I 3 10 4 1 5.0 -m 1 2 10.0 -m 2 1 9.0")
def test_msdoc_structure_ex3(self):
self._run("15 1000 -t 10.0 -I 3 10 4 1 -ma x 1.0 2.0 3.0 x 4.0 5.0 6.0 x")
def test_msdoc_outgroup_sequence(self):
self._run("11 1000 -t 2.0 -I 2 1 10 -ej 6.0 1 2")
def test_msdoc_two_species(self):
cmd = (
"15 10000 -t 11.2 -I 2 3 12 -g 1 44.36 -n 2 "
"0.125 -eg 0.03125 1 0.0 -en 0.0625 2 0.05 -ej 0.09375 2 1"
)
self._run(cmd)
def test_msdoc_stepping_stone(self):
cmd = (
"15 10000 -t 3.0 -I 6 0 7 0 0 8 0 -m 1 2 2.5 -m 2 1 2.5 -m 2 3 2.5 "
"-m 3 2 2.5 -m 4 5 2.5 -m 5 4 2.5 -m 5 6 2.5 -m 6 5 2.5 "
"-em 2.0 3 4 2.5 -em 2.0 4 3 2.5"
)
self._run(cmd)
class MsMiscExamples(MsTest):
"""
Miscellaneous examples that have been good for finding bugs.
"""
def test_simultaneous_ex1(self):
self._run("10 10000 -t 2.0 -eN 0.3 0.5 -eG .3 7.0")
def test_zero_growth_rate(self):
self._run("10 10000 -t 2.0 -G 6.93 -eG 0.2 0.0 -eN 0.3 0.5")
def test_konrad_1(self):
cmd = (
"4 1000 -t 2508 -I 2 2 2 0 -n 2 2.59 "
"-ma x 0 1.502 x -ej 0.9485 1 2 -r 23.76 3000"
)
self._run(cmd)
def test_konrad_2(self):
cmd = (
"3 10000 -t 0.423 -I 3 1 1 1 -es 0.0786 1 0.946635 "
"-ej 0.0786 4 3 -ej 0.189256 1 2 -ej 0.483492 2 3"
)
self._run(cmd)
def test_konrad_3(self):
self._run("100 100 -t 2 -I 10 10 10 10 10 10 10 10 10 10 10 0.001 ")
class MsRandom(MsTest):
"""
Some tests made by generating random parameters.
"""
def _run(self, num_populations=1, num_replicates=1000, num_demographic_events=0):
m = random.randint(1, 1000)
r = random.uniform(0.01, 0.1) * m
theta = random.uniform(1, 100)
N = num_populations
sample_sizes = [random.randint(2, 10) for _ in range(N)]
migration_matrix = [random.random() * (j % (N + 1) != 0) for j in range(N**2)]
structure = ""
if num_populations > 1:
structure = "-I {} {} -ma {}".format(
num_populations,
" ".join(str(s) for s in sample_sizes),
" ".join(str(r) for r in migration_matrix),
)
cmd = "{} {} -t {} -r {} {} {}".format(
sum(sample_sizes), num_replicates, theta, r, m, structure
)
# Set some initial growth rates, etc.
if N == 1:
if random.random() < 0.5:
cmd += f" -G {random.random()}"
else:
cmd += f" -eN 0 {random.random()}"
# Add some demographic events
t = 0
for _ in range(num_demographic_events):
t += 0.125
if random.random() < 0.5:
cmd += f" -eG {t} {random.random()}"
else:
cmd += f" -eN {t} {random.random()}"
super()._run(cmd)
def test_ms_random_1(self):
self._run()
def test_ms_random_2(self):
self._run(num_replicates=10**4, num_demographic_events=10)
def test_ms_random_2_pops1(self):
self._run(num_populations=3)
class MsHotTest(MsTest):
def _run(self, cmd):
self._run_variable_recombination_coalescent_stats(cmd)
def test_mshotdoc_hotspot_ex(self):
self._run("10 1000 -t 10.4 -r 10.0 25000 -v 2 100 200 10 7000 8000 20")
def test_mshot_zero_recomb_interval(self):
self._run("10 1000 -t 10.4 -r 10.0 25000 -v 1 5000 13000 0")
def test_mshot_zero_recomb(self):
self._run("10 1000 -t 10.4 -r 10.0 25000 -v 1 100 25000 0")
def test_mshot_high_recomb_variants(self):
hotspots = "4 1000 2000 0 7000 8000 20 12000 15000 10 20000 22000 0"
cmd = f"10 1000 -t 10.4 -r 10.0 25000 -v {hotspots}"
self._run(cmd)
class DiscoalTest(Test):
def get_discoal_seeds(self):
max_seed = 2**16
seeds = [random.randint(1, max_seed) for j in range(3)]
return ["-d"] + list(map(str, seeds))
def _discoal_str_to_ms(self, args):
# convert discoal string to msprime string
tokens = args.split(" ")
# cut out sites param
del tokens[2]
# adjust popIDs
for i in range(len(tokens)):
# pop size change case
if tokens[i] == "-en":
tokens[i + 2] = str(int(tokens[i + 2]) + 1)
# migration rate case
if tokens[i] == "-m":
tokens[i + 1] = str(int(tokens[i + 1]) + 1)
tokens[i + 2] = str(int(tokens[i + 2]) + 1)
msp_str = " ".join(tokens)
return msp_str
def _run_discoal_mutation_stats(self, args):
return self._run_sample_stats(
_discoal_executable + args.split() + self.get_discoal_seeds()
)
def _run_mutation_discoal_stats(self, args):
msp_str = self._discoal_str_to_ms(args)
df_msp = self._run_msprime_mutation_stats(msp_str)
df_d = self._run_sample_stats(
_discoal_executable + args.split() + self.get_discoal_seeds()
)
self._plot_stats("mutation", df_d, df_msp, "discoal", "msp")
def _discoal_str_to_simulation(self, args):
# takes discoal command line as input
# and returns msprime run treeseqs
tokens = args.split(" ")
# positional args
sample_size = int(tokens[0])
nreps = int(tokens[1])
seq_length = int(tokens[2])
# parse discoal command line for params
# init ones we definitely need for comparison
theta = rho = alpha = sweep_site = sweep_mod_time = None
refsize = 1e6
for i in range(3, len(tokens)):
# pop size change case
if tokens[i] == "-en":
raise ValueError(
"sweeps with population size changes remain unimplemented"
)
# migration rate case
if (tokens[i] == "-m") or (tokens[i] == "-p"):
raise ValueError(
"sweeps with multiple populations remain unimplemented"
)
# split or admixture case
if (tokens[i] == "-ea") or (tokens[i] == "-ed"):
raise ValueError("sweeps with splits or admixture not supported")
# sweep params
if tokens[i] == "-x":
sweep_site = float(tokens[i + 1])
if (tokens[i] == "-ws") or (tokens[i] == "-wd") or (tokens[i] == "-wn"):
sweep_mod_time = float(tokens[i + 1])
if tokens[i] == "-a":
alpha = float(tokens[i + 1])
if tokens[i] == "-N":
refsize = float(tokens[i + 1])
# coalescent params
if tokens[i] == "-t":
theta = float(tokens[i + 1])
if tokens[i] == "-r":
rho = float(tokens[i + 1])
mod_list = []
if alpha is not None:
# sweep model
s = alpha / (2 * refsize)
mod = msprime.SweepGenicSelection(
position=np.floor(sweep_site * seq_length),
start_frequency=1.0 / (2 * refsize),
end_frequency=1.0 - (1.0 / (2 * refsize)),
s=s * 2, # discoal fitness model is 1, 1+s, 1+2s
dt=1e-6,
)
mod_list.append(msprime.StandardCoalescent(duration=sweep_mod_time))
mod_list.append(mod)
# if an event is defined from discoal line
# best thing to do is rescale to Ne=0.25
# so that time scale are consistent
# see note at msprime/cli.py line 626
# and following for alternate solution
if sweep_mod_time > 0:
refsize = 0.25
mod.s = alpha / refsize
# append final model
mod_list.append("hudson")
# scale theta and rho
recomb_rate = rho / (4 * refsize * (seq_length - 1))
mu = theta / (4 * refsize * seq_length)
replicates = msprime.sim_ancestry(
[msprime.SampleSet(sample_size, ploidy=1)],
population_size=refsize,
model=mod_list,
recombination_rate=recomb_rate,
sequence_length=seq_length,
discrete_genome=False,
num_replicates=nreps,
)
mutate = functools.partial(
msprime.sim_mutations, discrete_genome=False, rate=mu
)
return map(mutate, replicates)
class DiscoalCompatibility(DiscoalTest):
"""
Basic tests to make sure that we have correctly set up the
discoal interface.
"""
def _run(self, cmd):
self._run_mutation_discoal_stats(cmd)
def test_discoal_simple_ex(self):
self._run("15 1000 100 -t 5.0")
def test_discoal_size_change1(self):
self._run("10 10000 100 -t 10.0 -en 0.1 0 2.0")
def test_discoal_size_change2(self):
self._run("10 10000 100 -t 10.0 -en 0.1 0 0.1")
def test_discoal_size_change3(self):
self._run("10 10000 100 -t 10.0 -en 0.01 0 0.01")
def test_discoal_size_change4(self):
self._run("10 10000 100 -t 10.0 -en 0.01 0 0.5 -en 0.05 0 1.0")
# TODO we need to fix this test and to add a good number of examples.
class DiscoalSweeps(DiscoalTest):
"""
Compare the result of sweeps in msprime and discoal.
"""
def _run(self, args):
df = pd.DataFrame()
data = collections.defaultdict(list)
replicates = self._discoal_str_to_simulation(args)
for ts in replicates:
data["pi"].append(ts.diversity(span_normalise=False))
data["D"].append(ts.Tajimas_D())
data["ss"].append(ts.segregating_sites(span_normalise=False))
data["pi"] = np.array(data["pi"]).flatten()
data["D"] = np.array(data["D"]).flatten()
data["ss"] = np.array(data["ss"]).flatten()
df = pd.DataFrame.from_dict(data)
df = df.fillna(0)
df_d = self._run_discoal_mutation_stats(args)
df_df = df_d[["pi", "D", "ss"]]
logging.debug(f"msp pi mean: {df['pi'].mean()}")
logging.debug(f"discoal pi mean: {df_df['pi'].mean()}")
logging.debug(f"msp ss mean: {df['ss'].mean()}")
logging.debug(f"discoal ss mean: {df_df['ss'].mean()}")
logging.debug(f"msp D mean: {df['D'].mean()}")
logging.debug(f"discoal D mean: {df_df['D'].mean()}")
logging.debug(f"sample sizes msp: {len(df['pi'])} discoal: {len(df_df['pi'])}")
self._plot_stats("mutation", df, df_df, "msp", "discoal")
def test_sweep_ex0(self):
cmd = "10 1000 10000 -t 10.0 -r 10.0"
self._run(cmd)
def test_sweep_no_rec_ex1(self):
cmd = "10 1000 10000 -t 10.0 -r 0.0 -ws 0 -a 100 -x 0.5 -N 10000"
self._run(cmd)
def test_sweep_no_rec_ex2(self):
cmd = "100 1000 10000 -t 10.0 -r 0.0 -ws 0 -a 200 -x 0.5 -N 10000"
self._run(cmd)
def test_sweep_rec_ex1(self):
cmd = "10 1000 10000 -t 10.0 -r 10.0 -ws 0 -a 1000 -x 0.5 -N 10000"
self._run(cmd)
def test_sweep_rec_ex2(self):
cmd = "10 1000 10000 -t 10.0 -r 20.0 -ws 0 -a 1000 -x 0.5 -N 10000"
self._run(cmd)
def test_sweep_rec_ex3(self):
cmd = "10 1000 10000 -t 10.0 -r 100.0 -ws 0 -a 1000 -x 0.5 -N 10000"
self._run(cmd)
def test_sweep_rec_ex4(self):
cmd = "10 1000 10000 -t 10.0 -r 400.0 -ws 0 -a 2000 -x 0.5 -N 10000"
self._run(cmd)
def test_sweep_rec_ex5(self):
cmd = "10 1000 10000 -t 100.0 -r 100.0 -ws 0 -a 250 -x 0.5 -N 10000"
self._run(cmd)
def test_sweep_tau_ex1(self):
cmd = "10 1000 10000 -t 10.0 -r 20.0 -ws 0.001 -a 250 -x 0.5 -N 10000"
self._run(cmd)
def test_sweep_tau_ex2(self):
cmd = "10 1000 10000 -t 10.0 -r 20.0 -ws 0.01 -a 250 -x 0.5 -N 10000"
self._run(cmd)
def test_sweep_tau_ex3(self):
cmd = "10 1000 10000 -t 10.0 -r 20.0 -ws 1.0 -a 250 -x 0.5 -N 10000"
self._run(cmd)
def sample_recap_simplify(slim_ts, sample_size, Ne, r, mu):
"""
takes a ts from slim and samples, recaps, simplifies
"""
demography = msprime.Demography.from_tree_sequence(slim_ts)
demography[1].initial_size = Ne
with warnings.catch_warnings():
warnings.simplefilter(
"ignore", category=msprime.IncompletePopulationMetadataWarning
)
recap = msprime.sim_ancestry(
initial_state=slim_ts,
demography=demography,
recombination_rate=r,
# TODO is this needed now? Shouldn't be, right?
start_time=slim_ts.metadata["SLiM"]["generation"],
)
logging.debug(f"pyslim: slim generation:{slim_ts.metadata['SLiM']['generation']}")
alive_inds = pyslim.individuals_alive_at(recap, 0)
keep_indivs = np.random.choice(alive_inds, sample_size, replace=False)
keep_nodes = []
for i in keep_indivs:
keep_nodes.extend(recap.individual(i).nodes)
logging.debug(f"before simplify {recap.num_nodes} nodes")
sts = recap.simplify(keep_nodes)
logging.debug(f"after simplify {sts.num_nodes} nodes")
logging.debug(f"after simplify {sts.num_trees} trees")
return msprime.mutate(sts, rate=mu)
class SweepVsSlim(Test):
"""
Tests where we compare the msprime sweeps with SLiM simulations.
"""
def run_sweep_slim_comparison(self, slim_args, **kwargs):
df_list = []
kwargs["model"] = "msp"
logging.debug(f"Running: {kwargs}")
seq_length = kwargs.get("seq_length")
pop_size = kwargs.get("pop_size")
s = kwargs.get("s")
tau = kwargs.get("tau")
sample_size = kwargs.get("sample_size")
recombination_rate = kwargs.get("recombination_rate")
num_replicates = kwargs.get("num_replicates")
sweep = msprime.SweepGenicSelection(
position=seq_length / 2,
start_frequency=1.0 / (2 * pop_size),