-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathreferences.bib
4566 lines (4401 loc) · 258 KB
/
references.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% Generated by Paperpile. Check out http://paperpile.com for more information.
% BibTeX export options can be customized via Settings -> BibTeX.
@ARTICLE{Pais2014-sr,
title = "Assessing the efficiency of multiple sequence alignment
programs",
author = "Pais, Fabiano Sviatopolk-Mirsky and Ruy, Patr\'{\i}cia de
C\'{a}ssia and Oliveira, Guilherme and Coimbra, Roney Santos",
affiliation = "Center for Excellence in Bioinformatics, Centro de Pesquisas
Ren\'{e} Rachou (CPqRR), Funda\c{c}\~{a}o Oswaldo Cruz
(FIOCRUZ/Minas), Belo Horizonte, MG Brazil.
abstract = "BACKGROUND: Multiple sequence alignment (MSA) is an extremely
useful tool for molecular and evolutionary biology and there
are several programs and algorithms available for this
purpose. Although previous studies have compared the alignment
accuracy of different MSA programs, their computational time
and memory usage have not been systematically evaluated. Given
the unprecedented amount of data produced by next generation
deep sequencing platforms, and increasing demand for
large-scale data analysis, it is imperative to optimize the
application of software. Therefore, a balance between
alignment accuracy and computational cost has become a
critical indicator of the most suitable MSA program. We
compared both accuracy and cost of nine popular MSA programs,
namely CLUSTALW, CLUSTAL OMEGA, DIALIGN-TX, MAFFT, MUSCLE,
POA, Probalign, Probcons and T-Coffee, against the benchmark
alignment dataset BAliBASE and discuss the relevance of some
implementations embedded in each program's algorithm. Accuracy
of alignment was calculated with the two standard scoring
functions provided by BAliBASE, the sum-of-pairs and
total-column scores, and computational costs were determined
by collecting peak memory usage and time of execution.
RESULTS: Our results indicate that mostly the
consistency-based programs Probcons, T-Coffee, Probalign and
MAFFT outperformed the other programs in accuracy. Whenever
sequences with large N/C terminal extensions were present in
the BAliBASE suite, Probalign, MAFFT and also CLUSTAL OMEGA
outperformed Probcons and T-Coffee. The drawback of these
programs is that they are more memory-greedy and slower than
POA, CLUSTALW, DIALIGN-TX, and MUSCLE. CLUSTALW and MUSCLE
were the fastest programs, being CLUSTALW the least RAM memory
demanding program. CONCLUSIONS: Based on the results presented
herein, all four programs Probcons, T-Coffee, Probalign and
MAFFT are well recommended for better accuracy of multiple
sequence alignments. T-Coffee and recent versions of MAFFT can
deliver faster and reliable alignments, which are specially
suited for larger datasets than those encountered in the
BAliBASE suite, if multi-core computers are available. In
fact, parallelization of alignments for multi-core computers
should probably be addressed by more programs in a near
future, which will certainly improve performance
significantly.",
journal = "Algorithms Mol. Biol.",
volume = 9,
number = 1,
pages = "4",
month = "6~" # mar,
year = 2014
}
@ARTICLE{Freyhult2007-et,
title = "Exploring genomic dark matter: a critical assessment of the
performance of homology search methods on noncoding {RNA}",
author = "Freyhult, Eva K and Bollback, Jonathan P and Gardner, Paul P",
affiliation = "The Linnaeus Centre for Bioinformatics, Uppsala University,
75124 Uppsala, Sweden.",
abstract = "Homology search is one of the most ubiquitous bioinformatic
tasks, yet it is unknown how effective the currently available
tools are for identifying noncoding RNAs (ncRNAs). In this
work, we use reliable ncRNA data sets to assess the
effectiveness of methods such as BLAST, FASTA, HMMer, and
Infernal. Surprisingly, the most popular homology search
methods are often the least accurate. As a result, many
studies have used inappropriate tools for their analyses. On
the basis of our results, we suggest homology search
strategies using the currently available tools and some
directions for future development.",
journal = "Genome Res.",
publisher = "Cold Spring Harbor Lab",
volume = 17,
number = 1,
pages = "117--125",
month = "6~" # jan,
year = 2007,
keywords = "My publications;citations.bib;paulall.bib"
}
@ARTICLE{Yang2009-oc,
title = "Comparison of public peak detection algorithms for {MALDI}
mass spectrometry data analysis",
author = "Yang, Chao and He, Zengyou and Yu, Weichuan",
affiliation = "Department of Electronic and Computer Engineering, The Hong
Kong University of Science and Technology, Clear Water Bay,
Hong Kong, PR China. [email protected]",
abstract = "BACKGROUND: In mass spectrometry (MS) based proteomic data
analysis, peak detection is an essential step for subsequent
analysis. Recently, there has been significant progress in the
development of various peak detection algorithms. However,
neither a comprehensive survey nor an experimental comparison
of these algorithms is yet available. The main objective of
this paper is to provide such a survey and to compare the
performance of single spectrum based peak detection methods.
RESULTS: In general, we can decompose a peak detection
procedure into three consequent parts: smoothing, baseline
correction and peak finding. We first categorize existing peak
detection algorithms according to the techniques used in
different phases. Such a categorization reveals the
differences and similarities among existing peak detection
algorithms. Then, we choose five typical peak detection
algorithms to conduct a comprehensive experimental study using
both simulation data and real MALDI MS data. CONCLUSION: The
results of comparison show that the continuous wavelet-based
algorithm provides the best average performance.",
journal = "BMC Bioinformatics",
volume = 10,
pages = "4",
month = "6~" # jan,
year = 2009
}
@ARTICLE{Altschul1990-ht,
title = "Basic local alignment search tool",
author = "Altschul, S F and Gish, W and Miller, W and Myers, E W and
Lipman, D J",
journal = "J. Mol. Biol.",
volume = 215,
number = 3,
pages = "403--410",
month = oct,
year = 1990,
keywords = "Mendeley Import (Jan 23);paulall.bib"
}
@ARTICLE{Boutros2014-zm,
title = "Toward better benchmarking: challenge-based methods assessment in
cancer genomics",
author = "Boutros, Paul C and Margolin, Adam A and Stuart, Joshua M and
Califano, Andrea and Stolovitzky, Gustavo",
abstract = "Rapid technological development has created an urgent need for
improved evaluation of algorithms for the analysis of cancer
genomics data. We outline how challenge-based assessment may help
fill this gap by leveraging crowd-sourcing to distribute effort
and reduce bias.",
journal = "Genome Biol.",
volume = 15,
number = 9,
pages = "462",
month = "17~" # sep,
year = 2014,
language = "en"
}
@ARTICLE{Boulesteix2013-vb,
title = "A plea for neutral comparison studies in computational
sciences",
author = "Boulesteix, A and Lauer, S and Eugster, M J
A",
affiliation = "Department of Medical Informatics, Biometry and Epidemiology,
Ludwig-Maximilians-University of Munich, Munich, Germany.
abstract = "In computational science literature including, e.g.,
bioinformatics, computational statistics or machine learning,
most published articles are devoted to the development of
``new methods'', while comparison studies are generally
appreciated by readers but surprisingly given poor
consideration by many journals. This paper stresses the
importance of neutral comparison studies for the objective
evaluation of existing methods and the establishment of
standards by drawing parallels with clinical research. The
goal of the paper is twofold. Firstly, we present a survey of
recent computational papers on supervised classification
published in seven high-ranking computational science
journals. The aim is to provide an up-to-date picture of
current scientific practice with respect to the comparison of
methods in both articles presenting new methods and articles
focusing on the comparison study itself. Secondly, based on
the results of our survey we critically discuss the necessity,
impact and limitations of neutral comparison studies in
computational sciences. We define three reasonable criteria a
comparison study has to fulfill in order to be considered as
neutral, and explicate general considerations on the
individual components of a ``tidy neutral comparison study''.
R codes for completely replicating our statistical analyses
and figures are available from the companion website
http://www.ibe.med.uni-muenchen.de/organisation/mitarbeiter/020\_professuren/boulesteix/plea2013.",
journal = "PLoS One",
volume = 8,
number = 4,
pages = "e61562",
month = "24~" # apr,
year = 2013
}
@ARTICLE{Bayzid2013-hc,
title = "Naive binning improves phylogenomic analyses",
author = "Bayzid, Md Shamsuzzoha and Warnow, Tandy",
affiliation = "Department of Computer Science, The University of Texas at
Austin, Austin, TX 78712, USA.",
abstract = "MOTIVATION: Species tree estimation in the presence of
incomplete lineage sorting (ILS) is a major challenge for
phylogenomic analysis. Although many methods have been
developed for this problem, little is understood about the
relative performance of these methods when estimated gene
trees are poorly estimated, owing to inadequate phylogenetic
signal. RESULTS: We explored the performance of some methods
for estimating species trees from multiple markers on
simulated datasets in which gene trees differed from the
species tree owing to ILS. We included *BEAST, concatenated
analysis and several 'summary methods': BUCKy, MP-EST,
minimize deep coalescence, matrix representation with
parsimony and the greedy consensus. We found that *BEAST and
concatenation gave excellent results, often with substantially
improved accuracy over the other methods. We observed that
*BEAST's accuracy is largely due to its ability to co-estimate
the gene trees and species tree. However, *BEAST is
computationally intensive, making it challenging to run on
datasets with 100 or more genes or with more than 20 taxa. We
propose a new approach to species tree estimation in which the
genes are partitioned into sets, and the species tree is
estimated from the resultant 'supergenes'. We show that this
technique improves the scalability of *BEAST without affecting
its accuracy and improves the accuracy of the summary methods.
Thus, naive binning can improve phylogenomic analysis in the
presence of ILS. CONTACT: [email protected] SUPPLEMENTARY
INFORMATION: Supplementary data are available at
Bioinformatics online.",
journal = "Bioinformatics",
volume = 29,
number = 18,
pages = "2277--2284",
month = "15~" # sep,
year = 2013
}
@ARTICLE{Kolodny2005-ry,
title = "Comprehensive evaluation of protein structure alignment
methods: scoring by geometric measures",
author = "Kolodny, Rachel and Koehl, Patrice and Levitt, Michael",
affiliation = "Department of Structural Biology, Fairchild Building, Stanford
University, Stanford CA 94305, USA. [email protected]",
abstract = "We report the largest and most comprehensive comparison of
protein structural alignment methods. Specifically, we
evaluate six publicly available structure alignment programs:
SSAP, STRUCTAL, DALI, LSQMAN, CE and SSM by aligning all
8,581,970 protein structure pairs in a test set of 2930
protein domains specially selected from CATH v.2.4 to ensure
sequence diversity. We consider an alignment good if it
matches many residues, and the two substructures are
geometrically similar. Even with this definition, evaluating
structural alignment methods is not straightforward. At first,
we compared the rates of true and false positives using
receiver operating characteristic (ROC) curves with the CATH
classification taken as a gold standard. This proved
unsatisfactory in that the quality of the alignments is not
taken into account: sometimes a method that finds less good
alignments scores better than a method that finds better
alignments. We correct this intrinsic limitation by using four
different geometric match measures (SI, MI, SAS, and GSAS) to
evaluate the quality of each structural alignment. With this
improved analysis we show that there is a wide variation in
the performance of different methods; the main reason for this
is that it can be difficult to find a good structural
alignment between two proteins even when such an alignment
exists. We find that STRUCTAL and SSM perform best, followed
by LSQMAN and CE. Our focus on the intrinsic quality of each
alignment allows us to propose a new method, called
``Best-of-All'' that combines the best results of all methods.
Many commonly used methods miss 10-50\% of the good
Best-of-All alignments. By putting existing structural
alignments into proper perspective, our study allows better
comparison of protein structures. By highlighting limitations
of existing methods, it will spur the further development of
better structural alignment methods. This will have
significant biological implications now that structural
comparison has come to play a central role in the analysis of
experimental work on protein structure, protein function and
protein evolution.",
journal = "J. Mol. Biol.",
volume = 346,
number = 4,
pages = "1173--1188",
month = "4~" # mar,
year = 2005
}
@ARTICLE{Sheldrick2008-xy,
title = "A short history of {SHELX}",
author = "Sheldrick, G M",
affiliation = "Department of Structural Chemistry, University of Goettingen,
Tammannstrasse 4, D-37077 Goettingen, Germany.
abstract = "An account is given of the development of the SHELX system of
computer programs from SHELX-76 to the present day. In
addition to identifying useful innovations that have come into
general use through their implementation in SHELX, a critical
analysis is presented of the less-successful features, missed
opportunities and desirable improvements for future releases
of the software. An attempt is made to understand how a
program originally designed for photographic intensity data,
punched cards and computers over 10000 times slower than an
average modern personal computer has managed to survive for so
long. SHELXL is the most widely used program for
small-molecule refinement and SHELXS and SHELXD are often
employed for structure solution despite the availability of
objectively superior programs. SHELXL also finds a niche for
the refinement of macromolecules against high-resolution or
twinned data; SHELXPRO acts as an interface for macromolecular
applications. SHELXC, SHELXD and SHELXE are proving useful for
the experimental phasing of macromolecules, especially because
they are fast and robust and so are often employed in
pipelines for high-throughput phasing. This paper could serve
as a general literature citation when one or more of the
open-source SHELX programs (and the Bruker AXS version
SHELXTL) are employed in the course of a crystal-structure
determination.",
journal = "Acta Crystallogr. A",
volume = 64,
number = "Pt 1",
pages = "112--122",
month = jan,
year = 2008
}
@ARTICLE{Jelizarow2010-zf,
title = "Over-optimism in bioinformatics: an illustration",
author = "Jelizarow, M and Guillemot, V and Tenenhaus, A
and Strimmer, K and Boulesteix, A",
affiliation = "Department of Medical Informatics, Biometry and Epidemiology,
University of Munich, Munich, Germany.",
abstract = "MOTIVATION: In statistical bioinformatics research, different
optimization mechanisms potentially lead to 'over-optimism' in
published papers. So far, however, a systematic critical study
concerning the various sources underlying this over-optimism
is lacking. RESULTS: We present an empirical study on
over-optimism using high-dimensional classification as
example. Specifically, we consider a 'promising' new
classification algorithm, namely linear discriminant analysis
incorporating prior knowledge on gene functional groups
through an appropriate shrinkage of the within-group
covariance matrix. While this approach yields poor results in
terms of error rate, we quantitatively demonstrate that it can
artificially seem superior to existing approaches if we 'fish
for significance'. The investigated sources of over-optimism
include the optimization of datasets, of settings, of
competing methods and, most importantly, of the method's
characteristics. We conclude that, if the improvement of a
quantitative criterion such as the error rate is the main
contribution of a paper, the superiority of new algorithms
should always be demonstrated on independent validation data.
AVAILABILITY: The R codes and relevant data can be downloaded
from
http://www.ibe.med.uni-muenchen.de/organisation/mitarbeiter/020\_professuren/boulesteix/overoptimism/,
such that the study is completely reproducible.",
journal = "Bioinformatics",
volume = 26,
number = 16,
pages = "1990--1998",
month = "15~" # aug,
year = 2010
}
@ARTICLE{Thompson1994-eu,
title = "{CLUSTAL} W: improving the sensitivity of progressive multiple
sequence alignment through sequence weighting, position-specific
gap penalties and weight matrix choice",
author = "Thompson, J D and Higgins, D G and Gibson, T J",
journal = "Nucleic Acids Res.",
volume = 22,
number = 22,
pages = "4673--4680",
month = nov,
year = 1994,
keywords = "Mendeley Import (Jan 23);paulall.bib"
}
@ARTICLE{Lu2013-fs,
title = "Comparative study of de novo assembly and genome-guided
assembly strategies for transcriptome reconstruction based on
{RNA-Seq}",
author = "Lu, Bingxin and Zeng, Zhenbing and Shi, Tieliu",
affiliation = "Center for Bioinformatics and Computational Biology, Shanghai
Key Laboratory of Regulatory Biology, Institute of Biomedical
Sciences and School of Life Sciences, East China Normal
University, Shanghai 200241, China.",
abstract = "Transcriptome reconstruction is an important application of
RNA-Seq, providing critical information for further analysis
of transcriptome. Although RNA-Seq offers the potential to
identify the whole picture of transcriptome, it still presents
special challenges. To handle these difficulties and
reconstruct transcriptome as completely as possible, current
computational approaches mainly employ two strategies: de novo
assembly and genome-guided assembly. In order to find the
similarities and differences between them, we firstly chose
five representative assemblers belonging to the two classes
respectively, and then investigated and compared their
algorithm features in theory and real performances in
practice. We found that all the methods can be reduced to
graph reduction problems, yet they have different conceptual
and practical implementations, thus each assembly method has
its specific advantages and disadvantages, performing worse
than others in certain aspects while outperforming others in
anther aspects at the same time. Finally we merged assemblies
of the five assemblers and obtained a much better assembly.
Additionally we evaluated an assembler using genome-guided de
novo assembly approach, and achieved good performance. Based
on these results, we suggest that to obtain a comprehensive
set of recovered transcripts, it is better to use a
combination of de novo assembly and genome-guided assembly.",
journal = "Sci. China Life Sci.",
volume = 56,
number = 2,
pages = "143--155",
month = feb,
year = 2013
}
@ARTICLE{Wilson2006-ih,
title = "Where's the real bottleneck in scientific computing?",
author = "Wilson, G V",
abstract = "When I first started doing computational science in 1986, a new
generation of fast, cheap chips had just ushered in the current
era of low-cost supercomputers, in which multiple processors
work in parallel on a single problem. Suddenly, it seemed as
though everyone ...",
journal = "Am. Sci.",
publisher = "americanscientist.org",
year = 2006
}
@ARTICLE{Felsenstein1995-ic,
title = "Phylogeny programs",
author = "Felsenstein, J",
journal = "Internet address: http://evolution. gs. washington.
edu/phylip/software. html",
year = 1995
}
@ARTICLE{Mann1947-re,
title = "On a Test of Whether one of Two Random Variables is
Stochastically Larger than the Other",
author = "Mann, H B and Whitney, D R",
abstract = "Let x and y be two random variables with continuous cumulative
distribution functions f and g. A statistic U depending on the
relative ranks of the x's and y's is proposed for testing the
hypothesis f = g. Wilcoxon proposed an equivalent test in the
Biometrics Bulletin, December, 1945, but gave only a few points
of the distribution of his statistic. Under the hypothesis f = g
the probability of obtaining a given U in a sample of n x's and
m y's is the solution of a certain recurrence relation involving
n and m. Using this recurrence relation tables have been
computed giving the probability of U for samples up to n = m =
8. At this point the distribution is almost normal. From the
recurrence relation explicit expressions for the mean, variance,
and fourth moment are obtained. The 2rth moment is shown to have
a certain form which enabled us to prove that the limit
distribution is normal if m, n go to infinity in any arbitrary
manner. The test is shown to be consistent with respect to the
class of alternatives $f(x) > g(x)$ for every x.",
journal = "Ann. Math. Stat.",
publisher = "Institute of Mathematical Statistics",
volume = 18,
number = 1,
pages = "50--60",
year = 1947
}
@ARTICLE{Knowles2008-jj,
title = "Why does a method that fails continue to be used?",
author = "Knowles, L Lacey",
affiliation = "Department of Ecology and Evolutionary Biology, Museum of
Zoology, University of Michigan, Ann Arbor, Michigan 48109,
USA. [email protected]",
abstract = "As a critical framework for addressing a diversity of
evolutionary and ecological questions, any method that
provides accurate and detailed phylogeographic inference would
be embraced. What is difficult to understand is the continued
use of a method that not only fails, but also has never been
shown to work--nested clade analysis is applied widely even
though the conditions under which the method will provide
reliable results have not yet been demonstrated. This
contradiction between performance and popularity is even more
perplexing given the recent methodological and computational
advances for making historical inferences, which include
estimating population genetic parameters and testing different
biogeographic scenarios. Here I briefly review the history of
criticisms and rebuttals that focus specifically on the high
rate of incorrect phylogeographic inference of nested-clade
analysis, with the goal of understanding what drives its
unfettered popularity. In this case, the appeal of what
nested-clade analysis claims to do--not what the method
actually achieves--appears to explain its paradoxical status
as a favorite method that fails. What a method promises, as
opposed to how it performs, must be considered separately when
evaluating whether the method represents a valuable tool for
historical inference.",
journal = "Evolution",
volume = 62,
number = 11,
pages = "2713--2717",
month = nov,
year = 2008
}
@INCOLLECTION{Kandemir2002-sv,
title = "Compiler Optimizations for Low Power Systems",
booktitle = "Power Aware Computing",
author = "Kandemir, Mahmut and Vijaykrishnan, N and Irwin, Mary Jane",
editor = "Graybill, Robert and Melhem, Rami",
publisher = "Springer US",
pages = "191--210",
series = "Series in Computer Science",
year = 2002,
language = "en"
}
@ARTICLE{Norel2011-cq,
title = "The self-assessment trap: can we all be better than average?",
author = "Norel, R and Rice, J J and Stolovitzky, G",
journal = "Mol. Syst. Biol.",
publisher = "EMBO Press",
volume = 7,
number = 1,
pages = "537",
month = "1~" # jan,
year = 2011
}
@ARTICLE{Posada1998-qq,
title = "{MODELTEST}: testing the model of {DNA} substitution",
author = "Posada, D and Crandall, K A",
affiliation = "Department of Zoology, Brigham Young University, 574 WIDB,
Provo, UT 84602-5255, USA. [email protected]",
abstract = "SUMMARY: The program MODELTEST uses log likelihood scores to
establish the model of DNA evolution that best fits the data.
AVAILABILITY: The MODELTEST package, including the source code
and some documentation is available at http://bioag.byu.
edu/zoology/crandall\_lab/modeltest.html.",
journal = "Bioinformatics",
volume = 14,
number = 9,
pages = "817--818",
year = 1998
}
@INCOLLECTION{Otwinowski1997-xj,
title = "[20] Processing of {X-ray} diffraction data collected in
oscillation mode",
booktitle = "Methods in Enzymology",
author = "Otwinowski, Z and Minor, W",
abstract = "Publisher Summary X-ray data can be collected with zero-, one-,
and two-dimensional detectors, zero-dimensional (single counter)
being the simplest and two-dimensional the most efficient in
terms of measuring diffracted X-rays in all directions. To
analyze the single-crystal diffraction data collected with these
detectors, several computer programs have been developed.
Two-dimensional detectors and related software are now
predominantly used to measure and integrate diffraction from
single crystals of biological macromolecules. Macromolecular
crystallography is an iterative process. To monitor the
progress, the HKL package provides two tools: (1) statistics,
both weighted ($\chi$2) and unweighted (R-merge), where the
Bayesian reasoning and multicomponent error model helps obtain
proper error estimates and (2) visualization of the process,
which helps an operator to confirm that the process of data
reduction, including the resulting statistics, is correct and
allows the evaluation of the problems for which there are no
good statistical criteria. Visualization also provides
confidence that the point of diminishing returns in data
collection and reduction has been reached. At that point, the
effort should be directed to solving the structure. The methods
presented in the chapter have been applied to solve a large
variety of problems, from inorganic molecules with 5 \AA unit
cell to rotavirus of 700 \AA diameters crystallized in 700
\texttimes{} 1000 \texttimes{} 1400 \AA cell.",
publisher = "Academic Press",
address = {}
volume = "276",
pages = "307--326",
year = 1997
}
@ARTICLE{Marx2013-zi,
title = "Biology: The big challenges of big data",
author = "Marx, V",
journal = "Nature",
volume = 498,
number = 7453,
pages = "255--260",
month = "13~" # jun,
year = 2013,
language = "en"
}
@ARTICLE{Woolley2010-ld,
title = "Evidence for a collective intelligence factor in the
performance of human groups",
author = "Woolley, A W and Chabris, C F and
Pentland, A and Hashmi, N and Malone, T W",
affiliation = "Carnegie Mellon University, Tepper School of Business,
Pittsburgh, PA 15213, USA. [email protected]",
abstract = "Psychologists have repeatedly shown that a single statistical
factor--often called ``general intelligence''--emerges from
the correlations among people's performance on a wide variety
of cognitive tasks. But no one has systematically examined
whether a similar kind of ``collective intelligence'' exists
for groups of people. In two studies with 699 people, working
in groups of two to five, we find converging evidence of a
general collective intelligence factor that explains a group's
performance on a wide variety of tasks. This ``c factor'' is
not strongly correlated with the average or maximum individual
intelligence of group members but is correlated with the
average social sensitivity of group members, the equality in
distribution of conversational turn-taking, and the proportion
of females in the group.",
journal = "Science",
volume = 330,
number = 6004,
pages = "686--688",
month = "29~" # oct,
year = 2010
}
@ARTICLE{Easterbrook1991-wp,
title = "Publication bias in clinical research",
author = "Easterbrook, P J and Berlin, J A and Gopalan, R and Matthews,
D R",
affiliation = "Division of Internal Medicine, Johns Hopkins University School
of Medicine, Baltimore, MD 21205.",
abstract = "In a retrospective survey, 487 research projects approved by
the Central Oxford Research Ethics Committee between 1984 and
1987, were studied for evidence of publication bias. As of
May, 1990, 285 of the studies had been analysed by the
investigators, and 52\% of these had been published. Studies
with statistically significant results were more likely to be
published than those finding no difference between the study
groups (adjusted odds ratio [OR] 2.32; 95\% confidence
interval [Cl] 1.25-4.28). Studies with significant results
were also more likely to lead to a greater number of
publications and presentations and to be published in journals
with a high citation impact factor. An increased likelihood of
publication was also associated with a high rating by the
investigator of the importance of the study results, and with
increasing sample size. The tendency towards publication bias
was greater with observational and laboratory-based
experimental studies (OR = 3.79; 95\% Cl = 1.47-9.76) than
with randomised clinical trials (OR = 0.84; 95\% Cl =
0.34-2.09). We have confirmed the presence of publication bias
in a cohort of clinical research studies. These findings
suggest that conclusions based only on a review of published
data should be interpreted cautiously, especially for
observational studies. Improved strategies are needed to
identify the results of unpublished as well as published
studies.",
journal = "Lancet",
volume = 337,
number = 8746,
pages = "867--872",
month = "13~" # apr,
year = 1991,
keywords = "Biomedical and Behavioral Research; Central Oxford Research
Ethics Committee; Empirical Approach"
}
@ARTICLE{Lowe1997-hq,
title = "{tRNAscan-SE}: a program for improved detection of transfer {RNA}
genes in genomic sequence",
author = "Lowe, T M and Eddy, S R",
journal = "Nucleic Acids Res.",
volume = 25,
number = 5,
pages = "955--964",
month = mar,
year = 1997,
keywords = "paulall.bib"
}
@ARTICLE{Gadbury2004-ga,
title = "Power and sample size estimation in high dimensional biology",
author = "Gadbury, G L and Page, G P and Edwards, J and {others}",
abstract = "Abstract Genomic scientists often test thousands of hypotheses
in a single experiment. One example is a microarray experiment
that seeks to determine differential gene expression among
experimental groups. Planning such experiments involves a
determination of ...",
journal = "Stat. Methods Med. Res.",
publisher = "smm.sagepub.com",
year = 2004
}
@ARTICLE{Minor2000-dv,
title = "Strategies for macromolecular synchrotron crystallography",
author = "Minor, W and Tomchick, D and Otwinowski, Z",
affiliation = "Department of Molecular Physiology and Biological Physics,
University of Virginia, Charlottesville, VA 22903, USA.
journal = "Structure",
volume = 8,
number = 5,
pages = "R105--10",
month = "15~" # may,
year = 2000
}
@ARTICLE{Jones1991-ik,
title = "Improved methods for building protein models in electron
density maps and the location of errors in these models",
author = "Jones, T A and Zou, J Y and Cowan, S W and Kjeldgaard, M",
affiliation = "Department of Molecular Biology, BMC, Uppsala, Sweden.",
abstract = "Map interpretation remains a critical step in solving the
structure of a macromolecule. Errors introduced at this early
stage may persist throughout crystallographic refinement and
result in an incorrect structure. The normally quoted
crystallographic residual is often a poor description for the
quality of the model. Strategies and tools are described that
help to alleviate this problem. These simplify the
model-building process, quantify the goodness of fit of the
model on a per-residue basis and locate possible errors in
peptide and side-chain conformations.",
journal = "Acta Crystallogr. A",
volume = "47 ( Pt 2)",
pages = "110--119",
month = "1~" # mar,
year = 1991
}
@ARTICLE{Wallner2005-qi,
title = "All are not equal: a benchmark of different homology modeling
programs",
author = "Wallner, Bj{\"{o}}rn and Elofsson, Arne",
affiliation = "Stockholm Bioinformatics Center, Albanova University Center,
Stockholm University, Stockholm, Sweden. [email protected]",
abstract = "Modeling a protein structure based on a homologous structure
is a standard method in structural biology today. In this
process an alignment of a target protein sequence onto the
structure of a template(s) is used as input to a program that
constructs a 3D model. It has been shown that the most
important factor in this process is the correctness of the
alignment and the choice of the best template structure(s),
while it is generally believed that there are no major
differences between the best modeling programs. Therefore, a
large number of studies to benchmark the alignment qualities
and the selection process have been performed. However, to our
knowledge no large-scale benchmark has been performed to
evaluate the programs used to transform the alignment to a 3D
model. In this study, a benchmark of six different homology
modeling programs- Modeller, SegMod/ENCAD, SWISS-MODEL,
3D-JIGSAW, nest, and Builder-is presented. The performance of
these programs is evaluated using physiochemical correctness
and structural similarity to the correct structure. From our
analysis it can be concluded that no single modeling program
outperform the others in all tests. However, it is quite clear
that three modeling programs, Modeller, nest, and SegMod/
ENCAD, perform better than the others. Interestingly, the
fastest and oldest modeling program, SegMod/ ENCAD, performs
very well, although it was written more than 10 years ago and
has not undergone any development since. It can also be
observed that none of the homology modeling programs builds
side chains as well as a specialized program (SCWRL), and
therefore there should be room for improvement.",
journal = "Protein Sci.",
volume = 14,
number = 5,
pages = "1315--1327",
month = may,
year = 2005
}
@ARTICLE{Merton1968-cb,
title = "The {Matthew Effect in Science}",
author = "Merton, R K",
journal = "Science",
publisher = "Washington",
volume = 159,
number = 3810,
pages = "56--63",
year = 1968
}
@ARTICLE{Joppa2013-vj,
title = "Troubling Trends in Scientific Software Use",
author = "Joppa, L N and McInerny, G and Harper, R and
Salido, L and Takeda, K and O'Hara, K and Gavaghan,
D and Emmott, S",
abstract = "Software pervades every domain of science ( 1 -- 3 ), perhaps
nowhere more decisively than in modeling. In key scientific
areas of great societal importance, models and the software that
implement them define both how science is done and what science
is done ( 4 , 5 ). Across all science, this dependence has led
to concerns around the need for open access to software ( 6 , 7
), centered on the reproducibility of research ( 1 , 8 -- 10 ).
From fields such as high-performance computing, we learn key
insights and best practices for how to develop, standardize, and
implement software ( 11 ). Open and systematic approaches to the
development of software are essential for all sciences. But for
many scientists this is not sufficient. We describe problems
with the adoption and use of scientific software.",
journal = "Science",
publisher = "American Association for the Advancement of Science",
volume = 340,
number = 6134,
pages = "814--815",
month = "17~" # may,
year = 2013,
language = "en"
}
@ARTICLE{Fourment2008-vl,
title = "A comparison of common programming languages used in
bioinformatics",
author = "Fourment, M and Gillings, M R",
affiliation = "Department of Biological Sciences, Macquarie University,
Sydney, NSW 2109, Australia. [email protected]",
abstract = "BACKGROUND: The performance of different programming languages
has previously been benchmarked using abstract mathematical
algorithms, but not using standard bioinformatics algorithms.
We compared the memory usage and speed of execution for three
standard bioinformatics methods, implemented in programs using
one of six different programming languages. Programs for the
Sellers algorithm, the Neighbor-Joining tree construction
algorithm and an algorithm for parsing BLAST file outputs were
implemented in C, C++, C\#, Java, Perl and Python. RESULTS:
Implementations in C and C++ were fastest and used the least
memory. Programs in these languages generally contained more
lines of code. Java and C\# appeared to be a compromise
between the flexibility of Perl and Python and the fast
performance of C and C++. The relative performance of the
tested languages did not change from Windows to Linux and no
clear evidence of a faster operating system was found. Source
code and additional information are available from
http://www.bioinformatics.org/benchmark/. CONCLUSION: This
benchmark provides a comparison of six commonly used
programming languages under two different operating systems.
The overall comparison shows that a developer should choose an
appropriate language carefully, taking into account the
performance expected and the library availability for each
language.",
journal = "BMC Bioinformatics",
volume = 9,
pages = "82",
month = "5~" # feb,
year = 2008
}
@ARTICLE{Puton2014-hy,
title = "{CompaRNA}: a server for continuous benchmarking of automated
methods for {RNA} secondary structure prediction",
author = "Puton, Tomasz and Kozlowski, Lukasz P and Rother, Kristian M and
Bujnicki, Janusz M",
journal = "Nucleic Acids Res.",
volume = 42,
number = 8,
pages = "5403--5406",
month = apr,
year = 2014
}
@ARTICLE{Swenson2010-un,
title = "A simulation study comparing supertree and combined analysis
methods using {SMIDGen}",
author = "Swenson, M S and Barban\c{c}on, F and Warnow, T and {others}",
abstract = "Abstract Background: Supertree methods comprise one approach to
reconstructing large molecular phylogenies given multi-marker
datasets: trees are estimated on each marker and then combined
into a tree (the `` supertree '') on the entire set of taxa.
Supertrees can be ...",
journal = "for Molecular Biology",
publisher = "biomedcentral.com",
year = 2010
}
@BOOK{Carroll2010-fn,
title = "Alice in wonderland \& through the looking glass",
author = "Carroll, Lewis",
publisher = "Bibliolis Books",
year = 2010
}
@ARTICLE{Bao2011-lv,
title = "Evaluation of next-generation sequencing software in mapping
and assembly",
author = "Bao, S and Jiang, R and Kwan, W and Wang,
B and Ma, X and Song, Y",
affiliation = "Department of Biochemistry, Center for Reproduction,
Development and Growth, The University of Hong Kong, Hong
Kong, Hong Kong.",
abstract = "Next-generation high-throughput DNA sequencing technologies
have advanced progressively in sequence-based genomic research
and novel biological applications with the promise of
sequencing DNA at unprecedented speed. These new
non-Sanger-based technologies feature several advantages when
compared with traditional sequencing methods in terms of
higher sequencing speed, lower per run cost and higher
accuracy. However, reads from next-generation sequencing (NGS)
platforms, such as 454/Roche, ABI/SOLiD and Illumina/Solexa,
are usually short, thereby restricting the applications of NGS
platforms in genome assembly and annotation. We presented an
overview of the challenges that these novel technologies meet
and particularly illustrated various bioinformatics attempts
on mapping and assembly for problem solving. We then compared
the performance of several programs in these two fields, and
further provided advices on selecting suitable tools for
specific biological applications.",
journal = "J. Hum. Genet.",
volume = 56,
number = 6,
pages = "406--414",
month = jun,
year = 2011
}
@ARTICLE{Tikk2010-qd,
title = "A comprehensive benchmark of kernel methods to extract
protein-protein interactions from literature",
author = "Tikk, Domonkos and Thomas, Philippe and Palaga, Peter and
Hakenberg, J{\"{o}}rg and Leser, Ulf",
affiliation = "Knowledge Management in Bioinformatics, Computer Science
Department, Humboldt-Universit{\"{a}}t zu Berlin, Berlin,
Germany. [email protected]",
abstract = "The most important way of conveying new findings in biomedical
research is scientific publication. Extraction of
protein-protein interactions (PPIs) reported in scientific
publications is one of the core topics of text mining in the
life sciences. Recently, a new class of such methods has been
proposed - convolution kernels that identify PPIs using deep
parses of sentences. However, comparing published results of
different PPI extraction methods is impossible due to the use
of different evaluation corpora, different evaluation metrics,
different tuning procedures, etc. In this paper, we study
whether the reported performance metrics are robust across
different corpora and learning settings and whether the use of
deep parsing actually leads to an increase in extraction
quality. Our ultimate goal is to identify the one method that
performs best in real-life scenarios, where information
extraction is performed on unseen text and not on specifically
prepared evaluation data. We performed a comprehensive
benchmarking of nine different methods for PPI extraction that
use convolution kernels on rich linguistic information.
Methods were evaluated on five different public corpora using
cross-validation, cross-learning, and cross-corpus evaluation.
Our study confirms that kernels using dependency trees
generally outperform kernels based on syntax trees. However,
our study also shows that only the best kernel methods can
compete with a simple rule-based approach when the evaluation
prevents information leakage between training and test
corpora. Our results further reveal that the F-score of many
approaches drops significantly if no corpus-specific parameter
optimization is applied and that methods reaching a good AUC
score often perform much worse in terms of F-score. We
conclude that for most kernels no sensible estimation of PPI
extraction performance on new text is possible, given the
current heterogeneity in evaluation data. Nevertheless, our
study shows that three kernels are clearly superior to the
other methods.",
journal = "PLoS Comput. Biol.",
volume = 6,
pages = "e1000837",
month = "1~" # jul,
year = 2010
}
@ARTICLE{Harzing2008-wb,
title = "Comparing the Google Scholar h-index with the {ISI} journal
impact factor",
author = "Harzing, A W and van der Wal, R",
abstract = "Abstract Publication in academic journals is a key criterion for
appointment, tenure and promotion in universities. Many
universities weigh publications according to the quality or
impact of the journal. Traditionally, journal quality has been
assessed through the ISI Journal Impact Factor (JIF).",
journal = "Research in Int. Management Products",
publisher = "harzing.com",
year = 2008
}
@ARTICLE{Moran2003-ve,
title = "Arguments for Rejecting the Sequential Bonferroni in Ecological
Studies",
author = "Moran, Matthew D",
journal = "Oikos",
publisher = "[Nordic Society Oikos, Wiley]",
volume = 100,
number = 2,
pages = "403--405",
year = 2003
}
@ARTICLE{Altschul2013-bv,
title = "The anatomy of successful computational biology software",
author = "Altschul, S and Demchak, B and Durbin, R and
Gentleman, R and Krzywinski, M and Li, H and
Nekrutenko, A and Robinson, J and Rasband, W and
Taylor, J and Trapnell, C",
affiliation = "National Center for Biotechnology Information, Bethesda,
Maryland.",
journal = "Nat. Biotechnol.",
volume = 31,
number = 10,
pages = "894--897",
month = oct,