biblio.bib

Automatically generated by Mendeley Desktop 1.19.5
Any changes to this file will be lost if it is regenerated by Mendeley.

BibTeX export options can be customized via Options -> BibTeX in Mendeley Desktop

@article{SVMRFE,
author = {Guyon, Isabelle and Weston, Jason and Barnhill, Stephen and Vapnik, Vladimir},
doi = {10.1023/A:1012487302797},
file = {:home/stachu/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Guyon et al. - 2002 - Gene Selection for Cancer Classification using Support Vector Machines.pdf:pdf},
issn = {08856125},
journal = {Machine Learning},
number = {1/3},
pages = {389--422},
publisher = {Kluwer Academic Publishers},
title = {{Gene Selection for Cancer Classification using Support Vector Machines}},
url = {http://link.springer.com/10.1023/A:1012487302797},
volume = {46},
year = {2002}
}
@article{NSC,
author = {Tibshirani, Robert and Hastie, Trevor and Narasimhan, Balasubramanian and Chu, Gilbert},
doi = {10.1214/ss/1056397488},
file = {:home/stachu/dyskD/ubuntu/programy/master/NSC.pdf:pdf},
issn = {0883-4237},
journal = {Statistical Science},
keywords = {Sample classification,gene expression arrays.},
month = {feb},
number = {1},
pages = {104--117},
publisher = {Institute of Mathematical Statistics},
title = {{Class Prediction by Nearest Shrunken Centroids, with Applications to DNA Microarrays}},
url = {http://projecteuclid.org/euclid.ss/1056397488},
volume = {18},
year = {2003}
}
@article{metastasis76,
abstract = {BACKGROUND Genome-wide measures of gene expression can identify patterns of gene activity that subclassify tumours and might provide a better means than is currently available for individual risk assessment in patients with lymph-node-negative breast cancer. METHODS We analysed, with Affymetrix Human U133a GeneChips, the expression of 22 000 transcripts from total RNA of frozen tumour samples from 286 lymph-node-negative patients who had not received adjuvant systemic treatment. FINDINGS In a training set of 115 tumours, we identified a 76-gene signature consisting of 60 genes for patients positive for oestrogen receptors (ER) and 16 genes for ER-negative patients. This signature showed 93{\%} sensitivity and 48{\%} specificity in a subsequent independent testing set of 171 lymph-node-negative patients. The gene profile was highly informative in identifying patients who developed distant metastases within 5 years (hazard ratio 5{\textperiodcentered}67 [95{\%} CI 2{\textperiodcentered}59–12{\textperiodcentered}4]), even when corrected for traditional prognostic factors in multivariate analysis (5{\textperiodcentered}55 [2{\textperiodcentered}46–12{\textperiodcentered}5]). The 76-gene profile also represented a strong prognostic factor for the development of metastasis in the subgroups of 84 premenopausal patients (9{\textperiodcentered}60 [2{\textperiodcentered}28–40{\textperiodcentered}5]), 87 postmenopausal patients (4{\textperiodcentered}04 [1{\textperiodcentered}57–10{\textperiodcentered}4]), and 79 patients with tumours of 10–20 mm (14{\textperiodcentered}1 [3{\textperiodcentered}34–59{\textperiodcentered}2]), a group of patients for whom prediction of prognosis is especially difficult. INTERPRETATION The identified signature provides a powerful tool for identification of patients at high risk of distant recurrence. The ability to identify patients who have a favourable prognosis could, after independent confirmation, allow clinicians to avoid adjuvant systemic therapy or to choose less aggressive therapeutic options.},
author = {Wang, Yixin and Klijn, Jan GM and Zhang, Yi and Sieuwerts, Anieta M and Look, Maxime P and Yang, Fei and Talantov, Dmitri and Timmermans, Mieke and {Meijer-van Gelder}, Marion E and Yu, Jack and Jatkoe, Tim and Berns, Els MJJ and Atkins, David and Foekens, John A},
doi = {10.1016/S0140-6736(05)17947-1},
file = {:home/stachu/dyskD/ubuntu/programy/master/metastasis/metastasis76Genes.pdf:pdf},
issn = {0140-6736},
journal = {The Lancet},
month = {feb},
number = {9460},
pages = {671--679},
publisher = {Elsevier},
title = {{Gene-expression profiles to predict distant metastasis of lymph-node-negative primary breast cancer}},
url = {https://www.sciencedirect.com/science/article/pii/S0140673605179471},
volume = {365},
year = {2005}
}
@article{thousandsSamples,
abstract = {Predicting at the time of discovery the prognosis and metastatic potential of cancer is a major challenge in current clinical research. Numerous recent studies searched for gene expression signatures that outperform traditionally used clinical parameters in outcome prediction. Finding such a signature will free many patients of the suffering and toxicity associated with adjuvant chemotherapy given to them under current protocols, even though they do not need such treatment. A reliable set of predictive genes also will contribute to a better understanding of the biological mechanism of metastasis. Several groups have published lists of predictive genes and reported good predictive performance based on them. However, the gene lists obtained for the same clinical types of patients by different groups differed widely and had only very few genes in common. This lack of agreement raised doubts about the reliability and robustness of the reported predictive gene lists, and the main source of the problem was shown to be the small number of samples that were used to generate the gene lists. Here, we introduce a previously undescribed mathematical method, probably approximately correct (PAC) sorting, for evaluating the robustness of such lists. We calculate for several published data sets the number of samples that are needed to achieve any desired level of reproducibility. For example, to achieve a typical overlap of 50{\%} between two predictive lists of genes, breast cancer studies would need the expression profiles of several thousand early discovery patients.},
author = {Ein-Dor, Liat and Zuk, Or and Domany, Eytan},
doi = {10.1073/pnas.0601231103},
file = {:home/stachu/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Ein-Dor, Zuk, Domany - 2006 - Thousands of samples are needed to generate a robust gene list for predicting outcome in cancer(2).pdf:pdf},
issn = {0027-8424},
journal = {Proceedings of the National Academy of Sciences of the United States of America},
month = {apr},
number = {15},
pages = {5923--8},
pmid = {16585533},
publisher = {National Academy of Sciences},
title = {{Thousands of samples are needed to generate a robust gene list for predicting outcome in cancer.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/16585533 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC1458674},
volume = {103},
year = {2006}
}
@article{sPCAold,
abstract = {Principal component analysis (PCA) is widely used in data processing and dimensionality reduction. However, PCA suffers from the fact that each principal component is a linear combination of all the original variables, thus it is often difficult to interpret the results. We introduce a new method called sparse principal component analysis (SPCA) using the lasso (elastic net) to produce modified principal components with sparse loadings. We first show that PCA can be formulated as a regression-type optimization problem; sparse loadings are then obtained by imposing the lasso (elastic net) constraint on the regression coefficients. Efficient algorithms are proposed to fit our SPCA models for both regular multivariate data and gene expression arrays. We also give a new formula to compute the total variance of modified principal components. As illustrations, SPCA is applied to real and simulated data with encouraging results.},
author = {Zou, Hui and Hastie, Trevor and Tibshirani, Robert},
doi = {10.1198/106186006X113430},
file = {:home/stachu/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Zou, Hastie, Tibshirani - Unknown - Sparse Principal Component Analysis.pdf:pdf},
issn = {1061-8600},
journal = {Journal of Computational and Graphical Statistics},
keywords = {Arrays,Gene expression,Lasso/elastic net,Multivariate analysis,Singular value decomposition,Thresholding},
month = {jun},
number = {2},
pages = {265--286},
publisher = {Taylor {\&} Francis},
title = {{Sparse Principal Component Analysis}},
url = {https://web.stanford.edu/{~}hastie/Papers/spc{\_}jcgs.pdf http://www.tandfonline.com/doi/abs/10.1198/106186006X113430},
volume = {15},
year = {2006}
}
@article{Metastasis2,
abstract = {Prognostic and predictive factors are indispensable tools in the treatment of patients with neoplastic disease. For the most part, such factors rely on a few specific cell surface, histological, or gross pathologic features. Gene expression assays have the potential to supplement what were previously a few distinct features with many thousands of features. We have developed Bayesian regression models that provide predictive capability based on gene expression data derived from DNA microarray analysis of a series of primary breast cancer samples. These patterns have the capacity to discriminate breast tumors on the basis of estrogen receptor status and also on the categorized lymph node status. Importantly, we assess the utility and validity of such models in predicting the status of tumors in crossvalidation determinations. The practical value of such approaches relies on the ability not only to assess relative probabilities of clinical outcomes for future samples but also to provide an honest assessment of the uncertainties associated with such predictive classifications on the basis of the selection of gene subsets for each validation analysis. This latter point is of critical importance in the ability to apply these methodologies to clinical assessment of tumor phenotype.},
annote = {Second important},
author = {West, M. and Blanchette, C. and Dressman, H. and Huang, E. and Ishida, S. and Spang, R. and Zuzan, H. and Olson, J. A. and Marks, J. R. and Nevins, J. R.},
doi = {10.1073/pnas.201162998},
file = {:home/stachu/dyskD/ubuntu/programy/master/predictiingBC.pdf:pdf},
issn = {0027-8424},
journal = {Proceedings of the National Academy of Sciences},
month = {sep},
number = {20},
pages = {11462--11467},
pmid = {11562467},
title = {{Predicting the clinical status of human breast cancer by using gene expression profiles}},
url = {http://www.pnas.org/cgi/doi/10.1073/pnas.201162998},
volume = {98},
year = {2001}
}
@article{Metastasis1,
abstract = {Breast cancer patients with the same stage of disease can have markedly different treatment responses and overall outcome. The strongest predictors for metastases (for example, lymph node status and histological grade) fail to classify accurately breast tumours according to their clinical behaviour. Chemotherapy or hormonal therapy reduces the risk of distant metastases by approximately one-third; however, 70-80{\%} of patients receiving this treatment would have survived without it. None of the signatures of breast cancer gene expression reported to date allow for patient-tailored therapy strategies. Here we used DNA microarray analysis on primary breast tumours of 117 young patients, and applied supervised classification to identify a gene expression signature strongly predictive of a short interval to distant metastases ('poor prognosis' signature) in patients without tumour cells in local lymph nodes at diagnosis (lymph node negative). In addition, we established a signature that identifies tumours of BRCA1 carriers. The poor prognosis signature consists of genes regulating cell cycle, invasion, metastasis and angiogenesis. This gene expression profile will outperform all currently used clinical parameters in predicting disease outcome. Our findings provide a strategy to select patients who would benefit from adjuvant therapy.},
annote = {MOST IMPORTANT but only hierarchical clustering},
author = {{Van't Veer}, Laura J. and Dai, Hongyue and {Van de Vijver}, Marc J. and He, Yudong D. and Hart, Augustinus A M and Mao, Mao and Peterse, Hans L. and {Van Der Kooy}, Karin and Marton, Matthew J. and Witteveen, Anke T. and Schreiber, George J. and Kerkhoven, Ron M. and Roberts, Chris and Linsley, Peter S. and Bernards, Ren{\'{e}} and Friend, Stephen H.},
doi = {10.1038/415530a},
file = {:home/stachu/dyskD/ubuntu/programy/master/metastasis/metastasis{\_}breast{\_}cancer.pdf:pdf},
issn = {00280836},
journal = {Nature},
number = {6871},
pages = {530--536},
title = {{Gene expression profiling predicts clinical outcome of breast cancer}},
volume = {415},
year = {2002}
}
@article{preprocessing,
abstract = {In this paper we report exploratory analyses of high-density oligonucleotide array data from the Affymetrix GeneChip system with the objective of improving upon currently used measures of gene expression. Our analyses make use of three data sets: a small experimental study consisting of five MGU74A mouse GeneChip arrays, part of the data from an extensive spike-in study conducted by Gene Logic and Wyeth's Genetics Institute involving 95 HG-U95A human GeneChip arrays; and part of a dilution study conducted by Gene Logic involving 75 HG-U95A GeneChip arrays. We display some familiar features of the perfect match and mismatch probe (PM and MM) values of these data, and examine the variance-mean relationship with probe-level data from probes believed to be defective, and so delivering noise only. We explain why we need to normalize the arrays to one another using probe level intensities. We then examine the behavior of the PM and MM using spike-in data and assess three commonly used summary measures: Affymetrix's (i) average difference (AvDiff) and (ii) MAS 5.0 signal, and (iii) the Li and Wong multiplicative model-based expression index (MBEI). The exploratory data analyses of the probe level data motivate a new summary measure that is a robust multi-array average (RMA) of background-adjusted, normalized, and log-transformed PM values. We evaluate the four expression summary measures using the dilution study data, assessing their behavior in terms of bias, variance and (for MBEI and RMA) model fit. Finally, we evaluate the algorithms in terms of their ability to detect known levels of differential expression using the spike-in data. We conclude that there is no obvious downside to using RMA and attaching a standard error (SE) to this quantity using a linear model which removes probe-specific affinities.},
author = {Irizarry, R. A. and Hobbs, Bridget and Collin, Francois and Beazer-Barclay, Yasmin D and Antonellis, Kristen J and Scherf, Uwe and Speed, Terence P},
doi = {10.1093/biostatistics/4.2.249},
file = {:home/stachu/dyskD/ubuntu/programy/master/greg/ExploNormSummries.pdf:pdf},
issn = {14654644},
journal = {Biostatistics},
month = {apr},
number = {2},
pages = {249--264},
pmid = {12925520},
title = {{Exploration, normalization, and summaries of high density oligonucleotide array probe level data}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/12925520 https://academic.oup.com/biostatistics/article-lookup/doi/10.1093/biostatistics/4.2.249},
volume = {4},
year = {2003}
}
@article{TumorMolecularClass,
annote = {Molecular classification - not based on data science},
author = {Gaasenbeek, M. and Lander, E. S. and Slonim, D. K. and Loh, M. L. and Tamayo, P. and Huard, C. and Coller, H. and Mesirov, J. P. and Golub, T. R. and Bloomfield, C. D. and Downing, J. R. and Caligiuri, M. A.},
doi = {10.1126/science.286.5439.531},
file = {:home/stachu/dyskD/ubuntu/programy/master/popular/MolecularClassificationMostPopular.pdf:pdf},
journal = {Science},
number = {5439},
pages = {531--537},
title = {{Molecular Classification of Cancer: Class Discovery and Class Prediction by Gene Expression Monitoring}},
url = {http://www.sciencemag.org/cgi/content/abstract/286/5439/531{\%}5Cnhttp://www.sciencemag.org/cgi/content/full/286/5439/531?ijkey=KxHv06lJBScvA},
volume = {286},
year = {1999}
}
@phdthesis{MLCC,
author = {Wilczy{\'{n}}ski, Stanis{\l}aw},
file = {:home/stachu/dyskD/ubuntu/programy/master/myBachelor.pdf:pdf},
pages = {28},
school = {University of Wroc{\l}aw},
title = {{Reduction of dimensionality by sparse subspace clustering}},
type = {Bachelor's thesis},
year = {2017}
}
@book{PCA,
abstract = {2nd ed. 1. Introduction -- 2. Properties of Population Principal Components -- 3. Properties of Sample Principal Components -- 4. Interpreting Principal Components: Examples -- 5. Graphical Representation of Data Using Principal Components -- 6. Choosing a Subset of Principal Components or Variables -- 7. Principal Component Analysis and Factor Analysis -- 8. Principal Components in Regression Analysis -- 9. Principal Components Used with Other Multivariate Techniques -- 10. Outlier Detection, Influential Observations and Robust Estimation -- 11. Rotation and Interpretation of Principal Components. 12. PCA for Time Series and Other Non-Independent Data -- 13. Principal Component Analysis for Special Types of Data -- 14. Generalizations and Adaptations of Principal Component Analysis -- A. Computation of Principal Components.},
author = {Jolliffe, I. T.},
file = {:home/stachu/dyskD/ubuntu/programy/master/PCA.pdf:pdf},
isbn = {0387954422},
pages = {487},
publisher = {Springer},
title = {{Principal component analysis}},
url = {https://books.google.pl/books/about/Principal{\_}Component{\_}Analysis.html?id={\_}olByCrhjwIC{\&}redir{\_}esc=y},
year = {2002}
}
@article{TumorPLS,
annote = {PLS},
author = {Nguyen, D. V. and Rocke, D. M.},
doi = {10.1093/bioinformatics/18.1.39},
file = {:home/stachu/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Nguyen, Rocke - 2002 - Tumor classification by partial least squares using microarray gene expression data.pdf:pdf},
issn = {1367-4803},
journal = {Bioinformatics},
month = {jan},
number = {1},
pages = {39--50},
title = {{Tumor classification by partial least squares using microarray gene expression data}},
url = {https://academic.oup.com/bioinformatics/article-lookup/doi/10.1093/bioinformatics/18.1.39},
volume = {18},
year = {2002}
}
@article{honey,
abstract = {1. Olivier Ledoit 1. A managing director in the Equities Division of Credit Suisse First Boston in London, UK. (olivier{\{}at{\}}ledoit.net) 2. Michael Wolf 1. A an associate professor of economics and business at the Universitat Pompeu Fabra in Barcelona, Spain. (michael.wolf{\{}at{\}}upf.edu) The central message of this article is that no one should use the sample covariance matrix for portfolio optimization. It is subject to estimation error of the kind most likely to perturb a mean-variance optimizer. Instead, a matrix can be obtained from the sample covariance matrix through a transformation called shrinkage. This tends to pull the most extreme coefficients toward more central values, systematically reducing estimation error when it matters most. Statistically, the challenge is to know the optimal shrinkage intensity. Shrinkage reduces portfolio tracking error relative to a benchmark index, and substantially raises the manager's realized information ratio.},
author = {Ledoit, Olivier and Wolf, Michael},
doi = {10.3905/jpm.2004.110},
file = {:home/stachu/dyskD/ubuntu/programy/master/honey.pdf:pdf},
issn = {0095-4918},
journal = {The Journal of Portfolio Management},
month = {jul},
number = {4},
pages = {110--119},
publisher = {Institutional Investor Journals Umbrella},
title = {{Honey, I Shrunk the Sample Covariance Matrix}},
url = {http://jpm.iijournals.com/lookup/doi/10.3905/jpm.2004.110},
volume = {30},
year = {2004}
}
@article{pesel,
abstract = {ABSTRACTWe discuss the problem of estimating the number of principal components in principal components analysis (PCA). Despite the importance of the problem and the multitude of solutions proposed in literature, it comes as a surprise that there does not exist a coherent asymptotic framework, which would justify different approaches depending on the actual size of the dataset. In this article, we address this issue by presenting an approximate Bayesian approach based on Laplace approximation and introducing a general method of developing criteria for model selection, called PEnalized SEmi-integrated Likelihood (PESEL). Our general framework encompasses a variety of existing approaches based on probabilistic models, like the Bayesian Information Criterion for Probabilistic PCA (PPCA), and enables the construction of new criteria, depending on the size of the dataset at hand and additional prior information. Specifically, we apply PESEL to derive two new criteria for datasets where the number of variables ...},
archivePrefix = {arXiv},
arxivId = {1606.05333},
author = {Sobczyk, Piotr and Bogdan, Malgorzata Ma{\l}gorzata and Josse, Julie},
doi = {10.1080/10618600.2017.1340302},
eprint = {1606.05333},
file = {:home/stachu/dyskD/ubuntu/programy/master/pesel.pdf:pdf},
issn = {1061-8600},
journal = {Journal of Computational and Graphical Statistics},
keywords = {Bayesian model selection,Dimension estimation,Laplace approximation,Principal component analysis},
month = {jun},
number = {4},
pages = {826--839},
publisher = {Taylor {\&} Francis},
title = {{Bayesian dimensionality reduction with PCA using penalized semi-integrated likelihood}},
url = {https://www.tandfonline.com/doi/full/10.1080/10618600.2017.1340302 http://arxiv.org/abs/1606.05333},
volume = {26},
year = {2017}
}
@article{GEO,
abstract = {The Gene Expression Omnibus (GEO) database is an international public repository that archives and freely distributes high-throughput gene expression and other functional genomics data sets. Created in 2000 as a worldwide resource for gene expression studies, GEO has evolved with rapidly changing technologies and now accepts high-throughput data for many other data applications, including those that examine genome methylation, chromatin structure, and genome-protein interactions. GEO supports community-derived reporting standards that specify provision of several critical study elements including raw data, processed data, and descriptive metadata. The database not only provides access to data for tens of thousands of studies, but also offers various Web-based tools and strategies that enable users to locate data relevant to their specific interests, as well as to visualize and analyze the data. This chapter includes detailed descriptions of methods to query and download GEO data and use the analysis and visualization tools. The GEO homepage is at http://www.ncbi.nlm.nih.gov/geo/.},
author = {Clough, Emily and Barrett, Tanya},
doi = {10.1007/978-1-4939-3578-9_5},
file = {:home/stachu/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Clough, Barrett - 2016 - The Gene Expression Omnibus Database.pdf:pdf},
issn = {1940-6029},
journal = {Methods in molecular biology (Clifton, N.J.)},
keywords = {Data mining,Database,Functional genomics,Gene expression,High-throughput sequencing,Microarray},
pages = {93--110},
pmid = {27008011},
publisher = {NIH Public Access},
title = {{The Gene Expression Omnibus Database.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/27008011 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC4944384},
volume = {1418},
year = {2016}
}
@phdthesis{MasterArts,
abstract = {In cancer research there is a large need to automate parts of the process of diagnosis, this is mainly to reduce cost, make it faster and more accurate. Gene expression data of tumor samples is known to contain much information about the disease and with that information that can help with diagnosing and even curing the patient. Datamining methods are an obvious candidate to aid in this automation and therefore have been deployed on gene expression data in a number of research papers. However, all these researchers face the same problem, the limited amount of samples and the large number of features. Due to the many genes in the human genome and the only few tumor samples that have been processed for their gene expression data, the data set has much more features than there are samples. This makes any type of data analysis hard due to easy overfitting on data with these properties. Thusly, there is a need for a way of effectively reducing the dimensionality of the samples, without removing the information of interest, to enable more effective data analysis of the reduced set. In this thesis, I propose a framework that is capable of reducing the dimensionality of gene expression samples for case-specific purposes. I explore multiple types of dimensionality reduction from basic statistical ones to novel deep learning algorithms. This research concludes with suggesting a combination of multiple linear prediction algorithms for feature selection in a case-specific fashion. With these types of algorithms some problems exist with the selections stability and robustness, I designed a framework aimed at improving these properties of the resulting selection. The framework combines multiple of these algorithms with cross-folding to end up with a sufficiently stable set of features that can be used for further analysis. Apart from this main result, the framework produces metrics that indicate the quality of the selection and it does additional genetic analysis and plotting relevant for field experts. Besides proposing and arguing the setup and validity of this framework, the implemented framework is tested on several medically relevant use cases and the results of these tests are presented and analyzed in this thesis as well. These results show the effectiveness of the framework on certain use-cases and the limits of the gene expression data. They prove that the framework is a solution to the proposed problem and show that the framework could add value for medical professionals during their daily practices.},
annote = {Best one, includes pathway enrichment},
author = {Arts, Sako},
file = {:home/stachu/dyskD/ubuntu/programy/master/masterThesis.pdf:pdf},
school = {Eindhoven Univeristy of Technology},
title = {{Dimensionality Reduction of Gene Expression Data}},
type = {Mater's Thesis},
url = {https://pure.tue.nl/ws/portalfiles/portal/109115723/CSE642{\_}Final{\_}Version{\_}Master{\_}Thesis{\_}v1{\_}0{\_}Sako{\_}Arts.pdf},
year = {2018}
}
@book{deeplearning,
annote = {$\backslash$url{\{}http://www.deeplearningbook.org{\}}},
author = {Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron},
publisher = {MIT Press},
title = {{Deep Learning}},
year = {2016}
}
@article{TumorClass3,
abstract = {Personalized drug design requires the classification of cancer patients as accurate as possible. With advances in genome sequencing and microarray technology, a large amount of gene expression data has been and will continuously be produced from various cancerous patients. Such cancer-alerted gene expression data allows us to classify tumors at the genomewide level. However, cancer-alerted gene expression datasets typically have much more number of genes (features) than that of samples (patients), which imposes a challenge for classification of tumors. In this paper, a new method is proposed for cancer diagnosis using gene expression data by casting the classification problem as finding sparse representations of test samples with respect to training samples. The sparse representation is computed by the l1 -regularized least square method. To investigate its performance, the proposed method is applied to six tumor gene expression datasets and compared with various support vector machine (SVM) methods. The experimental results have shown that the performance of the proposed method is comparable with or better than those of SVMs. In addition, the proposed method is more efficient than SVMs as it has no need of model selection.},
annote = {Good intro},
author = {Hang, Xiyi and Wu, Fang-Xiang},
doi = {10.1155/2009/403689},
file = {:home/stachu/dyskD/ubuntu/programy/master/popular/SparseRepresentationForGEClassification.pdf:pdf},
issn = {1110-7243},
journal = {Journal of Biomedicine and Biotechnology},
pages = {1--6},
title = {{Sparse Representation for Classification of Tumors Using Gene Expression Data}},
volume = {2009},
year = {2009}
}
@article{matplotlib,
author = {Hunter, John D.},
doi = {10.1109/MCSE.2007.55},
issn = {1521-9615},
journal = {Computing in Science {\&} Engineering},
number = {3},
pages = {90--95},
title = {{Matplotlib: A 2D Graphics Environment}},
url = {http://ieeexplore.ieee.org/document/4160265/},
volume = {9},
year = {2007}
}
@book{geneExpr,
abstract = {Seventh edition. Known world-wide as the standard introductory text to this important and exciting area, the seventh edition of Gene Cloning and DNA Analysis addresses new and growing areas of research whilst retaining the philosophy of the previous editions. Assuming the reader has little prior knowledge of the subject, its importance, the principles of the techniques used and their applications are all carefully laid out, with over 250 clearly presented four-colour illustrations. In addition to a number of informative changes to the text throughout the book, the chapters on DNA sequencing and genome studies have been rewritten to reflect the continuing rapid developments in this area of DNA analysis: -In depth description of the next generation sequencing methods and descriptions of their applications in studying genomes and transcriptomes -New material on the use of ChiP-seq to locate protein-binding sites -Extended coverage of the strategies used to assemble genome sequences -Description of how the Neanderthal genome has been sequenced and what that sequence tells us about interbreeding between Neanderthals and Homo sapiens Gene Cloning and DNA Analysis remains an essential introductory text to a wide range of biological sciences students; including genetics and genomics, molecular biology, biochemistry, immunology and applied biology. It is also a perfect introductory text for any professional needing to learn the basics of the subject. All libraries in universities where medical, life and biological sciences are studied and taught should have copies available on their shelves.},
author = {Brown, Terence A.},
isbn = {9781119072560},
publisher = {Wiley-Blackwell},
title = {{Gene cloning and DNA analysis : an introduction}},
url = {https://www.wiley.com/en-us/Gene+Cloning+and+DNA+Analysis{\%}3A+An+Introduction{\%}2C+7th+Edition-p-9781119072560},
year = {2016}
}
@article{fDNN,
abstract = {In predictive model development, gene expression data is associated with the unique challenge that the number of samples (n) is much smaller than the amount of features (p). This “n ≪ p” property has prevented classification of gene expression data from deep learning techniques, which have been proved powerful under “n {\textgreater} p” scenarios in other application fields, such as image classification. Further, the sparsity of effective features with unknown correlation structures in gene expression profiles brings more challenges for classification tasks. To tackle these problems, we propose a newly developed classifier named Forest Deep Neural Network (fDNN), to integrate the deep neural network architecture with a supervised forest feature detector. Using this built-in feature detector, the method is able to learn sparse feature representations and feed the representations into a neural network to mitigate the overfitting problem. Simulation experiments and real data analyses using two RNA-seq expression datasets are conducted to evaluate fDNN's capability. The method is demonstrated a useful addition to current predictive models with better classification performance and more meaningful selected features compared to ordinary random forests and deep neural networks.},
author = {Kong, Yunchuan and Yu, Tianwei},
doi = {10.1038/s41598-018-34833-6},
file = {:home/stachu/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Kong, Yu - 2018 - A Deep Neural Network Model using Random Forest to Extract Feature Representation for Gene Expression Data Classificat.pdf:pdf},
issn = {2045-2322},
journal = {Scientific Reports},
keywords = {Gene expression,Statistics},
month = {dec},
number = {1},
pages = {16477},
publisher = {Nature Publishing Group},
title = {{A Deep Neural Network Model using Random Forest to Extract Feature Representation for Gene Expression Data Classification}},
url = {http://www.nature.com/articles/s41598-018-34833-6},
volume = {8},
year = {2018}
}
@book{dataClassification,
abstract = {This book homes in on three primary aspects of data classification: the core methods for data classification including probabilistic classification, decision trees, rule-based methods, and SVM methods; different problem domains and scenarios such as multimedia data, text data, biological data, categorical data, network data, data streams and uncertain data: and different variations of the classification problem such as ensemble methods, visual methods, transfer learning, semi-supervised methods and active learning. These advanced methods can be used to enhance the quality of the underlying classification results.},
author = {Aggarwal, Charu C.},
edition = {1st},
file = {:home/stachu/dyskD/ubuntu/programy/master/popular/Data Classification.pdf:pdf},
isbn = {9781466586741},
publisher = {Chapman and Hall/CRC},
title = {{Data classification : algorithms and applications}},
url = {https://www.crcpress.com/Data-Classification-Algorithms-and-Applications/Aggarwal/p/book/9781466586741},
year = {2014}
}
@inproceedings{pytorch,
author = {Paszke, Adam and Gross, Sam and Chintala, Soumith and Chanan, Gregory and Yang, Edward and DeVito, Zachary and Lin, Zeming and Desmaison, Alban and Antiga, Luca and Lerer, Adam},
booktitle = {NIPS-W},
title = {{Automatic differentiation in PyTorch}},
year = {2017}
}
@article{dataOrigin,
abstract = {Breast cancer is a heterogeneous disease and has been classified into five molecular subtypes based on gene expression profiles. Signaling processes linked to different breast cancer molecular subtypes and different clinical outcomes are still poorly understood. Aberrant regulation of Wnt signaling has been implicated in breast cancer progression. In particular Ror1/2 receptors and several other members of the non-canonical Wnt signaling pathway were associated with aggressive breast cancer behavior. However, Wnt signals are mediated via multiple complex pathways, and it is clinically important to determine which particular Wnt cascades, including their domains and targets, are deregulated in poor prognosis breast cancer. To investigate activation and outcome of the Ror2-dependent non-canonical Wnt signaling pathway, we overexpressed the Ror2 receptor in MCF-7 and MDA-MB231 breast cancer cells, stimulated the cells with its ligand Wnt5a, and we knocked-down Ror1 in MDA-MB231 cells. We measured the invasive capacity of perturbed cells to assess phenotypic changes, and mRNA was profiled to quantify gene expression changes. Differentially expressed genes were integrated into a literature-based non-canonical Wnt signaling network. The results were further used in the analysis of an independent dataset of breast cancer patients with metastasis-free survival annotation. Overexpression of the Ror2 receptor, stimulation with Wnt5a, as well as the combination of both perturbations enhanced invasiveness of MCF-7 cells. The expression-responsive targets of Ror2 overexpression in MCF-7 induced a Ror2/Wnt module of the non-canonical Wnt signaling pathway. These targets alter regulation of other pathways involved in cell remodeling processing and cell metabolism. Furthermore, the genes of the Ror2/Wnt module were assessed as a gene signature in patient gene expression data and showed an association with clinical outcome. In summary, results of this study indicate a role of a newly defined Ror2/Wnt module in breast cancer progression and present a link between Ror2 expression and increased cell invasiveness.},
author = {Bayerlov{\'{a}}, Michaela and Menck, Kerstin and Klemm, Florian and Wolff, Alexander and Pukrop, Tobias and Binder, Claudia and Bei{\ss}barth, Tim and Bleckmann, Annalen},
doi = {10.3389/fonc.2017.00135},
file = {:home/stachu/dyskD/ubuntu/programy/master/greg/Ror2{\_}Signaling{\_}and{\_}Its{\_}Relevance{\_}in{\_}Breast{\_}Cancer{\_}.pdf:pdf},
issn = {2234-943X},
journal = {Frontiers in Oncology},
keywords = {Ror2,Wnt signaling,breast cancer,metastasis,module,network integration},
month = {jun},
pages = {135},
pmid = {28695110},
title = {{Ror2 Signaling and Its Relevance in Breast Cancer Progression}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/28695110 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC5483589 http://journal.frontiersin.org/article/10.3389/fonc.2017.00135/full},
volume = {7},
year = {2017}
}
@manual{samManual,
annote = {R package version 3.0},
author = {Tibshirani, R and Seo, Michael J and Chu, G and Narasimhan, Balasubramanian and Li, Jun},
file = {:home/stachu/dyskD/ubuntu/programy/master/samManual.pdf:pdf},
title = {{samr: SAM: Significance Analysis of Microarrays}},
url = {https://cran.r-project.org/package=samr},
year = {2018}
}
@article{PLSover,
abstract = {Partial least squares (PLS) is an efficient statistical regression technique that is highly suited for the analysis of genomic and proteomic data. In this article, we review both the theory underlying PLS as well as a host of bioinformatics applications of PLS. In particular, we provide a systematic comparison of the PLS approaches currently employed, and discuss analysis problems as diverse as, e.g. tumor classification from transcriptome data, identification of relevant genes, survival analysis and modeling of gene networks and transcription factor activities.},
author = {Boulesteix, A.-L. and Strimmer, K.},
doi = {10.1093/bib/bbl016},
file = {:home/stachu/dyskD/ubuntu/programy/master/PLSOverview.pdf:pdf},
issn = {1467-5463},
journal = {Briefings in Bioinformatics},
month = {may},
number = {1},
pages = {32--44},
pmid = {16772269},
title = {{Partial least squares: a versatile tool for the analysis of high-dimensional genomic data}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/16772269 https://academic.oup.com/bib/article-lookup/doi/10.1093/bib/bbl016},
volume = {8},
year = {2006}
}
@article{RF,
author = {Breiman, Leo},
doi = {10.1023/A:1010933404324},
file = {:home/stachu/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Breiman - 2001 - Random Forests.pdf:pdf},
issn = {08856125},
journal = {Machine Learning},
number = {1},
pages = {5--32},
publisher = {Kluwer Academic Publishers},
title = {{Random Forests}},
url = {http://link.springer.com/10.1023/A:1010933404324},
volume = {45},
year = {2001}
}
@article{TumorClusts,
abstract = {Gene expression data are the representation of nonlinear interactions among genes and environmental factors. Computing analysis of these data is expected to gain knowledge of gene functions and disease mechanisms. Clustering is a classical exploratory technique of discovering similar expression patterns and function modules. However, gene expression data are usually of high dimensions and relatively small samples, which results in the main difficulty for the application of clustering algorithms. Principal component analysis (PCA) is usually used to reduce the data dimensions for further clustering analysis. While PCA estimates the similarity between expression profiles based on the Euclidean distance, which cannot reveal the nonlinear connections between genes. This paper uses nonlinear dimensionality reduction (NDR) as a preprocessing strategy for feature selection and visualization, and then applies clustering algorithms to the reduced feature spaces. In order to estimate the effectiveness of NDR for capturing biologically relevant structures, the comparative analysis between NDR and PCA is exploited to five real cancer expression datasets. Results show that NDR can perform better than PCA in visualization and clustering analysis of complex gene expression data. {\textcopyright} 2010 Elsevier Ltd.},
author = {Shi, Jinlong and Luo, Zhigang},
doi = {10.1016/j.compbiomed.2010.06.007},
file = {:home/stachu/dyskD/ubuntu/programy/master/popular/nonlinearDimRedForClassification.pdf:pdf},
issn = {00104825},
journal = {Computers in Biology and Medicine},
keywords = {Cancer tissue,Clustering analysis,Gene expression,Nonlinear dimensionality reduction,Visualization},
number = {8},
pages = {723--732},
publisher = {Elsevier},
title = {{Nonlinear dimensionality reduction of gene expression data for visualization and clustering analysis of cancer tissue samples}},
url = {http://dx.doi.org/10.1016/j.compbiomed.2010.06.007},
volume = {40},
year = {2010}
}
@article{MetastasisScores,
abstract = {Mapping the pathways that give rise to metastasis is one of the key challenges of breast cancer research. Recently, several large-scale studies have shed light on this problem through analysis of gene expression profiles to identify markers correlated with metastasis. Here, we apply a protein-network-based approach that identifies markers not as individual genes but as subnetworks extracted from protein interaction databases. The resulting subnetworks provide novel hypotheses for pathways involved in tumor progression. Although genes with known breast cancer mutations are typically not detected through analysis of differential expression, they play a central role in the protein network by interconnecting many differentially expressed genes. We find that the subnetwork markers are more reproducible than individual marker genes selected without network information, and that they achieve higher accuracy in the classification of metastatic versus non-metastatic tumors.},
annote = {A bit different - uses prior knowledge about some markers, structure - however states the scores under some conditions (USEFUL)},
author = {Chuang, Han Yu and Lee, Eunjung and Liu, Yu Tsueng and Lee, Doheon and Ideker, Trey},
doi = {10.1038/msb4100180},
file = {:home/stachu/dyskD/ubuntu/programy/master/metastasis/network{\_}metasis{\_}with{\_}percentage{\_}accuracy.pdf:pdf},
issn = {17444292},
journal = {Molecular Systems Biology},
keywords = {Breast cancer metastasis,Classification,Microarrays,Pathways,Protein networks},
number = {140},
pages = {1--10},
title = {{Network-based classification of breast cancer metastasis}},
volume = {3},
year = {2007}
}
@article{Correlation,
author = {Toloşi, Laura and Lengauer, Thomas},
doi = {10.1093/bioinformatics/btr300},
file = {:home/stachu/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Toloşi, Lengauer - 2011 - Classification with correlated features unreliability of feature ranking and solutions.pdf:pdf},
issn = {1367-4803},
journal = {Bioinformatics},
month = {jul},
number = {14},
pages = {1986--1994},
publisher = {Narnia},
title = {{Classification with correlated features: unreliability of feature ranking and solutions}},
url = {https://academic.oup.com/bioinformatics/article-lookup/doi/10.1093/bioinformatics/btr300},
volume = {27},
year = {2011}
}
@book{ESL2,
address = {New York, NY},
author = {Hastie, Trevor and Tibshirani, Robert and Friedman, Jerome},
doi = {10.1007/978-0-387-84858-7},
file = {:home/stachu/dyskD/ubuntu/programy/master/popular/ESLII.pdf:pdf},
isbn = {978-0-387-84857-0},
publisher = {Springer New York},
series = {Springer Series in Statistics},
title = {{The Elements of Statistical Learning}},
url = {http://link.springer.com/10.1007/978-0-387-84858-7},
year = {2009}
}
@article{MetastasisComparison,
abstract = {OAPA In many microarray studies, classifiers have been constructed based on gene signatures to predict clinical outcomes for various cancer sufferers. However, signatures originating from different studies often suffer from poor robustness when used in the classification of datasets independent from which they were generated from. In this paper, we present an unsupervised feature learning framework by integrating a principal component analysis algorithm and autoencoder neural network to identify different characteristics from gene expression profiles. As the foundation for the obtained features, an ensemble classifier based on the AdaBoost algorithm (PCA-AE-Ada) was constructed to predict clinical outcomes in breast cancer. During the experiments, we established an additional classifier with the same classifier learning strategy (PCA-Ada) in order to perform as a baseline to the proposed method, where the only difference is the training inputs. The AUC (area under the receiver operating characteristic curve) index, MCC (Matthews correlation coefficient) index, ACC (accuracy), and other evaluation parameters of the proposed method were tested on several independent breast cancer datasets and compared with representative gene signature-based algorithms including the baseline method. Experimental results demonstrate that the proposed method using deep learning techniques performs better than others.},
author = {Zhang, Dejun and Zou, Lu and Zhou, Xionghui and He, Fazhi},
doi = {10.1109/ACCESS.2018.2837654},
file = {:home/stachu/dyskD/ubuntu/programy/master/metastasis/IEEE Xplore Full-Text PDF{\_}.html:html},
issn = {21693536},
journal = {IEEE Access},
keywords = {Cancer prognosis,deep learning,ensemble classifier,principal component analysis},
month = {may},
pages = {28936--28944},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
title = {{Integrating Feature Selection and Feature Extraction Methods with Deep Learning to Predict Clinical Outcome of Breast Cancer}},
volume = {6},
year = {2018}
}
@article{Metastasis4,
abstract = {Breast cancer starts as a local disease, but it can metastasize to the lymph nodes and distant organs. At primary diagnosis, prognostic markers are used to assess whether the transition to systemic disease is likely to have occurred. The prevailing model of metastasis reflects this view--it suggests that metastatic capacity is a late, acquired event in tumorigenesis. Others have proposed the idea that breast cancer is intrinsically a systemic disease. New molecular technologies, such as DNA microarrays, support the idea that metastatic capacity might be an inherent feature of breast tumours. These data have important implications for prognosis prediction and our understanding of metastasis.},
annote = {Purely medical},
author = {Weigelt, Britta and Peterse, Johannes L. and {Van't Veer}, Laura J.},
doi = {10.1038/nrc1670},
file = {:home/stachu/dyskD/ubuntu/programy/master/metastasis/Breast{\_}cancer{\_}metastasis{\_}Markers{\_}and{\_}Models.pdf:pdf},
issn = {1474175X},
journal = {Nature Reviews Cancer},
number = {8},
pages = {591--602},
title = {{Breast cancer metastasis: Markers and models}},
volume = {5},
year = {2005}
}
@book{ISL,
abstract = {An Introduction to Statistical Learning provides an accessible overview of the field of statistical learning, an essential toolset for making sense of the vast and complex data sets that have emerged in fields ranging from biology to finance to marketing to astrophysics in the past twenty years. This book presents some of the most important modeling and prediction techniques, along with relevant applications. Topics include linear regression, classification, resampling methods, shrinkage approaches, tree-based methods, support vector machines, clustering, and more. Color graphics and real-world examples are used to illustrate the methods presented. Since the goal of this textbook is to facilitate the use of these statistical learning techniques by practitioners in science, industry, and other fields, each chapter contains a tutorial on implementing the analyses and methods presented in R, an extremely popular open source statistical software platform. Two of the authors co-wrote The Elements of Statistical Learning (Hastie, Tibshirani and Friedman, 2nd edition 2009), a popular reference book for statistics and machine learning researchers. An Introduction to Statistical Learning covers many of the same topics, but at a level accessible to a much broader audience. This book is targeted at statisticians and non-statisticians alike who wish to use cutting-edge statistical learning techniques to analyze their data. The text assumes only a previous course in linear regression and no knowledge of matrix algebra.},
author = {James, Gareth and Witten, Daniela and Hastie, Trevor and Tibshirani, Robert},
doi = {10.1007/978-1-4614-7138-7},
file = {:home/stachu/dyskD/ubuntu/programy/master/popular/ISLR Seventh Printing.pdf:pdf},
isbn = {978-1-4614-7137-0},
issn = {0929-8673},
pmid = {10911016},
publisher = {Springer Publishing Company, Incorporated},
title = {{An Introduction to Statistical Learning: With Applications in R}},
url = {http://link.springer.com/10.1007/978-1-4614-7138-7},
year = {2014}
}
@incollection{TumorClass4,
address = {Berlin, Heidelberg},
author = {Lee, George and Rodriguez, Carlos and Madabhushi, Anant},
booktitle = {Bioinformatics Research and Applications},
doi = {10.1007/978-3-540-72031-7_16},
file = {:home/stachu/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Lee, Rodriguez, Madabhushi - 2007 - An Empirical Comparison of Dimensionality Reduction Methods for Classifying Gene and Protein Express.pdf:pdf},
pages = {170--181},
publisher = {Springer Berlin Heidelberg},
title = {{An Empirical Comparison of Dimensionality Reduction Methods for Classifying Gene and Protein Expression Datasets}},
url = {http://link.springer.com/10.1007/978-3-540-72031-7{\_}16},
year = {2007}
}
@article{NCBI2,
abstract = {The Gene Expression Omnibus (GEO, http://www.ncbi.nlm.nih.gov/geo/) is an international public repository for high-throughput microarray and next-generation sequence functional genomic data sets submitted by the research community. The resource supports archiving of raw data, processed data and metadata which are indexed, cross-linked and searchable. All data are freely available for download in a variety of formats. GEO also provides several web-based tools and strategies to assist users to query, analyse and visualize data. This article reports current status and recent database developments, including the release of GEO2R, an R-based web application that helps users analyse GEO data.},
author = {Barrett, Tanya and Wilhite, Stephen E. and Ledoux, Pierre and Evangelista, Carlos and Kim, Irene F. and Tomashevsky, Maxim and Marshall, Kimberly A. and Phillippy, Katherine H. and Sherman, Patti M. and Holko, Michelle and Yefanov, Andrey and Lee, Hyeseung and Zhang, Naigong and Robertson, Cynthia L. and Serova, Nadezhda and Davis, Sean and Soboleva, Alexandra},
doi = {10.1093/nar/gks1193},
file = {:home/stachu/dyskD/ubuntu/programy/master/greg/NCBIGeo.pdf:pdf},
issn = {0305-1048},
journal = {Nucleic Acids Research},
month = {nov},
number = {D1},
pages = {D991--D995},
pmid = {23193258},
title = {{NCBI GEO: archive for functional genomics data sets—update}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/23193258 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC3531084 http://academic.oup.com/nar/article/41/D1/D991/1067995/NCBI-GEO-archive-for-functional-genomics-data},
volume = {41},
year = {2012}
}
@article{TumorsClass1,
abstract = {A reliable and precise classification of tumors is essential for successful diagnosis and treatment of cancer. cDNA microarrays and high-density oligonucleotide chips are novel biotechnologies increasingly used in cancer research. By allowing the monitoring of expression levels in cells for thousands of genes simultaneously, microarray experiments may lead to a more complete understanding of the molecular variations among tumors and hence to a finer and more informative classification. The ability to successfully distinguish between tumor classes (already known or yet to be discovered) using gene expression data is an important aspect of this novel approach to cancer classification. This article compares the performance of different discrimination methods for the classification of tumors based on gene expression data. The methods include nearest-neighbor classifiers, linear discriminant analysis, and classification trees. Recent machine learning approaches, such as bagging and boosting, are also considere...},
annote = {Tumors classification 1},
author = {Dudoit, Sandrine and Fridlyand, Jane and Speed, Terence P},
doi = {10.1198/016214502753479248},
file = {:home/stachu/dyskD/ubuntu/programy/master/ComparisonOfDiscriminative.pdf:pdf},
issn = {0162-1459},
journal = {Journal of the American Statistical Association},
keywords = {Cancer,Discriminant analysis,Microarray experiment,Supervised learning,Tumor classification,Variable selection},
month = {mar},
number = {457},
pages = {77--87},
publisher = {Taylor {\&} Francis},
title = {{Comparison of Discrimination Methods for the Classification of Tumors Using Gene Expression Data}},
url = {http://www.tandfonline.com/doi/abs/10.1198/016214502753479248},
volume = {97},
year = {2002}
}
@article{ModelSelection,
abstract = {The correct use of model evaluation, model selection, and algorithm selection techniques is vital in academic machine learning research as well as in many industrial settings. This article reviews different techniques that can be used for each of these three subtasks and discusses the main advantages and disadvantages of each technique with references to theoretical and empirical studies. Further, recommendations are given to encourage best yet feasible practices in research and applications of machine learning. Common methods such as the holdout method for model evaluation and selection are covered, which are not recommended when working with small datasets. Different flavors of the bootstrap technique are introduced for estimating the uncertainty of performance estimates, as an alternative to confidence intervals via normal approximation if bootstrapping is computationally feasible. Common cross-validation techniques such as leave-one-out cross-validation and k-fold cross-validation are reviewed, the bias-variance trade-off for choosing k is discussed, and practical tips for the optimal choice of k are given based on empirical evidence. Different statistical tests for algorithm comparisons are presented, and strategies for dealing with multiple comparisons such as omnibus tests and multiple-comparison corrections are discussed. Finally, alternative methods for algorithm selection, such as the combined F-test 5x2 cross-validation and nested cross-validation, are recommended for comparing machine learning algorithms when datasets are small.},
archivePrefix = {arXiv},
arxivId = {1811.12808},
author = {Raschka, Sebastian},
eprint = {1811.12808},
file = {:home/stachu/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Raschka - 2018 - Model Evaluation, Model Selection, and Algorithm Selection in Machine Learning.pdf:pdf},
month = {nov},
title = {{Model Evaluation, Model Selection, and Algorithm Selection in Machine Learning}},
url = {http://arxiv.org/abs/1811.12808},
year = {2018}
}
@manual{varclust,
annote = {R package version 0.9.4},
author = {Sobczyk, Piotr and Wilczynski, Stanislaw and Josse, Julie and Bogdan, Malgorzata},
title = {{varclust: Variables Clustering}},
url = {https://cran.r-project.org/package=varclust},
year = {2019}
}
@article{decisionThresh,
abstract = {Standard classification algorithms are generally designed to maximize the number of correct predictions (concordance). The criterion of maximizing the concordance may not be appropriate in certain applications. In practice, some applications may emphasize high sensitivity (e.g., clinical diagnostic tests) and others may emphasize high specificity (e.g., epidemiology screening studies). This paper considers effects of the decision threshold on sensitivity, specificity, and concordance for four classification methods: logistic regression, classification tree, Fisher's linear discriminant analysis, and a weighted k-nearest neighbor. We investigated the use of decision threshold adjustment to improve performance of either sensitivity or specificity of a classifier under specific conditions. We conducted a Monte Carlo simulation showing that as the decision threshold increases, the sensitivity decreases and the specificity increases; but, the concordance values in an interval around the maximum concordance are similar. For specified sensitivity and specificity levels, an optimal decision threshold might be determined in an interval around the maximum concordance that meets the specified requirement. Three example data sets were analyzed for illustrations.},
author = {Chen, J. J. and Tsai, C.-A. and Moon, H. and Ahn, H. and Young, J. J. and Chen, C.-H.},
doi = {10.1080/10659360600787700},
file = {:home/stachu/dyskD/ubuntu/programy/master/Decisionthreshold.pdf:pdf},
issn = {1062-936X},
journal = {SAR and QSAR in Environmental Research},
month = {jun},
number = {3},
pages = {337--352},
pmid = {16815772},
title = {{Decision threshold adjustment in class prediction}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/16815772 http://www.tandfonline.com/doi/abs/10.1080/10659360600787700},
volume = {17},
year = {2006}
}
@article{RDA,
author = {Friedman, Jerome H.},
doi = {10.2307/2289860},
file = {:home/stachu/dyskD/ubuntu/programy/master/RDA.pdf:pdf},
issn = {01621459},
journal = {Journal of the American Statistical Association},
month = {mar},
number = {405},
pages = {165},
title = {{Regularized Discriminant Analysis}},
url = {https://www.jstor.org/stable/2289860?origin=crossref},
volume = {84},
year = {1989}
}
@article{cvOverfit,
author = {Cawley, Gavin C. and Talbot, Nicola L. C.},
file = {:home/stachu/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Cawley, Talbot - 2010 - On Over-fitting in Model Selection and Subsequent Selection Bias in Performance Evaluation.pdf:pdf},
issn = {ISSN 1533-7928},
journal = {Journal of Machine Learning Research},
number = {Jul},
pages = {2079--2107},
title = {{On Over-fitting in Model Selection and Subsequent Selection Bias in Performance Evaluation}},
url = {http://jmlr.csail.mit.edu/papers/v11/cawley10a.html},
volume = {11},
year = {2010}
}
@misc{NHGRI,
abstract = {The National Human Genome Research Institute (NHGRI) has produced this series of fact sheets to explain complex concepts in genomics research to a non-scientific audience. Teachers, students and the general public alike will find the materials clearly written and easy to understand.},
author = {{National Human Genome Research Institute}},
howpublished = {https://www.genome.gov/about-genomics/fact-sheets},
title = {{Fact Sheets about Genomics}},
url = {https://www.genome.gov/about-genomics/fact-sheets},
year = {2015}
}
@article{TumorClass2,
abstract = {The classification of different tumor types is of great importance in cancer diagnosis and drug discovery. However, most previous cancer classification studies are clinical based and have limited diagnostic ability. Cancer classification using gene expression data is known to contain the keys for addressing the fundamental problems relating to cancer diagnosis and drug discovery. The recent advent of DNA microarray technique has made simultaneous monitoring of thousands of gene expressions possible. With this abundance of gene expression data, researchers have started to explore the possibilities of cancer classification using gene expression data. Quite a number of methods have been proposed in recent years with promising results. But there are still a lot of issues which need to be addressed and understood. In order to gain a deep insight into the cancer classification problem, it is necessary to take a closer look at the problem, the proposed solutions and the related issues all together. In this survey paper, we present a comprehensive overview of various proposed cancer classification methods and evaluate them based on their computation time, classification accuracy and ability to reveal biologically meaningful gene information. We also introduce and evaluate various proposed gene selection methods which we believe should be an integral preprocessing step for cancer classification. In order to obtain a full picture of cancer classification, we also discuss several issues related to cancer classification, including the biological significance vs. statistical significance of a cancer classifier, the asymmetrical classification errors for cancer classifiers, and the gene contamination problem.},
annote = {Tumor classification 2},
author = {Lu, Ying and Han, Jiawei},
doi = {10.1016/S0306-4379(02)00072-8},
file = {:home/stachu/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Lu, Han - 2003 - Cancer classification using gene expression data.pdf:pdf},
issn = {0306-4379},
journal = {Information Systems},
month = {jun},
number = {4},
pages = {243--268},
publisher = {Pergamon},
title = {{Cancer classification using gene expression data}},
url = {https://www.sciencedirect.com/science/article/pii/S0306437902000728},
volume = {28},
year = {2003}
}
@article{sklearn2,
abstract = {Scikit-learn is an increasingly popular machine learning li- brary. Written in Python, it is designed to be simple and efficient, accessible to non-experts, and reusable in various contexts. In this paper, we present and discuss our design choices for the application programming interface (API) of the project. In particular, we describe the simple and elegant interface shared by all learning and processing units in the library and then discuss its advantages in terms of composition and reusability. The paper also comments on implementation details specific to the Python ecosystem and analyzes obstacles faced by users and developers of the library.},
archivePrefix = {arXiv},
arxivId = {1309.0238},
author = {Buitinck, Lars and Louppe, Gilles and Blondel, Mathieu and Pedregosa, Fabian and Mueller, Andreas and Grisel, Olivier and Niculae, Vlad and Prettenhofer, Peter and Gramfort, Alexandre and Grobler, Jaques and Layton, Robert and Vanderplas, Jake and Joly, Arnaud and Holt, Brian and Varoquaux, Ga{\"{e}}l},
eprint = {1309.0238},
file = {:home/stachu/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Buitinck et al. - 2013 - API design for machine learning software experiences from the scikit-learn project.pdf:pdf},
month = {sep},
title = {{API design for machine learning software: experiences from the scikit-learn project}},
url = {http://arxiv.org/abs/1309.0238},
year = {2013}
}
@article{quantileNorm,
abstract = {MOTIVATION When running experiments that involve multiple high density oligonucleotide arrays, it is important to remove sources of variation between arrays of non-biological origin. Normalization is a process for reducing this variation. It is common to see non-linear relations between arrays and the standard normalization provided by Affymetrix does not perform well in these situations. RESULTS We present three methods of performing normalization at the probe intensity level. These methods are called complete data methods because they make use of data from all arrays in an experiment to form the normalizing relation. These algorithms are compared to two methods that make use of a baseline array: a one number scaling based algorithm and a method that uses a non-linear normalizing relation by comparing the variability and bias of an expression measure. Two publicly available datasets are used to carry out the comparisons. The simplest and quickest complete data method is found to perform favorably. AVAILABILITY Software implementing all three of the complete data normalization methods is available as part of the R package Affy, which is a part of the Bioconductor project http://www.bioconductor.org. SUPPLEMENTARY INFORMATION Additional figures may be found at http://www.stat.berkeley.edu/{\~{}}bolstad/normalize/index.html},
author = {Bolstad, B.M. M and Irizarry, R.A A and Astrand, M. and Speed, T.P. P},
doi = {10.1093/bioinformatics/19.2.185},
file = {:home/stachu/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Bolstad et al. - Unknown - A Comparison of Normalization Methods for High Density Oligonucleotide Array Data Based on Variance and Bias.pdf:pdf},
issn = {1367-4803},
journal = {Bioinformatics},
month = {jan},
number = {2},
pages = {185--193},
pmid = {12538238},
title = {{A Comparison of Normalization Methods for High Density Oligonucleotide Array Data Based on Variance and Bias}},
url = {http://www.bioconductor.org.http//www.stat.berkeley.edu/˜bolstad/normalize/index.html http://www.ncbi.nlm.nih.gov/pubmed/12538238 https://academic.oup.com/bioinformatics/article-lookup/doi/10.1093/bioinformatics/19.2.185},
volume = {19},
year = {2003}
}
@article{FeatureSelection,
abstract = {We summarise various ways of performing dimensionality reduction on high-dimensional microarray data. Many different feature selection and feature extraction methods exist and they are being widely used. All these methods aim to remove redundant and irrelevant features so that classification of new instances will be more accurate. A popular source of data is microarrays, a biological platform for gathering gene expressions. Analysing microarrays can be difficult due to the size of the data they provide. In addition the complicated relations among the different genes make analysis more difficult and removing excess features can improve the quality of the results. We present some of the most popular methods for selecting significant features and provide a comparison between them. Their advantages and disadvantages are outlined in order to provide a clearer idea of when to use each one of them for saving computational time and resources.},
author = {Hira, Zena M. and Gillies, Duncan F.},
doi = {10.1155/2015/198363},
file = {:home/stachu/dyskD/ubuntu/programy/master/popular/FearueSelectionOverview.pdf:pdf},
issn = {1687-8027},
journal = {Advances in Bioinformatics},
number = {1},
pages = {1--13},
publisher = {Hindawi Publishing Corporation},
title = {{A Review of Feature Selection and Feature Extraction Methods Applied on Microarray Data}},
volume = {2015},
year = {2015}
}
@article{Lasso,
author = {Tibshirani, Robert},
doi = {10.2307/2346101},
file = {:home/stachu/dyskD/ubuntu/programy/master/popular/LassoShrinkage.pdf:pdf},
journal = {Journal of the Royal Statistical Society},
keywords = {AIC,akaike,cross-,infromation criteria,model choice,predicting density,s information criterion},
pages = {267--288},
title = {{Regression Shrinkage and Selection via the Lasso}},
volume = {58},
year = {1996}
}
@article{SAM,
abstract = {Microarrays can measure the expression of thousands of genes to identify changes in expression between different biological states. Methods are needed to determine the significance of these changes while accounting for the enormous number of genes. We describe a method, Significance Analysis of Microarrays (SAM), that assigns a score to each gene on the basis of change in gene expression relative to the standard deviation of repeated measurements. For genes with scores greater than an adjustable threshold, SAM uses permutations of the repeated measurements to estimate the percentage of genes identified by chance, the false discovery rate (FDR). When the transcriptional response of human cells to ionizing radiation was measured by microarrays, SAM identified 34 genes that changed at least 1.5-fold with an estimated FDR of 12{\%}, compared with FDRs of 60 and 84{\%} by using conventional methods of analysis. Of the 34 genes, 19 were involved in cell cycle regulation and 3 in apoptosis. Surprisingly, four nucleotide excision repair genes were induced, suggesting that this repair pathway for UV-damaged DNA might play a previously unrecognized role in repairing DNA damaged by ionizing radiation.},
author = {Tusher, V G and Tibshirani, R and Chu, G},
doi = {10.1073/pnas.091062498},
file = {:home/stachu/dyskD/ubuntu/programy/master/SAM.pdf:pdf},
issn = {0027-8424},
journal = {Proceedings of the National Academy of Sciences of the United States of America},
month = {apr},
number = {9},
pages = {5116--21},
pmid = {11309499},
publisher = {National Academy of Sciences},
title = {{Significance analysis of microarrays applied to the ionizing radiation response.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/11309499 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC33173},
volume = {98},
year = {2001}
}
@article{saga,
abstract = {In this work we introduce a new optimisation method called SAGA in the spirit of SAG, SDCA, MISO and SVRG, a set of recently proposed incremental gradient algorithms with fast linear convergence rates. SAGA improves on the theory behind SAG and SVRG, with better theoretical convergence rates, and has support for composite objectives where a proximal operator is used on the regulariser. Unlike SDCA, SAGA supports non-strongly convex problems directly, and is adaptive to any inherent strong convexity of the problem. We give experimental results showing the effectiveness of our method.},
archivePrefix = {arXiv},
arxivId = {1407.0202},
author = {Defazio, Aaron and Bach, Francis R and Lacoste-Julien, Simon},
eprint = {1407.0202},
file = {:home/stachu/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Defazio, Bach, Lacoste-Julien - 2014 - SAGA A Fast Incremental Gradient Method With Support for Non-Strongly Convex Composite Objectives.pdf:pdf},
journal = {CoRR},
month = {jul},
title = {{SAGA: A Fast Incremental Gradient Method With Support for Non-Strongly Convex Composite Objectives}},
url = {http://arxiv.org/abs/1407.0202},
volume = {abs/1407.0},
year = {2014}
}
@article{nestedCV,
abstract = {BACKGROUND Cross-validation (CV) is an effective method for estimating the prediction error of a classifier. Some recent articles have proposed methods for optimizing classifiers by choosing classifier parameter values that minimize the CV error estimate. We have evaluated the validity of using the CV error estimate of the optimized classifier as an estimate of the true error expected on independent data. RESULTS We used CV to optimize the classification parameters for two kinds of classifiers; Shrunken Centroids and Support Vector Machines (SVM). Random training datasets were created, with no difference in the distribution of the features between the two classes. Using these "null" datasets, we selected classifier parameter values that minimized the CV error estimate. 10-fold CV was used for Shrunken Centroids while Leave-One-Out-CV (LOOCV) was used for the SVM. Independent test data was created to estimate the true error. With "null" and "non null" (with differential expression between the classes) data, we also tested a nested CV procedure, where an inner CV loop is used to perform the tuning of the parameters while an outer CV is used to compute an estimate of the error. The CV error estimate for the classifier with the optimal parameters was found to be a substantially biased estimate of the true error that the classifier would incur on independent data. Even though there is no real difference between the two classes for the "null" datasets, the CV error estimate for the Shrunken Centroid with the optimal parameters was less than 30{\%} on 18.5{\%} of simulated training data-sets. For SVM with optimal parameters the estimated error rate was less than 30{\%} on 38{\%} of "null" data-sets. Performance of the optimized classifiers on the independent test set was no better than chance. The nested CV procedure reduces the bias considerably and gives an estimate of the error that is very close to that obtained on the independent testing set for both Shrunken Centroids and SVM classifiers for "null" and "non-null" data distributions. CONCLUSION We show that using CV to compute an error estimate for a classifier that has itself been tuned using CV gives a significantly biased estimate of the true error. Proper use of CV for estimating true error of a classifier developed using a well defined algorithm requires that all steps of the algorithm, including classifier parameter tuning, be repeated in each CV loop. A nested CV procedure provides an almost unbiased estimate of the true error.},
author = {Varma, Sudhir and Simon, Richard},
doi = {10.1186/1471-2105-7-91},
file = {:home/stachu/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Varma, Simon - 2006 - Bias in error estimation when using cross-validation for model selection.pdf:pdf},
issn = {1471-2105},
journal = {BMC bioinformatics},
month = {feb},
pages = {91},
pmid = {16504092},
publisher = {BioMed Central},
title = {{Bias in error estimation when using cross-validation for model selection.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/16504092 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC1397873},
volume = {7},
year = {2006}
}
@article{liblinear,
author = {Fan, Rong-En and Chang, Kai-Wei and Hsieh, Cho-Jui and Wang, Xiang-Rui and Lin, Chih-Jen},
file = {:home/stachu/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Fan et al. - 2008 - LIBLINEAR A Library for Large Linear Classification.pdf:pdf},
issn = {ISSN 1533-7928},
journal = {Journal of Machine Learning Research},
number = {Aug},
pages = {1871--1874},
title = {{LIBLINEAR: A Library for Large Linear Classification}},
url = {http://www.jmlr.org/papers/v9/fan08a.html},
volume = {9},
year = {2008}
}
@article{BreastCancerClassification,
abstract = {Comprehensive gene expression patterns generated from cDNA microarrays were correlated with detailed clinico-pathological characteristics and clinical outcome in an unselected group of 99 node-negative and node-positive breast cancer patients. Gene expression patterns were found to be strongly associated with estrogen receptor (ER) status and moderately associated with grade, but not associated with menopausal status, nodal status, or tumor size. Hierarchical cluster analysis segregated the tumors into two main groups based on their ER status, which correlated well with basal and luminal characteristics. Cox proportional hazards regression analysis identified 16 genes that were significantly associated with relapse-free survival at a stringent significance level of 0.001 to account for multiple comparisons. Of 231 genes previously reported by others [van't Veer, L. J., et al. (2002) Nature 415, 530-536] as being associated with survival, 93 probe elements overlapped with the set of 7,650 probe elements represented on the arrays used in this study. Hierarchical cluster analysis based on the set of 93 probe elements segregated our population into two distinct subgroups with different relapse-free survival (P {\textless} 0.03). The number of these 93 probe elements showing significant univariate association with relapse-free survival (P {\textless} 0.05) in the present study was 14, representing 11 unique genes. Genes involved in cell cycle, DNA replication, and chromosomal stability were consistently elevated in the various poor prognostic groups. In addition, glutathione S-transferase M3 emerged as an important survival marker in both studies. When taken together with other array studies, our results highlight the consistent biological and clinical associations with gene expression profiles.},
annote = {Very famous study - clasifing types of breast cancer (only mentioned metastasis)},
author = {Sotiriou, C. and Neo, S.-Y. and McShane, L. M. and Korn, E. L. and Long, P. M. and Jazaeri, A. and Martiat, P. and Fox, S. B. and Harris, A. L. and Liu, E. T.},
doi = {10.1073/pnas.1732912100},
file = {:home/stachu/dyskD/ubuntu/programy/master/popular/BreastCancerClassification.pdf:pdf},
issn = {0027-8424},
journal = {Proceedings of the National Academy of Sciences},
number = {18},
pages = {10393--10398},
title = {{Breast cancer classification and prognosis based on gene expression profiles from a population-based study}},
volume = {100},
year = {2003}
}
@book{MLprob,
abstract = {This textbook offers a comprehensive and self-contained introduction to the field of machine learning, based on a unified, probabilistic approach. The coverage combines breadth and depth, offering necessary background material on such topics as probability, optimization, and linear algebra as well as discussion of recent developments in the field, including conditional random fields, L1 regularization, and deep learning. The book is written in an informal, accessible style, complete with pseudo-code for the most important algorithms. All topics are copiously illustrated with color images and worked examples drawn from such application domains as biology, text processing, computer vision, and robotics. Rather than providing a cookbook of different heuristic methods, the book stresses a principled model-based approach, often using the language of graphical models to specify models in a concise and intuitive way. Almost all the models described have been implemented in a MATLAB software package--PMTK (probabilistic modeling toolkit)--that is freely available online},
author = {Murphy, Kevin P.},
file = {:home/stachu/dyskD/ubuntu/programy/master/popular/MLprob.pdf:pdf},
isbn = {9780262018029},
pages = {1067},
publisher = {MIT Press},
title = {{Machine learning : a probabilistic perspective}},
year = {2012}
}
@article{randomSearch,
author = {Bergstra, James and Bengio, Yoshua},
file = {:home/stachu/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Bergstra, Bengio - 2012 - Random Search for Hyper-Parameter Optimization.pdf:pdf},
issn = {ISSN 1533-7928},
journal = {Journal of Machine Learning Research},
number = {Feb},
pages = {281--305},
title = {{Random Search for Hyper-Parameter Optimization}},
url = {http://www.jmlr.org/papers/v13/bergstra12a.html},
volume = {13},
year = {2012}
}
@article{SAM2,
abstract = {BACKGROUND The Significance Analysis of Microarrays (SAM) is a popular method for detecting significantly expressed genes and controlling the false discovery rate (FDR). Recently, it has been reported in the literature that the FDR is not well controlled by SAM. Due to the vast application of SAM in microarray data analysis, it is of great importance to have an extensive evaluation of SAM and its associated R-package (sam2.20). RESULTS Our study has identified several discrepancies between SAM and sam2.20. One major difference is that SAM and sam2.20 use different methods for estimating FDR. Such discrepancies may cause confusion among the researchers who are using SAM or are developing the SAM-like methods. We have also shown that SAM provides no meaningful estimates of FDR and this problem has been corrected in sam2.20 by using a different formula for estimating FDR. However, we have found that, even with the improvement sam2.20 has made over SAM, sam2.20 may still produce erroneous and even conflicting results under certain situations. Using an example, we show that the problem of sam2.20 is caused by its use of asymmetric cutoffs which are due to the large variability of null scores at both ends of the order statistics. An obvious approach without the complication of the order statistics is the conventional symmetric cutoff method. For this reason, we have carried out extensive simulations to compare the performance of sam2.20 and the symmetric cutoff method. Finally, a simple modification is proposed to improve the FDR estimation of sam2.20 and the symmetric cutoff method. CONCLUSION Our study shows that the most serious drawback of SAM is its poor estimation of FDR. Although this drawback has been corrected in sam2.20, the control of FDR by sam2.20 is still not satisfactory. The comparison between sam2.20 and the symmetric cutoff method reveals that the relative performance of sam2.20 to the symmetric cutff method depends on the ratio of induced to repressed genes in a microarray data, and is also affected by the ratio of DE to EE genes and the distributions of induced and repressed genes. Numerical simulations show that the symmetric cutoff method has the biggest advantage over sam2.20 when there are equal number of induced and repressed genes (i.e., the ratio of induced to repressed genes is 1). As the ratio of induced to repressed genes moves away from 1, the advantage of the symmetric cutoff method to sam2.20 is gradually diminishing until eventually sam2.20 becomes significantly better than the symmetric cutoff method when the differentially expressed (DE) genes are either all induced or all repressed genes. Simulation results also show that our proposed simple modification provides improved control of FDR for both sam2.20 and the symmetric cutoff method.},
author = {Zhang, Shunpu},
doi = {10.1186/1471-2105-8-230},
file = {:home/stachu/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Zhang - 2007 - A comprehensive evaluation of SAM, the SAM R-package and a simple modification to improve its performance.pdf:pdf},
issn = {1471-2105},
journal = {BMC bioinformatics},
month = {jun},
pages = {230},
pmid = {17603887},
publisher = {BioMed Central},
title = {{A comprehensive evaluation of SAM, the SAM R-package and a simple modification to improve its performance.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/17603887 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC1955751},
volume = {8},
year = {2007}
}
@article{SPCAnew,
abstract = {Principal component analysis (PCA) is a widely used tool for data analysis and dimension reduction in applications throughout science and engineering. However, the principal components (PCs) can sometimes be difficult to interpret, because they are linear combinations of all the original variables. To facilitate interpretation, sparse PCA produces modified PCs with sparse loadings, i.e. loadings with very few non-zero elements. In this paper, we propose a new sparse PCA method, namely sparse PCA via regularized SVD (sPCA-rSVD). We use the connection of PCA with singular value decomposition (SVD) of the data matrix and extract the PCs through solving a low rank matrix approximation problem. Regularization penalties are introduced to the corresponding minimization problem to promote sparsity in PC loadings. An efficient iterative algorithm is proposed for computation. Two tuning parameter selection methods are discussed. Some theoretical results are established to justify the use of sPCA-rSVD when only the data covariance matrix is available. In addition, we give a modified definition of variance explained by the sparse PCs. The sPCA-rSVD provides a uniform treatment of both classical multivariate data and high-dimension-low-sample-size (HDLSS) data. Further understanding of sPCA-rSVD and some existing alternatives is gained through simulation studies and real data examples, which suggests that sPCA-rSVD provides competitive results.},
annote = {Describes the implementation from easyspc},
author = {Shen, Haipeng and Huang, Jianhua Z.},
doi = {10.1016/J.JMVA.2007.06.007},
file = {:home/stachu/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Shen, Huang - 2008 - Sparse principal component analysis via regularized low rank matrix approximation.pdf:pdf},
issn = {0047-259X},
journal = {Journal of Multivariate Analysis},
month = {jul},
number = {6},
pages = {1015--1034},
publisher = {Academic Press},
title = {{Sparse principal component analysis via regularized low rank matrix approximation}},
url = {https://www.sciencedirect.com/science/article/pii/S0047259X07000887},
volume = {99},
year = {2008}
}
@article{sklearn1,
author = {Pedregosa, Fabian and Varoquaux, Ga{\"{e}}l and Gramfort, Alexandre and Michel, Vincent and Thirion, Bertrand and Grisel, Olivier and Blondel, Mathieu and Prettenhofer, Peter and Weiss, Ron and Dubourg, Vincent and Vanderplas, Jake and Passos, Alexandre and Cournapeau, David and Brucher, Matthieu and Perrot, Matthieu and Duchesnay, {\'{E}}douard},
file = {:home/stachu/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Pedregosa et al. - 2011 - Scikit-learn Machine Learning in Python.pdf:pdf},
issn = {ISSN 1533-7928},
journal = {Journal of Machine Learning Research},
number = {Oct},
pages = {2825--2830},
title = {{Scikit-learn: Machine Learning in Python}},
url = {http://jmlr.csail.mit.edu/papers/v12/pedregosa11a.html},
volume = {12},
year = {2011}
}
@book{PythonML,
address = {Birmingham, UK},
author = {Raschka, Sebastian and Mirjalili, Vahid},
edition = {2},
file = {:home/stachu/dyskD/ubuntu/programy/master/Python Machine Learning.pdf:pdf},
isbn = {978-1787125933},
keywords = {Clustering,Data Science,Deep Learning,Machine Learning,Neural Networks,Programming,Supervised Learning},
publisher = {Packt Publishing},
title = {{Python Machine Learning, 2nd Ed.}},
year = {2017}
}
@article{svmWeights,
author = {Statnikov, Alexander and Hardin, D and Aliferis, Constantin},
file = {:home/stachu/dyskD/ubuntu/programy/master/Using{\_}SVM{\_}Weight-Based{\_}Methods{\_}to{\_}Identify{\_}Causall.pdf:pdf},
journal = {Sign},
title = {{Using SVM Weight-Based Methods to Identify Causally Relevant and Non-Causally Relevant Variables}},
volume = {1},
year = {2006}
}
@article{sag,
abstract = {We propose the stochastic average gradient (SAG) method for optimizing the sum of a finite number of smooth convex functions. Like stochastic gradient (SG) methods, the SAG method's iteration cost is independent of the number of terms in the sum. However, by incorporating a memory of previous gradient values the SAG method achieves a faster convergence rate than black-box SG methods. The convergence rate is improved from O(1/k{\^{}}{\{}1/2{\}}) to O(1/k) in general, and when the sum is strongly-convex the convergence rate is improved from the sub-linear O(1/k) to a linear convergence rate of the form O(p{\^{}}k) for p $\backslash$textless{\{}{\}} 1. Further, in many cases the convergence rate of the new method is also faster than black-box deterministic gradient methods, in terms of the number of gradient evaluations. Numerical experiments indicate that the new algorithm often dramatically outperforms existing SG and deterministic gradient methods, and that the performance may be further improved through the use of non-uniform sampling strategies.},
archivePrefix = {arXiv},
arxivId = {1309.2388},
author = {Schmidt, Mark and {Le Roux}, Nicolas and Bach, Francis and Roux, Nicolas Le and Bach, Francis},
doi = {10.1007/s10107-016-1030-6},
eprint = {1309.2388},
file = {:home/stachu/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Schmidt, Roux, Bach - 2013 - Minimizing Finite Sums with the Stochastic Average Gradient.pdf:pdf},
issn = {0025-5610},
journal = {Mathematical Programming},
month = {sep},
number = {1-2},
pages = {83--112},
publisher = {Springer Berlin Heidelberg},
title = {{Minimizing finite sums with the stochastic average gradient}},
url = {http://arxiv.org/abs/1309.2388 http://link.springer.com/10.1007/s10107-016-1030-6},
volume = {162},
year = {2013}
}
@article{LRgene,
author = {Zhu, J. and Hastie, Trevor},
doi = {10.1093/biostatistics/kxg046},
file = {:home/stachu/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Zhu, Hastie - 2004 - Classification of gene microarrays by penalized logistic regression.pdf:pdf},
issn = {1465-4644},
journal = {Biostatistics},
month = {jul},
number = {3},
pages = {427--443},
publisher = {Narnia},
title = {{Classification of gene microarrays by penalized logistic regression}},
url = {https://academic.oup.com/biostatistics/article-lookup/doi/10.1093/biostatistics/kxg046},
volume = {5},
year = {2004}
}
@article{dimRedInCV,
abstract = {In the context of cancer diagnosis and treatment, we consider the problem of constructing an accurate prediction rule on the basis of a relatively small number of tumor tissue samples of known type containing the expression data on very many (possibly thousands) genes. Recently, results have been presented in the literature suggesting that it is possible to construct a prediction rule from only a few genes such that it has a negligible prediction error rate. However, in these results the test error or the leave-one-out cross-validated error is calculated without allowance for the selection bias. There is no allowance because the rule is either tested on tissue samples that were used in the first instance to select the genes being used in the rule or because the cross-validation of the rule is not external to the selection process; that is, gene selection is not performed in training the rule at each stage of the cross-validation process. We describe how in practice the selection bias can be assessed and corrected for by either performing a cross-validation or applying the bootstrap external to the selection process. We recommend using 10-fold rather than leave-one-out cross-validation, and concerning the bootstrap, we suggest using the so-called .632+ bootstrap error estimate designed to handle overfitted prediction rules. Using two published data sets, we demonstrate that when correction is made for the selection bias, the cross-validated error is no longer zero for a subset of only a few genes.},
author = {Ambroise, Christophe and McLachlan, Geoffrey J},
doi = {10.1073/pnas.102102699},
file = {:home/stachu/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Ambroise, McLachlan - 2002 - Selection bias in gene extraction on the basis of microarray gene-expression data.pdf:pdf},
issn = {0027-8424},
journal = {Proceedings of the National Academy of Sciences of the United States of America},
month = {may},
number = {10},
pages = {6562--6},
pmid = {11983868},
publisher = {National Academy of Sciences},
title = {{Selection bias in gene extraction on the basis of microarray gene-expression data.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/11983868 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC124442},
volume = {99},
year = {2002}
}