references.bib


@book{gelman_data_2006,
	title = {Data analysis using regression and multilevel/hierarchical models},
	publisher = {Cambridge university press},
	author = {Gelman, Andrew and Hill, Jennifer},
	year = {2006},
	file = {Full Text:/Users/micl/Zotero/storage/7G98QCP2/Gelman and Hill - 2006 - Data analysis using regression and multilevelhier.pdf:application/pdf;Snapshot:/Users/micl/Zotero/storage/MKR5KG4R/books.html:text/html},
}

@book{gelman_bayesian_2013,
	title = {Bayesian {Data} {Analysis}, {Third} {Edition}},
	isbn = {978-1-4398-4095-5},
	abstract = {Now in its third edition, this classic book is widely considered the leading text on Bayesian methods, lauded for its accessible, practical approach to analyzing data and solving research problems. Bayesian Data Analysis, Third Edition continues to take an applied approach to analysis using up-to-date Bayesian methods. The authors—all leaders in the statistics community—introduce basic concepts from a data-analytic perspective before presenting advanced methods. Throughout the text, numerous worked examples drawn from real applications and research emphasize the use of Bayesian inference in practice. New to the Third Edition   Four new chapters on nonparametric modeling Coverage of weakly informative priors and boundary-avoiding priors Updated discussion of cross-validation and predictive information criteria Improved convergence monitoring and effective sample size calculations for iterative simulation Presentations of Hamiltonian Monte Carlo, variational Bayes, and expectation propagation New and revised software code   The book can be used in three different ways. For undergraduate students, it introduces Bayesian inference starting from first principles. For graduate students, the text presents effective current approaches to Bayesian modeling and computation in statistics and related fields. For researchers, it provides an assortment of Bayesian methods in applied statistics. Additional materials, including data sets used in the examples, solutions to selected exercises, and software instructions, are available on the book’s web page.},
	language = {en},
	publisher = {CRC Press},
	author = {Gelman, Andrew and Carlin, John B. and Stern, Hal S. and Dunson, David B. and Vehtari, Aki and Rubin, Donald B.},
	month = nov,
	year = {2013},
	keywords = {Mathematics / Probability \& Statistics / General, Computers / Mathematical \& Statistical Software, Psychology / Research \& Methodology},
}

@book{kruschke_doing_2010,
	title = {Doing {Bayesian} {Data} {Analysis}: {A} {Tutorial} {Introduction} with {R}},
	isbn = {978-0-12-381486-9},
	shorttitle = {Doing {Bayesian} {Data} {Analysis}},
	abstract = {There is an explosion of interest in Bayesian statistics, primarily because recently created computational methods have finally made Bayesian analysis tractable and accessible to a wide audience. Doing Bayesian Data Analysis, A Tutorial Introduction with R and BUGS, is for first year graduate students or advanced undergraduates and provides an accessible approach, as all mathematics is explained intuitively and with concrete examples. It assumes only algebra and ‘rusty’ calculus. Unlike other textbooks, this book begins with the basics, including essential concepts of probability and random sampling. The book gradually climbs all the way to advanced hierarchical modeling methods for realistic data. The text provides complete examples with the R programming language and BUGS software (both freeware), and begins with basic programming examples, working up gradually to complete programs for complex analyses and presentation graphics. These templates can be easily adapted for a large variety of students and their own research needs.The textbook bridges the students from their undergraduate training into modern Bayesian methods.-Accessible, including the basics of essential concepts of probability and random sampling -Examples with R programming language and BUGS software -Comprehensive coverage of all scenarios addressed by non-bayesian textbooks- t-tests, analysis of variance (ANOVA) and comparisons in ANOVA, multiple regression, and chi-square (contingency table analysis). -Coverage of experiment planning -R and BUGS computer programming code on website -Exercises have explicit purposes and guidelines for accomplishment},
	language = {en},
	publisher = {Academic Press},
	author = {Kruschke, John},
	month = nov,
	year = {2010},
	keywords = {Mathematics / General, Mathematics / Applied},
}

@article{ferrari_beta_2004,
	title = {Beta {Regression} for {Modelling} {Rates} and {Proportions}},
	volume = {31},
	issn = {0266-4763},
	url = {http://www.tandfonline.com/doi/abs/10.1080/0266476042000214501},
	doi = {10.1080/0266476042000214501},
	abstract = {This paper proposes a regression model where the response is beta distributed using a parameterization of the beta law that is indexed by mean and dispersion parameters. The proposed model is useful for situations where the variable of interest is continuous and restricted to the interval (0, 1) and is related to other variables through a regression structure. The regression parameters of the beta regression model are interpretable in terms of the mean of the response and, when the logit link is used, of an odds ratio, unlike the parameters of a linear regression that employs a transformed response. Estimation is performed by maximum likelihood. We provide closed-form expressions for the score function, for Fisher's information matrix and its inverse. Hypothesis testing is performed using approximations obtained from the asymptotic normality of the maximum likelihood estimator. Some diagnostic measures are introduced. Finally, practical applications that employ real data are presented and discussed.},
	number = {7},
	urldate = {2014-03-13},
	journal = {Journal of Applied Statistics},
	author = {Ferrari, Silvia and Cribari-Neto, Francisco},
	year = {2004},
	pages = {799--815},
	file = {Snapshot:/Users/micl/Zotero/storage/JZKNU8TH/0266476042000214501.html:text/html},
}

@book{barrett_causal_2024,
	title = {Causal {Inference} in {R}},
	url = {https://www.r-causal.org/},
	language = {en},
	urldate = {2023-12-02},
	author = {Barrett, Malcolm and McGowan, Lucy D’Agostino and Gerke, Travis},
	year = {2024},
	file = {Snapshot:/Users/micl/Zotero/storage/SDSTQ6EG/www.r-causal.org.html:text/html},
}

@book{hastie_elements_2017,
	title = {Elements of {Statistical} {Learning}: data mining, inference, and prediction. 2nd {Edition}.},
	url = {https://hastie.su.domains/ElemStatLearn/},
	urldate = {2023-12-02},
	author = {Hastie, Trevor and Tibshirani, Robert and Friedman, Jerome},
	year = {2017},
	file = {Elements of Statistical Learning\: data mining, inference, and prediction. 2nd Edition.:/Users/micl/Zotero/storage/YC7KFXJY/ElemStatLearn.html:text/html},
}

@book{goodfellow_deep_2016,
	title = {Deep {Learning}},
	url = {https://www.deeplearningbook.org/},
	urldate = {2023-12-02},
	author = {Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron},
	year = {2016},
	file = {Deep Learning:/Users/micl/Zotero/storage/GKAVKPVP/www.deeplearningbook.org.html:text/html},
}

@misc{scikit-learn_116_2023,
	title = {1.16. {Probability} calibration},
	url = {https://scikit-learn/stable/modules/calibration.html},
	abstract = {When performing classification you often want not only to predict the class label, but also obtain a probability of the respective label. This probability gives you some kind of confidence on the p...},
	language = {en},
	urldate = {2023-12-02},
	journal = {scikit-learn},
	author = {{scikit-learn}},
	year = {2023},
	file = {Snapshot:/Users/micl/Zotero/storage/J44HYBR6/calibration.html:text/html},
}

@book{james_introduction_2021,
	address = {New York, NY},
	series = {Springer {Texts} in {Statistics}},
	title = {An {Introduction} to {Statistical} {Learning}},
	volume = {103},
	isbn = {978-1-4614-7137-0 978-1-4614-7138-7},
	url = {http://link.springer.com/10.1007/978-1-4614-7138-7},
	urldate = {2023-12-02},
	publisher = {Springer New York},
	author = {James, Gareth and Witten, Daniela and Hastie, Trevor and Tibshirani, Robert},
	year = {2021},
	doi = {10.1007/978-1-4614-7138-7},
	file = {Submitted Version:/Users/micl/Zotero/storage/KS3SIBUU/James et al. - 2013 - An Introduction to Statistical Learning.pdf:application/pdf},
}

@article{lang_mlr3_2019,
	title = {mlr3: {A} modern object-oriented machine learning framework in {R}},
	volume = {4},
	issn = {2475-9066},
	shorttitle = {mlr3},
	url = {https://joss.theoj.org/papers/10.21105/joss.01903},
	doi = {10.21105/joss.01903},
	number = {44},
	urldate = {2023-12-02},
	journal = {Journal of Open Source Software},
	author = {Lang, Michel and Binder, Martin and Richter, Jakob and Schratz, Patrick and Pfisterer, Florian and Coors, Stefan and Au, Quay and Casalicchio, Giuseppe and Kotthoff, Lars and Bischl, Bernd},
	month = dec,
	year = {2019},
	pages = {1903},
	file = {Full Text:/Users/micl/Zotero/storage/4ZZCSE38/Lang et al. - 2019 - mlr3 A modern object-oriented machine learning fr.pdf:application/pdf},
}

@book{gelman_regression_2020,
	edition = {1},
	title = {Regression and {Other} {Stories}},
	isbn = {978-1-139-16187-9 978-1-107-02398-7 978-1-107-67651-0},
	url = {https://www.cambridge.org/highereducation/product/9781139161879/book},
	abstract = {Most textbooks on regression focus on theory and the simplest of examples. Real statistical problems, however, are complex and subtle. This is not a book about the theory of regression. It is about using regression to solve real problems of comparison, estimation, prediction, and causal inference. Unlike other books, it focuses on practical issues such as sample size and missing data and a wide range of goals and techniques. It jumps right in to methods and computer code you can use immediately. Real examples, real stories from the authors' experience demonstrate what regression can do and its limitations, with practical advice for understanding assumptions and implementing methods for experiments and observational studies. They make a smooth transition to logistic regression and GLM. The emphasis is on computation in R and Stan rather than derivations, with code available online. Graphics and presentation aid understanding of the models and model fitting.},
	urldate = {2023-12-02},
	publisher = {Cambridge University Press},
	author = {Gelman, Andrew and Hill, Jennifer and Vehtari, Aki},
	month = jul,
	year = {2020},
	doi = {10.1017/9781139161879},
}

@book{wooldridge_introductory_2012,
	address = {Mason, OH},
	edition = {5th edition},
	title = {Introductory {Econometrics}: {A} {Modern} {Approach}},
	isbn = {978-1-111-53104-1},
	shorttitle = {Introductory {Econometrics}},
	abstract = {Discover how empirical researchers today actually think about and apply econometric methods with the practical, professional approach in Wooldridge�s INTRODUCTORY ECONOMETRICS: A MODERN APPROACH, 5E. Unlike traditional books on the subject, INTRODUCTORY ECONOMETRICS� unique presentation demonstrates how econometrics has moved beyond just a set of abstract tools to become a genuinely useful tool for answering questions in business, policy evaluation, and forecasting environments. Organized around the type of data being analyzed, the book uses a systematic approach that only introduces assumptions as they are needed, which makes the material easier to understand and ultimately leads to better econometric practices. Packed with timely, relevant applications, the text emphasizes incorporates close to 100 intriguing data sets in six formats and offers updates that reflect the latest emerging developments in the field.},
	language = {English},
	publisher = {Cengage Learning},
	author = {Wooldridge, Jeffrey M.},
	month = sep,
	year = {2012},
}

@misc{google_machine_2023,
	title = {Machine {Learning} {\textbar} {Google} for {Developers}},
	url = {https://developers.google.com/machine-learning},
	abstract = {Educational resources for machine learning.},
	language = {en},
	urldate = {2023-12-02},
	author = {Google},
	year = {2023},
}

@article{rovine_peirce_2004,
	title = {Peirce and {Bowditch}},
	volume = {58},
	issn = {0003-1305},
	url = {https://doi.org/10.1198/000313004X964},
	doi = {10.1198/000313004X964},
	abstract = {Henry Pickering Bowditch and Charles Sanders Peirce made important contributions to the ideas of regression and correlation. This is particularly interesting as these contributions came well before the work of Galton and Pearson. This article discusses the work of Bowditch related to the development of regression and presents Peirce's coefficient of the science of the method, an association coefficient for a 2 × 2 contingency table.},
	number = {3},
	urldate = {2023-12-02},
	journal = {The American Statistician},
	author = {Rovine, Michael J and Anderson, Douglas R},
	month = aug,
	year = {2004},
	note = {Publisher: Taylor \& Francis
\_eprint: https://doi.org/10.1198/000313004X964},
	keywords = {Association, Coefficient, History of statistics},
	pages = {232--236},
}

@book{grolemund_welcome_2023,
	title = {Welcome {\textbar} {R} for {Data} {Science}},
	url = {https://r4ds.hadley.nz/},
	abstract = {This book will teach you how to do data science with R: You’ll learn how to get your data into R, get it into the most useful structure, transform it, visualise it and model it. In this book, you will find a practicum of skills for data science. Just as a chemist learns how to clean test tubes and stock a lab, you’ll learn how to clean data and draw plots—and many other things besides. These are the skills that allow data science to happen, and here you will find the best practices for doing each of these things with R. You’ll learn how to use the grammar of graphics, literate programming, and reproducible research to save time. You’ll also learn how to manage cognitive resources to facilitate discoveries when wrangling, visualising, and exploring data.},
	language = {en},
	urldate = {2023-12-02},
	author = {Grolemund, Hadley Wickham {and} Garrett},
	year = {2023},
	file = {Snapshot:/Users/micl/Zotero/storage/MKUR5VY6/r4ds.had.co.nz.html:text/html},
}

@book{greene_econometric_2017,
	title = {Econometric {Analysis} - 8th {Edition}},
	url = {https://pages.stern.nyu.edu/~wgreene/Text/econometricanalysis.htm},
	urldate = {2023-12-02},
	author = {Greene, William},
	year = {2017},
	file = {Econometric Analysis - 7th Edition:/Users/micl/Zotero/storage/RJAENYBP/econometricanalysis.html:text/html},
}

@misc{brownlee_gentle_2016,
	title = {Gentle {Introduction} to the {Bias}-{Variance} {Trade}-{Off} in {Machine} {Learning}},
	url = {https://machinelearningmastery.com/gentle-introduction-to-the-bias-variance-trade-off-in-machine-learning/},
	abstract = {Supervised machine learning algorithms can best be understood through the lens of the bias-variance trade-off. In this post, you will discover the Bias-Variance Trade-Off and how to use it to better understand machine learning algorithms and get better performance on your data. Let’s get started. Update Oct/2019: Removed discussion of parametric/nonparametric models (thanks Alex). Overview […]},
	language = {en-US},
	urldate = {2023-12-03},
	journal = {MachineLearningMastery.com},
	author = {Brownlee, Jason},
	month = mar,
	year = {2016},
	file = {Snapshot:/Users/micl/Zotero/storage/GBAUFIA6/gentle-introduction-to-the-bias-variance-trade-off-in-machine-learning.html:text/html},
}

@misc{google_introduction_2023,
	title = {Introduction {\textbar} {Machine} {Learning}},
	url = {https://developers.google.com/machine-learning/decision-forests},
	language = {en},
	urldate = {2023-12-03},
	journal = {Google for Developers},
	author = {Google},
	year = {2023},
	file = {Snapshot:/Users/micl/Zotero/storage/PJY3UGGV/decision-forests.html:text/html},
}

@misc{scikit-learn_nested_2023,
	title = {Nested versus non-nested cross-validation},
	url = {https://scikit-learn/stable/auto_examples/model_selection/plot_nested_cross_validation_iris.html},
	abstract = {This example compares non-nested and nested cross-validation strategies on a classifier of the iris data set. Nested cross-validation (CV) is often used to train a model in which hyperparameters al...},
	language = {en},
	urldate = {2023-12-03},
	journal = {scikit-learn},
	author = {{scikit-learn}},
	year = {2023},
	file = {Snapshot:/Users/micl/Zotero/storage/9FV6VZX3/plot_nested_cross_validation_iris.html:text/html},
}

@misc{schmidhuber_annotated_2022,
	title = {Annotated {History} of {Modern} {AI} and {Deep} {Learning}},
	url = {http://arxiv.org/abs/2212.11279},
	doi = {10.48550/arXiv.2212.11279},
	abstract = {Machine learning is the science of credit assignment: finding patterns in observations that predict the consequences of actions and help to improve future performance. Credit assignment is also required for human understanding of how the world works, not only for individuals navigating daily life, but also for academic professionals like historians who interpret the present in light of past events. Here I focus on the history of modern artificial intelligence (AI) which is dominated by artificial neural networks (NNs) and deep learning, both conceptually closer to the old field of cybernetics than to what's been called AI since 1956 (e.g., expert systems and logic programming). A modern history of AI will emphasize breakthroughs outside of the focus of traditional AI text books, in particular, mathematical foundations of today's NNs such as the chain rule (1676), the first NNs (linear regression, circa 1800), and the first working deep learners (1965-). From the perspective of 2022, I provide a timeline of the -- in hindsight -- most important relevant events in the history of NNs, deep learning, AI, computer science, and mathematics in general, crediting those who laid foundations of the field. The text contains numerous hyperlinks to relevant overview sites from my AI Blog. It supplements my previous deep learning survey (2015) which provides hundreds of additional references. Finally, to round it off, I'll put things in a broader historic context spanning the time since the Big Bang until when the universe will be many times older than it is now.},
	urldate = {2023-12-03},
	publisher = {arXiv},
	author = {Schmidhuber, Juergen},
	month = dec,
	year = {2022},
	note = {arXiv:2212.11279 [cs]},
	keywords = {Computer Science - Neural and Evolutionary Computing},
	annote = {Comment: 75 pages, over 500 references. arXiv admin note: substantial text overlap with arXiv:2005.05744},
	file = {arXiv Fulltext PDF:/Users/micl/Zotero/storage/G2XW58H9/Schmidhuber - 2022 - Annotated History of Modern AI and Deep Learning.pdf:application/pdf;arXiv.org Snapshot:/Users/micl/Zotero/storage/P75W945I/2212.html:text/html},
}

@article{welchowski_techniques_2022,
	title = {Techniques to {Improve} {Ecological} {Interpretability} of {Black}-{Box} {Machine} {Learning} {Models}},
	volume = {27},
	issn = {1537-2693},
	url = {https://doi.org/10.1007/s13253-021-00479-7},
	doi = {10.1007/s13253-021-00479-7},
	abstract = {Statistical modeling of ecological data is often faced with a large number of variables as well as possible nonlinear relationships and higher-order interaction effects. Gradient boosted trees (GBT) have been successful in addressing these issues and have shown a good predictive performance in modeling nonlinear relationships, in particular in classification settings with a categorical response variable.  They also tend to be robust against outliers. However, their black-box nature makes it difficult to interpret these models. We introduce several recently developed statistical tools to the environmental research community in order to advance interpretation of these black-box models. To analyze the properties of the tools, we applied gradient boosted trees to investigate biological health of streams within the contiguous USA, as measured by a benthic macroinvertebrate biotic index. Based on these data and a simulation study, we demonstrate the advantages and limitations of partial dependence plots (PDP), individual conditional expectation (ICE) curves and accumulated local effects (ALE) in their ability to identify covariate–response relationships. Additionally, interaction effects were quantified according to interaction strength (IAS) and Friedman’s \$\$H{\textasciicircum}2\$\$statistic. Interpretable machine learning techniques are useful tools to open the black-box of gradient boosted trees in the environmental sciences. This finding is supported by our case study on the effect of impervious surface on the benthic condition, which agrees with previous results in the literature. Overall, the most important variables were ecoregion, bed stability, watershed area, riparian vegetation and catchment slope. These variables were also present in most identified interaction effects. In conclusion, graphical tools (PDP, ICE, ALE) enable visualization and easier interpretation of GBT but should be supported by analytical statistical measures. Future methodological research is needed to investigate the properties of interaction tests.      Supplementary materials accompanying this paper appear on-line.},
	language = {en},
	number = {1},
	urldate = {2023-12-03},
	journal = {Journal of Agricultural, Biological and Environmental Statistics},
	author = {Welchowski, Thomas and Maloney, Kelly O. and Mitchell, Richard and Schmid, Matthias},
	month = mar,
	year = {2022},
	keywords = {Boosting, Interaction terms, Interpretable machine learning, Macroinvertebrates, Stream health},
	pages = {175--197},
	file = {Full Text PDF:/Users/micl/Zotero/storage/TPMT2XPT/Welchowski et al. - 2022 - Techniques to Improve Ecological Interpretability .pdf:application/pdf},
}

@book{biecek_explanatory_2020,
	title = {Explanatory {Model} {Analysis}},
	url = {https://ema.drwhy.ai/},
	abstract = {This book introduces unified language for exploration, explanation and examination of predictive machine learning models.},
	urldate = {2023-12-03},
	author = {Biecek, Przemyslaw and Burzykowski, Tomasz},
	year = {2020},
	file = {Snapshot:/Users/micl/Zotero/storage/C3ZUM24Y/ema.drwhy.ai.html:text/html},
}

@misc{bycroft_llm_2023,
	title = {{LLM} {Visualization}},
	url = {https://bbycroft.net/llm},
	urldate = {2023-12-03},
	author = {Bycroft, Brendan},
	year = {2023},
	file = {LLM Visualization:/Users/micl/Zotero/storage/46AYQEXH/llm.html:text/html},
}

@article{kunzel_metalearners_2019,
	title = {Metalearners for estimating heterogeneous treatment effects using machine learning},
	volume = {116},
	url = {https://www.pnas.org/doi/abs/10.1073/pnas.1804597116},
	doi = {10.1073/pnas.1804597116},
	abstract = {There is growing interest in estimating and analyzing heterogeneous treatment effects in experimental and observational studies. We describe a number of metaalgorithms that can take advantage of any supervised learning or regression method in machine learning and statistics to estimate the conditional average treatment effect (CATE) function. Metaalgorithms build on base algorithms—such as random forests (RFs), Bayesian additive regression trees (BARTs), or neural networks—to estimate the CATE, a function that the base algorithms are not designed to estimate directly. We introduce a metaalgorithm, the X-learner, that is provably efficient when the number of units in one treatment group is much larger than in the other and can exploit structural properties of the CATE function. For example, if the CATE function is linear and the response functions in treatment and control are Lipschitz-continuous, the X-learner can still achieve the parametric rate under regularity conditions. We then introduce versions of the X-learner that use RF and BART as base learners. In extensive simulation studies, the X-learner performs favorably, although none of the metalearners is uniformly the best. In two persuasion field experiments from political science, we demonstrate how our X-learner can be used to target treatment regimes and to shed light on underlying mechanisms. A software package is provided that implements our methods.},
	number = {10},
	urldate = {2023-12-09},
	journal = {Proceedings of the National Academy of Sciences},
	author = {Künzel, Sören R. and Sekhon, Jasjeet S. and Bickel, Peter J. and Yu, Bin},
	month = mar,
	year = {2019},
	note = {Publisher: Proceedings of the National Academy of Sciences},
	pages = {4156--4165},
	file = {Full Text PDF:/Users/micl/Zotero/storage/MKR9FWQJ/Künzel et al. - 2019 - Metalearners for estimating heterogeneous treatmen.pdf:application/pdf},
}

@article{pearl_causal_2009,
	title = {Causal inference in statistics: {An} overview},
	volume = {3},
	issn = {1935-7516},
	shorttitle = {Causal inference in statistics},
	url = {https://projecteuclid.org/journals/statistics-surveys/volume-3/issue-none/Causal-inference-in-statistics-An-overview/10.1214/09-SS057.full},
	doi = {10.1214/09-SS057},
	abstract = {This review presents empirical researchers with recent advances in causal inference, and stresses the paradigmatic shifts that must be undertaken in moving from traditional statistical analysis to causal analysis of multivariate data. Special emphasis is placed on the assumptions that underly all causal inferences, the languages used in formulating those assumptions, the conditional nature of all causal and counterfactual claims, and the methods that have been developed for the assessment of such claims. These advances are illustrated using a general theory of causation based on the Structural Causal Model (SCM) described in Pearl (2000a), which subsumes and unifies other approaches to causation, and provides a coherent mathematical foundation for the analysis of causes and counterfactuals. In particular, the paper surveys the development of mathematical tools for inferring (from a combination of data and assumptions) answers to three types of causal queries: (1) queries about the effects of potential interventions, (also called “causal effects” or “policy evaluation”) (2) queries about probabilities of counterfactuals, (including assessment of “regret,” “attribution” or “causes of effects”) and (3) queries about direct and indirect effects (also known as “mediation”). Finally, the paper defines the formal and conceptual relationships between the structural and potential-outcome frameworks and presents tools for a symbiotic analysis that uses the strong features of both.},
	number = {none},
	urldate = {2023-12-10},
	journal = {Statistics Surveys},
	author = {Pearl, Judea},
	month = jan,
	year = {2009},
	note = {Publisher: Amer. Statist. Assoc., the Bernoulli Soc., the Inst. Math. Statist., and the Statist. Soc. Canada},
	keywords = {causal effects, causes of effects, confounding, counterfactuals, graphical methods, mediation, policy evaluation, potential-outcome, structural equation models},
	pages = {96--146},
	file = {Full Text PDF:/Users/micl/Zotero/storage/QEVSNU5U/Pearl - 2009 - Causal inference in statistics An overview.pdf:application/pdf},
}

@misc{pearl_causal_2022,
	title = {Causal {Inference}: {History}, {Perspectives}, {Adventures}, and {Unification} ({An} {Interview} with {Judea} {Pearl})},
	url = {https://muse.jhu.edu/pub/56/article/867087/summary},
	urldate = {2023-12-10},
	author = {Pearl, Judea},
	year = {2022},
	file = {Project MUSE - Causal Inference\: History, Perspectives, Adventures, and Unification (An Interview with Judea Pearl):/Users/micl/Zotero/storage/AESKD3IT/summary.html:text/html},
}

@article{morgan_counterfactuals_2014,
	title = {Counterfactuals and {Causal} {Inference}: {Methods} and {Principles} for {Social} {Research}, 2nd {Edition}},
	shorttitle = {Counterfactuals and {Causal} {Inference}},
	url = {https://stars.library.ucf.edu/etextbooks/298},
	author = {Morgan, Stephen and Winship, Christopher},
	month = jan,
	year = {2014},
	file = {"Counterfactuals and Causal Inference\: Methods and Principles for Socia" by Stephen L. Morgan and Christopher Winship:/Users/micl/Zotero/storage/U74QKHFA/298.html:text/html},
}

@misc{facure_alves_causal_2022,
	title = {Causal {Inference} for {The} {Brave} and {True} — {Causal} {Inference} for the {Brave} and {True}},
	url = {https://matheusfacure.github.io/python-causality-handbook/landing-page.html},
	urldate = {2023-12-10},
	author = {Facure Alves, Matheus},
	year = {2022},
	file = {Causal Inference for The Brave and True — Causal Inference for the Brave and True:/Users/micl/Zotero/storage/ZMZM2BRJ/landing-page.html:text/html},
}

@book{molnar_interpretable_2023,
	title = {Interpretable {Machine} {Learning}},
	url = {https://christophm.github.io/interpretable-ml-book/},
	abstract = {Machine learning algorithms usually operate as black boxes and it is unclear how they derived a certain decision. This book is a guide for practitioners to make machine learning decisions interpretable.},
	urldate = {2023-12-10},
	author = {Molnar, Christoph},
	year = {2023},
	file = {Snapshot:/Users/micl/Zotero/storage/F775EJWY/interpretable-ml-book.html:text/html},
}

@misc{pok_how_2020,
	title = {How uplift modeling works {\textbar} {Blogs}},
	url = {https://ambiata.com/blog/2020-07-07-uplift-modeling/},
	urldate = {2023-12-10},
	author = {Pok, Wilson},
	year = {2020},
	file = {How uplift modeling works | Blogs:/Users/micl/Zotero/storage/QJDBLXPQ/2020-07-07-uplift-modeling.html:text/html},
}

@misc{shevchenko_types_2023,
	title = {Types of customers — scikit-uplift 0.3.1 documentation},
	url = {https://www.uplift-modeling.com/en/v0.5.1/user_guide/introduction/clients.html},
	urldate = {2023-12-10},
	author = {Shevchenko, Maksim},
	year = {2023},
	file = {Types of customers — scikit-uplift 0.3.1 documentation:/Users/micl/Zotero/storage/6RB8SG2A/clients.html:text/html},
}

@misc{zhang_dive_2023,
	title = {Dive into {Deep} {Learning} — {Dive} into {Deep} {Learning} 1.0.3 documentation},
	url = {https://d2l.ai/index.html},
	urldate = {2023-12-10},
	author = {Zhang, Aston and Lipton, Zack and Li, Mu and Smola, Alex},
	year = {2023},
	file = {Dive into Deep Learning — Dive into Deep Learning 1.0.3 documentation:/Users/micl/Zotero/storage/FYUH75CV/index.html:text/html},
}

@misc{vanderplas_python_2016,
	title = {Python {Data} {Science} {Handbook} [{Book}]},
	url = {https://www.oreilly.com/library/view/python-data-science/9781491912126/},
	abstract = {For many researchers, Python is a first-class tool mainly because of its libraries for storing, manipulating, and gaining insight from data. Several resources exist for individual pieces of this data … - Selection from Python Data Science Handbook [Book]},
	language = {en},
	urldate = {2023-12-10},
	author = {VanderPlas, Jake},
	year = {2016},
	note = {ISBN: 9781491912058},
	file = {Snapshot:/Users/micl/Zotero/storage/PS9825NX/9781491912126.html:text/html},
}

@book{cunningham_causal_2023,
	title = {Causal {Inference} {The} {Mixtape}},
	url = {https://mixtape.scunning.com/},
	urldate = {2023-12-10},
	author = {Cunningham, Scott},
	year = {2023},
	file = {Causal Inference The Mixtape:/Users/micl/Zotero/storage/5FEHRQ3X/mixtape.scunning.com.html:text/html},
}

@misc{causalml_causalml_2023,
	title = {{CausalML}},
	url = {https://causalml.readthedocs.io/en/latest/index.html},
	urldate = {2023-12-10},
	author = {{causalml}},
	year = {2023},
	file = {Welcome to Causal ML’s documentation — causalml documentation:/Users/micl/Zotero/storage/QX95ZBK7/index.html:text/html},
}

@misc{masis_interpretable_2023,
	title = {Interpretable {Machine} {Learning} with {Python} - {Second} {Edition}},
	url = {https://www.packtpub.com/product/interpretable-machine-learning-with-python-second-edition/9781803235424},
	abstract = {A deep dive into the key aspects and challenges of machine learning interpretability using a comprehensive toolkit, including SHAP, feature importance, and causal inference, to build fairer, safer, and more reliable models.},
	language = {en},
	urldate = {2023-12-10},
	journal = {Packt},
	author = {Masis, Serg},
	year = {2023},
	file = {Snapshot:/Users/micl/Zotero/storage/X98X8STZ/9781803235424.html:text/html},
}

@misc{faraway_linear_2014,
	title = {Linear {Models} with {R}},
	url = {https://www.routledge.com/Linear-Models-with-R/Faraway/p/book/9781439887332},
	abstract = {A Hands-On Way to Learning Data Analysis

Part of the core of statistics, linear models are used to make predictions and explain the relationship between the response and the predictors. Understanding linear models is crucial to a broader competence in the practice of statistics. Linear Models with R, Second Edition explains how to use linear models in physical science, engineering, social science, and business applications. The book incorporates several improvements that reflect how the world o},
	language = {en},
	urldate = {2023-12-10},
	journal = {Routledge \& CRC Press},
	author = {Faraway, Julian},
	year = {2014},
	file = {Snapshot:/Users/micl/Zotero/storage/BFRAMGID/9781439887332.html:text/html},
}

@book{wood_generalized_2017,
	address = {Boca Raton},
	edition = {2},
	title = {Generalized {Additive} {Models}: {An} {Introduction} with {R}, {Second} {Edition}},
	isbn = {978-1-315-37027-9},
	shorttitle = {Generalized {Additive} {Models}},
	abstract = {The first edition of this book has established itself as one of the leading references on generalized additive models (GAMs), and the only book on the topic to be introductory in nature with a wealth of practical examples and software implementation. It is self-contained, providing the necessary background in linear models, linear mixed models, and generalized linear models (GLMs), before presenting a balanced treatment of the theory and applications of GAMs and related models. 

The author bases his approach on a framework of penalized regression splines, and while firmly focused on the practical aspects of GAMs, discussions include fairly full explanations of the theory underlying the methods. Use of R software helps explain the theory and illustrates the practical application of the methodology. Each chapter contains an extensive set of exercises, with solutions in an appendix or in the book’s R data package gamair, to enable use as a course text or for self-study.},
	publisher = {Chapman and Hall/CRC},
	author = {Wood, Simon N.},
	month = may,
	year = {2017},
	doi = {10.1201/9781315370279},
}

@book{harrell_regression_2015,
	address = {Cham},
	edition = {2},
	series = {Springer {Series} in {Statistics}},
	title = {Regression {Modeling} {Strategies}: {With} {Applications} to {Linear} {Models}, {Logistic} and {Ordinal} {Regression}, and {Survival} {Analysis}},
	isbn = {978-3-319-19424-0 978-3-319-19425-7},
	shorttitle = {Regression {Modeling} {Strategies}},
	url = {https://link.springer.com/10.1007/978-3-319-19425-7},
	language = {en},
	urldate = {2023-12-10},
	publisher = {Springer International Publishing},
	author = {Harrell, Frank E.},
	year = {2015},
	doi = {10.1007/978-3-319-19425-7},
	keywords = {Regression analysis, Generalized least squares, knitr reproducible documents, Linear models, Logistic regression, Predictive modeling, R statistical software, Survival analysis},
}

@misc{gelman_what_2013,
	title = {What are the key assumptions of linear regression? {\textbar} {Statistical} {Modeling}, {Causal} {Inference}, and {Social} {Science}},
	url = {https://statmodeling.stat.columbia.edu/2013/08/04/19470/},
	urldate = {2023-12-10},
	author = {Gelman, Andrew},
	year = {2013},
	file = {What are the key assumptions of linear regression? | Statistical Modeling, Causal Inference, and Social Science:/Users/micl/Zotero/storage/W7X2XUFB/19470.html:text/html},
}

@book{kuhn_tidy_2023,
	title = {Tidy {Modeling} with {R}},
	url = {https://www.tmwr.org/},
	abstract = {The tidymodels framework is a collection of R packages for modeling and machine learning using tidyverse principles. This book provides a thorough introduction to how to use tidymodels, and an outline of good methodology and statistical practice for phases of the modeling process.},
	urldate = {2023-12-10},
	author = {Kuhn, Max and Silge, Julia},
	year = {2023},
	file = {Snapshot:/Users/micl/Zotero/storage/GWJ8P2AP/www.tmwr.org.html:text/html},
}

@misc{ucla_advanced_research_computing_faq_2023,
	title = {{FAQ}: {What} are pseudo {R}-squareds?},
	url = {https://stats.oarc.ucla.edu/other/mult-pkg/faq/general/faq-what-are-pseudo-r-squareds/},
	urldate = {2023-12-11},
	author = {{UCLA Advanced Research Computing}},
	year = {2023},
	file = {FAQ\: What are pseudo R-squareds?:/Users/micl/Zotero/storage/GS5XPW6K/faq-what-are-pseudo-r-squareds.html:text/html},
}

@book{roback_beyond_2021,
	title = {Beyond {Multiple} {Linear} {Regression}},
	url = {https://bookdown.org/roback/bookdown-BeyondMLR/},
	abstract = {An applied textbook on generalized linear models and multilevel models for advanced undergraduates, featuring many real, unique data sets. It is intended to be accessible to undergraduate students who have successfully completed a regression course. Even though there is no mathematical prerequisite, we still introduce fairly sophisticated topics such as likelihood theory, zero-inflated Poisson, and parametric bootstrapping in an intuitive and applied manner. We believe strongly in case studies featuring real data and real research questions; thus, most of the data in the textbook arises from collaborative research conducted by the authors and their students, or from student projects. Our goal is that, after working through this material, students will develop an expanded toolkit and a greater appreciation for the wider world of data and statistical modeling.},
	urldate = {2023-12-12},
	author = {Roback, Paul and Legler, Julie},
	year = {2021},
	file = {Snapshot:/Users/micl/Zotero/storage/KFPVKNZP/bookdown-BeyondMLR.html:text/html},
}

@book{clark_mixed_2023,
	title = {Mixed {Models} with {R}},
	url = {https://m-clark.github.io/mixed-models-with-R/},
	abstract = {This is an introduction to using mixed models in R. It covers the most common techniques employed, with demonstration primarily via the lme4 package. Discussion includes extensions into generalized mixed models, Bayesian approaches, and realms beyond.},
	urldate = {2023-12-12},
	author = {Clark, Michael},
	year = {2023},
	file = {Snapshot:/Users/micl/Zotero/storage/UY2HIX59/mixed-models-with-R.html:text/html},
}

@book{clark_generalized_2022,
	title = {Generalized {Additive} {Models}},
	url = {https://m-clark.github.io/generalized-additive-models/},
	abstract = {An introduction to generalized additive models (GAMs) is provided, with an emphasis on generalization from familiar linear models. It makes extensive use of the mgcv package in R. Discussion includes common approaches, standard extensions, and relations to other techniques. More technical modeling details are described and demonstrated as well.},
	urldate = {2023-12-12},
	author = {Clark, Michael},
	year = {2022},
	file = {Snapshot:/Users/micl/Zotero/storage/8EARDAFM/generalized-additive-models.html:text/html},
}

@book{clark_practical_2020,
	title = {Practical {Data} {Science}},
	url = {https://m-clark.github.io/data-processing-and-visualization/},
	abstract = {The focus of this document is on data science tools and techniques in R, including basic programming knowledge, visualization practices, modeling, and more, along with exercises to practice further. In addition, the demonstrations of most content in Python is available via Jupyter notebooks.},
	urldate = {2023-12-12},
	author = {Clark, Michael},
	year = {2020},
	file = {Snapshot:/Users/micl/Zotero/storage/TPNXYW7K/data-processing-and-visualization.html:text/html},
}

@book{clark_bayesian_2022,
	title = {Bayesian {Basics}},
	url = {https://m-clark.github.io/bayesian-basics/},
	abstract = {This document provides an introduction to Bayesian data analysis. It is conceptual in nature, but uses the probabilistic programming language Stan for demonstration (and its implementation in R via rstan). From elementary examples, guidance is provided for data preparation, efficient modeling, diagnostics, and more.},
	urldate = {2023-12-12},
	author = {Clark, Michael},
	year = {2022},
	file = {Snapshot:/Users/micl/Zotero/storage/YJKRBKDQ/bayesian-basics.html:text/html},
}

@book{koenker_quantile_2005,
	title = {Quantile regression},
	volume = {38},
	url = {https://books.google.com/books?hl=en&lr=&id=WjOdAgAAQBAJ&oi=fnd&pg=PT12&dq=koenker+quantile+regression&ots=CQFHSt5o-W&sig=G1TpKPHo-BRdJ8qWcBrIBI2FQAs},
	urldate = {2023-12-14},
	publisher = {Cambridge university press},
	author = {Koenker, Roger},
	year = {2005},
	file = {Available Version (via Google Scholar):/Users/micl/Zotero/storage/CN6W28TQ/Koenker - 2005 - Quantile regression.pdf:application/pdf},
}

@article{koenker_galton_2000,
	title = {Galton, {Edgeworth}, {Frisch}, and prospects for quantile regression in econometrics},
	volume = {95},
	issn = {0304-4076},
	url = {https://www.sciencedirect.com/science/article/pii/S0304407699000433},
	doi = {10.1016/S0304-4076(99)00043-3},
	abstract = {The work of three leading figures in the early history of econometrics is used to motivate some recent developments in the theory and application of quantile regression. We stress not only the robustness advantages of this form of semiparametric statistical method, but also the opportunity to recover a more complete description of the statistical relationship between variables. A recent proposal for a more X-robust form of quantile regression based on maximal depth ideas is described along with an interesting historical antecedent. Finally, the notorious computational burden of median regression, and quantile regression more generally, is addressed. It is argued that recent developments in interior point methods for linear programming together with some new preprocessing ideas make it possible to compute quantile regressions as quickly as least-squares regressions throughout the entire range of problem sizes encountered in econometrics.},
	number = {2},
	urldate = {2023-12-14},
	journal = {Journal of Econometrics},
	author = {Koenker, Roger},
	month = apr,
	year = {2000},
	keywords = {Interior point methods, Least absolute error regression, Linear programming, Quantile regression, Regression depth},
	pages = {347--374},
	file = {ScienceDirect Snapshot:/Users/micl/Zotero/storage/SLXLLCEI/S0304407699000433.html:text/html},
}

@misc{wikipedia_relationships_2023,
	title = {Relationships among probability distributions},
	copyright = {Creative Commons Attribution-ShareAlike License},
	url = {https://en.wikipedia.org/wiki/Relationships_among_probability_distributions},
	abstract = {In probability theory and statistics, there are several relationships among probability distributions. These relations can be categorized in the following groups: 

One distribution is a special case of another with a broader parameter space
Transforms (function of a random variable);
Combinations (function of several variables);
Approximation (limit) relationships;
Compound relationships (useful for Bayesian inference);
Duality;
Conjugate priors.},
	language = {en},
	urldate = {2023-12-17},
	journal = {Wikipedia},
	author = {Wikipedia},
	month = oct,
	year = {2023},
	note = {Page Version ID: 1180084573},
	file = {Snapshot:/Users/micl/Zotero/storage/MJ26SSLS/Relationships_among_probability_distributions.html:text/html},
}

@misc{murphy_machine_2012,
	title = {Machine {Learning}: {A} {Probabilistic} {Perspective}},
	url = {https://mitpress.mit.edu/9780262018029/machine-learning/},
	abstract = {A comprehensive introduction to machine learning that uses probabilistic models and inference as a unifying approach.Today's Web-enabled deluge of electronic...},
	language = {en-US},
	urldate = {2023-12-17},
	journal = {MIT Press},
	author = {Murphy, Kevin P.},
	year = {2012},
	file = {Snapshot:/Users/micl/Zotero/storage/D7TICG47/9780262018029.html:text/html},
}

@misc{murphy_probabilistic_2023,
	title = {Probabilistic {Machine} {Learning}},
	url = {https://mitpress.mit.edu/9780262046824/probabilistic-machine-learning/},
	abstract = {A detailed and up-to-date introduction to machine learning, presented through the unifying lens of probabilistic modeling and Bayesian decision theory.This b...},
	language = {en-US},
	urldate = {2023-12-17},
	journal = {MIT Press},
	author = {Murphy, Kevin P.},
	year = {2023},
	file = {Snapshot:/Users/micl/Zotero/storage/RX2RXTVM/9780262046824.html:text/html},
}

@misc{databricks_what_2019,
	title = {What is {AdaGrad}?},
	url = {https://www.databricks.com/glossary/adagrad},
	abstract = {Adaptive Gradient Algorithm (Adagrad) is an algorithm for gradient-based optimization and is well-suited when dealing with sparse data.},
	language = {en-US},
	urldate = {2023-12-18},
	journal = {Databricks},
	author = {DataBricks},
	month = feb,
	year = {2019},
	file = {Snapshot:/Users/micl/Zotero/storage/L3CINNNH/adagrad.html:text/html},
}

@misc{statquest_with_josh_starmer_gradient_2019,
	title = {Gradient {Descent}, {Step}-by-{Step}},
	url = {https://www.youtube.com/watch?v=sDv4f4s2SB8},
	abstract = {Gradient Descent is the workhorse behind most of Machine Learning. When you fit a machine learning method to a training dataset, you're probably using Gradient Descent. It can optimize parameters in a wide variety of settings. Since it's so fundamental to Machine Learning, I decided to make a "step-by-step" video that shows you exactly how it works.

NOTE: This video assumes you are already familiar with Least Squares and Linear Regression. If not, here's the link to the Quest:    • The Main Ideas of Fitting a Line to D...  

For a complete index of all the StatQuest videos, check out:
https://statquest.org/video-index/

Sources:
There are a ton of websites that describe the math behind Gradient Descent. One of my favorite is the wikipedia article: https://en.wikipedia.org/wiki/Gradien...

If you'd like to support StatQuest, please consider...

Buying The StatQuest Illustrated Guide to Machine Learning!!!
PDF - https://statquest.gumroad.com/l/wvtmc
Paperback - https://www.amazon.com/dp/B09ZCKR4H6
Kindle eBook - https://www.amazon.com/dp/B09ZG79HXC

Patreon:   / statquest  
...or...
YouTube Membership:    / @statquest  

...a cool StatQuest t-shirt or sweatshirt: 
https://shop.spreadshirt.com/statques...

...buying one or two of my songs (or go large and get a whole album!)
https://joshuastarmer.bandcamp.com/

...or just donating to StatQuest!
https://www.paypal.me/statquest

Lastly, if you want to keep up with me as I research and create new StatQuests, follow me on twitter:
  / joshuastarmer  

0:00 Awesome song and introduction
1:25 Main ideas behind Gradient Descent
5:38 Gradient Descent optimization of a single variable, part 1
9:08 An important note about why we use Gradient Descent
9:40 Gradient Descent optimization of a single variable, part 2
14:48 Review of concepts covered so far
15:48 Gradient Descent optimization of two (or more) variables
21:55 A note about Loss Functions
22:13 Gradient Descent algorithm
23:06 Stochastic Gradient Descent

\#statquest \#gradient \#descent \#ML},
	urldate = {2023-12-18},
	author = {{StatQuest with Josh Starmer}},
	month = feb,
	year = {2019},
}

@misc{statquest_with_josh_starmer_stochastic_2019,
	title = {Stochastic {Gradient} {Descent}, {Clearly} {Explained}!!!},
	url = {https://www.youtube.com/watch?v=vMh0zPT0tLI},
	abstract = {Even though Stochastic Gradient Descent sounds fancy, it is just a simple addition to "regular" Gradient Descent. This video sets up the problem that Stochastic Gradient Descent solves and then shows how it does it. Along the way, we discuss situations where Stochastic Gradient Descent is most useful, and some cool features that aren't that obvious.

NOTE: There is a small typo at 9:03. The values for the intercept and slope should be the most recent estimates, 0.86 and 0.68, instead of the original random values, 0 and 1.

NOTE: This StatQuest assumes you already understand "regular" Gradient Descent. If not, check out the 'Quest:    • Gradient Descent, Step-by-Step  

When I was researching Stochastic Gradient Descent, I found a ton of cool websites that provided lots of details. Here are some of my favorites:

Sebastian Ruder has a nice write-up: http://ruder.io/optimizing-gradient-d...

...as the Usupervised Feature Learning and Deep Learning Tutorial: http://deeplearning.stanford.edu/tuto...

For a complete index of all the StatQuest videos, check out:
https://statquest.org/video-index/

If you'd like to support StatQuest, please consider...

Buying The StatQuest Illustrated Guide to Machine Learning!!!
PDF - https://statquest.gumroad.com/l/wvtmc
Paperback - https://www.amazon.com/dp/B09ZCKR4H6
Kindle eBook - https://www.amazon.com/dp/B09ZG79HXC

Patreon:   / statquest  
...or...
YouTube Membership:    / @statquest  

...a cool StatQuest t-shirt or sweatshirt: 
https://shop.spreadshirt.com/statques...

...buying one or two of my songs (or go large and get a whole album!)
https://joshuastarmer.bandcamp.com/

...or just donating to StatQuest!
https://www.paypal.me/statquest

Lastly, if you want to keep up with me as I research and create new StatQuests, follow me on twitter:
  / joshuastarmer  

Corrections:
9:03. The values for the intercept and slope should be the most recent estimates, 0.86 and 0.68, instead of the original random values, 0 and 1.
9:33 the slope should be 0.7.

\#statquest \#sgd},
	urldate = {2023-12-18},
	author = {{StatQuest with Josh Starmer}},
	month = may,
	year = {2019},
}

@misc{brownlee_gradient_2021,
	title = {Gradient {Descent} {With} {AdaGrad} {From} {Scratch}},
	url = {https://machinelearningmastery.com/gradient-descent-with-adagrad-from-scratch/},
	abstract = {Gradient descent is an optimization algorithm that follows the negative gradient of an objective function in order to locate the minimum of the function. A limitation of gradient descent is that it uses the same step size (learning rate) for each input variable. This can be a problem on objective functions that have different amounts […]},
	language = {en-US},
	urldate = {2023-12-18},
	journal = {MachineLearningMastery.com},
	author = {Brownlee, Jason},
	month = jun,
	year = {2021},
	file = {Snapshot:/Users/micl/Zotero/storage/GIGERHRX/gradient-descent-with-adagrad-from-scratch.html:text/html},
}

@misc{carpenter_prior_2023,
	title = {Prior {Choice} {Recommendations}},
	url = {https://github.com/stan-dev/stan/wiki/Prior-Choice-Recommendations},
	abstract = {Stan development repository. The master branch contains the current release. The develop branch contains the latest stable development.  See the Developer Process Wiki for details.   - stan-dev/stan},
	language = {en},
	urldate = {2023-12-18},
	journal = {GitHub},
	author = {Carpenter, Bob},
	year = {2023},
	file = {Snapshot:/Users/micl/Zotero/storage/L65RJWQT/Prior-Choice-Recommendations.html:text/html},
}

@misc{mcelreath_statistical_2020,
	title = {Statistical {Rethinking}: {A} {Bayesian} {Course} with {Examples} in {R} and {STAN}},
	shorttitle = {Statistical {Rethinking}},
	url = {https://www.routledge.com/Statistical-Rethinking-A-Bayesian-Course-with-Examples-in-R-and-STAN/McElreath/p/book/9780367139919},
	abstract = {Statistical Rethinking: A Bayesian Course with Examples in R and Stan builds your knowledge of and confidence in making inferences from data. Reflecting the need for scripting in today's model-based statistics, the book pushes you to perform step-by-step calculations that are usually automated. This unique computational approach ensures that you understand enough of the details to make reasonable choices and interpretations in your own modeling work.

The text presents causal inference and gener},
	language = {en},
	urldate = {2023-12-18},
	journal = {Routledge \& CRC Press},
	author = {McElreath, Richard},
	year = {2020},
	file = {Snapshot:/Users/micl/Zotero/storage/CMEB2ZMH/9780367139919.html:text/html},
}

@book{kuhn_applied_2023,
	title = {Applied {Machine} {Learning} for {Tabular} {Data}},
	url = {https://aml4td.org/},
	language = {en},
	urldate = {2023-12-21},
	author = {Kuhn, Max and Johnson, Kjell},
	month = dec,
	year = {2023},
	file = {Snapshot:/Users/micl/Zotero/storage/GPMIWQ3M/aml4td.org.html:text/html},
}

@book{fleuret_little_2023,
	title = {The {Little} {Book} of {Deep} {Learning}},
	url = {https://fleuret.org/francois/lbdl.html},
	abstract = {This book is a short introduction to deep learning for readers with a STEM background},
	urldate = {2023-12-28},
	author = {Fleuret, François},
	year = {2023},
	file = {The Little Book of Deep Learning:/Users/micl/Zotero/storage/4UTCEVU9/lbdl.html:text/html},
}

@article{belkin_reconciling_2019,
	title = {Reconciling modern machine learning practice and the bias-variance trade-off},
	volume = {116},
	issn = {0027-8424, 1091-6490},
	url = {http://arxiv.org/abs/1812.11118},
	doi = {10.1073/pnas.1903070116},
	abstract = {Breakthroughs in machine learning are rapidly changing science and society, yet our fundamental understanding of this technology has lagged far behind. Indeed, one of the central tenets of the field, the bias-variance trade-off, appears to be at odds with the observed behavior of methods used in the modern machine learning practice. The bias-variance trade-off implies that a model should balance under-fitting and over-fitting: rich enough to express underlying structure in data, simple enough to avoid fitting spurious patterns. However, in the modern practice, very rich models such as neural networks are trained to exactly fit (i.e., interpolate) the data. Classically, such models would be considered over-fit, and yet they often obtain high accuracy on test data. This apparent contradiction has raised questions about the mathematical foundations of machine learning and their relevance to practitioners. In this paper, we reconcile the classical understanding and the modern practice within a unified performance curve. This "double descent" curve subsumes the textbook U-shaped bias-variance trade-off curve by showing how increasing model capacity beyond the point of interpolation results in improved performance. We provide evidence for the existence and ubiquity of double descent for a wide spectrum of models and datasets, and we posit a mechanism for its emergence. This connection between the performance and the structure of machine learning models delineates the limits of classical analyses, and has implications for both the theory and practice of machine learning.},
	number = {32},
	urldate = {2023-12-28},
	journal = {Proceedings of the National Academy of Sciences},
	author = {Belkin, Mikhail and Hsu, Daniel and Ma, Siyuan and Mandal, Soumik},
	month = aug,
	year = {2019},
	note = {arXiv:1812.11118 [cs, stat]},
	keywords = {Statistics - Machine Learning, Computer Science - Machine Learning},
	pages = {15849--15854},
	file = {arXiv Fulltext PDF:/Users/micl/Zotero/storage/2W6R7FQB/Belkin et al. - 2019 - Reconciling modern machine learning practice and t.pdf:application/pdf;arXiv.org Snapshot:/Users/micl/Zotero/storage/LZXDXDDD/1812.html:text/html},
}

@misc{bai_understanding_2021,
	title = {Understanding the {Under}-{Coverage} {Bias} in {Uncertainty} {Estimation}},
	url = {http://arxiv.org/abs/2106.05515},
	doi = {10.48550/arXiv.2106.05515},
	abstract = {Estimating the data uncertainty in regression tasks is often done by learning a quantile function or a prediction interval of the true label conditioned on the input. It is frequently observed that quantile regression -- a vanilla algorithm for learning quantiles with asymptotic guarantees -- tends to {\textbackslash}emph\{under-cover\} than the desired coverage level in reality. While various fixes have been proposed, a more fundamental understanding of why this under-coverage bias happens in the first place remains elusive. In this paper, we present a rigorous theoretical study on the coverage of uncertainty estimation algorithms in learning quantiles. We prove that quantile regression suffers from an inherent under-coverage bias, in a vanilla setting where we learn a realizable linear quantile function and there is more data than parameters. More quantitatively, for \${\textbackslash}alpha{\textgreater}0.5\$ and small \$d/n\$, the \${\textbackslash}alpha\$-quantile learned by quantile regression roughly achieves coverage \${\textbackslash}alpha - ({\textbackslash}alpha-1/2){\textbackslash}cdot d/n\$ regardless of the noise distribution, where \$d\$ is the input dimension and \$n\$ is the number of training data. Our theory reveals that this under-coverage bias stems from a certain high-dimensional parameter estimation error that is not implied by existing theories on quantile regression. Experiments on simulated and real data verify our theory and further illustrate the effect of various factors such as sample size and model capacity on the under-coverage bias in more practical setups.},
	urldate = {2023-12-29},
	publisher = {arXiv},
	author = {Bai, Yu and Mei, Song and Wang, Huan and Xiong, Caiming},
	month = jun,
	year = {2021},
	note = {arXiv:2106.05515 [cs, math, stat]},
	keywords = {Mathematics - Statistics Theory, Statistics - Machine Learning, Computer Science - Machine Learning},
	file = {arXiv Fulltext PDF:/Users/micl/Zotero/storage/C9GNLBYX/Bai et al. - 2021 - Understanding the Under-Coverage Bias in Uncertain.pdf:application/pdf;arXiv.org Snapshot:/Users/micl/Zotero/storage/BZ9YWUQY/2106.html:text/html},
}

@book{cohen_statistical_2009,
	address = {New York, NY},
	edition = {2. ed., reprint},
	title = {Statistical power analysis for the behavioral sciences},
	isbn = {978-0-8058-0283-2},
	abstract = {Statistical Power Analysis for the Behavioral Sciences, Revised Edition emphasizes the importance of statistical power analysis. This edition discusses the concepts and types of power analysis, t test for means, significance of a product moment rs, and differences between correlation coefficients. The test that a proportion is .50 and sign test, differences between proportions, and chi-square tests for goodness of fit and contingency tables are also elaborated. This text likewise covers the F tests of variance proportions in multiple regression/correlation analysis and computational procedures. This publication is intended for behavioral and biosocial scientists who use statistical inference, but also serves as a supplementary textbook for intermediate level courses in applied statistics in behavioral/biosocial science.},
	language = {en},
	publisher = {Psychology Press},
	author = {Cohen, Jacob},
	year = {2009},
	file = {Cohen - 2009 - Statistical power analysis for the behavioral scie.pdf:/Users/micl/Zotero/storage/YLULXAN9/Cohen - 2009 - Statistical power analysis for the behavioral scie.pdf:application/pdf},
}

@misc{fortuner_machine_2023,
	title = {Machine {Learning} {Glossary}},
	url = {https://ml-cheatsheet.readthedocs.io/en/latest/index.html},
	urldate = {2024-01-12},
	author = {Fortuner, Brendan},
	year = {2023},
	file = {Machine Learning Glossary — ML Glossary documentation:/Users/micl/Zotero/storage/HJ3MZAGU/index.html:text/html},
}

@misc{boykis_what_2023,
	title = {What are embeddings?},
	url = {http://vickiboykis.com/what_are_embeddings/index.html},
	abstract = {A deep-dive into machine learning embeddings.},
	language = {en},
	urldate = {2024-01-12},
	author = {Boykis, Vicki},
	year = {2023},
	file = {Snapshot:/Users/micl/Zotero/storage/9GRBNJMZ/what_are_embeddings.html:text/html},
}

@misc{stackexchange_are_2015,
	type = {Forum post},
	title = {Are there any differences between tensors and multidimensional arrays?},
	url = {https://math.stackexchange.com/q/1134809},
	urldate = {2024-01-16},
	journal = {Mathematics Stack Exchange},
	author = {StackExchange},
	month = feb,
	year = {2015},
	file = {Snapshot:/Users/micl/Zotero/storage/VRE8CNB8/are-there-any-differences-between-tensors-and-multidimensional-arrays.html:text/html},
}

@misc{howard_practical_2024,
	title = {Practical {Deep} {Learning} for {Coders} - {Practical} {Deep} {Learning}},
	url = {https://course.fast.ai/},
	abstract = {A free course designed for people with some coding experience, who want to learn how to apply deep learning and machine learning to practical problems.},
	language = {en},
	urldate = {2024-01-16},
	journal = {Practical Deep Learning for Coders},
	author = {Howard, Jeremy},
	year = {2024},
	file = {Snapshot:/Users/micl/Zotero/storage/YQCZDBMT/course.fast.ai.html:text/html},
}

@misc{heiss_marginalia_2022,
	title = {Marginalia: {A} guide to figuring out what the heck marginal effects, marginal slopes, average marginal effects, marginal effects at the mean, and all these other marginal things are},
	shorttitle = {Marginalia},
	url = {https://www.andrewheiss.com/blog/2022/05/20/marginalia/#what-are-marginal-effects},
	abstract = {Define what marginal effects even are, and then explore the subtle differences between average marginal effects, marginal effects at the mean, and marginal effects at representative values with the marginaleffects and emmeans R packages},
	language = {en},
	urldate = {2024-02-11},
	journal = {Andrew Heiss},
	author = {Heiss, Andrew},
	year = {2022},
	file = {Snapshot:/Users/micl/Zotero/storage/2RQHBN5I/marginalia.html:text/html},
}

@misc{arel-bundock_marginal_2024,
	title = {Marginal {Effects} {Zoo}},
	url = {https://marginaleffects.com/},
	urldate = {2024-02-11},
	author = {Arel-Bundock, Vincent},
	year = {2024},
	file = {Marginal Effects Zoo:/Users/micl/Zotero/storage/2W6CT52S/marginaleffects.com.html:text/html},
}

@misc{quantmetry_mapie_2024,
	title = {{MAPIE} - {Model} {Agnostic} {Prediction} {Interval} {Estimator} — {MAPIE} 0.8.2 documentation},
	url = {https://mapie.readthedocs.io/en/latest/},
	urldate = {2024-03-06},
	author = {{Quantmetry}},
	year = {2024},
	file = {MAPIE - Model Agnostic Prediction Interval Estimator — MAPIE 0.8.2 documentation:/Users/micl/Zotero/storage/N4XWTVTF/latest.html:text/html},
}

@misc{hvitfeldt_feature_2024,
	title = {Feature {Engineering} {A}-{Z} {\textbar} {Preface}},
	url = {https://feaz-book.com/},
	language = {en},
	urldate = {2024-03-10},
	journal = {Feature Engineering A-Z},
	author = {Hvitfeldt, Emil},
	month = jan,
	year = {2024},
	file = {Snapshot:/Users/micl/Zotero/storage/IM3A6CL5/feaz-book.com.html:text/html},
}

@misc{google_classification_2024,
	title = {Classification: {ROC} {Curve} and {AUC} {\textbar} {Machine} {Learning}},
	shorttitle = {Classification},
	url = {https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc},
	language = {en},
	urldate = {2024-03-12},
	journal = {Google for Developers},
	author = {Google},
	year = {2024},
	file = {Snapshot:/Users/micl/Zotero/storage/2LNC83PH/roc-and-auc.html:text/html},
}

@misc{penn_state_54_2018,
	title = {5.4 - {A} {Matrix} {Formulation} of the {Multiple} {Regression} {Model} {\textbar} {STAT} 462},
	url = {https://online.stat.psu.edu/stat462/node/132/},
	urldate = {2024-03-12},
	author = {Penn State, Department of Statistics},
	year = {2018},
	note = {Source of img/matrix\_linreg.png},
	file = {5.4 - A Matrix Formulation of the Multiple Regression Model | STAT 462:/Users/micl/Zotero/storage/8AKZVNBW/132.html:text/html},
}

@misc{chernozhukov_applied_2024,
	title = {Applied {Causal} {Inference} {Powered} by {ML} and {AI}},
	url = {http://arxiv.org/abs/2403.02467},
	abstract = {An introduction to the emerging fusion of machine learning and causal inference. The book presents ideas from classical structural equation models (SEMs) and their modern AI equivalent, directed acyclical graphs (DAGs) and structural causal models (SCMs), and covers Double/Debiased Machine Learning methods to do inference in such models using modern predictive tools.},
	urldate = {2024-03-12},
	publisher = {arXiv},
	author = {Chernozhukov, Victor and Hansen, Christian and Kallus, Nathan and Spindler, Martin and Syrgkanis, Vasilis},
	month = mar,
	year = {2024},
	note = {arXiv:2403.02467 [cs, econ, stat]},
	keywords = {Statistics - Methodology, Statistics - Machine Learning, Computer Science - Machine Learning, Economics - Econometrics},
	file = {arXiv.org Snapshot:/Users/micl/Zotero/storage/5WLWIMVI/2403.html:text/html;Full Text PDF:/Users/micl/Zotero/storage/VI7X2QTU/Chernozhukov et al. - 2024 - Applied Causal Inference Powered by ML and AI.pdf:application/pdf},
}

@book{mckinney_python_2023,
	edition = {3},
	title = {Python for {Data} {Analysis}},
	url = {https://wesmckinney.com/book/},
	language = {en},
	urldate = {2024-03-19},
	author = {McKinney, Wes},
	year = {2023},
	file = {Snapshot:/Users/micl/Zotero/storage/GFE2YV2S/book.html:text/html},
}

@book{navarro_learning_2018,
	title = {Learning {Statistics} with {R}},
	url = {https://learningstatisticswithr.com},
	language = {en},
	author = {Navarro, Danielle},
	year = {2018},
	file = {Navarro - Learning Statistics with R.pdf:/Users/micl/Zotero/storage/MJY84HXK/Navarro - Learning Statistics with R.pdf:application/pdf},
}

@book{weed_learning_2021,
	title = {Learning {Statistics} with {Python} — {Learning} {Statistics} with {Python}},
	url = {https://ethanweed.github.io/pythonbook/landingpage.html},
	urldate = {2024-03-19},
	author = {Weed, Ethan and Navarro, Danielle},
	year = {2021},
	file = {Learning Statistics with Python — Learning Statistics with Python:/Users/micl/Zotero/storage/4FYNLC7V/landingpage.html:text/html},
}

@misc{wikipedia_gradient_2024,
	title = {Gradient},
	copyright = {Creative Commons Attribution-ShareAlike License},
	url = {https://en.wikipedia.org/w/index.php?title=Gradient&oldid=1206147282},
	abstract = {In vector calculus, the gradient of a scalar-valued differentiable function f\{{\textbackslash}displaystyle f\} of several variables is the vector field (or vector-valued function) ∇f\{{\textbackslash}displaystyle {\textbackslash}nabla f\} whose value at a point p\{{\textbackslash}displaystyle p\} gives the direction and the rate of fastest increase. The gradient transforms like a vector under change of basis of the space of variables of f\{{\textbackslash}displaystyle f\}. If the gradient of a function is non-zero at a point p\{{\textbackslash}displaystyle p\}, the direction of the gradient is the direction in which the function increases most quickly from p\{{\textbackslash}displaystyle p\}, and the magnitude of the gradient is the rate of increase in that direction, the greatest absolute directional derivative. Further, a point where the gradient is the zero vector is known as a stationary point. The gradient thus plays a fundamental role in optimization theory, where it is used to minimize a function by gradient descent. In coordinate-free terms, the gradient of a function f(r)\{{\textbackslash}displaystyle f({\textbackslash}mathbf \{r\} )\} may be defined by:

where df\{{\textbackslash}displaystyle df\} is the total infinitesimal change in f\{{\textbackslash}displaystyle f\} for an infinitesimal displacement  dr\{{\textbackslash}displaystyle d{\textbackslash}mathbf \{r\} \}, and is seen to be maximal when dr\{{\textbackslash}displaystyle d{\textbackslash}mathbf \{r\} \} is in the direction of the gradient ∇f\{{\textbackslash}displaystyle {\textbackslash}nabla f\}. The nabla symbol ∇\{{\textbackslash}displaystyle {\textbackslash}nabla \}, written as an upside-down triangle and pronounced "del", denotes the vector differential operator.
When a coordinate system is used in which the basis vectors are not functions of position, the gradient is given by the vector whose components are the partial derivatives of f\{{\textbackslash}displaystyle f\} at p\{{\textbackslash}displaystyle p\}. That is, for f:Rn→R\{{\textbackslash}displaystyle f{\textbackslash}colon {\textbackslash}mathbb \{R\} {\textasciicircum}\{n\}{\textbackslash}to {\textbackslash}mathbb \{R\} \}, its gradient ∇f:Rn→Rn\{{\textbackslash}displaystyle {\textbackslash}nabla f{\textbackslash}colon {\textbackslash}mathbb \{R\} {\textasciicircum}\{n\}{\textbackslash}to {\textbackslash}mathbb \{R\} {\textasciicircum}\{n\}\} is defined at the point p=(x1,…,xn)\{{\textbackslash}displaystyle p=(x\_\{1\},{\textbackslash}ldots ,x\_\{n\})\} in n-dimensional space as the vector
Note that the above definition for gradient is only defined for the function f\{{\textbackslash}displaystyle f\}, if it is differentiable at p\{{\textbackslash}displaystyle p\}. There can be functions for which partial derivatives exist in every direction but fail to be differentiable. 
For example, the function f(x,y)=x2yx2+y2\{{\textbackslash}displaystyle f(x,y)=\{{\textbackslash}frac \{x{\textasciicircum}\{2\}y\}\{x{\textasciicircum}\{2\}+y{\textasciicircum}\{2\}\}\}\} unless at origin where f(0,0)=0\{{\textbackslash}displaystyle f(0,0)=0\}, is not differentiable at the origin as it does not have a well defined tangent plane despite having well defined partial derivatives in every direction at the origin. In this particular example, under rotation of x-y coordinate system, the above formula for gradient fails to transform like a vector (gradient becomes dependent on choice of basis for coordinate system) and also fails to point towards the 'steepest ascent' in some orientations. For differentiable functions where the formula for gradient holds, it can be shown to always transform as a vector under transformation of the basis so as to always point towards the fastest increase.
The gradient is dual to the total derivative df\{{\textbackslash}displaystyle df\}: the value of the gradient at a point is a tangent vector – a vector at each point; while the value of the derivative at a point is a cotangent vector – a linear functional on vectors. They are related in that the dot product of the gradient of f\{{\textbackslash}displaystyle f\} at a point p\{{\textbackslash}displaystyle p\} with another tangent vector v\{{\textbackslash}displaystyle {\textbackslash}mathbf \{v\} \} equals the directional derivative of f\{{\textbackslash}displaystyle f\} at p\{{\textbackslash}displaystyle p\} of the function along v\{{\textbackslash}displaystyle {\textbackslash}mathbf \{v\} \}; that is, ∇f(p)⋅v=∂f∂v(p)=dfp(v)\{{\textbackslash}textstyle {\textbackslash}nabla f(p){\textbackslash}cdot {\textbackslash}mathbf \{v\} =\{{\textbackslash}frac \{{\textbackslash}partial f\}\{{\textbackslash}partial {\textbackslash}mathbf \{v\} \}\}(p)=df\_\{p\}({\textbackslash}mathbf \{v\} )\}. 
The gradient admits multiple generalizations to more general functions on manifolds; see § Generalizations.},
	language = {en},
	urldate = {2024-03-23},
	journal = {Wikipedia},
	author = {Wikipedia},
	month = feb,
	year = {2024},
	note = {Page Version ID: 1206147282},
	file = {Snapshot:/Users/micl/Zotero/storage/7X4XJJ9D/Gradient.html:text/html},
}

@misc{google_reducing_2024,
	title = {Reducing {Loss}: {Gradient} {Descent} {\textbar} {Machine} {Learning}},
	shorttitle = {Reducing {Loss}},
	url = {https://developers.google.com/machine-learning/crash-course/reducing-loss/gradient-descent},
	language = {en},
	urldate = {2024-03-23},
	journal = {Google for Developers},
	author = {Google},
	year = {2024},
	file = {Snapshot:/Users/micl/Zotero/storage/N5ASTV9E/gradient-descent.html:text/html},
}

@book{efron_introduction_1994,
	address = {New York},
	title = {An {Introduction} to the {Bootstrap}},
	isbn = {978-0-429-24659-3},
	abstract = {An Introduction to the Bootstrap arms scientists and engineers as well as statisticians with the computational techniques they need to analyze and understand complicated data sets. The bootstrap is a computer-based method of statistical inference that answers statistical questions without formulas and gives a direct appreciation of variance, bias, coverage, and other probabilistic phenomena. This book presents an overview of the bootstrap and related methods for assessing statistical accuracy, concentrating on the ideas rather than their mathematical justification. Not just for beginners, the presentation starts off slowly, but builds in both scope and depth to ideas that are quite sophisticated.},
	publisher = {Chapman and Hall/CRC},
	author = {Efron, Bradley and Tibshirani, R. J.},
	month = may,
	year = {1994},
	doi = {10.1201/9780429246593},
}

@book{davison_bootstrap_1997,
	address = {Cambridge},
	series = {Cambridge {Series} in {Statistical} and {Probabilistic} {Mathematics}},
	title = {Bootstrap {Methods} and their {Application}},
	isbn = {978-0-521-57471-6},
	url = {https://www.cambridge.org/core/books/bootstrap-methods-and-their-application/ED2FD043579F27952363566DC09CBD6A},
	abstract = {Bootstrap methods are computer-intensive methods of statistical analysis, which use simulation to calculate standard errors, confidence intervals, and significance tests. The methods apply for any level of modelling, and so can be used for fully parametric, semiparametric, and completely nonparametric analysis. This 1997 book gives a broad and up-to-date coverage of bootstrap methods, with numerous applied examples, developed in a coherent way with the necessary theoretical basis. Applications include stratified data; finite populations; censored and missing data; linear, nonlinear, and smooth regression models; classification; time series and spatial problems. Special features of the book include: extensive discussion of significance tests and confidence intervals; material on various diagnostic methods; and methods for efficient computation, including improved Monte Carlo simulation. Each chapter includes both practical and theoretical exercises. S-Plus programs for implementing the methods described in the text are available from the supporting website.},
	urldate = {2024-03-24},
	publisher = {Cambridge University Press},
	author = {Davison, A. C. and Hinkley, D. V.},
	year = {1997},
	doi = {10.1017/CBO9780511802843},
	file = {Snapshot:/Users/micl/Zotero/storage/WZWY3LBN/ED2FD043579F27952363566DC09CBD6A.html:text/html;Submitted Version:/Users/micl/Zotero/storage/VAG8NS37/Davison and Hinkley - 1997 - Bootstrap Methods and their Application.pdf:application/pdf},
}

@misc{statquest_with_josh_starmer_bootstrapping_2021,
	title = {Bootstrapping {Main} {Ideas}!!!},
	url = {https://www.youtube.com/watch?v=Xz0x-8-cgaQ},
	abstract = {Bootstrapping is one of the simplest, yet most powerful methods in all of statistics. It provides us an easy way to get a sense of what might happen if we could repeat an experiment a bunch of times. It turns point estimates into distributions that can be used to calculate all kinds of stuff, including standard errors, confidence intervals and even p-values. In this video, we show how it can be used to calculate standard errors and confidence intervals. In Part 2, we'll see how to calculate p-values.

For a complete index of all the StatQuest videos, check out:
https://statquest.org/video-index/

If you'd like to support StatQuest, please consider...

Buying my book, The StatQuest Illustrated Guide to Machine Learning:
PDF - https://statquest.gumroad.com/l/wvtmc
Paperback - https://www.amazon.com/dp/B09ZCKR4H6
Kindle eBook - https://www.amazon.com/dp/B09ZG79HXC

Patreon:   / statquest  
...or...
YouTube Membership:    / @statquest  

...a cool StatQuest t-shirt or sweatshirt: 
https://shop.spreadshirt.com/statques...

...buying one or two of my songs (or go large and get a whole album!)
https://joshuastarmer.bandcamp.com/

...or just donating to StatQuest!
https://www.paypal.me/statquest

Lastly, if you want to keep up with me as I research and create new StatQuests, follow me on twitter:
  / joshuastarmer  

0:00 Awesome song and introduction
2:18 Bootstrapping in action!
4:23 Bootstrapping defined
6:40 Calculating standard errors and confidence intervals with bootstrapping
8:13 What makes bootstrapping so awesome

Correction: 
5:55 8{\textasciicircum}8 combinations of observed values and possible means assumes that order matters, and it doesn't. So 8{\textasciicircum}8 over counts the total number of useful combinations and the true number is 15 choose 8, which is 6435

\#StatQuest \#Bootstrap \#Statistics},
	urldate = {2024-03-24},
	author = {{StatQuest with Josh Starmer}},
	month = jul,
	year = {2021},
}

@misc{wikipedia_exponential_2024,
	title = {Exponential family},
	copyright = {Creative Commons Attribution-ShareAlike License},
	url = {https://en.wikipedia.org/w/index.php?title=Exponential_family&oldid=1202463189},
	abstract = {In probability and statistics, an exponential family is a parametric set of probability distributions of a certain form, specified below. This special form is chosen for mathematical convenience, including the enabling of the user to calculate expectations, covariances using differentiation based on some useful algebraic properties, as well as for generality, as exponential families are in a sense very natural sets of distributions to consider. The term exponential class is sometimes used in place of "exponential family", or the older term Koopman–Darmois family.
Sometimes loosely referred to as "the" exponential family, this class of distributions is distinct because they all possess a variety of desirable properties, most importantly the existence of a sufficient statistic.
The concept of exponential families is credited to E. J. G. Pitman, G. Darmois, and B. O. Koopman in 1935–1936. Exponential families of distributions provide a general framework for selecting a possible alternative parameterisation of a parametric family of distributions, in terms of natural parameters, and for defining useful sample statistics, called the natural sufficient statistics of the family.},
	language = {en},
	urldate = {2024-03-27},
	journal = {Wikipedia},
	author = {Wikipedia},
	month = feb,
	year = {2024},
	note = {Page Version ID: 1202463189},
	file = {Snapshot:/Users/micl/Zotero/storage/DNPX4KHR/Exponential_family.html:text/html},
}

@book{mccullagh_generalized_2019,
	address = {New York},
	edition = {2},
	title = {Generalized {Linear} {Models}},
	isbn = {978-0-203-75373-6},
	abstract = {The success of the first edition of Generalized Linear Models led to the updated Second Edition, which continues to provide a definitive unified, treatment of methods for the analysis of diverse types of data. Today, it remains popular for its clarity, richness of content and direct relevance to agricultural, biological, health, engineering, and ot},
	publisher = {Routledge},
	author = {McCullagh, P.},
	month = jan,
	year = {2019},
	doi = {10.1201/9780203753736},
}

@book{dobson_introduction_2018,
	address = {New York},
	edition = {4},
	title = {An {Introduction} to {Generalized} {Linear} {Models}},
	isbn = {978-1-315-18278-0},
	abstract = {An Introduction to Generalized Linear Models, Fourth Edition provides a cohesive framework for statistical modelling, with an emphasis on numerical and graphical methods. This new edition of a bestseller has been updated with new sections on non-linear associations, strategies for model selection, and a Postface on good statistical practice.

Like its predecessor, this edition presents the theoretical background of generalized linear models (GLMs) before focusing on methods for analyzing particular kinds of data. It covers Normal, Poisson, and Binomial distributions; linear regression models; classical estimation and model fitting methods; and frequentist methods of statistical inference. After forming this foundation, the authors explore multiple linear regression, analysis of variance (ANOVA), logistic regression, log-linear models, survival analysis, multilevel modeling, Bayesian models, and Markov chain Monte Carlo (MCMC) methods.


Introduces GLMs in a way that enables readers to understand the unifying structure that underpins them


Discusses common concepts and principles of advanced GLMs, including nominal and ordinal regression, survival analysis, non-linear associations and longitudinal analysis


Connects Bayesian analysis and MCMC methods to fit GLMs


Contains numerous examples from business, medicine, engineering, and the social sciences


Provides the example code for R, Stata, and WinBUGS to encourage implementation of the methods


Offers the data sets and solutions to the exercises online


Describes the components of good statistical practice to improve scientific validity and reproducibility of results.

Using popular statistical software programs, this concise and accessible text illustrates practical approaches to estimation, model fitting, and model comparisons.},
	publisher = {Chapman and Hall/CRC},
	author = {Dobson, Annette J. and Barnett, Adrian G.},
	month = apr,
	year = {2018},
	doi = {10.1201/9781315182780},
}

@book{fox_applied_2015,
	title = {Applied {Regression} {Analysis} and {Generalized} {Linear} {Models}},
	isbn = {978-1-4833-2131-8},
	abstract = {Combining a modern, data-analytic perspective with a focus on applications in the social sciences, the Third Edition of Applied Regression Analysis and Generalized Linear Models provides in-depth coverage of regression analysis, generalized linear models, and closely related methods, such as bootstrapping and missing data. Updated throughout, this Third Edition includes new chapters on mixed-effects models for hierarchical and longitudinal data. Although the text is largely accessible to readers with a modest background in statistics and mathematics, author John Fox also presents more advanced material in optional sections and chapters throughout the book.   Accompanying website resources containing all answers to the end-of-chapter exercises. Answers to odd-numbered questions, as well as datasets and other student resources are available on the author′s website.   NEW! Bonus chapter on Bayesian Estimation of Regression Models also available at the author′s website.},
	language = {en},
	publisher = {SAGE Publications},
	author = {Fox, John},
	month = mar,
	year = {2015},
	note = {Google-Books-ID: 3wrwCQAAQBAJ},
	keywords = {Social Science / Research, Social Science / Statistics, Psychology / Research \& Methodology, Reference / Research},
}

@book{agresti_foundations_2015,
	title = {Foundations of {Linear} and {Generalized} {Linear} {Models}},
	isbn = {978-1-118-73005-8},
	abstract = {A valuable overview of the most important ideas and results in statistical modeling Written by a highly-experienced author, Foundations of Linear and Generalized Linear Models is a clear and comprehensive guide to the key concepts and results of linearstatistical models. The book presents a broad, in-depth overview of the most commonly usedstatistical models by discussing the theory underlying the models, R software applications,and examples with crafted models to elucidate key ideas and promote practical modelbuilding. The book begins by illustrating the fundamentals of linear models, such as how the model-fitting projects the data onto a model vector subspace and how orthogonal decompositions of the data yield information about the effects of explanatory variables. Subsequently, the book covers the most popular generalized linear models, which include binomial and multinomial logistic regression for categorical data, and Poisson and negative binomial loglinear models for count data. Focusing on the theoretical underpinnings of these models, Foundations ofLinear and Generalized Linear Models also features:  An introduction to quasi-likelihood methods that require weaker distributional assumptions, such as generalized estimating equation methods An overview of linear mixed models and generalized linear mixed models with random effects for clustered correlated data, Bayesian modeling, and extensions to handle problematic cases such as high dimensional problems Numerous examples that use R software for all text data analyses More than 400 exercises for readers to practice and extend the theory, methods, and data analysis A supplementary website with datasets for the examples and exercises  An invaluable textbook for upper-undergraduate and graduate-level students in statistics and biostatistics courses, Foundations of Linear and Generalized Linear Models is also an excellent reference for practicing statisticians and biostatisticians, as well as anyone who is interested in learning about the most important statistical models for analyzing data.},
	language = {en},
	publisher = {John Wiley \& Sons},
	author = {Agresti, Alan},
	month = jan,
	year = {2015},
	note = {Google-Books-ID: dgIzBgAAQBAJ},
	keywords = {Mathematics / General, Mathematics / Probability \& Statistics / General, Mathematics / Probability \& Statistics / Stochastic Processes},
}

@book{hardin_generalized_2018,
	title = {Generalized {Linear} {Models} and {Extensions}},
	isbn = {978-1-59718-225-6},
	abstract = {Generalized linear models (GLMs) extend linear regression to models with a non-Gaussian, or even discrete, response. GLM theory is predicated on the exponential family of distributions--a class so rich that it includes the commonly used logit, probit, and Poisson models. Although one can fit these models in Stata by using specialized commands (for example, logit for logit models), fitting them as GLMs with Stata's glm command offers some advantages. For example, model diagnostics may be calculated and interpreted similarly regardless of the assumed distribution. This text thoroughly covers GLMs, both theoretically and computationally, with an emphasis on Stata. The theory consists of showing how the various GLMs are special cases of the exponential family, showing general properties of this family of distributions, and showing the derivation of maximum likelihood (ML) estimators and standard errors. Hardin and Hilbe show how iteratively reweighted least squares, another method of parameter estimation, are a consequence of ML estimation using Fisher scoring.},
	language = {en},
	publisher = {Stata Press},
	author = {Hardin, James W. and Hilbe, Joseph M.},
	year = {2018},
	note = {Google-Books-ID: AhKQtgEACAAJ},
	keywords = {Mathematics / Probability \& Statistics / General, Science / Life Sciences / Biology},
}

@book{dunn_generalized_2018,
	title = {Generalized {Linear} {Models} {With} {Examples} in {R}},
	isbn = {978-1-4419-0118-7},
	abstract = {This textbook presents an introduction to generalized linear models, complete with real-world data sets and practice problems, making it applicable for both beginning and advanced students of applied statistics. Generalized linear models (GLMs) are powerful tools in applied statistics that extend the ideas of multiple linear regression and analysis of variance to include response variables that are not normally distributed. As such, GLMs can model a wide variety of data types including counts, proportions, and binary outcomes or positive quantities.The book is designed with the student in mind, making it suitable for self-study or a structured course. Beginning with an introduction to linear regression, the book also devotes time to advanced topics not typically included in introductory textbooks. It features chapter introductions and summaries, clear examples, and many practice problems, all carefully designed to balance theory and practice. The text also provides a working knowledge of applied statistical practice through the extensive use of R, which is integrated into the text. Other features include: • Advanced topics such as power variance functions, saddlepoint approximations, likelihood score tests, modified profile likelihood, small-dispersion asymptotics, and randomized quantile residuals • Nearly 100 data sets in the companion R package GLMsData • Examples that are cross-referenced to the companion data set, allowing readers to load the data and follow the analysis in their own R session},
	language = {en},
	publisher = {Springer},
	author = {Dunn, Peter K. and Smyth, Gordon K.},
	month = nov,
	year = {2018},
	note = {Google-Books-ID: tBh5DwAAQBAJ},
	keywords = {Mathematics / Probability \& Statistics / General, Computers / Mathematical \& Statistical Software, Mathematics / Probability \& Statistics / Stochastic Processes},
}

@article{nelder_generalized_1972,
	title = {Generalized {Linear} {Models}},
	volume = {135},
	issn = {0035-9238},
	url = {https://doi.org/10.2307/2344614},
	doi = {10.2307/2344614},
	abstract = {The technique of iterative weighted linear regression can be used to obtain maximum likelihood estimates of the parameters with observations distributed according to some exponential family and systematic effects that can be made linear by a suitable transformation. A generalization of the analysis of variance is given for these models using log-likelihoods. These generalized linear models are illustrated by examples relating to four distributions; the Normal, Binomial (probit analysis, etc.), Poisson (contingency tables) and gamma (variance components).The implications of the approach in designing statistics courses are discussed.},
	number = {3},
	urldate = {2024-03-28},
	journal = {Royal Statistical Society. Journal. Series A: General},
	author = {Nelder, J. A. and Wedderburn, R. W. M.},
	month = may,
	year = {1972},
	pages = {370--384},
	file = {Snapshot:/Users/micl/Zotero/storage/3YWG6BE4/7110572.html:text/html},
}

@misc{gelman_advanced_2024,
	title = {Advanced {Regression} and {Multilevel} {Models}},
	url = {http://www.stat.columbia.edu/~gelman/armm/},
	abstract = {The updated and expanded second edition of the multilevel modeling parts of Data Analysis Using Regression and Multilevel/Hierarchical Models, along with new material on advanced regression. It is the sequel to Regression and Other Stories.},
	urldate = {2024-03-31},
	author = {Gelman, Andrew and Hill, Jennifer and Goodrich, Ben and Gabry, Jonah and Simpson, Daniel and Vehtari, Aki},
	year = {2024},
	file = {Home page for the book, "Advanced Regression and Multilevel Models":/Users/micl/Zotero/storage/BJC6E5C5/armm.html:text/html},
}

@article{simpson_using_2021,
	title = {Using random effects in {GAMs} with mgcv},
	url = {https://www.fromthebottomoftheheap.net/2021/02/02/random-effects-in-gams/},
	abstract = {There are lots of choices for fitting generalized linear mixed effects models within R, but if you want to include smooth functions of covariates, the choices are limited. One option is to fit the model using gamm() from the mgcv 📦 or gamm4() from the gamm4 📦, which use lme()...},
	language = {en},
	urldate = {2024-03-31},
	journal = {From the Bottom of the Heap},
	author = {Simpson, Gavin},
	month = feb,
	year = {2021},
}

@misc{mahr_random_2021,
	title = {Random effects and penalized splines are the same thing},
	url = {https://tjmahr.github.io/random-effects-penalized-splines-same-thing/},
	abstract = {Weighted wiggles and smoothed categories},
	language = {en},
	urldate = {2024-03-31},
	journal = {Higher Order Functions},
	author = {Mahr, Tristan},
	month = feb,
	year = {2021},
	file = {Snapshot:/Users/micl/Zotero/storage/T73J27E2/random-effects-penalized-splines-same-thing.html:text/html},
}

@misc{lones_how_2024,
	title = {How to avoid machine learning pitfalls: a guide for academic researchers},
	shorttitle = {How to avoid machine learning pitfalls},
	url = {http://arxiv.org/abs/2108.02497},
	doi = {10.48550/arXiv.2108.02497},
	abstract = {This document outlines some of the common mistakes that occur when using machine learning, and what can be done to avoid them. Whilst it should be accessible to anyone with a basic understanding of machine learning techniques, it was originally written for research students, and focuses on issues that are of particular concern within academic research, such as the need to do rigorous comparisons and reach valid conclusions. It covers five stages of the machine learning process: what to do before model building, how to reliably build models, how to robustly evaluate models, how to compare models fairly, and how to report results.},
	urldate = {2024-03-31},
	publisher = {arXiv},
	author = {Lones, Michael A.},
	month = jan,
	year = {2024},
	note = {arXiv:2108.02497 [cs]},
	keywords = {Computer Science - Machine Learning},
	annote = {Comment: 28 pages},
	file = {arXiv Fulltext PDF:/Users/micl/Zotero/storage/C97JRF7Q/Lones - 2024 - How to avoid machine learning pitfalls a guide fo.pdf:application/pdf;arXiv.org Snapshot:/Users/micl/Zotero/storage/SKFZFS79/2108.html:text/html},
}

@book{raschka_build_2023,
	title = {Build a {Large} {Language} {Model} ({From} {Scratch})},
	url = {https://www.manning.com/books/build-a-large-language-model-from-scratch},
	abstract = {Learn how to create, train, and tweak large language models (LLMs) by building one from the ground up!{\textless}/b{\textgreater}

In Build a Large Language Model (from Scratch){\textless}/i{\textgreater}, you’ll discover how LLMs work from the inside out. In this insightful book, bestselling author Sebastian Raschka guides you step by step through creating your own LLM, explaining each stage with clear text, diagrams, and examples.  You’ll go from the initial design and creation to pretraining on a general corpus, all the way to finetuning for specific tasks.

Build a Large Language Model (from Scratch){\textless}/i{\textgreater} teaches you how to:

Plan and code all the parts of an LLM{\textless}/li{\textgreater}
Prepare a dataset suitable for LLM training{\textless}/li{\textgreater}
Finetune LLMs for text classification and with your own data{\textless}/li{\textgreater}
Use human feedback to ensure your LLM follows instructions{\textless}/li{\textgreater}
Load pretrained weights into an LLM{\textless}/li{\textgreater}
{\textless}/ul{\textgreater}

The large language models (LLMs) that power cutting-edge AI tools like ChatGPT, Bard, and Copilot seem like a miracle, but they’re not magic. This book demystifies LLMs by helping you build your own from scratch. You’ll get a unique and valuable insight into how LLMs work, learn how to evaluate their quality, and pick up concrete techniques to finetune and improve them.

The process you use to train and develop your own small-but-functional model in this book follows the same steps used to deliver huge-scale foundation models like GPT-4. Your small-scale LLM can be developed on an ordinary laptop, and you’ll be able to use it as your own personal assistant.},
	language = {en},
	urldate = {2024-04-01},
	author = {Raschka, Sebastian},
	year = {2023},
	file = {Snapshot:/Users/micl/Zotero/storage/5JMBB6VY/build-a-large-language-model-from-scratch.html:text/html},
}

@book{raschka_machine_2023,
	title = {Machine {Learning} {Q} and {AI}},
	url = {https://nostarch.com/machine-learning-q-and-ai},
	abstract = {Learn the answers to 30 cutting-edge questions in machine learning and AI and level up your expertise in the field},
	language = {en},
	urldate = {2024-04-01},
	author = {Raschka, Sebastian},
	month = aug,
	year = {2023},
	file = {Snapshot:/Users/micl/Zotero/storage/XBSAAS7Z/machine-learning-q-and-ai.html:text/html},
}

@book{raschka_machine_2022,
	title = {Machine {Learning} with {PyTorch} and {Scikit}-{Learn}},
	url = {https://sebastianraschka.com/books/machine-learning-with-pytorch-and-scikit-learn/},
	language = {en},
	urldate = {2024-04-01},
	author = {Raschka, Sebastian},
	year = {2022},
	file = {Snapshot:/Users/micl/Zotero/storage/VMUT67TJ/machine-learning-with-pytorch-and-scikit-learn.html:text/html},
}

@misc{albon_machine_2024,
	title = {Machine {Learning} {Notes}},
	url = {https://chrisalbon.com/Home},
	abstract = {Home - Chris Albon},
	language = {en},
	urldate = {2024-04-01},
	author = {Albon, Chris},
	year = {2024},
	file = {Snapshot:/Users/micl/Zotero/storage/J9U8RPNG/Home.html:text/html},
}

@book{albon_machine_2024-1,
	title = {Machine {Learning} {Flashcards}},
	url = {https://machinelearningflashcards.com/},
	urldate = {2024-04-01},
	author = {Albon, Chris},
	year = {2024},
	file = {Machine Learning Flashcards:/Users/micl/Zotero/storage/KPWFHZ5C/machinelearningflashcards.com.html:text/html},
}

@article{cawley_over-fitting_2010,
	title = {On {Over}-fitting in {Model} {Selection} and {Subsequent} {Selection} {Bias} in {Performance} {Evaluation}},
	volume = {11},
	issn = {1532-4435},
	abstract = {Model selection strategies for machine learning algorithms typically involve the numerical optimisation of an appropriate model selection criterion, often based on an estimator of generalisation performance, such as k-fold cross-validation. The error of such an estimator can be broken down into bias and variance components. While unbiasedness is often cited as a beneficial quality of a model selection criterion, we demonstrate that a low variance is at least as important, as a non-negligible variance introduces the potential for over-fitting in model selection as well as in training the model. While this observation is in hindsight perhaps rather obvious, the degradation in performance due to over-fitting the model selection criterion can be surprisingly large, an observation that appears to have received little attention in the machine learning literature to date. In this paper, we show that the effects of this form of over-fitting are often of comparable magnitude to differences in performance between learning algorithms, and thus cannot be ignored in empirical evaluation. Furthermore, we show that some common performance evaluation practices are susceptible to a form of selection bias as a result of this form of over-fitting and hence are unreliable. We discuss methods to avoid over-fitting in model selection and subsequent selection bias in performance evaluation, which we hope will be incorporated into best practice. While this study concentrates on cross-validation based model selection, the findings are quite general and apply to any model selection practice involving the optimisation of a model selection criterion evaluated over a finite sample of data, including maximisation of the Bayesian evidence and optimisation of performance bounds.},
	journal = {The Journal of Machine Learning Research},
	author = {Cawley, Gavin C. and Talbot, Nicola L.C.},
	month = aug,
	year = {2010},
	pages = {2079--2107},
	file = {Full Text PDF:/Users/micl/Zotero/storage/XKSBMG77/Cawley and Talbot - 2010 - On Over-fitting in Model Selection and Subsequent .pdf:application/pdf},
}

@book{ripley_pattern_1996,
	address = {Cambridge},
	title = {Pattern {Recognition} and {Neural} {Networks}},
	isbn = {978-0-521-71770-0},
	url = {https://www.cambridge.org/core/books/pattern-recognition-and-neural-networks/4E038249C9BAA06C8F4EE6F044D09C5C},
	abstract = {This 1996 book is a reliable account of the statistical framework for pattern recognition and machine learning. With unparalleled coverage and a wealth of case-studies this book gives valuable insight into both the theory and the enormously diverse applications (which can be found in remote sensing, astrophysics, engineering and medicine, for example). So that readers can develop their skills and understanding, many of the real data sets used in the book are available from the author's website: www.stats.ox.ac.uk/{\textasciitilde}ripley/PRbook/. For the same reason, many examples are included to illustrate real problems in pattern recognition. Unifying principles are highlighted, and the author gives an overview of the state of the subject, making the book valuable to experienced researchers in statistics, machine learning/artificial intelligence and engineering. The clear writing style means that the book is also a superb introduction for non-specialists.},
	urldate = {2024-04-05},
	publisher = {Cambridge University Press},
	author = {Ripley, Brian D.},
	year = {1996},
	doi = {10.1017/CBO9780511812651},
	file = {Snapshot:/Users/micl/Zotero/storage/SE9K4VCH/4E038249C9BAA06C8F4EE6F044D09C5C.html:text/html;Submitted Version:/Users/micl/Zotero/storage/BR56WEMQ/Ripley - 1996 - Pattern Recognition and Neural Networks.pdf:application/pdf},
}

@book{bishop_pattern_2006,
	address = {New York},
	series = {Information science and statistics},
	title = {Pattern recognition and machine learning},
	isbn = {978-0-387-31073-2},
	language = {en},
	publisher = {Springer},
	author = {Bishop, Christopher M.},
	year = {2006},
	keywords = {Machine learning, Pattern perception},
	file = {Bishop - 2006 - Pattern recognition and machine learning.pdf:/Users/micl/Zotero/storage/HU5PUM6W/Bishop - 2006 - Pattern recognition and machine learning.pdf:application/pdf},
}

@book{bischl_applied_2024,
	title = {Applied {Machine} {Learning} {Using} mlr3 in {R}},
	url = {https://mlr3book.mlr-org.com/},
	abstract = {mlr3 is an award-winning ecosystem of R packages that have been developed to enable state-of-the-art machine learning capabilities in R. Applied Machine Learning Using mlr3 in R gives an overview of flexible and robust machine learning methods, with an emphasis on how to implement them using mlr3 in R. It covers various key topics, including basic machine learning tasks, such as building and evaluating a predictive model; hyperparameter tuning of machine learning approaches to obtain peak perfor},
	language = {en},
	urldate = {2024-04-06},
	editor = {Bischl, Bernd and Sonabend, Raphael and Kotthoff, Lars and Lang, Michel},
	year = {2024},
	file = {Snapshot:/Users/micl/Zotero/storage/B9HWDLKN/9781032507545.html:text/html},
}

@misc{microsoft_generative_2024,
	title = {Generative {AI} for {Beginners}},
	url = {https://microsoft.github.io/generative-ai-for-beginners/#/},
	urldate = {2024-04-07},
	author = {Microsoft},
	year = {2024},
	file = {Generative AI for Beginners:/Users/micl/Zotero/storage/U7SNRZJS/generative-ai-for-beginners.html:text/html},
}

@article{turrell_python_2024,
	title = {Python for {Data} {Science} v1.0.1},
	copyright = {Creative Commons Attribution 4.0 International},
	url = {https://zenodo.org/doi/10.5281/zenodo.10518241},
	doi = {10.5281/ZENODO.10518241},
	urldate = {2024-04-12},
	author = {Turrell, Arthur and Pietro Monticone and Zeki Akyol and Yiben Huang},
	month = jan,
	year = {2024},
	note = {Publisher: [object Object]
Version Number: 1.0.1},
}

@article{clark_deep_2022,
	title = {Deep {Learning} for {Tabular} {Data}},
	shorttitle = {Michael {Clark}},
	url = {https://m-clark.github.io/posts/2022-04-01-more-dl-for-tabular/},
	abstract = {A continuing exploration},
	urldate = {2024-04-14},
	author = {Clark, Michael},
	month = may,
	year = {2022},
	file = {Snapshot:/Users/micl/Zotero/storage/FKNWM65X/2022-04-01-more-dl-for-tabular.html:text/html},
}

@article{clark_this_2021,
	title = {This is definitely not all you need},
	shorttitle = {Michael {Clark}},
	url = {https://m-clark.github.io/posts/2021-07-15-dl-for-tabular/},
	abstract = {A summary of findings regarding deep learning for tabular data.},
	urldate = {2024-04-14},
	author = {Clark, Michael},
	month = jul,
	year = {2021},
	file = {Snapshot:/Users/micl/Zotero/storage/MX7XZVZ5/2021-07-15-dl-for-tabular.html:text/html},
}

@book{peng_r_2022,
	title = {R {Programming} for {Data} {Science}},
	url = {https://bookdown.org/rdpeng/rprogdatascience/},
	abstract = {The R programming language has become the de facto programming language for data science. Its flexibility, power, sophistication, and expressiveness have made it an invaluable tool for data scientists around the world. This book is about the fundamentals of R programming. You will get started with the basics of the language, learn how to manipulate datasets, how to write functions, and how to debug and optimize code. With the fundamentals provided in this book, you will have a solid foundation on which to build your data science toolbox.},
	urldate = {2024-04-15},
	author = {Peng, Roger D.},
	year = {2022},
	file = {Snapshot:/Users/micl/Zotero/storage/W4MM6NWF/rprogdatascience.html:text/html},
}

@misc{roberts_neural_2000,
	title = {Neural {Networks} - {History}},
	url = {https://cs.stanford.edu/people/eroberts/courses/soco/projects/neural-networks/History/history1.html},
	urldate = {2024-04-16},
	author = {Roberts, Eric},
	year = {2000},
	file = {Neural Networks - History:/Users/micl/Zotero/storage/JSZINDVJ/history1.html:text/html},
}

@misc{lee_deep_2017,
	title = {Deep {Neural} {Networks} as {Gaussian} {Processes}},
	url = {https://arxiv.org/abs/1711.00165v3},
	abstract = {It has long been known that a single-layer fully-connected neural network with an i.i.d. prior over its parameters is equivalent to a Gaussian process (GP), in the limit of infinite network width. This correspondence enables exact Bayesian inference for infinite width neural networks on regression tasks by means of evaluating the corresponding GP. Recently, kernel functions which mimic multi-layer random neural networks have been developed, but only outside of a Bayesian framework. As such, previous work has not identified that these kernels can be used as covariance functions for GPs and allow fully Bayesian prediction with a deep neural network. In this work, we derive the exact equivalence between infinitely wide deep networks and GPs. We further develop a computationally efficient pipeline to compute the covariance function for these GPs. We then use the resulting GPs to perform Bayesian inference for wide deep neural networks on MNIST and CIFAR-10. We observe that trained neural network accuracy approaches that of the corresponding GP with increasing layer width, and that the GP uncertainty is strongly correlated with trained network prediction error. We further find that test performance increases as finite-width trained networks are made wider and more similar to a GP, and thus that GP predictions typically outperform those of finite-width networks. Finally we connect the performance of these GPs to the recent theory of signal propagation in random neural networks.},
	language = {en},
	urldate = {2024-04-18},
	journal = {arXiv.org},
	author = {Lee, Jaehoon and Bahri, Yasaman and Novak, Roman and Schoenholz, Samuel S. and Pennington, Jeffrey and Sohl-Dickstein, Jascha},
	month = nov,
	year = {2017},
	file = {Full Text PDF:/Users/micl/Zotero/storage/7SME8DU6/Lee et al. - 2017 - Deep Neural Networks as Gaussian Processes.pdf:application/pdf},
}

@book{rasmussen_gaussian_2005,
	title = {Gaussian {Processes} for {Machine} {Learning}},
	isbn = {978-0-262-25683-4},
	url = {https://direct.mit.edu/books/book/2320/Gaussian-Processes-for-Machine-Learning},
	abstract = {A comprehensive and self-contained introduction to Gaussian processes, which provide a principled, practical, probabilistic approach to learning in kernel},
	language = {en},
	urldate = {2024-04-18},
	publisher = {The MIT Press},
	author = {Rasmussen, Carl Edward and Williams, Christopher K. I.},
	month = nov,
	year = {2005},
	doi = {10.7551/mitpress/3206.001.0001},
	file = {Snapshot:/Users/micl/Zotero/storage/AIKRA9T4/Gaussian-Processes-for-Machine-Learning.html:text/html},
}

@article{mcculloch_logical_1943,
	title = {A logical calculus of the ideas immanent in nervous activity},
	volume = {5},
	issn = {1522-9602},
	url = {https://doi.org/10.1007/BF02478259},
	doi = {10.1007/BF02478259},
	abstract = {Because of the “all-or-none” character of nervous activity, neural events and the relations among them can be treated by means of propositional logic. It is found that the behavior of every net can be described in these terms, with the addition of more complicated logical means for nets containing circles; and that for any logical expression satisfying certain conditions, one can find a net behaving in the fashion it describes. It is shown that many particular choices among possible neurophysiological assumptions are equivalent, in the sense that for every net behaving under one assumption, there exists another net which behaves under the other and gives the same results, although perhaps not in the same time. Various applications of the calculus are discussed.},
	language = {en},
	number = {4},
	urldate = {2024-04-19},
	journal = {The bulletin of mathematical biophysics},
	author = {McCulloch, Warren S. and Pitts, Walter},
	month = dec,
	year = {1943},
	keywords = {Excitatory Synapse, Inhibitory Synapse, Nervous Activity, Spatial Summation, Temporal Summation},
	pages = {115--133},
}

@inproceedings{vaswani_attention_2017,
	title = {Attention is {All} you {Need}},
	volume = {30},
	url = {https://proceedings.neurips.cc/paper_files/paper/2017/hash/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html},
	abstract = {The dominant sequence transduction models are based on complex recurrent orconvolutional neural networks in an encoder and decoder configuration. The best performing such models also connect the encoder and decoder through an attentionm echanisms.  We propose a novel, simple network architecture based solely onan attention mechanism, dispensing with recurrence and convolutions entirely.Experiments on two machine translation tasks show these models to be superiorin quality while being more parallelizable and requiring significantly less timeto train. Our single model with 165 million parameters, achieves 27.5 BLEU onEnglish-to-German translation, improving over the existing best ensemble result by over 1 BLEU. On English-to-French translation, we outperform the previoussingle state-of-the-art with model by 0.7 BLEU, achieving a BLEU score of 41.1.},
	urldate = {2024-04-26},
	booktitle = {Advances in {Neural} {Information} {Processing} {Systems}},
	publisher = {Curran Associates, Inc.},
	author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, Ł ukasz and Polosukhin, Illia},
	year = {2017},
	file = {Full Text PDF:/Users/micl/Zotero/storage/27F7GJZU/Vaswani et al. - 2017 - Attention is All you Need.pdf:application/pdf},
}

@misc{google_what_2024,
	title = {What is unsupervised learning?},
	url = {https://cloud.google.com/discover/what-is-unsupervised-learning},
	abstract = {Unsupervised learning uses machine learning to analyze unlabeled datasets to discover patterns without human supervision. Read on to learn more.},
	language = {en},
	urldate = {2024-04-26},
	author = {Google},
	year = {2024},
	file = {Snapshot:/Users/micl/Zotero/storage/FWE9ETRG/what-is-unsupervised-learning.html:text/html},
}

@misc{ucla_advanced_research_computing_faq_2023-1,
	title = {{FAQ}: {How} do {I} interpret odds ratios in logistic regression?},
	url = {https://stats.oarc.ucla.edu/other/mult-pkg/faq/general/faq-how-do-i-interpret-odds-ratios-in-logistic-regression/},
	urldate = {2024-04-29},
	author = {{UCLA Advanced Research Computing}},
	year = {2023},
	file = {FAQ\: How do I interpret odds ratios in logistic regression?:/Users/micl/Zotero/storage/GPIQKYEA/faq-how-do-i-interpret-odds-ratios-in-logistic-regression.html:text/html},
}

@misc{hugging_face_byte-pair_2024,
	title = {Byte-{Pair} {Encoding} tokenization - {Hugging} {Face} {NLP} {Course}},
	url = {https://huggingface.co/learn/nlp-course/en/chapter6/5},
	abstract = {We’re on a journey to advance and democratize artificial intelligence through open source and open science.},
	urldate = {2024-05-05},
	author = {Hugging Face},
	year = {2024},
	file = {Snapshot:/Users/micl/Zotero/storage/546SDRJM/5.html:text/html},
}

@misc{andrej_karpathy_lets_2024,
	title = {Let's build the {GPT} {Tokenizer}},
	url = {https://www.youtube.com/watch?v=zduSFxRajkE},
	abstract = {The Tokenizer is a necessary and pervasive component of Large Language Models (LLMs), where it translates between strings and tokens (text chunks). Tokenizers are a completely separate stage of the LLM pipeline: they have their own training sets, training algorithms (Byte Pair Encoding), and after training implement two fundamental functions: encode() from strings to tokens, and decode() back from tokens to strings. In this lecture we build from scratch the Tokenizer used in the GPT series from OpenAI. In the process, we will see that a lot of weird behaviors and problems of LLMs actually trace back to tokenization. We'll go through a number of these issues, discuss why tokenization is at fault, and why someone out there ideally finds a way to delete this stage entirely.

Chapters:
00:00:00 intro: Tokenization, GPT-2 paper, tokenization-related issues
00:05:50 tokenization by example in a Web UI (tiktokenizer)
00:14:56 strings in Python, Unicode code points
00:18:15 Unicode byte encodings, ASCII, UTF-8, UTF-16, UTF-32
00:22:47 daydreaming: deleting tokenization
00:23:50 Byte Pair Encoding (BPE) algorithm walkthrough
00:27:02 starting the implementation
00:28:35 counting consecutive pairs, finding most common pair
00:30:36 merging the most common pair
00:34:58 training the tokenizer: adding the while loop, compression ratio
00:39:20 tokenizer/LLM diagram: it is a completely separate stage
00:42:47 decoding tokens to strings
00:48:21 encoding strings to tokens
00:57:36 regex patterns to force splits across categories
01:11:38 tiktoken library intro, differences between GPT-2/GPT-4 regex
01:14:59 GPT-2 encoder.py released by OpenAI walkthrough
01:18:26 special tokens, tiktoken handling of, GPT-2/GPT-4 differences
01:25:28 minbpe exercise time! write your own GPT-4 tokenizer
01:28:42 sentencepiece library intro, used to train Llama 2 vocabulary
01:43:27 how to set vocabulary set? revisiting gpt.py transformer
01:48:11 training new tokens, example of prompt compression
01:49:58 multimodal [image, video, audio] tokenization with vector quantization
01:51:41 revisiting and explaining the quirks of LLM tokenization
02:10:20 final recommendations
02:12:50 ??? :)

Exercises:
- Advised flow: reference this document and try to implement the steps before I give away the partial solutions in the video. The full solutions if you're getting stuck are in the minbpe code https://github.com/karpathy/minbpe/bl...

Links:
- Google colab for the video: https://colab.research.google.com/dri...
- GitHub repo for the video: minBPE https://github.com/karpathy/minbpe
- Playlist of the whole Zero to Hero series so far:    • The spelled-out intro to neural netwo...  
- our Discord channel:   / discord  
- my Twitter:   / karpathy  

Supplementary links:
- tiktokenizer https://tiktokenizer.vercel.app
- tiktoken from OpenAI: https://github.com/openai/tiktoken
- sentencepiece from Google https://github.com/google/sentencepiece},
	urldate = {2024-05-05},
	author = {{Andrej Karpathy}},
	month = feb,
	year = {2024},
}

@misc{wikipedia_cross-entropy_2024,
	title = {Cross-entropy},
	copyright = {Creative Commons Attribution-ShareAlike License},
	url = {https://en.wikipedia.org/w/index.php?title=Cross-entropy&oldid=1221840853#Cross-entropy_loss_function_and_logistic_regression},
	abstract = {In information theory, the cross-entropy between two probability distributions 
  
    
        p
      
    
    \{{\textbackslash}displaystyle p\}
  
 and 
  
    
        q
      
    
    \{{\textbackslash}displaystyle q\}
  
, over the same underlying set of events, measures the average number of bits needed to identify an event drawn from the set when the coding scheme used for the set is optimized for an estimated probability distribution 
  
    
        q
      
    
    \{{\textbackslash}displaystyle q\}
  
, rather than the true distribution 
  
    
        p
      
    
    \{{\textbackslash}displaystyle p\}
  
.},
	language = {en},
	urldate = {2024-05-05},
	journal = {Wikipedia},
	author = {Wikipedia},
	month = may,
	year = {2024},
	note = {Page Version ID: 1221840853},
	file = {Snapshot:/Users/micl/Zotero/storage/ETMKQUBB/Cross-entropy.html:text/html},
}

@inproceedings{burges_learning_2005,
	address = {Bonn, Germany},
	title = {Learning to rank using gradient descent},
	isbn = {978-1-59593-180-1},
	url = {http://portal.acm.org/citation.cfm?doid=1102351.1102363},
	doi = {10.1145/1102351.1102363},
	abstract = {We investigate using gradient descent methods for learning ranking functions; we propose a simple probabilistic cost function, and we introduce RankNet, an implementation of these ideas using a neural network to model the underlying ranking function. We present test results on toy data and on data from a commercial internet search engine.},
	language = {en},
	urldate = {2024-05-05},
	booktitle = {Proceedings of the 22nd international conference on {Machine} learning  - {ICML} '05},
	publisher = {ACM Press},
	author = {Burges, Chris and Shaked, Tal and Renshaw, Erin and Lazier, Ari and Deeds, Matt and Hamilton, Nicole and Hullender, Greg},
	year = {2005},
	pages = {89--96},
	file = {Burges et al. - 2005 - Learning to rank using gradient descent.pdf:/Users/micl/Zotero/storage/CZGRKHF2/Burges et al. - 2005 - Learning to rank using gradient descent.pdf:application/pdf},
}

@article{burges_ranknet_2016,
	title = {From {RankNet} to {LambdaRank} to {LambdaMART}: {An} {Overview}},
	abstract = {LambdaMART is the boosted tree version of LambdaRank, which is based on RankNet. RankNet, LambdaRank, and LambdaMART have proven to be very successful algorithms for solving real world ranking problems: for example an ensemble of LambdaMART rankers won Track 1 of the 2010 Yahoo! Learning To Rank Challenge. The details of these algorithms are spread across several papers and reports, and so here we give a self-contained, detailed and complete description of them.},
	language = {en},
	author = {Burges, Christopher J C},
	year = {2016},
	file = {Burges - From RankNet to LambdaRank to LambdaMART An Overv.pdf:/Users/micl/Zotero/storage/258KAVRR/Burges - From RankNet to LambdaRank to LambdaMART An Overv.pdf:application/pdf},
}

@misc{jiang_visual_2020,
	title = {A {Visual} {Explanation} of {Gradient} {Descent} {Methods} ({Momentum}, {AdaGrad}, {RMSProp}, {Adam})},
	url = {https://towardsdatascience.com/a-visual-explanation-of-gradient-descent-methods-momentum-adagrad-rmsprop-adam-f898b102325c},
	abstract = {Why can AdaGrad escape saddle point? Why is Adam usually better? In a race down different terrains, which will win?},
	language = {en},
	urldate = {2024-05-08},
	journal = {Medium},
	author = {Jiang, Lili},
	month = sep,
	year = {2020},
	file = {Snapshot:/Users/micl/Zotero/storage/V68VQQZ4/a-visual-explanation-of-gradient-descent-methods-momentum-adagrad-rmsprop-adam-f898b102325c.html:text/html},
}

@article{dahabreh_causal_2024,
	title = {Causal {Inference} {About} the {Effects} of {Interventions} {From} {Observational} {Studies} in {Medical} {Journals}},
	issn = {0098-7484},
	url = {https://doi.org/10.1001/jama.2024.7741},
	doi = {10.1001/jama.2024.7741},
	abstract = {Many medical journals, including JAMA, restrict the use of causal language to the reporting of randomized clinical trials. Although well-conducted randomized clinical trials remain the preferred approach for answering causal questions, methods for observational studies have advanced such that causal interpretations of the results of well-conducted observational studies may be possible when strong assumptions hold. Furthermore, observational studies may be the only practical source of information for answering some questions about the causal effects of medical or policy interventions, can support the study of interventions in populations and settings that reflect practice, and can help identify interventions for further experimental investigation. Identifying opportunities for the appropriate use of causal language when describing observational studies is important for communication in medical journals.A structured approach to whether and how causal language may be used when describing observational studies would enhance the communication of research goals, support the assessment of assumptions and design and analytic choices, and allow for more clear and accurate interpretation of results. Building on the extensive literature on causal inference across diverse disciplines, we suggest a framework for observational studies that aim to provide evidence about the causal effects of interventions based on 6 core questions: what is the causal question; what quantity would, if known, answer the causal question; what is the study design; what causal assumptions are being made; how can the observed data be used to answer the causal question in principle and in practice; and is a causal interpretation of the analyses tenable?Adoption of the proposed framework to identify when causal interpretation is appropriate in observational studies promises to facilitate better communication between authors, reviewers, editors, and readers. Practical implementation will require cooperation between editors, authors, and reviewers to operationalize the framework and evaluate its effect on the reporting of empirical research.},
	urldate = {2024-05-10},
	journal = {JAMA},
	author = {Dahabreh, Issa J. and Bibbins-Domingo, Kirsten},
	month = may,
	year = {2024},
	file = {Snapshot:/Users/micl/Zotero/storage/BH4393ZE/2818746.html:text/html},
}

@article{hernan_c-word_2018,
	title = {The {C}-{Word}: {Scientific} {Euphemisms} {Do} {Not} {Improve} {Causal} {Inference} {From} {Observational} {Data}},
	volume = {108},
	issn = {0090-0036},
	shorttitle = {The {C}-{Word}},
	url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5888052/},
	doi = {10.2105/AJPH.2018.304337},
	abstract = {Causal inference is a core task of science. However, authors and editors often refrain from explicitly acknowledging the causal goal of research projects; they refer to causal effect estimates as associational estimates., This commentary argues that using the term “causal” is necessary to improve the quality of observational research., Specifically, being explicit about the causal objective of a study reduces ambiguity in the scientific question, errors in the data analysis, and excesses in the interpretation of the results.},
	number = {5},
	urldate = {2024-05-10},
	journal = {American Journal of Public Health},
	author = {Hernán, Miguel A.},
	month = may,
	year = {2018},
	pmid = {29565659},
	pmcid = {PMC5888052},
	pages = {616--619},
	file = {PubMed Central Full Text PDF:/Users/micl/Zotero/storage/W3ZU9F5X/Hernán - 2018 - The C-Word Scientific Euphemisms Do Not Improve C.pdf:application/pdf},
}

@misc{sen_decoder-only_2024,
	title = {A decoder-only foundation model for time-series forecasting},
	url = {http://research.google/blog/a-decoder-only-foundation-model-for-time-series-forecasting/},
	abstract = {Posted by Rajat Sen and Yichen Zhou, Google Research Time-series forecasting is ubiquitous in various domains, such as retail, finance, manufacturi...},
	language = {en},
	urldate = {2024-05-11},
	author = {Sen, Rajat and Zhou, Yichen},
	year = {2024},
	file = {Snapshot:/Users/micl/Zotero/storage/ZD2F5FHG/a-decoder-only-foundation-model-for-time-series-forecasting.html:text/html},
}

@book{walker_analyzing_2023,
	title = {Analyzing {US} {Census} {Data}},
	url = {https://walker-data.com/census-r},
	abstract = {This book is an overview of how practitioners can acquire, wrangle, visualize, and model US Census data with the R programming language.},
	language = {en},
	urldate = {2024-05-11},
	author = {Walker, Kyle E.},
	year = {2023},
	file = {Snapshot:/Users/micl/Zotero/storage/H3EY3AM2/index.html:text/html},
}

@article{shorten_text_2021,
	title = {Text {Data} {Augmentation} for {Deep} {Learning}},
	volume = {8},
	issn = {2196-1115},
	url = {https://doi.org/10.1186/s40537-021-00492-0},
	doi = {10.1186/s40537-021-00492-0},
	abstract = {Natural Language Processing (NLP) is one of the most captivating applications of Deep Learning. In this survey, we consider how the Data Augmentation training strategy can aid in its development. We begin with the major motifs of Data Augmentation summarized into strengthening local decision boundaries, brute force training, causality and counterfactual examples, and the distinction between meaning and form. We follow these motifs with a concrete list of augmentation frameworks that have been developed for text data. Deep Learning generally struggles with the measurement of generalization and characterization of overfitting. We highlight studies that cover how augmentations can construct test sets for generalization. NLP is at an early stage in applying Data Augmentation compared to Computer Vision. We highlight the key differences and promising ideas that have yet to be tested in NLP. For the sake of practical implementation, we describe tools that facilitate Data Augmentation such as the use of consistency regularization, controllers, and offline and online augmentation pipelines, to preview a few. Finally, we discuss interesting topics around Data Augmentation in NLP such as task-specific augmentations, the use of prior knowledge in self-supervised learning versus Data Augmentation, intersections with transfer and multi-task learning, and ideas for AI-GAs (AI-Generating Algorithms). We hope this paper inspires further research interest in Text Data Augmentation.},
	language = {en},
	number = {1},
	urldate = {2024-05-11},
	journal = {Journal of Big Data},
	author = {Shorten, Connor and Khoshgoftaar, Taghi M. and Furht, Borko},
	month = jul,
	year = {2021},
	keywords = {Big Data, Data Augmentation, Natural Language Processing, NLP, Overfitting, Text Data},
	pages = {101},
	file = {Full Text PDF:/Users/micl/Zotero/storage/J3VKUNFC/Shorten et al. - 2021 - Text Data Augmentation for Deep Learning.pdf:application/pdf},
}

@misc{amazon_what_2024,
	title = {What is {Data} {Augmentation}? - {Data} {Augmentation} {Techniques} {Explained} - {AWS}},
	shorttitle = {What is {Data} {Augmentation}?},
	url = {https://aws.amazon.com/what-is/data-augmentation/},
	abstract = {What is Data Augmentation how and why businesses use Data Augmentation, and how to use Data Augmentation with AWS.},
	language = {en-US},
	urldate = {2024-05-11},
	journal = {Amazon Web Services, Inc.},
	author = {Amazon},
	year = {2024},
	file = {Snapshot:/Users/micl/Zotero/storage/QQTWUQXB/data-augmentation.html:text/html},
}

@article{chawla_smote_2002,
	title = {{SMOTE}: {Synthetic} {Minority} {Over}-sampling {Technique}},
	volume = {16},
	copyright = {Copyright (c)},
	issn = {1076-9757},
	shorttitle = {{SMOTE}},
	url = {https://www.jair.org/index.php/jair/article/view/10302},
	doi = {10.1613/jair.953},
	abstract = {An approach to the construction of classifiers from    imbalanced datasets is described. A dataset is imbalanced if the    classification categories are not approximately equally    represented. Often real-world data sets are predominately composed of    ``normal'' examples with only a small percentage of ``abnormal'' or    ``interesting'' examples. It is also the case that the cost of    misclassifying an abnormal (interesting) example as a normal example    is often much higher than the cost of the reverse    error. Under-sampling of the majority (normal) class has been proposed    as a good means of increasing the sensitivity of a classifier to the    minority class. This paper shows that a combination of our method of    over-sampling the minority (abnormal) class and under-sampling the    majority (normal) class can achieve better classifier performance (in    ROC space) than only under-sampling the majority class.  This paper    also shows that a combination of our method of over-sampling the    minority class and under-sampling the majority class can achieve    better classifier performance (in ROC space) than varying the loss    ratios in Ripper or class priors in Naive Bayes. Our method of    over-sampling the minority class involves creating synthetic minority    class examples.  Experiments are performed using C4.5, Ripper and a    Naive Bayes classifier. The method is evaluated using the area under    the Receiver Operating Characteristic curve (AUC) and the ROC convex    hull strategy.},
	language = {en},
	urldate = {2024-05-11},
	journal = {Journal of Artificial Intelligence Research},
	author = {Chawla, N. V. and Bowyer, K. W. and Hall, L. O. and Kegelmeyer, W. P.},
	month = jun,
	year = {2002},
	pages = {321--357},
	file = {Full Text PDF:/Users/micl/Zotero/storage/NIRAR5IH/Chawla et al. - 2002 - SMOTE Synthetic Minority Over-sampling Technique.pdf:application/pdf},
}

@misc{raschka_about_2014,
	title = {About {Feature} {Scaling} and {Normalization}},
	url = {https://sebastianraschka.com/Articles/2014_about_feature_scaling.html},
	abstract = {Sections},
	language = {en},
	urldate = {2024-05-11},
	journal = {Sebastian Raschka, PhD},
	author = {Raschka, Sebastian},
	year = {2014},
	file = {Snapshot:/Users/micl/Zotero/storage/HYS9PP2Z/2014_about_feature_scaling.html:text/html},
}

@misc{rocca_handling_2019,
	title = {Handling imbalanced datasets in machine learning},
	url = {https://towardsdatascience.com/handling-imbalanced-datasets-in-machine-learning-7a0e84220f28},
	abstract = {What should and should not be done when facing an imbalanced classes problem?},
	language = {en},
	urldate = {2024-05-11},
	journal = {Medium},
	author = {Rocca, Baptiste},
	year = {2019},
	file = {Snapshot:/Users/micl/Zotero/storage/LBJXTJUR/handling-imbalanced-datasets-in-machine-learning-7a0e84220f28.html:text/html},
}

@misc{google_imbalanced_2023,
	title = {Imbalanced {Data} {\textbar} {Machine} {Learning}},
	url = {https://developers.google.com/machine-learning/data-prep/construct/sampling-splitting/imbalanced-data},
	language = {en},
	urldate = {2024-05-11},
	journal = {Google for Developers},
	author = {Google},
	year = {2023},
	file = {Snapshot:/Users/micl/Zotero/storage/C5U5PUN6/imbalanced-data.html:text/html},
}

@misc{monroe_imbalanced_2025,
	title = {Imbalanced {Outcomes}: {Challenges} and {Solutions}},
	abstract = {Binary classification tasks abound in machine learning, but it is not an uncommon situation to find that a binary target is notably or even severely imbalanced, such that one category occurs with much greater frequency than the other. In fact, it’s possible that truly balanced data are quite rare. While we could potentially ignore this reality and just run our models and predict new data as usual, this does not come without consequences. We could instead take a variety of approaches to deal with this situation, but, as we will see, there is no free lunch there either.},
	author = {Monroe, Elizabeth and Clark, Michael},
	year = {2025},
}

@misc{brownlee_gentle_2019,
	title = {A {Gentle} {Introduction} to {Imbalanced} {Classification}},
	url = {https://machinelearningmastery.com/what-is-imbalanced-classification/},
	abstract = {Classification predictive modeling involves predicting a class label for a given observation. An imbalanced classification problem is an example of a classification problem where the distribution of examples across the known classes is biased or skewed. The distribution can vary from a slight bias to a severe imbalance where there is one example in the […]},
	language = {en-US},
	urldate = {2024-05-11},
	journal = {MachineLearningMastery.com},
	author = {Brownlee, Jason},
	month = dec,
	year = {2019},
	file = {Snapshot:/Users/micl/Zotero/storage/2I3IV7DZ/what-is-imbalanced-classification.html:text/html},
}

@inproceedings{niculescu-mizil_predicting_2005,
	address = {Bonn, Germany},
	title = {Predicting good probabilities with supervised learning},
	isbn = {978-1-59593-180-1},
	url = {http://portal.acm.org/citation.cfm?doid=1102351.1102430},
	doi = {10.1145/1102351.1102430},
	abstract = {We examine the relationship between the predictions made by different learning algorithms and true posterior probabilities. We show that maximum margin methods such as boosted trees and boosted stumps push probability mass away from 0 and 1 yielding a characteristic sigmoid shaped distortion in the predicted probabilities. Models such as Naive Bayes, which make unrealistic independence assumptions, push probabilities toward 0 and 1. Other models such as neural nets and bagged trees do not have these biases and predict well calibrated probabilities. We experiment with two ways of correcting the biased probabilities predicted by some learning methods: Platt Scaling and Isotonic Regression. We qualitatively examine what kinds of distortions these calibration methods are suitable for and quantitatively examine how much data they need to be effective. The empirical results show that after calibration boosted trees, random forests, and SVMs predict the best probabilities.},
	language = {en},
	urldate = {2024-05-11},
	booktitle = {Proceedings of the 22nd international conference on {Machine} learning  - {ICML} '05},
	publisher = {ACM Press},
	author = {Niculescu-Mizil, Alexandru and Caruana, Rich},
	year = {2005},
	pages = {625--632},
	file = {Niculescu-Mizil and Caruana - 2005 - Predicting good probabilities with supervised lear.pdf:/Users/micl/Zotero/storage/QNW8APII/Niculescu-Mizil and Caruana - 2005 - Predicting good probabilities with supervised lear.pdf:application/pdf},
}

@misc{brownlee_how_2020,
	title = {How to {Calibrate} {Probabilities} for {Imbalanced} {Classification}},
	url = {https://machinelearningmastery.com/probability-calibration-for-imbalanced-classification/},
	abstract = {Many machine learning models are capable of predicting a probability or probability-like scores for class membership. Probabilities provide a required level of granularity for evaluating and comparing models, especially on imbalanced classification problems where tools like ROC Curves are used to interpret predictions and the ROC AUC metric is used to compare model performance, both […]},
	language = {en-US},
	urldate = {2024-05-11},
	journal = {MachineLearningMastery.com},
	author = {Brownlee, Jason},
	month = feb,
	year = {2020},
	file = {Snapshot:/Users/micl/Zotero/storage/B5Q7AAST/probability-calibration-for-imbalanced-classification.html:text/html},
}

@misc{cross_validated_answer_2020,
	title = {Answer to "{Why} {Some} {Algorithms} {Produce} {Calibrated} {Probabilities}"},
	url = {https://stats.stackexchange.com/a/452533},
	urldate = {2024-05-11},
	journal = {Cross Validated},
	author = {{Cross Validated}},
	month = mar,
	year = {2020},
	file = {Snapshot:/Users/micl/Zotero/storage/WCLG9PQI/why-some-algorithms-produce-calibrated-probabilities.html:text/html},
}

@book{clark_graphical_2018,
	title = {Graphical \& {Latent} {Variable} {Modeling}},
	url = {https://m-clark.github.io/sem/},
	abstract = {This document focuses on structural equation modeling. It is conceptually based, and tries to generalize beyond the standard SEM treatment. It includes special emphasis on the lavaan package. Topics include: graphical models, including path analysis, bayesian networks, and network analysis, mediation, moderation, latent variable models, including principal components analysis and ‘factor analysis’, measurement models, structural equation models, mixture models, growth curves, item response theory, Bayesian nonparametric techniques, latent dirichlet allocation, and more.},
	urldate = {2024-05-11},
	author = {Clark, Michael},
	year = {2018},
	file = {Snapshot:/Users/micl/Zotero/storage/LJLLPH48/sem.html:text/html},
}

@book{hyndman_forecasting_2021,
	title = {Forecasting: {Principles} and {Practice} (3rd ed)},
	shorttitle = {Forecasting},
	url = {https://otexts.com/fpp3/},
	abstract = {3rd edition},
	urldate = {2024-05-11},
	author = {Hyndman, Rob and Athanasopoulos, George},
	year = {2021},
	file = {Snapshot:/Users/micl/Zotero/storage/WY3M45BU/fpp3.html:text/html},
}

@book{dunn_distribution-free_2020,
	title = {Distribution-{Free} {Prediction} {Sets} with {Random} {Effects}},
	abstract = {We consider the problem of constructing distribution-free prediction sets when there are random effects. For iid data, prediction sets can be constructed using the method of conformal prediction. The validity of this prediction set hinges on the assumption that the data are exchangeable, which is not true when there are random effects. We extend the conformal method so that it is valid with random effects. We develop a CDF pooling approach, a single subsampling approach, and a repeated subsampling approach to construct conformal prediction sets in unsupervised and supervised settings. We compare these approaches in terms of coverage and average set size. We recommend the repeated subsampling approach that constructs a conformal set by sampling one observation from each distribution multiple times. Simulations show that this approach has the best balance between coverage and average conformal set size. Supplementary materials for this article are available online.},
	author = {Dunn, Robin and Wasserman, Larry and Ramdas, Aaditya},
	month = sep,
	year = {2020},
	file = {Full Text PDF:/Users/micl/Zotero/storage/3MSZQVWX/Dunn et al. - 2020 - Distribution-Free Prediction Sets with Random Effe.pdf:application/pdf},
}

@misc{vig_deconstructing_2019,
	title = {Deconstructing {BERT}, {Part} 2: {Visualizing} the {Inner} {Workings} of {Attention}},
	shorttitle = {Deconstructing {BERT}, {Part} 2},
	url = {https://towardsdatascience.com/deconstructing-bert-part-2-visualizing-the-inner-workings-of-attention-60a16d86b5c1},
	abstract = {A new visualization tool shows how BERT forms its distinctive attention patterns.},
	language = {en},
	urldate = {2024-05-11},
	journal = {Medium},
	author = {Vig, Jesse},
	year = {2019},
	file = {Snapshot:/Users/micl/Zotero/storage/DVU3P79S/deconstructing-bert-part-2-visualizing-the-inner-workings-of-attention-60a16d86b5c1.html:text/html},
}

@misc{jordan_introduction_2018,
	title = {Introduction to autoencoders.},
	url = {https://www.jeremyjordan.me/autoencoders/},
	abstract = {Autoencoders are an unsupervised learning technique in which we leverage neural
networks for the task of representation learning. Specifically, we'll design a
neural network architecture such that we impose a bottleneck in the network
which forces a compressed knowledge representation of the original input. If the
input features were each},
	language = {en},
	urldate = {2024-05-11},
	journal = {Jeremy Jordan},
	author = {Jordan, Jeremy},
	month = mar,
	year = {2018},
	file = {Snapshot:/Users/micl/Zotero/storage/3FSSCBIX/autoencoders.html:text/html},
}

@misc{wikipedia_replication_2024,
	title = {Replication crisis},
	copyright = {Creative Commons Attribution-ShareAlike License},
	url = {https://en.wikipedia.org/w/index.php?title=Replication_crisis&oldid=1222335234},
	abstract = {The replication crisis (also called the replicability crisis and the reproducibility crisis) is an ongoing methodological crisis in which the results of many scientific studies are difficult or impossible to reproduce. Because the reproducibility of empirical results is an essential part of the scientific method, such failures undermine the credibility of theories building on them and potentially call into question substantial parts of scientific knowledge.
The replication crisis is frequently discussed in relation to psychology and medicine, where considerable efforts have been undertaken to reinvestigate classic results, to determine whether they are reliable, and if they turn out not to be, the reasons for the failure. Data strongly indicate that other natural and social sciences are affected as well.
The phrase replication crisis was coined in the early 2010s as part of a growing awareness of the problem. Considerations of causes and remedies have given rise to a new scientific discipline, metascience, which uses methods of empirical research to examine empirical research practice.
Considerations about reproducibility fall into two categories. Reproducibility in the narrow sense refers to re-examining and validating the analysis of a given set of data. Replication refers to repeating the experiment or study to obtain new, independent data with the goal of reaching the same or similar conclusions.},
	language = {en},
	urldate = {2024-05-12},
	journal = {Wikipedia},
	author = {Wikipedia},
	month = may,
	year = {2024},
	note = {Page Version ID: 1222335234},
	file = {Snapshot:/Users/micl/Zotero/storage/W9PWIB4E/Replication_crisis.html:text/html},
}

@article{pochinkov_llm_2023,
	title = {{LLM} {Basics}: {Embedding} {Spaces} - {Transformer} {Token} {Vectors} {Are} {Not} {Points} in {Space}},
	shorttitle = {{LLM} {Basics}},
	url = {https://www.lesswrong.com/posts/pHPmMGEMYefk9jLeh/llm-basics-embedding-spaces-transformer-token-vectors-are},
	abstract = {This post is written as an explanation of a misconception I had with transformer embedding when I was getting started. Thanks to Stephen Fowler for t…},
	language = {en},
	urldate = {2024-05-26},
	author = {Pochinkov, Nicky},
	month = feb,
	year = {2023},
}

@misc{cross_validated_answer_2011,
	title = {Answer to "{The} connection between {Bayesian} statistics and generative modeling"},
	url = {https://stats.stackexchange.com/a/7473},
	urldate = {2024-05-27},
	journal = {Cross Validated},
	author = {{Cross Validated}},
	month = feb,
	year = {2011},
	file = {Snapshot:/Users/micl/Zotero/storage/XYVCJ7DV/the-connection-between-bayesian-statistics-and-generative-modeling.html:text/html},
}

@book{faraway_extending_2016,
	address = {New York},
	edition = {2},
	title = {Extending the {Linear} {Model} with {R}: {Generalized} {Linear}, {Mixed} {Effects} and {Nonparametric} {Regression} {Models}, {Second} {Edition}},
	isbn = {978-1-315-38272-2},
	shorttitle = {Extending the {Linear} {Model} with {R}},
	abstract = {Start Analyzing a Wide Range of Problems 
Since the publication of the bestselling, highly recommended first edition, R has considerably expanded both in popularity and in the number of packages available. Extending the Linear Model with R: Generalized Linear, Mixed Effects and Nonparametric Regression Models, Second Edition takes advantage of the greater functionality now available in R and substantially revises and adds several topics.
New to the Second Edition

Expanded coverage of binary and binomial responses, including proportion responses, quasibinomial and beta regression, and applied considerations regarding these models 
New sections on Poisson models with dispersion, zero inflated count models, linear discriminant analysis, and sandwich and robust estimation for generalized linear models (GLMs) 
Revised chapters on random effects and repeated measures that reflect changes in the lme4 package and show how to perform hypothesis testing for the models using other methods
New chapter on the Bayesian analysis of mixed effect models that illustrates the use of STAN and presents the approximation method of INLA 
Revised chapter on generalized linear mixed models to reflect the much richer choice of fitting software now available
Updated coverage of splines and confidence bands in the chapter on nonparametric regression
New material on random forests for regression and classification 
Revamped R code throughout, particularly the many plots using the ggplot2 package
Revised and expanded exercises with solutions now included
Demonstrates the Interplay of Theory and Practice
This textbook continues to cover a range of techniques that grow from the linear regression model. It presents three extensions to the linear framework: GLMs, mixed effect models, and nonparametric regression models. The book explains data analysis using real examples and includes all the R commands necessary to reproduce the analyses.},
	publisher = {Chapman and Hall/CRC},
	author = {Faraway, Julian J.},
	month = mar,
	year = {2016},
	doi = {10.1201/9781315382722},
}

@inproceedings{kirillov_segment_2023,
	title = {Segment {Anything}},
	url = {https://openaccess.thecvf.com/content/ICCV2023/html/Kirillov_Segment_Anything_ICCV_2023_paper.html},
	language = {en},
	urldate = {2024-06-01},
	author = {Kirillov, Alexander and Mintun, Eric and Ravi, Nikhila and Mao, Hanzi and Rolland, Chloe and Gustafson, Laura and Xiao, Tete and Whitehead, Spencer and Berg, Alexander C. and Lo, Wan-Yen and Dollar, Piotr and Girshick, Ross},
	year = {2023},
	pages = {4015--4026},
	file = {Full Text PDF:/Users/micl/Zotero/storage/FEYPIM49/Kirillov et al. - 2023 - Segment Anything.pdf:application/pdf},
}

@misc{yeh_ai_2024,
	title = {{AI} by {Hand} ✍️ {\textbar} {Tom} {Yeh} {\textbar} {Substack}},
	url = {https://aibyhand.substack.com/},
	abstract = {with Prof. Tom Yeh. Click to read AI by Hand ✍️, by Tom Yeh, a Substack publication.},
	language = {en},
	urldate = {2024-06-14},
	author = {Yeh, Tom},
	month = jun,
	year = {2024},
	file = {Snapshot:/Users/micl/Zotero/storage/WRBYUT3Y/aibyhand.substack.com.html:text/html},
}

@misc{cross_validated_answer_2021,
	title = {Answer to "{Why} do we do matching for causal inference vs regressing on confounders?"},
	shorttitle = {Answer to "{Why} do we do matching for causal inference vs regressing on confounders?},
	url = {https://stats.stackexchange.com/a/544958},
	urldate = {2024-06-22},
	journal = {Cross Validated},
	author = {{Cross Validated}},
	month = sep,
	year = {2021},
	file = {Snapshot:/Users/micl/Zotero/storage/ARYGSAZL/why-do-we-do-matching-for-causal-inference-vs-regressing-on-confounders.html:text/html},
}

@incollection{neal_priors_1996,
	address = {New York, NY},
	title = {Priors for {Infinite} {Networks}},
	isbn = {978-1-4612-0745-0},
	url = {https://doi.org/10.1007/978-1-4612-0745-0_2},
	abstract = {In this chapter, I show that priors over network parameters can be defined in such a way that the corresponding priors over functions computed by the network reach reasonable limits as the number of hidden units goes to infinity. When using such priors,there is thus no need to limit the size of the network in order to avoid “overfitting”. The infinite network limit also provides insight into the properties of different priors. A Gaussian prior for hidden-to-output weights results in a Gaussian process prior for functions,which may be smooth, Brownian, or fractional Brownian. Quite different effects can be obtained using priors based on non-Gaussian stable distributions. In networks with more than one hidden layer, a combination of Gaussian and non-Gaussian priors appears most interesting.},
	language = {en},
	urldate = {2024-06-22},
	booktitle = {Bayesian {Learning} for {Neural} {Networks}},
	publisher = {Springer},
	author = {Neal, Radford M.},
	editor = {Neal, Radford M.},
	year = {1996},
	doi = {10.1007/978-1-4612-0745-0_2},
	pages = {29--53},
}

@book{fahrmeir_regression_2021,
	address = {Berlin, Heidelberg},
	title = {Regression: {Models}, {Methods} and {Applications}},
	copyright = {https://www.springer.com/tdm},
	isbn = {978-3-662-63881-1 978-3-662-63882-8},
	shorttitle = {Regression},
	url = {https://link.springer.com/10.1007/978-3-662-63882-8},
	language = {en},
	urldate = {2024-06-27},
	publisher = {Springer},
	author = {Fahrmeir, Ludwig and Kneib, Thomas and Lang, Stefan and Marx, Brian D.},
	year = {2021},
	doi = {10.1007/978-3-662-63882-8},
	keywords = {generalized linear models, linear regression, semiparametric regression, Bayesian approach to regression, categorical regression models, distributional regression, general linear mixed models, linear models, nonparametric regression, quantile regression, regression and machine learning, regression models, spatial smoothing, structured additive regression models},
	file = {Submitted Version:/Users/micl/Zotero/storage/STN2DZIS/Fahrmeir et al. - 2021 - Regression Models, Methods and Applications.pdf:application/pdf},
}

@article{gelman_garden_2013,
	title = {The garden of forking paths: {Why} multiple comparisons can be a problem, even when there is no “ﬁshing expedition” or “p-hacking” and the research hypothesis was posited ahead of time},
	abstract = {Researcher degrees of freedom can lead to a multiple comparisons problem, even in settings where researchers perform only a single analysis on their data. The problem is there can be a large number of potential comparisons when the details of data analysis are highly contingent on data, without the researcher having to perform any conscious procedure of ﬁshing or examining multiple p-values. We discuss in the context of several examples of published papers where data-analysis decisions were theoretically-motivated based on previous literature, but where the details of data selection and analysis were not pre-speciﬁed and, as a result, were contingent on data.},
	language = {en},
	author = {Gelman, Andrew and Loken, Eric},
	year = {2013},
	file = {Gelman and Loken - The garden of forking paths Why multiple comparis.pdf:/Users/micl/Zotero/storage/23SJ2GQ7/Gelman and Loken - The garden of forking paths Why multiple comparis.pdf:application/pdf},
}

@book{prince_understanding_2023,
	title = {Understanding {Deep} {Learning}},
	isbn = {978-0-262-04864-4},
	abstract = {An authoritative, accessible, and up-to-date treatment of deep learning that strikes a pragmatic middle ground between theory and practice.Deep learning is a fast-moving field with sweeping relevance in today’s increasingly digital world. Understanding Deep Learning provides an authoritative, accessible, and up-to-date treatment of the subject, covering all the key topics along with recent advances and cutting-edge concepts. Many deep learning texts are crowded with technical details that obscure fundamentals, but Simon Prince ruthlessly curates only the most important ideas to provide a high density of critical information in an intuitive and digestible form. From machine learning basics to advanced models, each concept is presented in lay terms and then detailed precisely in mathematical form and illustrated visually. The result is a lucid, self-contained textbook suitable for anyone with a basic background in applied mathematics.Up-to-date treatment of deep learning covers cutting-edge topics not found in existing texts, such as transformers and diffusion modelsShort, focused chapters progress in complexity, easing students into difficult concepts Pragmatic approach straddling theory and practice gives readers the level of detail required to implement naive versions of modelsStreamlined presentation separates critical ideas from background context and extraneous detailMinimal mathematical prerequisites, extensive illustrations, and practice problems make challenging material widely accessible Programming exercises offered in accompanying Python Notebooks},
	language = {en},
	publisher = {MIT Press},
	author = {Prince, Simon J. D.},
	month = dec,
	year = {2023},
	note = {Google-Books-ID: rvyxEAAAQBAJ},
	keywords = {Computers / Artificial Intelligence / General, Computers / Data Science / Machine Learning, Computers / Data Science / Neural Networks},
}

@book{hernan_causal_2012,
	title = {Causal {Inference}: {What} {If} (the book)},
	shorttitle = {Causal {Inference}},
	url = {https://www.hsph.harvard.edu/miguel-hernan/causal-inference-book/},
	abstract = {Jamie Robins and I have written a book that provides a cohesive presentation of concepts of, and methods for, causal inference. Much of this material is currently scattered across journals in several disciplines or confined to technical articles. We expect that the book will be of interest to anyone interested in causal inference, e.g., epidemiologists, statisticians, psychologists, economists, sociologists, political scientists, computer scientists… The book is divided in 3 parts of increasing difficulty: causal inference without models, causal inference with models, and causal inference from complex longitudinal data.},
	language = {en-us},
	urldate = {2024-07-06},
	author = {Hernán, Miguel A. and Robins, James M.},
	month = oct,
	year = {2012},
	file = {Snapshot:/Users/micl/Zotero/storage/N3IYFXMG/causal-inference-book.html:text/html},
}

@misc{shalizi_f-tests_2015,
	title = {F-{Tests}, {R2}, and {Other} {Distractions}},
	url = {https://www.stat.cmu.edu/~cshalizi/mreg/15/},
	urldate = {2024-07-11},
	author = {Shalizi, Cosma},
	year = {2015},
	file = {36-401, Modern Regression (2015):/Users/micl/Zotero/storage/SMXSWEKV/15.html:text/html},
}

@misc{ushey_arrays_2024,
	title = {Arrays in {R} and {Python}},
	url = {https://cran.r-project.org/web/packages/reticulate/vignettes/arrays.html},
	abstract = {Reticulate Package Vignette},
	urldate = {2024-07-20},
	author = {Ushey, Kevin and Allaire, JJ and Yuan, Tang},
	year = {2024},
	file = {Arrays in R and Python:/Users/micl/Zotero/storage/2N6UDADU/arrays.html:text/html},
}

@misc{clark_thinking_2018,
	title = {Thinking about {Latent} {Variables}},
	url = {https://m-clark.github.io/docs/FA_notes.html},
	urldate = {2024-07-27},
	author = {Clark, Michael},
	year = {2018},
	file = {Thinking about Latent Variables:/Users/micl/Zotero/storage/ED9VF4PW/FA_notes.html:text/html},
}

@misc{leech_questionable_2024,
	title = {Questionable practices in machine learning},
	url = {http://arxiv.org/abs/2407.12220},
	doi = {10.48550/arXiv.2407.12220},
	abstract = {Evaluating modern ML models is hard. The strong incentive for researchers and companies to report a state-of-the-art result on some metric often leads to questionable research practices (QRPs): bad practices which fall short of outright research fraud. We describe 43 such practices which can undermine reported results, giving examples where possible. Our list emphasises the evaluation of large language models (LLMs) on public benchmarks. We also discuss "irreproducible research practices", i.e. decisions that make it difficult or impossible for other researchers to reproduce, build on or audit previous research.},
	urldate = {2024-07-28},
	publisher = {arXiv},
	author = {Leech, Gavin and Vazquez, Juan J. and Yagudin, Misha and Kupper, Niclas and Aitchison, Laurence},
	month = jul,
	year = {2024},
	note = {arXiv:2407.12220 [cs]},
	keywords = {Computer Science - Computers and Society, Computer Science - Machine Learning, Computer Science - Computation and Language},
	file = {arXiv Fulltext PDF:/Users/micl/Zotero/storage/FXS8APSK/Leech et al. - 2024 - Questionable practices in machine learning.pdf:application/pdf;arXiv.org Snapshot:/Users/micl/Zotero/storage/W626ITYF/2407.html:text/html},
}

@book{clark_model_2021,
	title = {Model {Estimation} by {Example}},
	url = {https://m-clark.github.io/models-by-example/},
	abstract = {This document provides ‘by-hand’ demonstrations of various models and algorithms. The goal is to take away some of the mystery by providing clean code examples that are easy to run and compare with other tools.},
	urldate = {2024-08-03},
	author = {Clark, Michael},
	year = {2021},
	file = {Snapshot:/Users/micl/Zotero/storage/AW9USMAV/models-by-example.html:text/html},
}

@article{ivanova_comprehension_2020,
	title = {Comprehension of computer code relies primarily on domain-general executive brain regions},
	volume = {9},
	issn = {2050-084X},
	url = {https://doi.org/10.7554/eLife.58906},
	doi = {10.7554/eLife.58906},
	abstract = {Computer programming is a novel cognitive tool that has transformed modern society. What cognitive and neural mechanisms support this skill? Here, we used functional magnetic resonance imaging to investigate two candidate brain systems: the multiple demand (MD) system, typically recruited during math, logic, problem solving, and executive tasks, and the language system, typically recruited during linguistic processing. We examined MD and language system responses to code written in Python, a text-based programming language (Experiment 1) and in ScratchJr, a graphical programming language (Experiment 2); for both, we contrasted responses to code problems with responses to content-matched sentence problems. We found that the MD system exhibited strong bilateral responses to code in both experiments, whereas the language system responded strongly to sentence problems, but weakly or not at all to code problems. Thus, the MD system supports the use of novel cognitive tools even when the input is structurally similar to natural language.},
	urldate = {2024-08-04},
	journal = {eLife},
	author = {Ivanova, Anna A and Srikant, Shashank and Sueoka, Yotaro and Kean, Hope H and Dhamala, Riva and O'Reilly, Una-May and Bers, Marina U and Fedorenko, Evelina},
	editor = {Martin, Andrea E and Behrens, Timothy E and Matchin, William and Bornkessel-Schlesewsky, Ina},
	month = dec,
	year = {2020},
	note = {Publisher: eLife Sciences Publications, Ltd},
	keywords = {computer code, fMRI, language, multiple demand, programming},
	pages = {e58906},
	file = {Full Text PDF:/Users/micl/Zotero/storage/CAEWR37Y/Ivanova et al. - 2020 - Comprehension of computer code relies primarily on.pdf:application/pdf},
}

@book{boehmke_hands-machine_2020,
	title = {Hands-{On} {Machine} {Learning} with {R}},
	url = {https://bradleyboehmke.github.io/HOML/},
	abstract = {A Machine Learning Algorithmic Deep Dive Using R.},
	urldate = {2024-08-04},
	author = {Boehmke, Bradley and {Greenwell, Brandon}},
	year = {2020},
}

@misc{cross_validated_why_2016,
	type = {Forum post},
	title = {Why are neural networks becoming deeper, but not wider?},
	url = {https://stats.stackexchange.com/q/222883},
	urldate = {2024-08-11},
	journal = {Cross Validated},
	author = {{Cross Validated}},
	month = jul,
	year = {2016},
}

@misc{witten_bias-variance_2020,
	title = {The {Bias}-{Variance} {Trade}-{Off} \& "{DOUBLE} {DESCENT}"},
	shorttitle = {Daniela {Witten} on {X}},
	url = {https://x.com/daniela_witten/status/1292293102103748609},
	language = {en},
	urldate = {2024-08-21},
	journal = {X (formerly Twitter)},
	author = {Witten, Daniela},
	month = aug,
	year = {2020},
	file = {Snapshot:/Users/micl/Zotero/storage/3B7Y977R/1292293102103748609.html:text/html},
}

@misc{power_grokking_2022,
	title = {Grokking: {Generalization} {Beyond} {Overfitting} on {Small} {Algorithmic} {Datasets}},
	shorttitle = {Grokking},
	url = {http://arxiv.org/abs/2201.02177},
	doi = {10.48550/arXiv.2201.02177},
	abstract = {In this paper we propose to study generalization of neural networks on small algorithmically generated datasets. In this setting, questions about data efficiency, memorization, generalization, and speed of learning can be studied in great detail. In some situations we show that neural networks learn through a process of "grokking" a pattern in the data, improving generalization performance from random chance level to perfect generalization, and that this improvement in generalization can happen well past the point of overfitting. We also study generalization as a function of dataset size and find that smaller datasets require increasing amounts of optimization for generalization. We argue that these datasets provide a fertile ground for studying a poorly understood aspect of deep learning: generalization of overparametrized neural networks beyond memorization of the finite training dataset.},
	urldate = {2024-08-24},
	publisher = {arXiv},
	author = {Power, Alethea and Burda, Yuri and Edwards, Harri and Babuschkin, Igor and Misra, Vedant},
	month = jan,
	year = {2022},
	note = {arXiv:2201.02177 [cs]},
	keywords = {Computer Science - Machine Learning},
	annote = {Comment: Correspondence to alethea@openai.com. Code available at: https://github.com/openai/grok},
	file = {arXiv Fulltext PDF:/Users/micl/Zotero/storage/WYTPJFTX/Power et al. - 2022 - Grokking Generalization Beyond Overfitting on Sma.pdf:application/pdf;arXiv.org Snapshot:/Users/micl/Zotero/storage/CDRK95MJ/2201.html:text/html},
}

@article{burkner_ordinal_2019,
	title = {Ordinal {Regression} {Models} in {Psychology}: {A} {Tutorial}},
	volume = {2},
	issn = {2515-2459},
	shorttitle = {Ordinal {Regression} {Models} in {Psychology}},
	url = {https://doi.org/10.1177/2515245918823199},
	doi = {10.1177/2515245918823199},
	abstract = {Ordinal variables, although extremely common in psychology, are almost exclusively analyzed with statistical models that falsely assume them to be metric. This practice can lead to distorted effect-size estimates, inflated error rates, and other problems. We argue for the application of ordinal models that make appropriate assumptions about the variables under study. In this Tutorial, we first explain the three major classes of ordinal models: the cumulative, sequential, and adjacent-category models. We then show how to fit ordinal models in a fully Bayesian framework with the R package brms, using data sets on opinions about stem-cell research and time courses of marriage. The appendices provide detailed mathematical derivations of the models and a discussion of censored ordinal models. Compared with metric models, ordinal models provide better theoretical interpretation and numerical inference from ordinal data, and we recommend their widespread adoption in psychology.},
	language = {en},
	number = {1},
	urldate = {2024-08-30},
	journal = {Advances in Methods and Practices in Psychological Science},
	author = {Bürkner, Paul-Christian and Vuorre, Matti},
	month = mar,
	year = {2019},
	note = {Publisher: SAGE Publications Inc},
	pages = {77--101},
	file = {SAGE PDF Full Text:/Users/micl/Zotero/storage/Y7QD56P3/Bürkner and Vuorre - 2019 - Ordinal Regression Models in Psychology A Tutoria.pdf:application/pdf},
}

@misc{angelopoulos_gentle_2022,
	title = {A {Gentle} {Introduction} to {Conformal} {Prediction} and {Distribution}-{Free} {Uncertainty} {Quantification}},
	url = {http://arxiv.org/abs/2107.07511},
	doi = {10.48550/arXiv.2107.07511},
	abstract = {Black-box machine learning models are now routinely used in high-risk settings, like medical diagnostics, which demand uncertainty quantification to avoid consequential model failures. Conformal prediction is a user-friendly paradigm for creating statistically rigorous uncertainty sets/intervals for the predictions of such models. Critically, the sets are valid in a distribution-free sense: they possess explicit, non-asymptotic guarantees even without distributional assumptions or model assumptions. One can use conformal prediction with any pre-trained model, such as a neural network, to produce sets that are guaranteed to contain the ground truth with a user-specified probability, such as 90\%. It is easy-to-understand, easy-to-use, and general, applying naturally to problems arising in the fields of computer vision, natural language processing, deep reinforcement learning, and so on. This hands-on introduction is aimed to provide the reader a working understanding of conformal prediction and related distribution-free uncertainty quantification techniques with one self-contained document. We lead the reader through practical theory for and examples of conformal prediction and describe its extensions to complex machine learning tasks involving structured outputs, distribution shift, time-series, outliers, models that abstain, and more. Throughout, there are many explanatory illustrations, examples, and code samples in Python. With each code sample comes a Jupyter notebook implementing the method on a real-data example; the notebooks can be accessed and easily run using our codebase.},
	urldate = {2024-08-30},
	publisher = {arXiv},
	author = {Angelopoulos, Anastasios N. and Bates, Stephen},
	month = dec,
	year = {2022},
	note = {arXiv:2107.07511 [cs, math, stat]},
	keywords = {Statistics - Methodology, Mathematics - Statistics Theory, Statistics - Machine Learning, Computer Science - Machine Learning, Computer Science - Artificial Intelligence},
	annote = {Comment: Blog and tutorial video at http://angelopoulos.ai/blog/posts/gentle-intro/ ; Code is available at https://github.com/aangelopoulos/conformal-prediction},
	file = {arXiv Fulltext PDF:/Users/micl/Zotero/storage/QGNFCATH/Angelopoulos and Bates - 2022 - A Gentle Introduction to Conformal Prediction and .pdf:application/pdf;arXiv.org Snapshot:/Users/micl/Zotero/storage/W6ZEMFZ6/2107.html:text/html},
}

@article{breiman_statistical_2001,
	title = {Statistical {Modeling}: {The} {Two} {Cultures} (with comments and a rejoinder by the author)},
	volume = {16},
	issn = {0883-4237, 2168-8745},
	shorttitle = {Statistical {Modeling}},
	url = {https://projecteuclid.org/journals/statistical-science/volume-16/issue-3/Statistical-Modeling--The-Two-Cultures-with-comments-and-a/10.1214/ss/1009213726.full},
	doi = {10.1214/ss/1009213726},
	abstract = {There are two cultures in the use of statistical modeling to reach conclusions from data. One assumes that the data are generated by a given stochastic data model. The other uses algorithmic models and treats the data mechanism as unknown. The statistical community has been committed to the almost exclusive use of data models. This commitment has led to irrelevant theory, questionable conclusions, and has kept statisticians from working on a large range of interesting current problems. Algorithmic modeling, both in theory and practice, has developed rapidly in fields outside statistics. It can be used both on large complex data sets and as a more accurate and informative alternative to data modeling on smaller data sets. If our goal as a field is to use data to solve problems, then we need to move away from exclusive dependence on data models and adopt a more diverse set of tools.},
	number = {3},
	urldate = {2024-09-06},
	journal = {Statistical Science},
	author = {Breiman, Leo},
	month = aug,
	year = {2001},
	note = {Publisher: Institute of Mathematical Statistics},
	pages = {199--231},
	file = {Full Text PDF:/Users/micl/Zotero/storage/86XNG5Z8/Breiman - 2001 - Statistical Modeling The Two Cultures (with comme.pdf:application/pdf},
}

@book{molnar_introduction_2024,
	title = {Introduction {To} {Conformal} {Prediction} {With} {Python}},
	url = {https://christophmolnar.com/books/conformal-prediction/},
	abstract = {A Short Guide For Quantifying The Uncertainty Of Machine Learning Models},
	language = {en},
	urldate = {2024-10-02},
	author = {Molnar, Christoph},
	month = aug,
	year = {2024},
	file = {Snapshot:/Users/micl/Zotero/storage/F9G26SZ2/conformal-prediction.html:text/html},
}

@misc{mit_opencourseware_6_2017,
	title = {6. {Monte} {Carlo} {Simulation}},
	url = {https://www.youtube.com/watch?v=OgO1gpXSUzU},
	abstract = {MIT 6.0002 Introduction to Computational Thinking and Data Science, Fall 2016
View the complete course: http://ocw.mit.edu/6-0002F16
Instructor: John Guttag

Prof. Guttag discusses the Monte Carlo simulation, Roulette

License: Creative Commons BY-NC-SA
More information at http://ocw.mit.edu/terms
More courses at http://ocw.mit.edu},
	urldate = {2024-10-03},
	author = {{MIT OpenCourseWare}},
	month = may,
	year = {2017},
}

@misc{mayo_error_2019,
	title = {Error {Statistics} {Philosophy}},
	url = {https://errorstatistics.com/},
	language = {en},
	urldate = {2024-10-03},
	journal = {Error Statistics Philosophy},
	author = {Mayo, Deborah},
	month = sep,
	year = {2019},
	file = {Snapshot:/Users/micl/Zotero/storage/MPNRKXAW/errorstatistics.com.html:text/html},
}

@misc{raschka_losses_2022,
	title = {Losses {Learned}},
	url = {https://sebastianraschka.com/blog/2022/losses-learned-part1.html},
	abstract = {The cross-entropy loss is our go-to loss for training deep learning-based classifiers. In this article, I am giving you a quick tour of how we usually comput...},
	language = {en},
	urldate = {2024-10-21},
	journal = {Sebastian Raschka, PhD},
	author = {Raschka, Sebastian},
	year = {2022},
	file = {Snapshot:/Users/micl/Zotero/storage/RHVJIH7X/losses-learned-part1.html:text/html},
}

@misc{google_mlops_2024,
	title = {{MLOps}: {Continuous} delivery and automation pipelines in machine learning {\textbar} {Cloud} {Architecture} {Center}},
	shorttitle = {{MLOps}},
	url = {https://cloud.google.com/architecture/mlops-continuous-delivery-and-automation-pipelines-in-machine-learning},
	abstract = {Discusses techniques for implementing and automating continuous integration (CI), continuous delivery (CD), and continuous training (CT) for machine learning (ML) systems.},
	language = {en},
	urldate = {2024-10-24},
	journal = {Google Cloud},
	author = {Google},
	year = {2024},
	file = {Snapshot:/Users/micl/Zotero/storage/P2KTX4YL/mlops-continuous-delivery-and-automation-pipelines-in-machine-learning.html:text/html},
}

@misc{ye_modern_2024,
	title = {Modern {Neighborhood} {Components} {Analysis}: {A} {Deep} {Tabular} {Baseline} {Two} {Decades} {Later}},
	shorttitle = {Modern {Neighborhood} {Components} {Analysis}},
	url = {http://arxiv.org/abs/2407.03257},
	doi = {10.48550/arXiv.2407.03257},
	abstract = {The growing success of deep learning in various domains has prompted investigations into its application to tabular data, where deep models have shown promising results compared to traditional tree-based methods. In this paper, we revisit Neighborhood Component Analysis (NCA), a classic tabular prediction method introduced in 2004, designed to learn a linear projection that captures semantic similarities between instances. We find that minor modifications, such as adjustments to the learning objectives and the integration of deep learning architectures, significantly enhance NCA's performance, enabling it to surpass most modern deep tabular models. Additionally, we introduce a stochastic neighbor sampling strategy that improves both the efficiency and predictive accuracy of our proposed ModernNCA -- sampling only a subset of neighbors during training, while utilizing the entire neighborhood during inference. Extensive experiments demonstrate that our ModernNCA achieves state-of-the-art results in both classification and regression tasks across various tabular datasets, outperforming both tree-based and other deep tabular models, while also reducing training time and model size.},
	urldate = {2024-10-27},
	publisher = {arXiv},
	author = {Ye, Han-Jia and Yin, Huai-Hong and Zhan, De-Chuan},
	month = jul,
	year = {2024},
	note = {arXiv:2407.03257},
	keywords = {Computer Science - Machine Learning},
	file = {Preprint PDF:/Users/micl/Zotero/storage/TBCLG4RJ/Ye et al. - 2024 - Modern Neighborhood Components Analysis A Deep Ta.pdf:application/pdf;Snapshot:/Users/micl/Zotero/storage/3YG8NRP8/2407.html:text/html},
}

@misc{gorishniy_tabr_2023,
	title = {{TabR}: {Tabular} {Deep} {Learning} {Meets} {Nearest} {Neighbors} in 2023},
	shorttitle = {{TabR}},
	url = {http://arxiv.org/abs/2307.14338},
	doi = {10.48550/arXiv.2307.14338},
	abstract = {Deep learning (DL) models for tabular data problems (e.g. classification, regression) are currently receiving increasingly more attention from researchers. However, despite the recent efforts, the non-DL algorithms based on gradient-boosted decision trees (GBDT) remain a strong go-to solution for these problems. One of the research directions aimed at improving the position of tabular DL involves designing so-called retrieval-augmented models. For a target object, such models retrieve other objects (e.g. the nearest neighbors) from the available training data and use their features and labels to make a better prediction. In this work, we present TabR -- essentially, a feed-forward network with a custom k-Nearest-Neighbors-like component in the middle. On a set of public benchmarks with datasets up to several million objects, TabR marks a big step forward for tabular DL: it demonstrates the best average performance among tabular DL models, becomes the new state-of-the-art on several datasets, and even outperforms GBDT models on the recently proposed "GBDT-friendly" benchmark (see Figure 1). Among the important findings and technical details powering TabR, the main ones lie in the attention-like mechanism that is responsible for retrieving the nearest neighbors and extracting valuable signal from them. In addition to the much higher performance, TabR is simple and significantly more efficient compared to prior retrieval-based tabular DL models.},
	urldate = {2024-10-27},
	publisher = {arXiv},
	author = {Gorishniy, Yury and Rubachev, Ivan and Kartashev, Nikolay and Shlenskii, Daniil and Kotelnikov, Akim and Babenko, Artem},
	month = oct,
	year = {2023},
	note = {arXiv:2307.14338},
	keywords = {Computer Science - Machine Learning},
	file = {Preprint PDF:/Users/micl/Zotero/storage/TFPJ8CV3/Gorishniy et al. - 2023 - TabR Tabular Deep Learning Meets Nearest Neighbor.pdf:application/pdf;Snapshot:/Users/micl/Zotero/storage/NR39HWY2/2307.html:text/html},
}

@misc{3blue1brown_how_2024,
	title = {How large language models work, a visual intro to transformers {\textbar} {Chapter} 5, {Deep} {Learning}},
	url = {https://www.youtube.com/watch?v=wjZofJX0v4M},
	abstract = {Breaking down how Large Language Models work
Instead of sponsored ad reads, these lessons are funded directly by viewers: https://3b1b.co/support

---

Here are a few other relevant resources

Build a GPT from scratch, by Andrej Karpathy
   • Let's build GPT: from scratch, in cod...  

If you want a conceptual understanding of language models from the ground up, @vcubingx just started a short series of videos on the topic:
   • What does it mean for computers to un...  

If you're interested in the herculean task of interpreting what these large networks might actually be doing, the Transformer Circuits posts by Anthropic are great. In particular, it was only after reading one of these that I started thinking of the combination of the value and output matrices as being a combined low-rank map from the embedding space to itself, which, at least in my mind, made things much clearer than other sources.
https://transformer-circuits.pub/2021...

Site with exercises related to ML programming and GPTs
https://www.gptandchill.ai/codingprob...

History of language models by Brit Cruise,  @ArtOfTheProblem  
   • ChatGPT: 30 Year History {\textbar} How AI Lea...  

An early paper on how directions in embedding spaces have meaning:
https://arxiv.org/pdf/1301.3781.pdf

---

Timestamps

0:00 - Predict, sample, repeat
3:03 - Inside a transformer
6:36 - Chapter layout
7:20 - The premise of Deep Learning
12:27 - Word embeddings
18:25 - Embeddings beyond words
20:22 - Unembedding
22:22 - Softmax with temperature
26:03 - Up next},
	urldate = {2024-11-01},
	author = {{3Blue1Brown}},
	month = apr,
	year = {2024},
}

@misc{lecun_self-supervised_2021,
	title = {Self-supervised learning: {The} dark matter of intelligence},
	shorttitle = {Self-supervised learning},
	url = {https://ai.meta.com/blog/self-supervised-learning-the-dark-matter-of-intelligence/},
	abstract = {How can we build machines with human-level intelligence? There’s a limit to how far the field of AI can go with supervised learning alone. Here's why self-supervised learning is one of the most promising ways to make significant progress in AI.},
	language = {en},
	urldate = {2024-11-03},
	author = {LeCun, Yann and Misra, Ishan},
	year = {2021},
	file = {Snapshot:/Users/micl/Zotero/storage/E7DIKJSQ/self-supervised-learning-the-dark-matter-of-intelligence.html:text/html},
}

@misc{bergmann_what_2023,
	title = {What {Is} {Self}-{Supervised} {Learning}?},
	shorttitle = {What {Is} {Self}-{Supervised} {Learning}?},
	url = {https://www.ibm.com/topics/self-supervised-learning},
	abstract = {Self-supervised learning is a machine learning technique that uses unsupervised learning for tasks typical to supervised learning, without labeled data.},
	language = {en},
	urldate = {2024-11-03},
	author = {Bergmann, Dave},
	month = nov,
	year = {2023},
}

@article{vanderweele_invited_2012,
	title = {Invited {Commentary}: {Structural} {Equation} {Models} and {Epidemiologic} {Analysis}},
	volume = {176},
	shorttitle = {Invited {Commentary}},
	url = {https://pmc.ncbi.nlm.nih.gov/articles/PMC3530375/},
	doi = {10.1093/aje/kws213},
	abstract = {In this commentary, structural equation models (SEMs) are discussed as a tool for epidemiologic analysis. Such models are related to and compared with other analytic approaches often used in epidemiology, including regression analysis, causal ...},
	language = {en},
	number = {7},
	urldate = {2024-11-08},
	journal = {American Journal of Epidemiology},
	author = {VanderWeele, Tyler J.},
	month = sep,
	year = {2012},
	pmid = {22956513},
	pages = {608},
	file = {Full Text PDF:/Users/micl/Zotero/storage/AQBW7M8P/VanderWeele - 2012 - Invited Commentary Structural Equation Models and.pdf:application/pdf},
}

@article{robins_marginal_2000,
	title = {Marginal structural models and causal inference in epidemiology},
	volume = {11},
	issn = {1044-3983},
	doi = {10.1097/00001648-200009000-00011},
	abstract = {In observational studies with exposures or treatments that vary over time, standard approaches for adjustment of confounding are biased when there exist time-dependent confounders that are also affected by previous treatment. This paper introduces marginal structural models, a new class of causal models that allow for improved adjustment of confounding in those situations. The parameters of a marginal structural model can be consistently estimated using a new class of estimators, the inverse-probability-of-treatment weighted estimators.},
	language = {eng},
	number = {5},
	journal = {Epidemiology (Cambridge, Mass.)},
	author = {Robins, J. M. and Hernán, M. A. and Brumback, B.},
	month = sep,
	year = {2000},
	pmid = {10955408},
	keywords = {Humans, Anti-HIV Agents, Causality, Confounding Factors, Epidemiologic, Epidemiologic Methods, HIV Infections, Models, Statistical, Risk Factors, Time Factors, Zidovudine},
	pages = {550--560},
}

@misc{elor_smote_2022,
	title = {To {SMOTE}, or not to {SMOTE}?},
	url = {http://arxiv.org/abs/2201.08528},
	doi = {10.48550/arXiv.2201.08528},
	abstract = {Balancing the data before training a classifier is a popular technique to address the challenges of imbalanced binary classification in tabular data. Balancing is commonly achieved by duplication of minority samples or by generation of synthetic minority samples. While it is well known that balancing affects each classifier differently, most prior empirical studies did not include strong state-of-the-art (SOTA) classifiers as baselines. In this work, we are interested in understanding whether balancing is beneficial, particularly in the context of SOTA classifiers. Thus, we conduct extensive experiments considering three SOTA classifiers along the weaker learners used in previous investigations. Additionally, we carefully discern proper metrics, consistent and non-consistent algorithms and hyper-parameter selection methods and show that these have a significant impact on prediction quality and on the effectiveness of balancing. Our results support the known utility of balancing for weak classifiers. However, we find that balancing does not improve prediction performance for the strong ones. We further identify several other scenarios for which balancing is effective and observe that prior studies demonstrated the utility of balancing by focusing on these settings.},
	urldate = {2024-11-11},
	publisher = {arXiv},
	author = {Elor, Yotam and Averbuch-Elor, Hadar},
	month = may,
	year = {2022},
	note = {arXiv:2201.08528},
	keywords = {Computer Science - Machine Learning},
	file = {Preprint PDF:/Users/micl/Zotero/storage/FE2TAMTC/Elor and Averbuch-Elor - 2022 - To SMOTE, or not to SMOTE.pdf:application/pdf;Snapshot:/Users/micl/Zotero/storage/W28VZH7V/2201.html:text/html},
}

@article{pedregosa_scikit-learn_2011,
	title = {Scikit-learn: {Machine} {Learning} in {Python}},
	volume = {12},
	issn = {1533-7928},
	shorttitle = {Scikit-learn},
	url = {http://jmlr.org/papers/v12/pedregosa11a.html},
	abstract = {Scikit-learn is a Python module integrating a wide range of state-of-the-art machine learning algorithms for medium-scale supervised and unsupervised problems. This package focuses on bringing machine learning to non-specialists using a general-purpose high-level language.  Emphasis is put on ease of use, performance, documentation, and API consistency.  It has minimal dependencies and is distributed under the simplified BSD license, encouraging its use in both academic and commercial settings.  Source code, binaries, and documentation can be downloaded from http://scikit-learn.sourceforge.net.},
	number = {85},
	urldate = {2024-11-12},
	journal = {Journal of Machine Learning Research},
	author = {Pedregosa, Fabian and Varoquaux, Gaël and Gramfort, Alexandre and Michel, Vincent and Thirion, Bertrand and Grisel, Olivier and Blondel, Mathieu and Prettenhofer, Peter and Weiss, Ron and Dubourg, Vincent and Vanderplas, Jake and Passos, Alexandre and Cournapeau, David and Brucher, Matthieu and Perrot, Matthieu and Duchesnay, Édouard},
	year = {2011},
	pages = {2825--2830},
	file = {Full Text PDF:/Users/micl/Zotero/storage/J7YXPHRG/Pedregosa et al. - 2011 - Scikit-learn Machine Learning in Python.pdf:application/pdf;Source Code:/Users/micl/Zotero/storage/R35CLYIF/scikit-learn.html:text/html},
}