ep2017.tex

\documentclass[handout]{beamer}

%\usepackage{pgfpages}
%%\setbeameroption{show notes}
%\setbeameroption{show notes on second screen=right}

\setbeamertemplate{caption}[numbered]

\usepackage{tikz}
\usetikzlibrary{calc, matrix, arrows, automata, shapes, positioning, shadows, trees}
\usepackage{graphicx}
\usepackage{multimedia}
\usepackage[latin1]{inputenc}
\usecolortheme{lily}
\setbeamercovered{transparent}
\usepackage{pgfplots}
\usepackage{pgfplotstable}
\usepackage{color}
\usepackage{subfigure}


\tikzset{basic/.style={draw}}
\tikzset{input/.style={basic, circle}}
\tikzset{bias/.style={basic, rectangle}}
\tikzset{neuron/.style={basic, circle}}
\tikzset{>=stealth, font=\scriptsize}
\tikzset{sectors/.style n args={2}{%
     circle, draw, minimum width=3.444em,
     append after command={%
         \pgfextra{ %
            \draw (\tikzlastnode.north) -- (\tikzlastnode.south);
            \path (\tikzlastnode.center) -- node[] {#1} (\tikzlastnode.west);
            \path (\tikzlastnode.center) -- node[] {#2} (\tikzlastnode.east);
         }
      }
   }
}


% argmax
\newcommand{\argmax}[1]{\underset{#1}{\operatorname{arg}\,\operatorname{max}}\;}

\title[\insertdate]{Nonparametric Bayes}

\author{Omar Guti\'errez}
\institute{@trinogz}
\date{June 15, 2015}

\begin{document}

\begin{frame}
\titlepage
Mostly based on \textbf{A Tutorial on Bayesian Nonparametric Models} by Samuel J. Gershman.
\end{frame}
\note{
    Nothing
}

\begin{frame}
    \frametitle{Outline} 
    \tableofcontents
\end{frame}
\note{
    Nothing
}

\section{Introduction}
\begin{frame}{Introduction}
    \begin{itemize}
        %\item Before to start: Statistics was already there, even before than Machine Learning (ML)
        \item What we do in ML is fitting a model to the data
        \item That is, we adjust the values of certain parameters
    \end{itemize}
\end{frame}
\note{
    \begin{itemize}
        \item Machine Learning in a nutshell
            \begin{itemize}
                \item Cloud - Internet
                \item Ubiquituos computing - IoT
                \item Eigenfaces
                \item Statistics - Data Science
                \item That is marketing :)
            \end{itemize}
    \end{itemize}
}

\begin{frame}{Linear Regression}
\begin{figure}[H]
    \centering
    \input{diagrams/regression.tex}
    \caption{Linear Regression}
\end{figure}
\end{frame}
\note{
    In linear models, we adjust the $\beta$ values
}

\begin{frame}{Neural Networks}
\begin{figure}[H]
    \centering
    \input{diagrams/perceptron_and.tex}
    \caption{Perceptron}
\end{figure}
\end{frame}
\note{
In neural networks, we adjust the value of the weights.
}

\begin{frame}{Hidden Markov Models}
\begin{figure}[H]
    \centering
    \input{diagrams/hidden.tex}
    \caption{Hidden Markov Models}
\end{figure}
\end{frame}
\note{
In HMM, we adjust the hidden states
}

\begin{frame}{Bertrand Russell\textsc{\char13}s Inductivist Turkey}
\begin{figure}[H]
    \centering
    \subfigure[One Model]{
        \input{diagrams/model_a.tex}
    }\quad
    \subfigure[Another Model]{
        \input{diagrams/model_b.tex}
    }
    \caption{A comparison of models}
\end{figure}
\end{frame}
\note{
    Now that we know more about models, which is the best model?
}

\begin{frame}{Bertrand Russell\textsc{\char13}s Inductivist Turkey}
\begin{figure}[H]
    \centering
    \subfigure[One Model]{
        \input{diagrams/model_c.tex}
    }\quad
    \subfigure[Another Model]{
        \input{diagrams/model_d.tex}
    }
    \caption{A comparison of models}
\end{figure}
\end{frame}
\note{
    Howevers, these models are incomplete
}

\begin{frame}{Bayesian Learning}
\Huge
\begin{equation}
    \textcolor{red}{P(h|D)} =
    \frac{
    \textcolor{green}{P(D|h)}
    \textcolor{blue}{P(h)}
    }
    {\textcolor{brown}{P(D)}}
\end{equation}
\end{frame}
\note{
    \begin{itemize}
        \item And now, let's talk about the models we are concerned with
        \item The interesting facts in our datasets are governed by probabilities distributions ...
        \item by reasoning about these probabilities we can do inferences
        \item Bayesian reasoning give us a basis to manipulate probabilities
        \item Bayesian is not exactly Bayesian, is the same as if we think in we are Newtonian only because we use Calculus
        \item $P(h)$ initial probability for hypothesis $h$, prior probability, prior knowledge
        \item $P(D)$ prior probability of the trainig data
        \item $P(D|h)$ likelihood of $h$
        \item $P(h|D)$ posterior probability of ...
    \end{itemize}
}

\begin{frame}{Maximum Likelihood Estimation}
\Large
\begin{equation}
    \begin{aligned}
        h_{MAP} &\equiv \argmax {h \in H} P(h|D)\\
                &= \argmax {h \in H} \frac{P(D|h)P(h)}{P(D)}\\
                &= \argmax {h \in H} P(D|h)P(h)\\
        h_{MLE} &= \argmax {h \in H} P(D|h)
    \end{aligned}
\end{equation}
\end{frame}
\note{
    \begin{itemize}
        \item If you want to learn machine learning put this in your hearth
        \item Now we have seen the very basic idea behind Bayes reasoning...
        \item ... we will return to this later
    \end{itemize}
}

\section{Example: clustering}
\begin{frame}{Data is a mess}
    \begin{itemize}
        \item The articles in Wikipedia
        \item The species in the planet
        \item The hashtags on Twitter
    \end{itemize}
\end{frame}
\note{
Let's think in the next examples...
The data is evolving
}

\subsection{Traditional Approach}
\begin{frame}{How the problem is \textit{sometimes} addressed}
    \begin{itemize}
        \item Let's start with the classic approach
        \item Let's do clustering
        \item Let's use Gaussian Mixture Models (GMM)
        \item We can fit several models and then compare them with some metric.
        % add an original scatterplot
        % add an image with clusters classified
        % add an image with BIC
    \end{itemize}
\end{frame}
\note{
	\begin{itemize}
        \item We can use Gaussian Mixture Models
        \item How many classes should I use in my mixture model?
        \item Do the connection with Maximum Likelihood, EM, and GMM
        \item Do the connection with Maximum Likelihood, and linear models
        %\item Learning more from the data as we see it.
	\end{itemize}
}

\begin{frame}{How the problem is \textit{sometimes} addressed}
    \begin{figure}[H]
        \centering
        \subfigure[2]{
            \includegraphics[width=0.30\textwidth]{code/generated_images/bic_compo_2.jpg}
        }\quad
        \subfigure[3]{
            \includegraphics[width=0.30\textwidth]{code/generated_images/bic_compo_3.jpg}
        }\quad
        \subfigure[4]{
            \includegraphics[width=0.30\textwidth]{code/generated_images/bic_compo_4.jpg}
        }\quad
        \subfigure[5]{
            \includegraphics[width=0.30\textwidth]{code/generated_images/bic_compo_5.jpg}
        }
        \subfigure[6]{
            \includegraphics[width=0.30\textwidth]{code/generated_images/bic_compo_6.jpg}
        }
        \caption{A comparison of clusterings classified with GMM}
    \end{figure}
\end{frame}

\begin{frame}{How the problem is \textit{sometimes} addressed}
    \begin{figure}[H]
        \centering
        \includegraphics[width=0.75\textwidth]{code/generated_images/result_bic.jpg}
        \caption{Bayesian Information Criterion (BIC)}
    \end{figure}
\end{frame}

\subsection{Alternative Approach}
\begin{frame}{How we can \textit{alternatively} approach the problem}
	\begin{itemize}
        \item Another interesting approach is to use Bayesian Nonparametric (BNP) models
        \item BNP models will build a model than can adapt its complexity to the data
	\end{itemize}
\end{frame}
\note{
    \begin{itemize}
        \item Learning more from the data as we see it
    \end{itemize}
}

\begin{frame}{Bayesian nonparametric models}
    \begin{figure}[H]
        \centering
        \input{diagrams/bayesian_models.tex}
    \end{figure}
\end{frame}

\begin{frame}{Chinese Restaurant Process}

    \begin{figure}[H]
        \centering
        \includegraphics[width=0.75\textwidth]{figures/crp2.png}\par
    \end{figure}

    \begin{itemize}
        \item Infinite number of tables
        \item A sequence of customers entering the restaurant and sitting down
        \item The first customer enters and sits at the first table
        \item The second customer enters and sits...
            \begin{itemize}
                \item at the first table with probability $\frac{1}{1 + \alpha}$
                \item at the second table with probability $\frac{\alpha}{1 + \alpha}$
            \end{itemize}
    \end{itemize}
\end{frame}
\note{
    \begin{itemize}
        \item We realize that CRP is a form of clustering: $K$ groups and each group with size $N_k$
    \end{itemize}
}

\begin{frame}{How we can \textit{alternatively} approach the problem}
    {\centering
    \begin{figure}[H]
        \includegraphics[width=0.75\textwidth]{code/generated_images/dpgmm.jpg}
        \caption{Points classified with Infinite GMM}
    \end{figure}
    }
\end{frame}

\begin{frame}{What else can be done?}
    {\centering
    \begin{figure}[H]
        \includegraphics[width=0.75\textwidth]{figures/mnist.png}
        \caption{Digit recognition (datamicroscopes)}
    \end{figure}
    }
\end{frame}

\begin{frame}{What else can be done?}
    {\centering
    \begin{figure}[H]
        \includegraphics[width=0.75\textwidth]{figures/topic_modeling.png}
        \caption{Topic Modeling (datamicroscopes)}
    \end{figure}
    }
\end{frame}

\section{Conclusions}
\begin{frame}{Recap: Bayesian parametric vs nonparametric models}
    \begin{itemize}
        \item Traditional approach (finite)
           \begin{itemize}
               \item The number of parameters $\theta$ (e.g. clusters) is prespecified
               \item We have a prior distribution over parameters $P(\theta)$
               \item For example, in the Gaussian mixture model, each cluster will be modelled
                   using a parametric model (e. g. Gaussian)
           \end{itemize}
        \item Bayesian nonparametric models
           \begin{itemize}
               \item We assume that there is an \textbf{infinite} number of latent clusters
               \item A finite number of clusters is \textit{inferred} from data
               \item The number of clusters grow as new data points are observed
           \end{itemize}
    \end{itemize}
\end{frame}

\begin{frame}{Libraries in Python}
	\begin{itemize}
        \item Sklearn
        \item Datamicroscopes
	\end{itemize}
\end{frame}

%\section{Bayesian Inference}
%\begin{frame}{Bayesian Inference}
%	\begin{itemize}
%        \item $P(A|D) \propto P(D|A) P(A)$
%	\end{itemize}
%\end{frame}

%\section{Generative Model}
%\begin{frame}{Generative Model}
%	\begin{itemize}
%        \item *
%	\end{itemize}
%\end{frame}

\begin{frame}{What else to learn?}
    \begin{itemize}
        \item What is the $\beta$ distribution?
        \item What is the Dirichlet distribution?
        \item Dirichlet process
    \end{itemize}
\end{frame}

\begin{frame}{References}
    \begin{itemize}
        \item \textbf{Machine Learning} by Tom Mitchell
        \item \textbf{A Tutorial on Bayesian Nonparametric Models} by Samuel J. Gershman
        \item \textbf{datamicroscopes} library
    \end{itemize}
\end{frame}

\begin{frame}
\huge{Thank you}\\
\huge{Questions?}\\
\end{frame}

\end{document}