1
- # FROM quay.io/jupyter/pyspark-notebook@sha256:95ef0825db8d4ce411bafe93be230d8ff59d0c2a45038ca1fd4ac4c3c5474ddb
2
- # FROM quay.io/jupyter/pyspark-notebook:aarch64-2023-12-18
3
- FROM quay.io/jupyter/pyspark-notebook@sha256:9863dd81c0b2d047232c60f9b96aabb9a3c561161960eea48761618a891daa35
1
+ FROM vnijs/rsm-msba-arm:2.9.1
4
2
5
3
LABEL Vincent Nijs
"[email protected] "
6
4
@@ -13,301 +11,8 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"]
13
11
14
12
USER root
15
13
16
- # fixes the issue where sudo requires terminal for password when starting postgres
17
- RUN echo "${NB_USER} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
18
-
19
- RUN apt-get update -qq && apt-get -y --no-install-recommends install \
20
- supervisor \
21
- openssh-server \
22
- libcurl4-openssl-dev \
23
- zsh \
24
- vim \
25
- vifm \
26
- wget \
27
- rsync \
28
- lsb-release \
29
- git \
30
- netcat \
31
- htop \
32
- openjdk-17-jdk-headless \
33
- ant \
34
- ca-certificates-java \
35
- && apt-get clean \
36
- && update-ca-certificates -f;
37
-
38
- ENV CMDSTAN_VERSION="2.33.1"
39
- ENV PANDAS_VERSION="2.1.4"
40
- # ENV PANDAS_VERSION="2.0.3" # pyspark image still using 2.0.3
41
- ENV PYARROW_VERSION="14.0.1"
42
- RUN mamba install --quiet --yes -c conda-forge \
43
- pandas=${PANDAS_VERSION} \
44
- cmdstan=${CMDSTAN_VERSION} \
45
- cmdstanpy \
46
- sqlalchemy \
47
- psycopg2 \
48
- ipython-sql \
49
- beautifulsoup4 \
50
- scikit-learn \
51
- mlxtend \
52
- xgboost \
53
- lightgbm \
54
- graphviz \
55
- lime \
56
- shap \
57
- spacy \
58
- pydotplus \
59
- networkx \
60
- seaborn \
61
- plotnine \
62
- selenium \
63
- sqlalchemy \
64
- pyLDAvis \
65
- python-dotenv \
66
- statsmodels \
67
- linearmodels \
68
- jupyterlab_widgets \
69
- jupytext \
70
- black \
71
- isort \
72
- nltk \
73
- jupyter-server-proxy \
74
- jupyter-rsession-proxy \
75
- streamlit \
76
- xlrd \
77
- openpyxl \
78
- pyarrow=${PYARROW_VERSION} \
79
- python-duckdb \
80
- duckdb-engine \
81
- bash_kernel \
82
- sympy \
83
- simpy \
84
- awscli \
85
- bokeh \
86
- dask-kubernetes \
87
- dask-ml \
88
- findspark \
89
- pyspark \
90
- plotly \
91
- && python -m bash_kernel.install
92
-
93
- # causing issues with 1/12/2023 update
94
- # snowflake-connector-python
95
-
96
- COPY files/setup-ml-frameworks.sh setup.sh
97
- RUN chmod 755 setup.sh \
98
- && ./setup.sh \
99
- && rm setup.sh
100
-
101
- # make system (conda) R the first choice
102
- ENV R_VERSION=4.3.2
103
- ENV TERM=xterm
104
- ENV R_HOME=/opt/conda/lib/R
105
- ENV LD_LIBRARY_PATH="/opt/conda/lib:/usr/local/lib:${LD_LIBRARY_PATH}"
106
- ENV PATH="/usr/local/bin:$PATH"
107
-
108
- RUN mamba install --quiet --yes -c conda-forge \
109
- c-compiler \
110
- "r-base>=${R_VERSION}" \
111
- r-curl \
112
- binutils \
113
- libgit2 \
114
- freetype \
115
- libpng \
116
- libtiff \
117
- libjpeg-turbo \
118
- libxml2 \
119
- unixodbc \
120
- rpy2 \
121
- jupyterlab-variableinspector \
122
- jupyterlab_code_formatter \
123
- openssh \
124
- git \
125
- && ln -s /opt/conda/bin/R /usr/local/bin/R \
126
- && ln -s /opt/conda/bin/Rscript /usr/local/bin/Rscript
127
-
128
- # not available through conda-forge for both arm and amd
129
- # or the conda version is causing issues
130
- RUN pip install \
131
- jupyterlab-skip-traceback \
132
- radian \
133
- polars \
134
- connectorx \
135
- xlsx2csv \
136
- jupysql \
137
- shiny \
138
- shinywidgets \
139
- pyrsm
140
-
141
- # catboost # not available for arm64
142
-
143
- # connectorx is default for sql stuff in polars but is not built for aarch64
144
- # had to do that manually with a docker file
145
- # see https://github.com/sfu-db/connector-x/issues/386
146
- ENV wheel_name=connectorx-0.3.2-cp311-cp311-manylinux_2_34_aarch64.whl
147
- COPY files/connectorx/${wheel_name} ${wheel_name}
148
- RUN pip install ${wheel_name}
149
-
150
- RUN echo "R_LIBS_USER='~/.rsm-msba/R/${R_VERSION}'" >> ${R_HOME}/etc/Renviron.site
151
- RUN echo '.libPaths(unique(c(Sys.getenv("R_LIBS_USER"), .libPaths())))' >> ${R_HOME}/etc/Rprofile.site
152
-
153
- COPY files/setup-tidyverse.sh setup.sh
154
- RUN chmod +x setup.sh \
155
- && ./setup.sh \
156
- && rm setup.sh
157
-
158
- # packages need for radiant a reproducible analysis
159
- COPY files/setup-radiant.sh setup.sh
160
- RUN chmod +x setup.sh \
161
- && ./setup.sh \
162
- && rm setup.sh
163
-
164
- # tooling for Bayesian Machine Learning class
165
- # COPY files/setup-bml.sh setup.sh
166
- # RUN chmod +x setup.sh \
167
- # && ./setup.sh \
168
- # && rm setup.sh
169
-
170
- # adding postgres
171
- # mostly from https://docs.docker.com/engine/examples/postgresql_service/
172
- ENV POSTGRES_VERSION=14
173
-
174
- # upgrade to postgres 14
175
- RUN apt -y update && \
176
- apt -y upgrade && \
177
- apt -y install gpgv dirmngr wget vim && \
178
- sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list' && \
179
- wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - && \
180
- apt -y update && \
181
- apt-get install -y \
182
- postgresql-${POSTGRES_VERSION} \
183
- postgresql-client-${POSTGRES_VERSION} \
184
- postgresql-contrib-${POSTGRES_VERSION}
185
-
186
- # Run the rest of the commands as the postgres user
187
- RUN addgroup ${NB_USER} postgres \
188
- && addgroup postgres users \
189
- && chown -R postgres:postgres /etc/postgresql/${POSTGRES_VERSION}/ \
190
- && chown -R postgres:postgres /var/lib/postgresql/${POSTGRES_VERSION}/ \
191
- && chmod -R u=rwX,go= /var/lib/postgresql/${POSTGRES_VERSION}/
192
-
193
- USER postgres
194
-
195
- ARG PGPASSWORD=${PGPASSWORD:-postgres}
196
- ENV PGPASSWORD=${PGPASSWORD}
197
-
198
- # create a postgres role for ${NB_USER} with "postgres" as the password
199
- # create a database "rsm-docker" owned by the ${NB_USER} role.
200
- RUN /etc/init.d/postgresql start \
201
- && psql --command "CREATE USER ${NB_USER} WITH SUPERUSER PASSWORD '${PGPASSWORD}';" \
202
- && createdb -O ${NB_USER} rsm-docker
203
-
204
- COPY files/postgresql.conf /etc/postgresql/${POSTGRES_VERSION}/main/postgresql.conf
205
- COPY files/pg_hba.conf /etc/postgresql/${POSTGRES_VERSION}/main/pg_hba.conf
206
-
207
- USER root
208
-
209
- # populate version number in conf file
210
- RUN sed -i 's/__version__/' "$POSTGRES_VERSION" '/g' /etc/postgresql/${POSTGRES_VERSION}/main/postgresql.conf
211
-
212
- RUN addgroup ${NB_USER} postgres \
213
- && chown -R postgres:postgres /etc/postgresql/${POSTGRES_VERSION}/main/ \
214
- && fix-permissions /etc/postgresql/${POSTGRES_VERSION}/main/
215
-
216
- # from https://github.com/ucsd-ets/rsm-msba-datahub/blob/master/Dockerfile
217
- # RUN chmod -R 777 /etc/postgresql/${POSTGRES_VERSION}
218
- # RUN chmod -R 777 /var/lib/postgresql/
219
-
220
- # oh-my-zsh (need to install wget and curl again ...)
221
- RUN apt-get update -qq && apt-get -y --no-install-recommends install wget curl \
222
- && sh -c "$(curl -fsSL https://raw.github.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" \
223
- && git clone https://github.com/zsh-users/zsh-completions ${ZSH_CUSTOM:=~/.oh-my-zsh/custom}/plugins/zsh-completions \
224
- && git clone https://github.com/zsh-users/zsh-autosuggestions ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-autosuggestions \
225
- && git clone https://github.com/zsh-users/zsh-syntax-highlighting.git ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-syntax-highlighting \
226
- && git clone https://github.com/supercrabtree/k ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/k \
227
- && git clone --depth=1 https://github.com/romkatv/powerlevel10k.git ${ZSH_CUSTOM:-$HOME/.oh-my-zsh/custom}/themes/powerlevel10k \
228
- && cp -R /home/jovyan/.oh-my-zsh /etc/skel/.oh-my-zsh
229
-
230
- COPY files/zshrc /etc/skel/.zshrc
231
- COPY files/p10k.zsh /etc/skel/.p10k.zsh
232
- COPY files/usethis /usr/local/bin/usethis
233
- COPY files/clean.sh /usr/local/bin/clean
234
-
235
- # settings for local install of python packages
236
- ARG PYBASE=/home/${NB_USER}/.rsm-msba
237
- ENV PYBASE=${PYBASE}
238
- ENV PYTHONUSERBASE=${PYBASE} \
239
- JUPYTER_PATH=${PYBASE}/share/jupyter \
240
- JUPYTER_DATA_DIR=${PYBASE}/share/jupyter \
241
- JUPYTER_CONFIG_DIR=${PYBASE}/jupyter \
242
- JUPYTER_RUNTIME_DIR=/tmp/jupyter/runtime \
243
- RSTUDIO_WHICH_R=/usr/local/bin/R \
244
- SHELL=/bin/zsh \
245
- ZDOTDIR=/home/${NB_USER}/.rsm-msba/zsh \
246
- CMDSTAN="/opt/cmdstan/cmdstan-${CMDSTAN_VERSION}"
247
-
248
- COPY files/install-rstudio.sh setup.sh
249
- RUN chmod 755 setup.sh \
250
- && ./setup.sh \
251
- && rm setup.sh
252
-
253
- # setup quarto - can be used with Rstudio
254
- # and when connecting to running container
255
- # from VSCode
256
- COPY files/setup-quarto.sh setup.sh
257
- RUN chmod +x setup.sh \
258
- && ./setup.sh \
259
- && rm setup.sh
260
-
261
- # updating the supervisord.conf file for Jupyter and the notebook_config file
262
- COPY files/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
263
- COPY files/condarc /opt/conda/.condarc
264
- RUN mkdir -p /var/log/supervisor \
265
- && fix-permissions /var/log/supervisor \
266
- && fix-permissions /etc/supervisor/conf.d/ \
267
- && fix-permissions "${CONDA_DIR}"
268
-
269
- # copy base conda environment management script
270
- COPY files/ccenv.sh /usr/local/bin/ccenv
271
- COPY files/cl.sh /usr/local/bin/cl
272
- COPY files/cr.sh /usr/local/bin/cr
273
- COPY files/ci.sh /usr/local/bin/ci
274
- COPY files/ce.sh /usr/local/bin/ce
275
-
276
- # Copy the launch script into the image
277
- COPY launch-${DOCKERHUB_NAME}.sh /opt/launch.sh
278
- COPY files/setup.sh /usr/local/bin/setup
279
- RUN fix-permissions /etc/skel \
280
- && fix-permissions /usr/local/bin \
281
- && chmod 755 /usr/local/bin/*
282
-
283
- # get pgweb
284
- RUN wget -O pgweb.zip https://github.com/sosedoff/pgweb/releases/download/v0.11.11/pgweb_linux_arm64_v7.zip \
285
- && unzip pgweb.zip -d pgweb_dir \
286
- && rm pgweb.zip \
287
- && mv pgweb_dir/* /usr/local/bin/pgweb \
288
- && rm -rf pgweb_dir
289
-
290
- # setting up jupyter-server-proxy extensions pgweb, gitgadget, and radiant
291
- RUN pip install git+https://github.com/vnijs/jupyter-pgweb-proxy.git \
292
- && pip install git+https://github.com/vnijs/jupyter-gitgadget-proxy.git \
293
- && pip install git+https://github.com/vnijs/jupyter-radiant-proxy.git
294
-
295
- # packages need for radiant a reproducible analysis
296
- COPY files/setup-extra.sh setup.sh
297
- RUN chmod +x setup.sh \
298
- && ./setup.sh \
299
- && rm setup.sh
300
-
301
- RUN mamba update --yes pandoc \
302
- && mamba clean --all -f -y \
303
- && fix-permissions "${CONDA_DIR}" \
304
- && fix-permissions "/home/${NB_USER}"
305
-
306
- # packages need for arrow
307
- COPY files/setup-arrow.sh setup.sh
308
- RUN chmod +x setup.sh \
309
- && ./setup.sh \
310
- && rm setup.sh
14
+ RUN apt-get update -qq && apt-get -y --no-install-recommends install lsof
15
+ RUN pip install pyrsm --upgrade
311
16
312
17
# setup hadoop
313
18
ENV JAVA_HOME "/usr/lib/jvm/java-17-openjdk-arm64/"
@@ -325,7 +30,6 @@ ADD files/scalable_analytics/hdfs-site.xml $HADOOP_HOME/etc/hadoop/
325
30
ADD files/scalable_analytics/init-dfs.sh /opt/hadoop/
326
31
ADD files/scalable_analytics/start-dfs.sh /opt/hadoop/
327
32
ADD files/scalable_analytics/stop-dfs.sh /opt/hadoop/
328
- ADD files/scalable_analytics/hadoop.sh /usr/bin/hadoop
329
33
RUN chown -R ${NB_USER} ${HADOOP_HOME} \
330
34
&& chmod 755 ${HADOOP_HOME}/*.sh \
331
35
&& chmod 755 /usr/bin/hadoop
0 commit comments