Skip to content

Commit f1beceb

Browse files
committed
2.9.2 hadoop
1 parent 10e2b6d commit f1beceb

File tree

9 files changed

+703
-604
lines changed

9 files changed

+703
-604
lines changed

files/scalable_analytics/core-site.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,6 @@
1919
<configuration>
2020
<property>
2121
<name>fs.defaultFS</name>
22-
<value>hdfs://localhost:9000</value>
22+
<value>hdfs://localhost:9100</value>
2323
</property>
2424
</configuration>

files/scalable_analytics/hadoop.sh

-7
This file was deleted.

files/scalable_analytics/init-dfs.sh

+5-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
#!/bin/bash
22

3-
$HADOOP_HOME/bin/hdfs namenode -format
4-
echo `${HADOOP_HOME}/bin/hdfs getconf -confKey dfs.datanode.data.dir` | cut -c8- | xargs rm -r
3+
mkdir -p /tmp/hadoop-root/dfs/name
4+
mkdir -p /tmp/hadoop-jovyan/dfs/data
5+
sed -i '$a\# Add the line for suppressing the NativeCodeLoader warning \nlog4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR,console' /$HADOOP_HOME/etc/hadoop/log4j.properties
6+
$HADOOP_HOME/bin/hdfs namenode -format -force
7+
echo `${HADOOP_HOME}/bin/hdfs getconf -confKey dfs.datanode.data.dir` | cut -c8- | xargs rm -r

files/setup-hadoop.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,4 @@ curl -sL --retry 3 \
1111
| tar -x --strip-components=1 -C $HADOOP_HOME \
1212
&& rm -rf $HADOOP_HOME/share/doc \
1313
&& chown -R ${NB_USER} $HADOOP_HOME \
14-
&& mkdir "${HADOOP_HOME}/logs"
14+
&& mkdir -p "${HADOOP_HOME}/logs"

rsm-msba-arm/Dockerfile

+3-299
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
# FROM quay.io/jupyter/pyspark-notebook@sha256:95ef0825db8d4ce411bafe93be230d8ff59d0c2a45038ca1fd4ac4c3c5474ddb
2-
# FROM quay.io/jupyter/pyspark-notebook:aarch64-2023-12-18
3-
FROM quay.io/jupyter/pyspark-notebook@sha256:9863dd81c0b2d047232c60f9b96aabb9a3c561161960eea48761618a891daa35
1+
FROM vnijs/rsm-msba-arm:2.9.1
42

53
LABEL Vincent Nijs "[email protected]"
64

@@ -13,301 +11,8 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"]
1311

1412
USER root
1513

16-
# fixes the issue where sudo requires terminal for password when starting postgres
17-
RUN echo "${NB_USER} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
18-
19-
RUN apt-get update -qq && apt-get -y --no-install-recommends install \
20-
supervisor \
21-
openssh-server \
22-
libcurl4-openssl-dev \
23-
zsh \
24-
vim \
25-
vifm \
26-
wget \
27-
rsync \
28-
lsb-release \
29-
git \
30-
netcat \
31-
htop \
32-
openjdk-17-jdk-headless \
33-
ant \
34-
ca-certificates-java \
35-
&& apt-get clean \
36-
&& update-ca-certificates -f;
37-
38-
ENV CMDSTAN_VERSION="2.33.1"
39-
ENV PANDAS_VERSION="2.1.4"
40-
# ENV PANDAS_VERSION="2.0.3" # pyspark image still using 2.0.3
41-
ENV PYARROW_VERSION="14.0.1"
42-
RUN mamba install --quiet --yes -c conda-forge \
43-
pandas=${PANDAS_VERSION} \
44-
cmdstan=${CMDSTAN_VERSION} \
45-
cmdstanpy \
46-
sqlalchemy \
47-
psycopg2 \
48-
ipython-sql \
49-
beautifulsoup4 \
50-
scikit-learn \
51-
mlxtend \
52-
xgboost \
53-
lightgbm \
54-
graphviz \
55-
lime \
56-
shap \
57-
spacy \
58-
pydotplus \
59-
networkx \
60-
seaborn \
61-
plotnine \
62-
selenium \
63-
sqlalchemy \
64-
pyLDAvis \
65-
python-dotenv \
66-
statsmodels \
67-
linearmodels \
68-
jupyterlab_widgets \
69-
jupytext \
70-
black \
71-
isort \
72-
nltk \
73-
jupyter-server-proxy \
74-
jupyter-rsession-proxy \
75-
streamlit \
76-
xlrd \
77-
openpyxl \
78-
pyarrow=${PYARROW_VERSION} \
79-
python-duckdb \
80-
duckdb-engine \
81-
bash_kernel \
82-
sympy \
83-
simpy \
84-
awscli \
85-
bokeh \
86-
dask-kubernetes \
87-
dask-ml \
88-
findspark \
89-
pyspark \
90-
plotly \
91-
&& python -m bash_kernel.install
92-
93-
# causing issues with 1/12/2023 update
94-
# snowflake-connector-python
95-
96-
COPY files/setup-ml-frameworks.sh setup.sh
97-
RUN chmod 755 setup.sh \
98-
&& ./setup.sh \
99-
&& rm setup.sh
100-
101-
# make system (conda) R the first choice
102-
ENV R_VERSION=4.3.2
103-
ENV TERM=xterm
104-
ENV R_HOME=/opt/conda/lib/R
105-
ENV LD_LIBRARY_PATH="/opt/conda/lib:/usr/local/lib:${LD_LIBRARY_PATH}"
106-
ENV PATH="/usr/local/bin:$PATH"
107-
108-
RUN mamba install --quiet --yes -c conda-forge \
109-
c-compiler \
110-
"r-base>=${R_VERSION}" \
111-
r-curl \
112-
binutils \
113-
libgit2 \
114-
freetype \
115-
libpng \
116-
libtiff \
117-
libjpeg-turbo \
118-
libxml2 \
119-
unixodbc \
120-
rpy2 \
121-
jupyterlab-variableinspector \
122-
jupyterlab_code_formatter \
123-
openssh \
124-
git \
125-
&& ln -s /opt/conda/bin/R /usr/local/bin/R \
126-
&& ln -s /opt/conda/bin/Rscript /usr/local/bin/Rscript
127-
128-
# not available through conda-forge for both arm and amd
129-
# or the conda version is causing issues
130-
RUN pip install \
131-
jupyterlab-skip-traceback \
132-
radian \
133-
polars \
134-
connectorx \
135-
xlsx2csv \
136-
jupysql \
137-
shiny \
138-
shinywidgets \
139-
pyrsm
140-
141-
# catboost # not available for arm64
142-
143-
# connectorx is default for sql stuff in polars but is not built for aarch64
144-
# had to do that manually with a docker file
145-
# see https://github.com/sfu-db/connector-x/issues/386
146-
ENV wheel_name=connectorx-0.3.2-cp311-cp311-manylinux_2_34_aarch64.whl
147-
COPY files/connectorx/${wheel_name} ${wheel_name}
148-
RUN pip install ${wheel_name}
149-
150-
RUN echo "R_LIBS_USER='~/.rsm-msba/R/${R_VERSION}'" >> ${R_HOME}/etc/Renviron.site
151-
RUN echo '.libPaths(unique(c(Sys.getenv("R_LIBS_USER"), .libPaths())))' >> ${R_HOME}/etc/Rprofile.site
152-
153-
COPY files/setup-tidyverse.sh setup.sh
154-
RUN chmod +x setup.sh \
155-
&& ./setup.sh \
156-
&& rm setup.sh
157-
158-
# packages need for radiant a reproducible analysis
159-
COPY files/setup-radiant.sh setup.sh
160-
RUN chmod +x setup.sh \
161-
&& ./setup.sh \
162-
&& rm setup.sh
163-
164-
# tooling for Bayesian Machine Learning class
165-
# COPY files/setup-bml.sh setup.sh
166-
# RUN chmod +x setup.sh \
167-
# && ./setup.sh \
168-
# && rm setup.sh
169-
170-
# adding postgres
171-
# mostly from https://docs.docker.com/engine/examples/postgresql_service/
172-
ENV POSTGRES_VERSION=14
173-
174-
# upgrade to postgres 14
175-
RUN apt -y update && \
176-
apt -y upgrade && \
177-
apt -y install gpgv dirmngr wget vim && \
178-
sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list' && \
179-
wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - && \
180-
apt -y update && \
181-
apt-get install -y \
182-
postgresql-${POSTGRES_VERSION} \
183-
postgresql-client-${POSTGRES_VERSION} \
184-
postgresql-contrib-${POSTGRES_VERSION}
185-
186-
# Run the rest of the commands as the postgres user
187-
RUN addgroup ${NB_USER} postgres \
188-
&& addgroup postgres users \
189-
&& chown -R postgres:postgres /etc/postgresql/${POSTGRES_VERSION}/ \
190-
&& chown -R postgres:postgres /var/lib/postgresql/${POSTGRES_VERSION}/ \
191-
&& chmod -R u=rwX,go= /var/lib/postgresql/${POSTGRES_VERSION}/
192-
193-
USER postgres
194-
195-
ARG PGPASSWORD=${PGPASSWORD:-postgres}
196-
ENV PGPASSWORD=${PGPASSWORD}
197-
198-
# create a postgres role for ${NB_USER} with "postgres" as the password
199-
# create a database "rsm-docker" owned by the ${NB_USER} role.
200-
RUN /etc/init.d/postgresql start \
201-
&& psql --command "CREATE USER ${NB_USER} WITH SUPERUSER PASSWORD '${PGPASSWORD}';" \
202-
&& createdb -O ${NB_USER} rsm-docker
203-
204-
COPY files/postgresql.conf /etc/postgresql/${POSTGRES_VERSION}/main/postgresql.conf
205-
COPY files/pg_hba.conf /etc/postgresql/${POSTGRES_VERSION}/main/pg_hba.conf
206-
207-
USER root
208-
209-
# populate version number in conf file
210-
RUN sed -i 's/__version__/'"$POSTGRES_VERSION"'/g' /etc/postgresql/${POSTGRES_VERSION}/main/postgresql.conf
211-
212-
RUN addgroup ${NB_USER} postgres \
213-
&& chown -R postgres:postgres /etc/postgresql/${POSTGRES_VERSION}/main/ \
214-
&& fix-permissions /etc/postgresql/${POSTGRES_VERSION}/main/
215-
216-
# from https://github.com/ucsd-ets/rsm-msba-datahub/blob/master/Dockerfile
217-
# RUN chmod -R 777 /etc/postgresql/${POSTGRES_VERSION}
218-
# RUN chmod -R 777 /var/lib/postgresql/
219-
220-
# oh-my-zsh (need to install wget and curl again ...)
221-
RUN apt-get update -qq && apt-get -y --no-install-recommends install wget curl \
222-
&& sh -c "$(curl -fsSL https://raw.github.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" \
223-
&& git clone https://github.com/zsh-users/zsh-completions ${ZSH_CUSTOM:=~/.oh-my-zsh/custom}/plugins/zsh-completions \
224-
&& git clone https://github.com/zsh-users/zsh-autosuggestions ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-autosuggestions \
225-
&& git clone https://github.com/zsh-users/zsh-syntax-highlighting.git ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-syntax-highlighting \
226-
&& git clone https://github.com/supercrabtree/k ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/k \
227-
&& git clone --depth=1 https://github.com/romkatv/powerlevel10k.git ${ZSH_CUSTOM:-$HOME/.oh-my-zsh/custom}/themes/powerlevel10k \
228-
&& cp -R /home/jovyan/.oh-my-zsh /etc/skel/.oh-my-zsh
229-
230-
COPY files/zshrc /etc/skel/.zshrc
231-
COPY files/p10k.zsh /etc/skel/.p10k.zsh
232-
COPY files/usethis /usr/local/bin/usethis
233-
COPY files/clean.sh /usr/local/bin/clean
234-
235-
# settings for local install of python packages
236-
ARG PYBASE=/home/${NB_USER}/.rsm-msba
237-
ENV PYBASE=${PYBASE}
238-
ENV PYTHONUSERBASE=${PYBASE} \
239-
JUPYTER_PATH=${PYBASE}/share/jupyter \
240-
JUPYTER_DATA_DIR=${PYBASE}/share/jupyter \
241-
JUPYTER_CONFIG_DIR=${PYBASE}/jupyter \
242-
JUPYTER_RUNTIME_DIR=/tmp/jupyter/runtime \
243-
RSTUDIO_WHICH_R=/usr/local/bin/R \
244-
SHELL=/bin/zsh \
245-
ZDOTDIR=/home/${NB_USER}/.rsm-msba/zsh \
246-
CMDSTAN="/opt/cmdstan/cmdstan-${CMDSTAN_VERSION}"
247-
248-
COPY files/install-rstudio.sh setup.sh
249-
RUN chmod 755 setup.sh \
250-
&& ./setup.sh \
251-
&& rm setup.sh
252-
253-
# setup quarto - can be used with Rstudio
254-
# and when connecting to running container
255-
# from VSCode
256-
COPY files/setup-quarto.sh setup.sh
257-
RUN chmod +x setup.sh \
258-
&& ./setup.sh \
259-
&& rm setup.sh
260-
261-
# updating the supervisord.conf file for Jupyter and the notebook_config file
262-
COPY files/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
263-
COPY files/condarc /opt/conda/.condarc
264-
RUN mkdir -p /var/log/supervisor \
265-
&& fix-permissions /var/log/supervisor \
266-
&& fix-permissions /etc/supervisor/conf.d/ \
267-
&& fix-permissions "${CONDA_DIR}"
268-
269-
# copy base conda environment management script
270-
COPY files/ccenv.sh /usr/local/bin/ccenv
271-
COPY files/cl.sh /usr/local/bin/cl
272-
COPY files/cr.sh /usr/local/bin/cr
273-
COPY files/ci.sh /usr/local/bin/ci
274-
COPY files/ce.sh /usr/local/bin/ce
275-
276-
# Copy the launch script into the image
277-
COPY launch-${DOCKERHUB_NAME}.sh /opt/launch.sh
278-
COPY files/setup.sh /usr/local/bin/setup
279-
RUN fix-permissions /etc/skel \
280-
&& fix-permissions /usr/local/bin \
281-
&& chmod 755 /usr/local/bin/*
282-
283-
# get pgweb
284-
RUN wget -O pgweb.zip https://github.com/sosedoff/pgweb/releases/download/v0.11.11/pgweb_linux_arm64_v7.zip \
285-
&& unzip pgweb.zip -d pgweb_dir \
286-
&& rm pgweb.zip \
287-
&& mv pgweb_dir/* /usr/local/bin/pgweb \
288-
&& rm -rf pgweb_dir
289-
290-
# setting up jupyter-server-proxy extensions pgweb, gitgadget, and radiant
291-
RUN pip install git+https://github.com/vnijs/jupyter-pgweb-proxy.git \
292-
&& pip install git+https://github.com/vnijs/jupyter-gitgadget-proxy.git \
293-
&& pip install git+https://github.com/vnijs/jupyter-radiant-proxy.git
294-
295-
# packages need for radiant a reproducible analysis
296-
COPY files/setup-extra.sh setup.sh
297-
RUN chmod +x setup.sh \
298-
&& ./setup.sh \
299-
&& rm setup.sh
300-
301-
RUN mamba update --yes pandoc \
302-
&& mamba clean --all -f -y \
303-
&& fix-permissions "${CONDA_DIR}" \
304-
&& fix-permissions "/home/${NB_USER}"
305-
306-
# packages need for arrow
307-
COPY files/setup-arrow.sh setup.sh
308-
RUN chmod +x setup.sh \
309-
&& ./setup.sh \
310-
&& rm setup.sh
14+
RUN apt-get update -qq && apt-get -y --no-install-recommends install lsof
15+
RUN pip install pyrsm --upgrade
31116

31217
# setup hadoop
31318
ENV JAVA_HOME "/usr/lib/jvm/java-17-openjdk-arm64/"
@@ -325,7 +30,6 @@ ADD files/scalable_analytics/hdfs-site.xml $HADOOP_HOME/etc/hadoop/
32530
ADD files/scalable_analytics/init-dfs.sh /opt/hadoop/
32631
ADD files/scalable_analytics/start-dfs.sh /opt/hadoop/
32732
ADD files/scalable_analytics/stop-dfs.sh /opt/hadoop/
328-
ADD files/scalable_analytics/hadoop.sh /usr/bin/hadoop
32933
RUN chown -R ${NB_USER} ${HADOOP_HOME} \
33034
&& chmod 755 ${HADOOP_HOME}/*.sh \
33135
&& chmod 755 /usr/bin/hadoop

0 commit comments

Comments
 (0)