Skip to content
Closed
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
.idea_modules/
.project
.pydevproject
.python-version
.ruby-version
.scala_dependencies
.settings
/lib/
Expand Down
2 changes: 1 addition & 1 deletion dev/create-release/do-release-docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ fcreate_secure "$GPG_KEY_FILE"
$GPG --export-secret-key --armor "$GPG_KEY" > "$GPG_KEY_FILE"

run_silent "Building spark-rm image with tag $IMGTAG..." "docker-build.log" \
docker build -t "spark-rm:$IMGTAG" --build-arg UID=$UID "$SELF/spark-rm"
docker build --no-cache -t "spark-rm:$IMGTAG" --build-arg UID=$UID "$SELF/spark-rm"

# Write the release information to a file with environment variables to be used when running the
# image.
Expand Down
61 changes: 32 additions & 29 deletions dev/create-release/spark-rm/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
# Includes:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dongjoon-hyun and @wangyum might better to review.

# * Java 8
# * Ivy
# * Python (2.7.15/3.6.7)
# * Python 3.7
# * Ruby 2.7
# * R-base/R-base-dev (3.6.1)
# * Ruby 2.3 build utilities

FROM ubuntu:18.04

Expand All @@ -33,15 +33,11 @@ ENV DEBCONF_NONINTERACTIVE_SEEN true
# These arguments are just for reuse and not really meant to be customized.
ARG APT_INSTALL="apt-get install --no-install-recommends -y"

ARG BASE_PIP_PKGS="setuptools wheel"
ARG PIP_PKGS="pyopenssl numpy sphinx"
ARG PIP_PKGS="sphinx==2.3.1 mkdocs==1.0.4 numpy==1.18.1"
ARG GEM_PKGS="jekyll:4.0.0 jekyll-redirect-from:0.16.0 rouge:3.15.0"

# Install extra needed repos and refresh.
# - CRAN repo
# - Ruby repo (for doc generation)
#
# This is all in a single "RUN" command so that if anything changes, "apt update" is run to fetch
# the most current package versions (instead of potentially using old versions cached by docker).
RUN apt-get clean && apt-get update && $APT_INSTALL gnupg ca-certificates && \
echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/' >> /etc/apt/sources.list && \
gpg --keyserver keyserver.ubuntu.com --recv-key E298A3A825C0D65DFD57CBB651716619E084DAB9 && \
Expand All @@ -50,36 +46,43 @@ RUN apt-get clean && apt-get update && $APT_INSTALL gnupg ca-certificates && \
rm -rf /var/lib/apt/lists/* && \
apt-get clean && \
apt-get update && \
$APT_INSTALL software-properties-common && \
apt-add-repository -y ppa:brightbox/ruby-ng && \
apt-get update && \
# Install openjdk 8.
$APT_INSTALL openjdk-8-jdk && \
update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java && \
# Install build / source control tools
$APT_INSTALL curl wget git maven ivy subversion make gcc lsof libffi-dev \
pandoc pandoc-citeproc libssl-dev libcurl4-openssl-dev libxml2-dev && \
pandoc pandoc-citeproc libssl-dev libcurl4-openssl-dev libxml2-dev

ENV PATH "$PATH:/root/.pyenv/bin:/root/.pyenv/shims"
RUN curl -L https://github.com/pyenv/pyenv-installer/raw/dd3f7d0914c5b4a416ca71ffabdf2954f2021596/bin/pyenv-installer | bash
Copy link
Member

@HyukjinKwon HyukjinKwon Mar 31, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, I suspect it was not tested due to the limitation described in the PR description:

Generating SQL API Markdown files.
20/03/31 06:41:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Traceback (most recent call last):
  File "/opt/spark-rm/output/spark/sql/gen-sql-api-docs.py", line 21, in <module>
    from pyspark.java_gateway import launch_gateway
  File "/opt/spark-rm/output/spark/python/lib/pyspark.zip/pyspark/__init__.py", line 51, in <module>
  File "/opt/spark-rm/output/spark/python/lib/pyspark.zip/pyspark/context.py", line 22, in <module>
ImportError: No module named threading
log4j:WARN No appenders could be found for logger (org.apache.spark.util.ShutdownHookManager).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.

Seems the installed Python is weird. threading is the standard Python library that has existed from Python 2 to Python 3, but seems not existent with the Python installed here.

Let me revert this to make RC preparation easier.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, that's strange, because I was able to build enough of the Dockerfile to get past this point, and I did not hit this problem.

What I couldn't test was to run everything via do-release-docker.sh.

RUN $APT_INSTALL libbz2-dev libreadline-dev libsqlite3-dev
RUN pyenv install 3.7.6
RUN pyenv global 3.7.6
RUN python --version
RUN pip install --upgrade pip
RUN pip --version
RUN pip install $PIP_PKGS

ENV PATH "$PATH:/root/.rbenv/bin:/root/.rbenv/shims"
RUN curl -fsSL https://github.com/rbenv/rbenv-installer/raw/108c12307621a0aa06f19799641848dde1987deb/bin/rbenv-installer | bash
RUN rbenv install 2.7.0
RUN rbenv global 2.7.0
RUN ruby --version
RUN $APT_INSTALL g++
RUN gem --version
RUN gem install --no-document $GEM_PKGS

RUN \
curl -sL https://deb.nodesource.com/setup_11.x | bash && \
$APT_INSTALL nodejs && \
# Install needed python packages. Use pip for installing packages (for consistency).
$APT_INSTALL libpython3-dev python3-pip && \
# Change default python version to python3.
update-alternatives --install /usr/bin/python python /usr/bin/python2.7 1 && \
update-alternatives --install /usr/bin/python python /usr/bin/python3.6 2 && \
update-alternatives --set python /usr/bin/python3.6 && \
pip3 install $BASE_PIP_PKGS && \
pip3 install $PIP_PKGS && \
# Install R packages and dependencies used when building.
# R depends on pandoc*, libssl (which are installed above).
$APT_INSTALL nodejs

# Install R packages and dependencies used when building.
# R depends on pandoc*, libssl (which are installed above).
RUN \
$APT_INSTALL r-base r-base-dev && \
$APT_INSTALL texlive-latex-base texlive texlive-fonts-extra texinfo qpdf && \
Rscript -e "install.packages(c('curl', 'xml2', 'httr', 'devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2', 'e1071', 'survival'), repos='https://cloud.r-project.org/')" && \
Rscript -e "devtools::install_github('jimhester/lintr')" && \
# Install tools needed to build the documentation.
$APT_INSTALL ruby2.3 ruby2.3-dev mkdocs && \
gem install jekyll --no-rdoc --no-ri -v 3.8.6 && \
gem install jekyll-redirect-from -v 0.15.0 && \
gem install rouge
Rscript -e "devtools::install_github('jimhester/lintr')"

WORKDIR /opt/spark-rm/output

Expand Down
38 changes: 32 additions & 6 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,41 @@ whichever version of Spark you currently have checked out of revision control.
The Spark documentation build uses a number of tools to build HTML docs and API docs in Scala, Java,
Python, R and SQL.

You need to have [Ruby](https://www.ruby-lang.org/en/documentation/installation/) and
[Python](https://docs.python.org/2/using/unix.html#getting-and-installing-the-latest-version-of-python)
installed. Also install the following libraries:
You need to have Ruby 2 and Python 3 installed. A handy way to install and manage various versions of Ruby and Python is with [`rbenv`] and [`pyenv`].
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we revert these doc changes in the documentation? Otherwise, I am good with other changes.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or alternatively, we can have one section to describe use rbenv and pyenv separately .. that works to me too.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I moved them to their own sub-section. If we think it's distracting / not helpful, I can remove it.


[`rbenv`]: https://github.com/rbenv/rbenv
[`pyenv`]: https://github.com/pyenv/pyenv

On macOS you can install them with Homebrew:

```sh
brew install rbenv pyenv
```

To activate them, you'll need to run these commands or add them to the end of your `.bash_profile`:

```sh
eval "$(rbenv init -)"
eval "$(pyenv init -)"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One alternative to running these init commands is to prefix the below commands so that they do the right thing without depending on any environment variables being set by init.

So, for example, pip install would become pyenv exec pip install and jekyll would become rbenv exec jekyll.

```

You can now use them to install specific versions of Ruby and Python and associate them with
the Spark home directory. Whenever you navigate to this directory or any of its subdirectories, these versions of Ruby and Python will be automatically activated.

```sh
$ sudo gem install jekyll jekyll-redirect-from rouge
rbenv install 2.7.0
pyenv install 3.7.6

cd /path/to/spark/root
rbenv local 2.7.0
pyenv local 3.7.6
```

Note: If you are on a system with both Ruby 1.9 and Ruby 2.0 you may need to replace gem with gem2.0.
Now you can install the following libraries:

```sh
gem install jekyll:4.0.0 jekyll-redirect-from:0.16.0 rouge:3.15.0
```

### R Documentation

Expand All @@ -58,7 +84,7 @@ Note: Other versions of roxygen2 might work in SparkR documentation generation b
To generate API docs for any language, you'll need to install these libraries:

```sh
$ sudo pip install sphinx mkdocs numpy
pip install sphinx==2.3.1 mkdocs==1.0.4 numpy==1.18.1
```

## Generating the Documentation HTML
Expand Down
156 changes: 2 additions & 154 deletions python/docs/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
# You can set these variables from the command line.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
PAPER ?=
BUILDDIR ?= _build

export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.10.8.1-src.zip)
Expand All @@ -15,38 +14,9 @@ $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx in
endif

# Internal variables.
PAPEROPT_a4 = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
# the i18n builder cannot share the environment and doctrees with the others
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(SPHINXOPTS) .

.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext

help:
@echo "Please use \`make <target>' where <target> is one of"
@echo " html to make standalone HTML files"
@echo " dirhtml to make HTML files named index.html in directories"
@echo " singlehtml to make a single large HTML file"
@echo " pickle to make pickle files"
@echo " json to make JSON files"
@echo " htmlhelp to make HTML files and a HTML help project"
@echo " qthelp to make HTML files and a qthelp project"
@echo " devhelp to make HTML files and a Devhelp project"
@echo " epub to make an epub"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
@echo " latexpdf to make LaTeX files and run them through pdflatex"
@echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
@echo " text to make text files"
@echo " man to make manual pages"
@echo " texinfo to make Texinfo files"
@echo " info to make Texinfo files and run them through makeinfo"
@echo " gettext to make PO message catalogs"
@echo " changes to make an overview of all changed/added/deprecated items"
@echo " xml to make Docutils-native XML files"
@echo " pseudoxml to make pseudoxml-XML files for display purposes"
@echo " linkcheck to check all external links for integrity"
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
.PHONY: help clean html

clean:
rm -rf $(BUILDDIR)/*
Expand All @@ -55,125 +25,3 @@ html:
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."

dirhtml:
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."

singlehtml:
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
@echo
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."

pickle:
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
@echo
@echo "Build finished; now you can process the pickle files."

json:
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
@echo
@echo "Build finished; now you can process the JSON files."

htmlhelp:
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
@echo
@echo "Build finished; now you can run HTML Help Workshop with the" \
".hhp project file in $(BUILDDIR)/htmlhelp."

qthelp:
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
@echo
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pyspark.qhcp"
@echo "To view the help file:"
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pyspark.qhc"

devhelp:
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
@echo
@echo "Build finished."
@echo "To view the help file:"
@echo "# mkdir -p $$HOME/.local/share/devhelp/pyspark"
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pyspark"
@echo "# devhelp"

epub:
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
@echo
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."

latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
@echo "Run \`make' in that directory to run these through (pdf)latex" \
"(use \`make latexpdf' here to do that automatically)."

latexpdf:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through pdflatex..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."

latexpdfja:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through platex and dvipdfmx..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."

text:
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
@echo
@echo "Build finished. The text files are in $(BUILDDIR)/text."

man:
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
@echo
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."

texinfo:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
@echo "Run \`make' in that directory to run these through makeinfo" \
"(use \`make info' here to do that automatically)."

info:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo "Running Texinfo files through makeinfo..."
make -C $(BUILDDIR)/texinfo info
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."

gettext:
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
@echo
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."

changes:
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
@echo
@echo "The overview file is in $(BUILDDIR)/changes."

linkcheck:
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
@echo
@echo "Link check complete; look for any errors in the above output " \
"or in $(BUILDDIR)/linkcheck/output.txt."

doctest:
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
@echo "Testing of doctests in the sources finished, look at the " \
"results in $(BUILDDIR)/doctest/output.txt."

xml:
$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
@echo
@echo "Build finished. The XML files are in $(BUILDDIR)/xml."

pseudoxml:
$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
@echo
@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
Loading