diff --git a/.circleci/config.yml b/.circleci/config.yml index dfa66dadf7..a498260b45 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -15,6 +15,84 @@ executors: working_directory: /chronon docker: - image: houpy0829/chronon-ci:base--f87f50dc520f7a73894ae024eb78bd305d5b08e2 + modern_ubuntu_executor: + resource_class: xlarge + working_directory: /chronon + docker: + - image: cimg/openjdk:11.0 + modern_ubuntu_executor_xxlarge: + resource_class: 2xlarge + working_directory: /chronon + docker: + - image: cimg/openjdk:11.0 + +commands: + install_build_dependencies: + description: "Install Thrift, Conda, SBT, and Mill on Ubuntu 22.04" + steps: + - run: + name: Install system dependencies + command: | + sudo apt-get update + sudo apt-get install -y \ + automake \ + bison \ + cmake \ + flex \ + g++ \ + git \ + libboost-dev \ + libboost-filesystem-dev \ + libboost-program-options-dev \ + libboost-system-dev \ + libboost-test-dev \ + libevent-dev \ + libssl-dev \ + libtool \ + make \ + pkg-config + - run: + name: Install Thrift 0.11.0 from source + command: | + export THRIFT_VERSION=0.11.0 + curl -sSL "http://archive.apache.org/dist/thrift/$THRIFT_VERSION/thrift-$THRIFT_VERSION.tar.gz" -o thrift.tar.gz + mkdir -p /tmp/thrift + tar zxf thrift.tar.gz -C /tmp/thrift --strip-components=1 + rm thrift.tar.gz + cd /tmp/thrift + ./configure --without-python --without-cpp + make + sudo make install + cd / + rm -rf /tmp/thrift + thrift --version + - run: + name: Install SBT + command: | + echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | sudo tee /etc/apt/sources.list.d/sbt.list + curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | sudo apt-key add + sudo apt-get update + sudo apt-get install -y sbt + - run: + name: Install Miniconda + command: | + wget -q https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh + bash ~/miniconda.sh -b -p $HOME/miniconda + echo 'export PATH=$HOME/miniconda/bin:$PATH' >> $BASH_ENV + source $BASH_ENV + conda create -y -n chronon_py python=3.7 + conda install -y -q -n chronon_py --no-deps virtualenv + $HOME/miniconda/envs/chronon_py/bin/pip install \ + flake8==5.0.4 flake8-quotes==3.3.1 thrift==0.11.0 click==7.0 thrift_json==0.1.0 nose>=1.3.7 + $HOME/miniconda/envs/chronon_py/bin/pip install build + - run: + name: Install Mill with checksum verification + command: | + curl -L https://github.com/com-lihaoyi/mill/releases/download/0.10.15/0.10.15 -o mill + echo "d90132b1a4ebe4d55d2bc43b3f18b5d6e8e3d12d89a83f83ad2276867e127916 mill" | sha256sum -c - + chmod +x mill + sudo mv mill /usr/local/bin/mill + mill --version jobs: "Pull Docker Image": @@ -195,6 +273,64 @@ jobs: destination: spark_warehouse.tar.gz when: on_fail + "Mill -- Compile All": + executor: modern_ubuntu_executor + steps: + - checkout + - install_build_dependencies + - run: + name: Compile all modules with Mill + shell: /bin/bash -leuxo pipefail + command: | + source $BASH_ENV + conda activate chronon_py + # Prepare scala version-specific sources for Mill + mill api.prepareScalaSources + # Compile all modules + mill api.compile aggregator.compile online.compile spark.compile flink.compile + + "Mill -- Tests": + executor: modern_ubuntu_executor_xxlarge + steps: + - checkout + - install_build_dependencies + - run: + name: Run all tests with Mill + shell: /bin/bash -leuxo pipefail + command: | + source $BASH_ENV + conda activate chronon_py + mill api.prepareScalaSources + mill api.test aggregator.test online.test spark.test flink.test + - run: + name: Compress spark-warehouse + command: | + cd /tmp/ && tar -czvf spark-warehouse.tar.gz chronon/spark-warehouse + when: on_fail + - store_artifacts: + path: /tmp/spark-warehouse.tar.gz + destination: spark_warehouse.tar.gz + when: on_fail + + "Mill -- Python API Build": + executor: modern_ubuntu_executor + steps: + - checkout + - install_build_dependencies + - run: + name: Build Python API with Mill + shell: /bin/bash -leuxo pipefail + command: | + source $BASH_ENV + conda activate chronon_py + # Set project root for Mill commands + export CHRONON_ROOT=/chronon + # Generate Python Thrift and build wheel + mill generatePythonThrift + mill buildPythonApi + - store_artifacts: + path: /chronon/api/py/dist + workflows: build_test_deploy: jobs: @@ -221,5 +357,14 @@ workflows: requires: - "Pull Docker Image" - "Scala 13 -- Iceberg Table Utils Tests": + requires: + - "Pull Docker Image" + - "Mill -- Compile All": + requires: + - "Pull Docker Image" + - "Mill -- Tests": + requires: + - "Pull Docker Image" + - "Mill -- Python API Build": requires: - "Pull Docker Image" \ No newline at end of file diff --git a/aggregator/package.mill b/aggregator/package.mill new file mode 100644 index 0000000000..e312494507 --- /dev/null +++ b/aggregator/package.mill @@ -0,0 +1,69 @@ +package build.aggregator + +import mill._ +import mill.javalib._ +import mill.javalib.publish._ +import mill.scalalib.SbtModule + +object `package` extends SbtModule with PublishModule { + + def scalaVersion = "2.12.12" + + def mvnDeps = Seq( + mvn"com.google.code.gson:gson:2.8.6", + mvn"com.yahoo.datasketches:sketches-core:0.13.4", + mvn"org.apache.commons:commons-lang3:3.12.0" + ) + + def moduleDeps = super.moduleDeps ++ Seq(build.api) + + def compileMvnDeps = Seq( + mvn"org.apache.spark::spark-core:3.1.1", + mvn"org.apache.spark::spark-sql:3.1.1" + ) + + def pomSettings = PomSettings( + "Chronon is a feature engineering platform", + "ai.chronon", + "https://github.com/airbnb/chronon", + Seq(License( + "Apache 2", + "Apache 2", + "http://www.apache.org/licenses/LICENSE-2.0.txt", + false, + false, + "repo" + )), + VersionControl( + Some("https://github.com/airbnb/chronon"), + Some("scm:git@github.com:airbnb/chronon.git"), + None, + None + ), + Seq(Developer( + "nikhilsimha", + "Nikhil Simha", + "http://nikhilsimha.com", + None, + None + )) + ) + + def publishVersion = "awhittier-mill-0.0.110-SNAPSHOT" + + object test extends SbtTests with TestModule.Junit4 { + + def mvnDeps = Seq( + mvn"junit:junit:4.13.2", + mvn"com.novocode:junit-interface:0.11", + mvn"org.scalatest::scalatest:3.2.15", + mvn"org.apache.commons:commons-math3:3.6.1" + ) + + def moduleDeps = super.moduleDeps ++ Seq(build.api.test) + + def testSandboxWorkingDir = false + def testParallelism = false + + } +} diff --git a/api/package.mill b/api/package.mill new file mode 100644 index 0000000000..b0f6e5658c --- /dev/null +++ b/api/package.mill @@ -0,0 +1,105 @@ +package build.api + +import mill._ +import mill.javalib._ +import mill.javalib.publish._ +import mill.scalalib.SbtModule + +object `package` extends SbtModule with PublishModule { + + def scalaVersion = "2.12.12" + + def mvnDeps = Seq( + mvn"org.apache.thrift:libthrift:0.13.0", + mvn"org.scala-lang.modules::scala-collection-compat:2.6.0", + mvn"org.scala-lang:scala-reflect:2.12.12" + ) + + def compileMvnDeps = Seq( + mvn"org.apache.spark::spark-core:3.1.1", + mvn"org.apache.spark::spark-sql:3.1.1" + ) + + def pomSettings = PomSettings( + "Chronon is a feature engineering platform", + "ai.chronon", + "https://github.com/airbnb/chronon", + Seq(License( + "Apache 2", + "Apache 2", + "http://www.apache.org/licenses/LICENSE-2.0.txt", + false, + false, + "repo" + )), + VersionControl( + Some("https://github.com/airbnb/chronon"), + Some("scm:git@github.com:airbnb/chronon.git"), + None, + None + ), + Seq(Developer( + "nikhilsimha", + "Nikhil Simha", + "http://nikhilsimha.com", + None, + None + )) + ) + + def publishVersion = "awhittier-mill-0.0.110-SNAPSHOT" + + /** + * Prepare Scala version-specific sources for Mill compilation. + * + * Mill's security model prevents reading from scala-2.12/ during Task execution, + * so we copy version-specific sources to the main scala/ directory before compilation. + * + * This is a one-time setup command - run it once per clean build: + * ./mill api.prepareScalaSources + * + * SBT users don't need this - sbt automatically picks up scala-2.12/ directories. + */ + def prepareScalaSources() = Task.Command { + val scala212Dir = millSourcePath0 / "src" / "main" / "scala-2.12" + val scalaDir = millSourcePath0 / "src" / "main" / "scala" + + os.walk(scala212Dir) + .filter(_.ext == "scala") + .foreach { sourceFile => + val relativePath = sourceFile.relativeTo(scala212Dir) + val targetFile = scalaDir / relativePath + os.makeDir.all(targetFile / os.up) + os.copy.over(sourceFile, targetFile, createFolders = true) + println(s"✓ Copied: ${sourceFile.last}") + } + println(s"✓ Scala 2.12 sources prepared for Mill compilation") + () + } + + // Generate Java sources from Thrift + def generatedSources = Task { + val thriftFile = millSourcePath0 / "thrift" / "api.thrift" + val outDir = Task.dest / "java" + os.remove.all(outDir) + os.makeDir.all(outDir) + os.proc("thrift", "--gen", "java", "-out", outDir, thriftFile) + .call(stdout = os.Inherit) + + os.walk(outDir).filter(_.ext == "java").map(PathRef(_)) + } + + object test extends SbtTests with TestModule.Junit4 { + + def mvnDeps = Seq( + mvn"com.novocode:junit-interface:0.11", + mvn"junit:junit:4.13.2", + mvn"org.scalatest::scalatest:3.2.15", + mvn"org.scalatestplus::mockito-3-4:3.2.10.0" + ) + + def testSandboxWorkingDir = false + def testParallelism = false + + } +} diff --git a/api/py/python-api-build.sh b/api/py/python-api-build.sh index a984be5ee0..144b17f28c 100755 --- a/api/py/python-api-build.sh +++ b/api/py/python-api-build.sh @@ -26,6 +26,23 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) echo "Removing old distributions..." rm -f $SCRIPT_DIR/dist/* +# Create and activate venv if it doesn't exist +VENV_DIR="$SCRIPT_DIR/../../venv" +if [[ ! -d "$VENV_DIR" ]]; then + echo "Creating Python virtual environment..." + python3 -m venv "$VENV_DIR" +fi + +# Activate venv +echo "Activating Python virtual environment..." +source "$VENV_DIR/bin/activate" + +# Install build dependencies in venv if needed +if ! python3 -c "import build" 2>/dev/null; then + echo "Installing Python build dependencies..." + pip install --quiet build twine +fi + # The default action is "build" if [[ -z "${ACTION}" ]] || [[ "${ACTION}" == "build" ]]; then PYPI_REPOSITORY="internal" diff --git a/build.mill b/build.mill new file mode 100644 index 0000000000..81179bdff7 --- /dev/null +++ b/build.mill @@ -0,0 +1,116 @@ +//| mill-version: 1.0.6 +package build + +import mill._ +import mill.javalib._ +import mill.javalib.publish._ +import mill.scalalib.SbtModule + +// Compute project root - use environment variable override for CI, otherwise detect from git +val PROJECT_ROOT = os.Path(sys.env.getOrElse("CHRONON_ROOT", + os.proc("git", "rev-parse", "--show-toplevel").call().out.text().trim() +)) + +object `package` extends SbtModule with PublishModule { + + def artifactName = "chronon" + + def scalaVersion = "2.12.12" + + def pomSettings = PomSettings( + "Chronon is a feature engineering platform", + "ai.chronon", + "https://github.com/airbnb/chronon", + Seq(License( + "Apache 2", + "Apache 2", + "http://www.apache.org/licenses/LICENSE-2.0.txt", + false, + false, + "repo" + )), + VersionControl( + Some("https://github.com/airbnb/chronon"), + Some("scm:git@github.com:airbnb/chronon.git"), + None, + None + ), + Seq(Developer( + "nikhilsimha", + "Nikhil Simha", + "http://nikhilsimha.com", + None, + None + )) + ) + + def publishVersion = "0.0.110-SNAPSHOT" + + /** + * Generate Python Thrift sources from api.thrift + * Usage: ./mill generatePythonThrift + */ + def generatePythonThrift() = Task.Command { + // Use captured project root directory + val root = PROJECT_ROOT + val thriftFile = root / "api" / "thrift" / "api.thrift" + val outDir = root / "api" / "py" / "ai" / "chronon" + + println(s"Generating Python Thrift files from: $thriftFile") + println(s"Output directory: $outDir") + + // Remove old generated files + if (os.exists(outDir / "api")) { + os.remove.all(outDir / "api") + } + + // Generate Python Thrift files + os.proc("thrift", "--gen", "py", "-out", outDir, thriftFile) + .call(cwd = root, stdout = os.Inherit, stderr = os.Inherit) + + println("Python Thrift generation completed!") + () + } + + /** + * Build Python API wheel + * Usage: ./mill buildPythonApi + * Usage: ./mill buildPythonApi --action release + */ + def buildPythonApi(action: String = "build") = Task.Command { + println(s"Building Python API with action: $action") + + // Generate Python Thrift first + generatePythonThrift()() + + // Use captured project root directory + val root = PROJECT_ROOT + + // Get version and branch for Python API (mirroring sbt's git.versionProperty) + val branchStr = os.proc("git", "rev-parse", "--abbrev-ref", "HEAD") + .call(cwd = root) + .out + .text() + .trim() + .replace("/", "-") + + // Read version from version.sbt + val baseVersion = "0.0.110-SNAPSHOT" + val versionStr = if (branchStr == "main" || branchStr == "master") { + baseVersion + } else { + s"${branchStr}-${baseVersion}" + } + + println(s"Version: $versionStr, Branch: $branchStr") + + // Run the Python API build script + val script = root / "api" / "py" / "python-api-build.sh" + os.proc(script, versionStr, branchStr, action) + .call(cwd = root, stdout = os.Inherit, stderr = os.Inherit) + + println(s"Python API $action completed!") + () + } + +} diff --git a/flink/package.mill b/flink/package.mill new file mode 100644 index 0000000000..34629ae45f --- /dev/null +++ b/flink/package.mill @@ -0,0 +1,73 @@ +package build.flink + +import mill._ +import mill.javalib._ +import mill.javalib.publish._ +import mill.scalalib.SbtModule + +object `package` extends SbtModule with PublishModule { + + def scalaVersion = "2.12.12" + + def mvnDeps = Seq( + mvn"org.apache.avro:avro:1.8.2", + mvn"org.apache.flink::flink-streaming-scala:1.16.1", + mvn"org.apache.flink:flink-clients:1.16.1", + mvn"org.apache.flink:flink-metrics-dropwizard:1.16.1", + mvn"org.apache.flink:flink-test-utils:1.16.1" + ) + + def moduleDeps = super.moduleDeps ++ Seq(build.aggregator, build.online) + + def compileMvnDeps = Seq( + mvn"org.apache.spark::spark-core:3.1.1", + mvn"org.apache.spark::spark-hive:3.1.1", + mvn"org.apache.spark::spark-sql-kafka-0-10:3.1.1", + mvn"org.apache.spark::spark-sql:3.1.1", + mvn"org.apache.spark::spark-streaming:3.1.1" + ) + + def pomSettings = PomSettings( + "Chronon is a feature engineering platform", + "ai.chronon", + "https://github.com/airbnb/chronon", + Seq(License( + "Apache 2", + "Apache 2", + "http://www.apache.org/licenses/LICENSE-2.0.txt", + false, + false, + "repo" + )), + VersionControl( + Some("https://github.com/airbnb/chronon"), + Some("scm:git@github.com:airbnb/chronon.git"), + None, + None + ), + Seq(Developer( + "nikhilsimha", + "Nikhil Simha", + "http://nikhilsimha.com", + None, + None + )) + ) + + def publishVersion = "cb0ceedc95c0a4602315424f558e418f4c2628ef-SNAPSHOT" + + object test extends SbtTests with TestModule.Junit4 { + + def mvnDeps = Seq( + mvn"junit:junit:4.13.2", + mvn"com.novocode:junit-interface:0.11", + mvn"org.scalatest::scalatest:3.2.15" + ) + + def moduleDeps = super.moduleDeps ++ Seq(build.aggregator.test) + + def testSandboxWorkingDir = false + def testParallelism = false + + } +} diff --git a/mill b/mill new file mode 100755 index 0000000000..136be4dfe1 --- /dev/null +++ b/mill @@ -0,0 +1,339 @@ +#!/usr/bin/env sh + +# This is a wrapper script, that automatically selects or downloads Mill from Maven Central or GitHub release pages. +# +# This script determines the Mill version to use by trying these sources +# - env-variable `MILL_VERSION` +# - local file `.mill-version` +# - local file `.config/mill-version` +# - `mill-version` from YAML frontmatter of current buildfile +# - if accessible, find the latest stable version available on Maven Central (https://repo1.maven.org/maven2) +# - env-variable `DEFAULT_MILL_VERSION` +# +# If a version has the suffix '-native' a native binary will be used. +# If a version has the suffix '-jvm' an executable jar file will be used, requiring an already installed Java runtime. +# If no such suffix is found, the script will pick a default based on version and platform. +# +# Once a version was determined, it tries to use either +# - a system-installed mill, if found and it's version matches +# - an already downloaded version under ~/.cache/mill/download +# +# If no working mill version was found on the system, +# this script downloads a binary file from Maven Central or Github Pages (this is version dependent) +# into a cache location (~/.cache/mill/download). +# +# Mill Project URL: https://github.com/com-lihaoyi/mill +# Script Version: 1.0.6 +# +# If you want to improve this script, please also contribute your changes back! +# This script was generated from: dist/scripts/src/mill.sh +# +# Licensed under the Apache License, Version 2.0 + +set -e + +if [ -z "${DEFAULT_MILL_VERSION}" ] ; then + DEFAULT_MILL_VERSION="1.0.6" +fi + + +if [ -z "${GITHUB_RELEASE_CDN}" ] ; then + GITHUB_RELEASE_CDN="" +fi + + +MILL_REPO_URL="https://github.com/com-lihaoyi/mill" + +if [ -z "${CURL_CMD}" ] ; then + CURL_CMD=curl +fi + +# Explicit commandline argument takes precedence over all other methods +if [ "$1" = "--mill-version" ] ; then + echo "The --mill-version option is no longer supported." 1>&2 +fi + +MILL_BUILD_SCRIPT="" + +if [ -f "build.mill" ] ; then + MILL_BUILD_SCRIPT="build.mill" +elif [ -f "build.mill.scala" ] ; then + MILL_BUILD_SCRIPT="build.mill.scala" +elif [ -f "build.sc" ] ; then + MILL_BUILD_SCRIPT="build.sc" +fi + +# Please note, that if a MILL_VERSION is already set in the environment, +# We reuse it's value and skip searching for a value. + +# If not already set, read .mill-version file +if [ -z "${MILL_VERSION}" ] ; then + if [ -f ".mill-version" ] ; then + MILL_VERSION="$(tr '\r' '\n' < .mill-version | head -n 1 2> /dev/null)" + elif [ -f ".config/mill-version" ] ; then + MILL_VERSION="$(tr '\r' '\n' < .config/mill-version | head -n 1 2> /dev/null)" + elif [ -n "${MILL_BUILD_SCRIPT}" ] ; then + # `s/.*://`: + # This is a greedy match that removes everything from the beginning of the line up to (and including) the last + # colon (:). This effectively isolates the value part of the declaration. + # + # `s/#.*//`: + # This removes any comments at the end of the line. + # + # `s/['\"]//g`: + # This removes all single and double quotes from the string, wherever they appear (g is for "global"). + # + # `s/^[[:space:]]*//; s/[[:space:]]*$//`: + # These two expressions trim any leading or trailing whitespace ([[:space:]] matches spaces and tabs). + MILL_VERSION="$(grep -E "//\|.*mill-version" "${MILL_BUILD_SCRIPT}" | sed -E "s/.*://; s/#.*//; s/['\"]//g; s/^[[:space:]]*//; s/[[:space:]]*$//")" + fi +fi + +MILL_USER_CACHE_DIR="${XDG_CACHE_HOME:-${HOME}/.cache}/mill" + +if [ -z "${MILL_DOWNLOAD_PATH}" ] ; then + MILL_DOWNLOAD_PATH="${MILL_USER_CACHE_DIR}/download" +fi + +# If not already set, try to fetch newest from Github +if [ -z "${MILL_VERSION}" ] ; then + # TODO: try to load latest version from release page + echo "No mill version specified." 1>&2 + echo "You should provide a version via a '//| mill-version: ' comment or a '.mill-version' file." 1>&2 + + mkdir -p "${MILL_DOWNLOAD_PATH}" + LANG=C touch -d '1 hour ago' "${MILL_DOWNLOAD_PATH}/.expire_latest" 2>/dev/null || ( + # we might be on OSX or BSD which don't have -d option for touch + # but probably a -A [-][[hh]mm]SS + touch "${MILL_DOWNLOAD_PATH}/.expire_latest"; touch -A -010000 "${MILL_DOWNLOAD_PATH}/.expire_latest" + ) || ( + # in case we still failed, we retry the first touch command with the intention + # to show the (previously suppressed) error message + LANG=C touch -d '1 hour ago' "${MILL_DOWNLOAD_PATH}/.expire_latest" + ) + + # POSIX shell variant of bash's -nt operator, see https://unix.stackexchange.com/a/449744/6993 + # if [ "${MILL_DOWNLOAD_PATH}/.latest" -nt "${MILL_DOWNLOAD_PATH}/.expire_latest" ] ; then + if [ -n "$(find -L "${MILL_DOWNLOAD_PATH}/.latest" -prune -newer "${MILL_DOWNLOAD_PATH}/.expire_latest")" ]; then + # we know a current latest version + MILL_VERSION=$(head -n 1 "${MILL_DOWNLOAD_PATH}"/.latest 2> /dev/null) + fi + + if [ -z "${MILL_VERSION}" ] ; then + # we don't know a current latest version + echo "Retrieving latest mill version ..." 1>&2 + LANG=C ${CURL_CMD} -s -i -f -I ${MILL_REPO_URL}/releases/latest 2> /dev/null | grep --ignore-case Location: | sed s'/^.*tag\///' | tr -d '\r\n' > "${MILL_DOWNLOAD_PATH}/.latest" + MILL_VERSION=$(head -n 1 "${MILL_DOWNLOAD_PATH}"/.latest 2> /dev/null) + fi + + if [ -z "${MILL_VERSION}" ] ; then + # Last resort + MILL_VERSION="${DEFAULT_MILL_VERSION}" + echo "Falling back to hardcoded mill version ${MILL_VERSION}" 1>&2 + else + echo "Using mill version ${MILL_VERSION}" 1>&2 + fi +fi + +MILL_NATIVE_SUFFIX="-native" +MILL_JVM_SUFFIX="-jvm" +FULL_MILL_VERSION=$MILL_VERSION +ARTIFACT_SUFFIX="" +set_artifact_suffix(){ + if [ "$(expr substr $(uname -s) 1 5 2>/dev/null)" = "Linux" ]; then + if [ "$(uname -m)" = "aarch64" ]; then + ARTIFACT_SUFFIX="-native-linux-aarch64" + else + ARTIFACT_SUFFIX="-native-linux-amd64" + fi + elif [ "$(uname)" = "Darwin" ]; then + if [ "$(uname -m)" = "arm64" ]; then + ARTIFACT_SUFFIX="-native-mac-aarch64" + else + ARTIFACT_SUFFIX="-native-mac-amd64" + fi + else + echo "This native mill launcher supports only Linux and macOS." 1>&2 + exit 1 + fi +} + +case "$MILL_VERSION" in + *"$MILL_NATIVE_SUFFIX") + MILL_VERSION=${MILL_VERSION%"$MILL_NATIVE_SUFFIX"} + set_artifact_suffix + ;; + + *"$MILL_JVM_SUFFIX") + MILL_VERSION=${MILL_VERSION%"$MILL_JVM_SUFFIX"} + ;; + + *) + case "$MILL_VERSION" in + 0.1.*) ;; + 0.2.*) ;; + 0.3.*) ;; + 0.4.*) ;; + 0.5.*) ;; + 0.6.*) ;; + 0.7.*) ;; + 0.8.*) ;; + 0.9.*) ;; + 0.10.*) ;; + 0.11.*) ;; + 0.12.*) ;; + *) + set_artifact_suffix + esac + ;; +esac + +MILL="${MILL_DOWNLOAD_PATH}/$MILL_VERSION$ARTIFACT_SUFFIX" + +try_to_use_system_mill() { + if [ "$(uname)" != "Linux" ]; then + return 0 + fi + + MILL_IN_PATH="$(command -v mill || true)" + + if [ -z "${MILL_IN_PATH}" ]; then + return 0 + fi + + SYSTEM_MILL_FIRST_TWO_BYTES=$(head --bytes=2 "${MILL_IN_PATH}") + if [ "${SYSTEM_MILL_FIRST_TWO_BYTES}" = "#!" ]; then + # MILL_IN_PATH is (very likely) a shell script and not the mill + # executable, ignore it. + return 0 + fi + + SYSTEM_MILL_PATH=$(readlink -e "${MILL_IN_PATH}") + SYSTEM_MILL_SIZE=$(stat --format=%s "${SYSTEM_MILL_PATH}") + SYSTEM_MILL_MTIME=$(stat --format=%y "${SYSTEM_MILL_PATH}") + + if [ ! -d "${MILL_USER_CACHE_DIR}" ]; then + mkdir -p "${MILL_USER_CACHE_DIR}" + fi + + SYSTEM_MILL_INFO_FILE="${MILL_USER_CACHE_DIR}/system-mill-info" + if [ -f "${SYSTEM_MILL_INFO_FILE}" ]; then + parseSystemMillInfo() { + LINE_NUMBER="${1}" + # Select the line number of the SYSTEM_MILL_INFO_FILE, cut the + # variable definition in that line in two halves and return + # the value, and finally remove the quotes. + sed -n "${LINE_NUMBER}p" "${SYSTEM_MILL_INFO_FILE}" |\ + cut -d= -f2 |\ + sed 's/"\(.*\)"/\1/' + } + + CACHED_SYSTEM_MILL_PATH=$(parseSystemMillInfo 1) + CACHED_SYSTEM_MILL_VERSION=$(parseSystemMillInfo 2) + CACHED_SYSTEM_MILL_SIZE=$(parseSystemMillInfo 3) + CACHED_SYSTEM_MILL_MTIME=$(parseSystemMillInfo 4) + + if [ "${SYSTEM_MILL_PATH}" = "${CACHED_SYSTEM_MILL_PATH}" ] \ + && [ "${SYSTEM_MILL_SIZE}" = "${CACHED_SYSTEM_MILL_SIZE}" ] \ + && [ "${SYSTEM_MILL_MTIME}" = "${CACHED_SYSTEM_MILL_MTIME}" ]; then + if [ "${CACHED_SYSTEM_MILL_VERSION}" = "${MILL_VERSION}" ]; then + MILL="${SYSTEM_MILL_PATH}" + return 0 + else + return 0 + fi + fi + fi + + SYSTEM_MILL_VERSION=$(${SYSTEM_MILL_PATH} --version | head -n1 | sed -n 's/^Mill.*version \(.*\)/\1/p') + + cat < "${SYSTEM_MILL_INFO_FILE}" +CACHED_SYSTEM_MILL_PATH="${SYSTEM_MILL_PATH}" +CACHED_SYSTEM_MILL_VERSION="${SYSTEM_MILL_VERSION}" +CACHED_SYSTEM_MILL_SIZE="${SYSTEM_MILL_SIZE}" +CACHED_SYSTEM_MILL_MTIME="${SYSTEM_MILL_MTIME}" +EOF + + if [ "${SYSTEM_MILL_VERSION}" = "${MILL_VERSION}" ]; then + MILL="${SYSTEM_MILL_PATH}" + fi +} +try_to_use_system_mill + +# If not already downloaded, download it +if [ ! -s "${MILL}" ] || [ "$MILL_TEST_DRY_RUN_LAUNCHER_SCRIPT" = "1" ] ; then + case $MILL_VERSION in + 0.0.* | 0.1.* | 0.2.* | 0.3.* | 0.4.* ) + DOWNLOAD_SUFFIX="" + DOWNLOAD_FROM_MAVEN=0 + ;; + 0.5.* | 0.6.* | 0.7.* | 0.8.* | 0.9.* | 0.10.* | 0.11.0-M* ) + DOWNLOAD_SUFFIX="-assembly" + DOWNLOAD_FROM_MAVEN=0 + ;; + *) + DOWNLOAD_SUFFIX="-assembly" + DOWNLOAD_FROM_MAVEN=1 + ;; + esac + case $MILL_VERSION in + 0.12.0 | 0.12.1 | 0.12.2 | 0.12.3 | 0.12.4 | 0.12.5 | 0.12.6 | 0.12.7 | 0.12.8 | 0.12.9 | 0.12.10 | 0.12.11 ) + DOWNLOAD_EXT="jar" + ;; + 0.12.* ) + DOWNLOAD_EXT="exe" + ;; + 0.* ) + DOWNLOAD_EXT="jar" + ;; + *) + DOWNLOAD_EXT="exe" + ;; + esac + + DOWNLOAD_FILE=$(mktemp mill.XXXXXX) + if [ "$DOWNLOAD_FROM_MAVEN" = "1" ] ; then + DOWNLOAD_URL="https://repo1.maven.org/maven2/com/lihaoyi/mill-dist${ARTIFACT_SUFFIX}/${MILL_VERSION}/mill-dist${ARTIFACT_SUFFIX}-${MILL_VERSION}.${DOWNLOAD_EXT}" + else + MILL_VERSION_TAG=$(echo "$MILL_VERSION" | sed -E 's/([^-]+)(-M[0-9]+)?(-.*)?/\1\2/') + DOWNLOAD_URL="${GITHUB_RELEASE_CDN}${MILL_REPO_URL}/releases/download/${MILL_VERSION_TAG}/${MILL_VERSION}${DOWNLOAD_SUFFIX}" + unset MILL_VERSION_TAG + fi + + if [ "$MILL_TEST_DRY_RUN_LAUNCHER_SCRIPT" = "1" ] ; then + echo $DOWNLOAD_URL + echo $MILL + exit 0 + fi + # TODO: handle command not found + echo "Downloading mill ${MILL_VERSION} from ${DOWNLOAD_URL} ..." 1>&2 + ${CURL_CMD} -f -L -o "${DOWNLOAD_FILE}" "${DOWNLOAD_URL}" + chmod +x "${DOWNLOAD_FILE}" + mkdir -p "${MILL_DOWNLOAD_PATH}" + mv "${DOWNLOAD_FILE}" "${MILL}" + + unset DOWNLOAD_FILE + unset DOWNLOAD_SUFFIX +fi + +if [ -z "$MILL_MAIN_CLI" ] ; then + MILL_MAIN_CLI="${0}" +fi + +MILL_FIRST_ARG="" +if [ "$1" = "--bsp" ] || [ "${1#"-i"}" != "$1" ] || [ "$1" = "--interactive" ] || [ "$1" = "--no-server" ] || [ "$1" = "--no-daemon" ] || [ "$1" = "--repl" ] || [ "$1" = "--help" ] ; then + # Need to preserve the first position of those listed options + MILL_FIRST_ARG=$1 + shift +fi + +unset MILL_DOWNLOAD_PATH +unset MILL_OLD_DOWNLOAD_PATH +unset OLD_MILL +unset MILL_VERSION +unset MILL_REPO_URL + +# -D mill.main.cli is for compatibility with Mill 0.10.9 - 0.13.0-M2 +# We don't quote MILL_FIRST_ARG on purpose, so we can expand the empty value without quotes +# shellcheck disable=SC2086 +exec "${MILL}" $MILL_FIRST_ARG -D "mill.main.cli=${MILL_MAIN_CLI}" "$@" diff --git a/online/package.mill b/online/package.mill new file mode 100644 index 0000000000..e286776de4 --- /dev/null +++ b/online/package.mill @@ -0,0 +1,150 @@ +package build.online + +import mill._ +import mill.javalib._ +import mill.javalib.publish._ +import mill.scalalib.SbtModule + +// online: default shaded build with embedded dependencies +object `package` extends SbtModule with PublishModule { + + def scalaVersion = "2.12.12" + + def mvnDeps = Seq( + mvn"com.datadoghq:java-dogstatsd-client:2.7", + mvn"com.github.ben-manes.caffeine:caffeine:2.8.5", + mvn"io.netty:netty-buffer:4.1.68.Final", + mvn"net.jodah:typetools:0.4.1", + mvn"org.apache.spark::spark-core:3.1.1", + mvn"org.apache.spark::spark-hive:3.1.1", + mvn"org.apache.spark::spark-sql-kafka-0-10:3.1.1", + mvn"org.apache.spark::spark-sql:3.1.1", + mvn"org.apache.spark::spark-streaming:3.1.1", + mvn"org.rogach::scallop:4.0.1", + mvn"org.scala-lang.modules::scala-java8-compat:0.9.0" + ) + + def moduleDeps = super.moduleDeps ++ Seq(build.aggregator) + + def pomSettings = PomSettings( + "Chronon is a feature engineering platform", + "ai.chronon", + "https://github.com/airbnb/chronon", + Seq(License( + "Apache 2", + "Apache 2", + "http://www.apache.org/licenses/LICENSE-2.0.txt", + false, + false, + "repo" + )), + VersionControl( + Some("https://github.com/airbnb/chronon"), + Some("scm:git@github.com:airbnb/chronon.git"), + None, + None + ), + Seq(Developer( + "nikhilsimha", + "Nikhil Simha", + "http://nikhilsimha.com", + None, + None + )) + ) + + def publishVersion = "awhittier-mill-0.0.110-SNAPSHOT" + + object test extends SbtTests with TestModule.Junit4 { + + def mvnDeps = Seq( + mvn"junit:junit:4.13.2", + mvn"com.novocode:junit-interface:0.11", + mvn"org.scalatest::scalatest:3.2.15" + ) + + def moduleDeps = super.moduleDeps ++ Seq(build.aggregator.test) + + def testSandboxWorkingDir = false + def testParallelism = false + + } + + // online_unshaded: unshaded build with provided dependencies + object unshaded extends SbtModule with PublishModule { + + def artifactName = "online_unshaded" + + def scalaVersion = "2.12.12" + + // Use same source directory as online (parent directory) + def millSourcePath: os.Path = os.pwd / "online" + + def mvnDeps = Seq( + mvn"com.datadoghq:java-dogstatsd-client:2.7", + mvn"com.github.ben-manes.caffeine:caffeine:2.8.5", + mvn"io.netty:netty-buffer:4.1.68.Final", + mvn"net.jodah:typetools:0.4.1", + mvn"org.rogach::scallop:4.0.1", + mvn"org.scala-lang.modules::scala-java8-compat:0.9.0" + ) + + def compileMvnDeps = Seq( + mvn"com.fasterxml.jackson.core:jackson-core:2.10.0", + mvn"com.fasterxml.jackson.core:jackson-databind:2.10.0", + mvn"com.fasterxml.jackson.module::jackson-module-scala:2.10.0", + mvn"org.apache.avro:avro:1.8.2", + mvn"org.apache.spark::spark-core:3.1.1", + mvn"org.apache.spark::spark-hive:3.1.1", + mvn"org.apache.spark::spark-sql-kafka-0-10:3.1.1", + mvn"org.apache.spark::spark-sql:3.1.1", + mvn"org.apache.spark::spark-streaming:3.1.1" + ) + + def moduleDeps = super.moduleDeps ++ Seq(build.aggregator) + + def pomSettings = PomSettings( + "Chronon is a feature engineering platform", + "ai.chronon", + "https://github.com/airbnb/chronon", + Seq(License( + "Apache 2", + "Apache 2", + "http://www.apache.org/licenses/LICENSE-2.0.txt", + false, + false, + "repo" + )), + VersionControl( + Some("https://github.com/airbnb/chronon"), + Some("scm:git@github.com:airbnb/chronon.git"), + None, + None + ), + Seq(Developer( + "nikhilsimha", + "Nikhil Simha", + "http://nikhilsimha.com", + None, + None + )) + ) + + def publishVersion = "awhittier-mill-0.0.110-SNAPSHOT" + + object test extends SbtTests with TestModule.Junit4 { + + def mvnDeps = Seq( + mvn"junit:junit:4.13.2", + mvn"com.novocode:junit-interface:0.11", + mvn"org.scalatest::scalatest:3.2.15" + ) + + def moduleDeps = super.moduleDeps ++ Seq(build.aggregator.test) + + def testSandboxWorkingDir = false + def testParallelism = false + + } + } +} diff --git a/online/src/test/scala/ai/chronon/online/FetcherCacheTest.scala b/online/src/test/scala/ai/chronon/online/FetcherCacheTest.scala index 8625cf56a3..e6297fdd32 100644 --- a/online/src/test/scala/ai/chronon/online/FetcherCacheTest.scala +++ b/online/src/test/scala/ai/chronon/online/FetcherCacheTest.scala @@ -328,7 +328,7 @@ class FetcherCacheTest extends MockitoHelper { servingInfo.outputCodec.decodeMap, servingInfo, keys) - verify(servingInfo.outputCodec.decodeMap(any()), times(1)) // decoding did happen + verify(outputCodec, times(1)).decodeMap(any()) // decoding did happen assertEquals(mapResponse, decodedMapResponse) } } diff --git a/service/package.mill b/service/package.mill new file mode 100644 index 0000000000..d55916ed90 --- /dev/null +++ b/service/package.mill @@ -0,0 +1,71 @@ +package build.service + +import mill._ +import mill.javalib._ +import mill.javalib.publish._ +import mill.scalalib.SbtModule + +object `package` extends SbtModule with PublishModule { + + def scalaVersion = "2.12.12" + + def mvnDeps = Seq( + mvn"ch.qos.logback:logback-classic:1.2.11", + mvn"com.typesafe:config:1.4.3", + mvn"io.micrometer:micrometer-registry-statsd:1.13.6", + mvn"io.netty:netty-all:4.1.111.Final", + mvn"io.vertx:vertx-config:4.5.10", + mvn"io.vertx:vertx-core:4.5.10", + mvn"io.vertx:vertx-micrometer-metrics:4.5.10", + mvn"io.vertx:vertx-web:4.5.10", + mvn"org.slf4j:slf4j-api:1.7.36" + ) + + def moduleDeps = super.moduleDeps ++ Seq(build.online) + + def pomSettings = PomSettings( + "Chronon is a feature engineering platform", + "ai.chronon", + "https://github.com/airbnb/chronon", + Seq(License( + "Apache 2", + "Apache 2", + "http://www.apache.org/licenses/LICENSE-2.0.txt", + false, + false, + "repo" + )), + VersionControl( + Some("https://github.com/airbnb/chronon"), + Some("scm:git@github.com:airbnb/chronon.git"), + None, + None + ), + Seq(Developer( + "nikhilsimha", + "Nikhil Simha", + "http://nikhilsimha.com", + None, + None + )) + ) + + def publishVersion = "cb0ceedc95c0a4602315424f558e418f4c2628ef-SNAPSHOT" + + object test extends SbtTests with TestModule.Junit4 { + + def mvnDeps = Seq( + mvn"com.novocode:junit-interface:0.11", + mvn"io.vertx:vertx-codegen:4.5.10", + mvn"io.vertx:vertx-unit:4.5.10", + mvn"junit:junit:4.13.2", + mvn"org.mockito:mockito-core:4.11.0" + ) + + def moduleDeps = super.moduleDeps ++ Seq(build.online.test) + + def testSandboxWorkingDir = false + def testParallelism = false + + } +} diff --git a/spark/package.mill b/spark/package.mill new file mode 100644 index 0000000000..8a39147414 --- /dev/null +++ b/spark/package.mill @@ -0,0 +1,144 @@ +package build.spark + +import mill._ +import mill.javalib._ +import mill.javalib.publish._ +import mill.scalalib.SbtModule + +// spark_uber: provided dependencies (for shaded/uber jar) +object `package` extends SbtModule with PublishModule { + + def artifactName = "spark_uber" + + def scalaVersion = "2.12.12" + + def mvnDeps = Seq( + mvn"com.fasterxml.jackson.core:jackson-core:2.10.0", + mvn"com.fasterxml.jackson.core:jackson-databind:2.10.0", + mvn"com.fasterxml.jackson.module::jackson-module-scala:2.10.0" + ) + + def moduleDeps = super.moduleDeps ++ Seq(build.aggregator, build.online) + + def compileMvnDeps = Seq( + mvn"io.delta::delta-core:1.0.1", + mvn"org.apache.iceberg::iceberg-spark-runtime-3.2:1.1.0", + mvn"org.apache.spark::spark-core:3.1.1", + mvn"org.apache.spark::spark-hive:3.1.1", + mvn"org.apache.spark::spark-sql-kafka-0-10:3.1.1", + mvn"org.apache.spark::spark-sql:3.1.1", + mvn"org.apache.spark::spark-streaming:3.1.1" + ) + + def pomSettings = PomSettings( + "Chronon is a feature engineering platform", + "ai.chronon", + "https://github.com/airbnb/chronon", + Seq(License( + "Apache 2", + "Apache 2", + "http://www.apache.org/licenses/LICENSE-2.0.txt", + false, + false, + "repo" + )), + VersionControl( + Some("https://github.com/airbnb/chronon"), + Some("scm:git@github.com:airbnb/chronon.git"), + None, + None + ), + Seq(Developer( + "nikhilsimha", + "Nikhil Simha", + "http://nikhilsimha.com", + None, + None + )) + ) + + def publishVersion = "awhittier-mill-0.0.110-SNAPSHOT" + + object test extends SbtTests with TestModule.Junit4 { + + def mvnDeps = Seq( + mvn"junit:junit:4.13.2", + mvn"com.novocode:junit-interface:0.11", + mvn"org.scalatest::scalatest:3.2.15", + mvn"org.xerial.snappy:snappy-java:1.1.8.4" + ) + + def moduleDeps = super.moduleDeps ++ Seq(build.aggregator.test) + + def testSandboxWorkingDir = false + def testParallelism = false + + } + + // spark_embedded: embedded dependencies (not provided) + object embedded extends SbtModule with PublishModule { + + def artifactName = "spark_embedded" + + def scalaVersion = "2.12.12" + + // Use same source directory as spark_uber (parent directory) + def millSourcePath: os.Path = os.pwd / "spark" + + def mvnDeps = Seq( + mvn"com.fasterxml.jackson.core:jackson-core:2.10.0", + mvn"com.fasterxml.jackson.core:jackson-databind:2.10.0", + mvn"com.fasterxml.jackson.module::jackson-module-scala:2.10.0", + mvn"io.delta::delta-core:1.0.1", + mvn"org.apache.iceberg::iceberg-spark-runtime-3.2:1.1.0", + mvn"org.apache.spark::spark-core:3.1.1", + mvn"org.apache.spark::spark-hive:3.1.1", + mvn"org.apache.spark::spark-sql-kafka-0-10:3.1.1", + mvn"org.apache.spark::spark-sql:3.1.1", + mvn"org.apache.spark::spark-streaming:3.1.1" + ) + + def moduleDeps = super.moduleDeps ++ Seq(build.aggregator, build.online) + + def pomSettings = PomSettings( + "Chronon is a feature engineering platform", + "ai.chronon", + "https://github.com/airbnb/chronon", + Seq(License( + "Apache 2", + "Apache 2", + "http://www.apache.org/licenses/LICENSE-2.0.txt", + false, + false, + "repo" + )), + VersionControl( + Some("https://github.com/airbnb/chronon"), + Some("scm:git@github.com:airbnb/chronon.git"), + None, + None + ), + Seq(Developer( + "nikhilsimha", + "Nikhil Simha", + "http://nikhilsimha.com", + None, + None + )) + ) + + def publishVersion = "awhittier-mill-0.0.110-SNAPSHOT" + + // Tests disabled for embedded variant (matching sbt build) + object test extends SbtTests with TestModule.Junit4 { + def mvnDeps = Seq( + mvn"junit:junit:4.13.2", + mvn"com.novocode:junit-interface:0.11", + mvn"org.scalatest::scalatest:3.2.15" + ) + + def testSandboxWorkingDir = false + def testParallelism = false + } + } +} diff --git a/spark/src/test/scala/ai/chronon/spark/test/MetadataExporterTest.scala b/spark/src/test/scala/ai/chronon/spark/test/MetadataExporterTest.scala index 5019dcf4c3..46d39808f4 100644 --- a/spark/src/test/scala/ai/chronon/spark/test/MetadataExporterTest.scala +++ b/spark/src/test/scala/ai/chronon/spark/test/MetadataExporterTest.scala @@ -78,10 +78,13 @@ class MetadataExporterTest extends TestCase { val sampleDf = DataFrameGen .events(spark, sampleData, 10000, partitions = 30) sampleDf.save(sampleTable) - val confResource = getClass.getResource("/") + + // Use the actual file system path to test resources instead of getResource + // which can return a JAR path in Mill + val testResourcesPath = new File("spark/src/test/resources").getAbsolutePath val tmpDir: File = Files.createTempDir() - MetadataExporter.run(confResource.getPath, Some(tmpDir.getAbsolutePath)) - printFilesInDirectory(s"${confResource.getPath}/joins/team") + MetadataExporter.run(testResourcesPath, Some(tmpDir.getAbsolutePath)) + printFilesInDirectory(s"$testResourcesPath/joins/team") printFilesInDirectory(s"${tmpDir.getAbsolutePath}/joins") // Read the files. val file = Source.fromFile(s"${tmpDir.getAbsolutePath}/joins/example_join.v1")