diff --git a/docs/source/user-guide/datasources.md b/docs/source/user-guide/datasources.md index 5634df8e07..1e5a7df241 100644 --- a/docs/source/user-guide/datasources.md +++ b/docs/source/user-guide/datasources.md @@ -111,5 +111,41 @@ Verify the native scan type should be `CometNativeScan`. More on [HDFS Reader](../../../native/hdfs/README.md) +### Local HDFS development + +- Configure local machine network. Add hostname to `/etc/hosts` +```commandline +127.0.0.1 localhost namenode datanode1 datanode2 datanode3 +::1 localhost namenode datanode1 datanode2 datanode3 +``` + +- Start local HDFS cluster, 3 datanodes, namenode url is `namenode:9000` +```commandline +docker compose -f kube/local/hdfs-docker-compose.yml up +``` + +- Check the local namenode is up and running on `http://localhost:9870/dfshealth.html#tab-overview` +- Build a project with HDFS support +```commandline +JAVA_HOME="/opt/homebrew/opt/openjdk@11" make release PROFILES="-Pspark-3.5" COMET_FEATURES=hdfs RUSTFLAGS="-L /opt/homebrew/opt/openjdk@11/libexec/openjdk.jdk/Contents/Home/lib/server" +``` + +- Run local test +```scala + + withSQLConf( + CometConf.COMET_ENABLED.key -> "true", + CometConf.COMET_EXEC_ENABLED.key -> "true", + CometConf.COMET_NATIVE_SCAN_IMPL.key -> CometConf.SCAN_NATIVE_DATAFUSION, + SQLConf.USE_V1_SOURCE_LIST.key -> "parquet", + "fs.defaultFS" -> "hdfs://namenode:9000", + "dfs.client.use.datanode.hostname" -> "true") { + val df = spark.read.parquet("/tmp/2") + df.show(false) + df.explain("extended") + } + } +``` +Or use `spark-shell` with HDFS support as described [above](#using-experimental-native-datafusion-reader) ## S3 In progress diff --git a/kube/local/hadoop.env b/kube/local/hadoop.env new file mode 100644 index 0000000000..2ed6d086d1 --- /dev/null +++ b/kube/local/hadoop.env @@ -0,0 +1,31 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +CORE_CONF_fs_defaultFS=hdfs://namenode:9000 +CORE_CONF_hadoop_http_staticuser_user=root +CORE_CONF_hadoop_proxyuser_hue_hosts=* +CORE_CONF_hadoop_proxyuser_hue_groups=* +CORE_CONF_io_compression_codecs=org.apache.hadoop.io.compress.SnappyCodec +CORE_CONF_hadoop_tmp_dir=/hadoop-data +CORE_CONF_dfs_client_use_datanode_hostname=true +CORE_CONF_dfs_datanode_use_datanode_hostname=true + +HDFS_CONF_dfs_webhdfs_enabled=true +HDFS_CONF_dfs_permissions_enabled=false +HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false +HDFS_CONF_dfs_client_use_datanode_hostname=true +HDFS_CONF_dfs_datanode_use_datanode_hostname=true \ No newline at end of file diff --git a/kube/local/hdfs-docker-compose.yml b/kube/local/hdfs-docker-compose.yml new file mode 100644 index 0000000000..774b28f502 --- /dev/null +++ b/kube/local/hdfs-docker-compose.yml @@ -0,0 +1,100 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +version: "3" + +services: + namenode: + image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8 + container_name: namenode + restart: always + ports: + - 9870:9870 + - 9000:9000 + volumes: + - /tmp/hadoop/dfs/name:/hadoop/dfs/name + environment: + - CLUSTER_NAME=test + env_file: + - hadoop.env + + datanode1: + image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8 + container_name: datanode1 + hostname: datanode1 + restart: always + ports: + - 9866:9866 + - 9864:9864 + depends_on: + - namenode + volumes: + - /tmp/hadoop/dfs/data1:/hadoop/dfs/data + environment: + SERVICE_PRECONDITION: "namenode:9870" + env_file: + - hadoop.env + datanode2: + image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8 + container_name: datanode2 + hostname: datanode2 + restart: always + ports: + - 9867:9866 + - 9865:9864 + depends_on: + - namenode + volumes: + - /tmp/hadoop/dfs/data2:/hadoop/dfs/data + environment: + SERVICE_PRECONDITION: "namenode:9870" + env_file: + - hadoop.env + datanode3: + image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8 + container_name: datanode3 + hostname: datanode3 + restart: always + ports: + - 9868:9866 + - 9863:9864 + depends_on: + - namenode + volumes: + - /tmp/hadoop/dfs/data3:/hadoop/dfs/data + environment: + SERVICE_PRECONDITION: "namenode:9870" + env_file: + - hadoop.env + + resourcemanager: + image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.2.1-java8 + container_name: resourcemanager + restart: always + environment: + SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode1:9864 datanode2:9864 datanode3:9864 datanode1:9866 datanode2:9866 datanode3:9866" + env_file: + - hadoop.env + + nodemanager1: + image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.2.1-java8 + container_name: nodemanager + restart: always + environment: + SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode1:9864 datanode2:9864 datanode3:9864 datanode1:9866 datanode2:9866 datanode3:9866 resourcemanager:8088" + env_file: + - hadoop.env \ No newline at end of file