Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions docs/source/user-guide/datasources.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,5 +111,41 @@ Verify the native scan type should be `CometNativeScan`.

More on [HDFS Reader](../../../native/hdfs/README.md)

### Local HDFS development

- Configure local machine network. Add hostname to `/etc/hosts`
```commandline
127.0.0.1 localhost namenode datanode1 datanode2 datanode3
::1 localhost namenode datanode1 datanode2 datanode3
```

- Start local HDFS cluster, 3 datanodes, namenode url is `namenode:9000`
```commandline
docker compose -f kube/local/hdfs-docker-compose.yml up
```

- Check the local namenode is up and running on `http://localhost:9870/dfshealth.html#tab-overview`
- Build a project with HDFS support
```commandline
JAVA_HOME="/opt/homebrew/opt/openjdk@11" make release PROFILES="-Pspark-3.5" COMET_FEATURES=hdfs RUSTFLAGS="-L /opt/homebrew/opt/openjdk@11/libexec/openjdk.jdk/Contents/Home/lib/server"
```

- Run local test
```scala

withSQLConf(
CometConf.COMET_ENABLED.key -> "true",
CometConf.COMET_EXEC_ENABLED.key -> "true",
CometConf.COMET_NATIVE_SCAN_IMPL.key -> CometConf.SCAN_NATIVE_DATAFUSION,
SQLConf.USE_V1_SOURCE_LIST.key -> "parquet",
"fs.defaultFS" -> "hdfs://namenode:9000",
"dfs.client.use.datanode.hostname" -> "true") {
val df = spark.read.parquet("/tmp/2")
df.show(false)
df.explain("extended")
}
}
```
Or use `spark-shell` with HDFS support as described [above](#using-experimental-native-datafusion-reader)
## S3
In progress
31 changes: 31 additions & 0 deletions kube/local/hadoop.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

CORE_CONF_fs_defaultFS=hdfs://namenode:9000
CORE_CONF_hadoop_http_staticuser_user=root
CORE_CONF_hadoop_proxyuser_hue_hosts=*
CORE_CONF_hadoop_proxyuser_hue_groups=*
CORE_CONF_io_compression_codecs=org.apache.hadoop.io.compress.SnappyCodec
CORE_CONF_hadoop_tmp_dir=/hadoop-data
CORE_CONF_dfs_client_use_datanode_hostname=true
CORE_CONF_dfs_datanode_use_datanode_hostname=true

HDFS_CONF_dfs_webhdfs_enabled=true
HDFS_CONF_dfs_permissions_enabled=false
HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false
HDFS_CONF_dfs_client_use_datanode_hostname=true
HDFS_CONF_dfs_datanode_use_datanode_hostname=true
100 changes: 100 additions & 0 deletions kube/local/hdfs-docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

version: "3"

services:
namenode:
image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8
container_name: namenode
restart: always
ports:
- 9870:9870
- 9000:9000
volumes:
- /tmp/hadoop/dfs/name:/hadoop/dfs/name
environment:
- CLUSTER_NAME=test
env_file:
- hadoop.env

datanode1:
image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
container_name: datanode1
hostname: datanode1
restart: always
ports:
- 9866:9866
- 9864:9864
depends_on:
- namenode
volumes:
- /tmp/hadoop/dfs/data1:/hadoop/dfs/data
environment:
SERVICE_PRECONDITION: "namenode:9870"
env_file:
- hadoop.env
datanode2:
image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
container_name: datanode2
hostname: datanode2
restart: always
ports:
- 9867:9866
- 9865:9864
depends_on:
- namenode
volumes:
- /tmp/hadoop/dfs/data2:/hadoop/dfs/data
environment:
SERVICE_PRECONDITION: "namenode:9870"
env_file:
- hadoop.env
datanode3:
image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
container_name: datanode3
hostname: datanode3
restart: always
ports:
- 9868:9866
- 9863:9864
depends_on:
- namenode
volumes:
- /tmp/hadoop/dfs/data3:/hadoop/dfs/data
environment:
SERVICE_PRECONDITION: "namenode:9870"
env_file:
- hadoop.env

resourcemanager:
image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.2.1-java8
container_name: resourcemanager
restart: always
environment:
SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode1:9864 datanode2:9864 datanode3:9864 datanode1:9866 datanode2:9866 datanode3:9866"
env_file:
- hadoop.env

nodemanager1:
image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.2.1-java8
container_name: nodemanager
restart: always
environment:
SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode1:9864 datanode2:9864 datanode3:9864 datanode1:9866 datanode2:9866 datanode3:9866 resourcemanager:8088"
env_file:
- hadoop.env
Loading