Skip to content

Commit 8ab7b1b

Browse files
authored
Overhaul data-load-tools (#2715)
- Add Volume for dataloading - Refactor data-load-tools - Speed up ingesting by multithreading download.
1 parent b03bc60 commit 8ab7b1b

File tree

56 files changed

+1285
-849
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+1285
-849
lines changed

.github/workflows/gss.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ jobs:
7676
export RUSTC_WRAPPER=/usr/local/bin/sccache
7777
sccache --start-server
7878
cd ${GITHUB_WORKSPACE}/interactive_engine
79-
mvn clean install -P groot,groot-assembly -Drust.compile.mode=debug -DskipTests -Dgroot.compile.feature="column_filter_push_down" --quiet
79+
mvn clean install -P groot -Drust.compile.mode=debug -DskipTests -Dgroot.compile.feature="column_filter_push_down" --quiet
8080
8181
sccache --show-stats
8282
@@ -92,7 +92,7 @@ jobs:
9292
export SCCACHE_DIR=~/.cache/sccache
9393
export RUSTC_WRAPPER=/usr/local/bin/sccache
9494
cd ${GITHUB_WORKSPACE}/interactive_engine
95-
mvn clean install -P groot,groot-assembly -Drust.compile.mode=debug -DskipTests --quiet
95+
mvn clean install -P groot -Drust.compile.mode=debug -DskipTests --quiet
9696
9797
sccache --show-stats
9898

.github/workflows/k8s-ci.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -692,7 +692,7 @@ jobs:
692692
minikube image load graphscope/learning:${SHORT_SHA}
693693
694694
export PYTHONPATH=${GITHUB_WORKSPACE}/python:${PYTHONPATH}
695-
cd ${GITHUB_WORKSPACE}/interactive_engine && mvn clean install --quiet -DskipTests -Drust.compile.skip=true -P graphscope,graphscope-assembly
695+
cd ${GITHUB_WORKSPACE}/interactive_engine && mvn clean install --quiet -DskipTests -Drust.compile.skip=true -P graphscope
696696
cd ${GITHUB_WORKSPACE}/interactive_engine/tests
697697
# ./function_test.sh 8111 1
698698
./function_test.sh 8112 2

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ interactive: $(INTERACTIVE_DIR)/assembly/target/graphscope.tar.gz
144144

145145
$(INTERACTIVE_DIR)/assembly/target/graphscope.tar.gz:
146146
cd $(INTERACTIVE_DIR) && \
147-
mvn package -DskipTests -Drust.compile.mode=$(BUILD_TYPE) -P graphscope,graphscope-assembly -Drevision=$(VERSION) --quiet
147+
mvn package -DskipTests -Drust.compile.mode=$(BUILD_TYPE) -P graphscope -Drevision=$(VERSION) --quiet
148148

149149
learning-install: learning
150150
mkdir -p $(INSTALL_PREFIX)

docs/storage_engine/groot.md

+52-38
Large diffs are not rendered by default.

interactive_engine/assembly/README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
This will build graphscope or groot into an assembly archive.
33

44
# Usage
5-
`mvn package -P graphscope,graphscope-assembly` will generate a graphscope.tar.gz under `target/`.
6-
`mvn package -P groot,groot-assembly` will generate a groot.tar.gz under `target/`.
5+
`mvn package -P graphscope` will generate a graphscope.tar.gz under `target/`.
6+
`mvn package -P groot` will generate a groot.tar.gz under `target/`.

interactive_engine/assembly/groot.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
<assembly xmlns="http://maven.apache.org/ASSEMBLY/2.0.0"
22
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
33
xsi:schemaLocation="http://maven.apache.org/ASSEMBLY/2.0.0 http://maven.apache.org/xsd/assembly-2.0.0.xsd">
4-
<id>groot-assembly</id>
4+
<id>groot</id>
55
<formats>
66
<format>tar.gz</format>
77
</formats>

interactive_engine/assembly/pom.xml

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
</activation>
2121
</profile>
2222
<profile>
23-
<id>graphscope-assembly</id>
23+
<id>graphscope</id>
2424
<build>
2525
<plugins>
2626
<plugin>
@@ -49,7 +49,7 @@
4949
</dependencies>
5050
</profile>
5151
<profile>
52-
<id>groot-assembly</id>
52+
<id>groot</id>
5353
<build>
5454
<plugins>
5555
<plugin>

interactive_engine/assembly/src/bin/groot/store_ctl.sh

+2-14
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,7 @@
22
#
33
# groot command tool
44

5-
set -x
6-
set -e
7-
set -o pipefail
5+
set -xeo pipefail
86

97
usage() {
108
cat <<END
@@ -20,7 +18,6 @@ cat <<END
2018
2119
start_max_node start max_node of gaia
2220
start_server start individual groot server
23-
start_load_tools start load_tools
2421
END
2522
}
2623

@@ -91,12 +88,6 @@ start_max_node() {
9188
"$@" > >(tee -a "${LOG_DIR}/${LOG_NAME}.out") 2> >(tee -a "${LOG_DIR}/${LOG_NAME}.err" >&2)
9289
}
9390

94-
start_load_tools() {
95-
_setup_env
96-
java -cp "${GROOT_HOME}/lib/data-load-tool-0.0.1-SNAPSHOT.jar" \
97-
com.alibaba.graphscope.groot.dataload.LoadTool "$@"
98-
}
99-
10091
# start groot server
10192
start_server() {
10293
_setup_env
@@ -138,13 +129,10 @@ while test $# -ne 0; do
138129
-h|--help) usage; exit ;;
139130
start_max_node) start_max_node "gaia" "$@"; exit;;
140131
start_server) start_server "$@"; exit;;
141-
start_load_tools) start_load_tools "$@"; exit;;
142132
*)
143133
echo "unrecognized option or command '${arg}'"
144134
usage; exit;;
145135
esac
146136
done
147137

148-
set +e
149-
set +o pipefail
150-
set +x
138+
set +xeo pipefail
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
package com.alibaba.graphscope.groot.common.config;
2+
3+
public class DataLoadConfig {
4+
5+
// Get property
6+
7+
/** universal configurations **/
8+
public static final String GRAPH_ENDPOINT = "graph.endpoint";
9+
10+
public static final String COLUMN_MAPPING_CONFIG = "column.mapping.config";
11+
12+
public static final String LOAD_AFTER_BUILD = "load.after.build";
13+
14+
public static final String SPLIT_SIZE = "split.size";
15+
16+
public static final String UNIQUE_PATH = "unique.path"; // generated automatically for each task
17+
public static final String USER_NAME = "auth.username";
18+
public static final String PASS_WORD = "auth.password";
19+
20+
/** job on HDFS configurations **/
21+
22+
// Input and output
23+
public static final String INPUT_PATH = "input.path";
24+
25+
public static final String OUTPUT_PATH = "output.path";
26+
public static final String SEPARATOR = "separator";
27+
public static final String SKIP_HEADER = "skip.header";
28+
public static final String LDBC_CUSTOMIZE = "ldbc.customize";
29+
/* end */
30+
31+
/** job on ODPS configurations **/
32+
public static final String DATA_SINK_TYPE = "data.sink.type"; // hdfs, oss, volume
33+
// The table format is `project.table` or `table`;
34+
// For partitioned table, the format is `project.table|p1=1/p2=2` or `table|p1=1/p2=2`
35+
public static final String OUTPUT_TABLE = "output.table"; // a dummy table
36+
/* end */
37+
38+
// Set property
39+
public static final String SCHEMA_JSON = "schema.json";
40+
public static final String COLUMN_MAPPINGS = "column.mappings";
41+
public static final String META_INFO = "meta.info";
42+
43+
public static final String META_FILE_NAME = "META";
44+
45+
/** OSS configurations **/
46+
public static final String OSS_ENDPOINT = "oss.endpoint";
47+
48+
public static final String OSS_ACCESS_ID = "oss.access.id";
49+
public static final String OSS_ACCESS_KEY = "oss.access.key";
50+
51+
public static final String OSS_BUCKET_NAME = "oss.bucket.name";
52+
public static final String OSS_OBJECT_NAME = "oss.object.name";
53+
public static final String OSS_INFO_URL = "oss.info.url";
54+
/* end */
55+
56+
/** ODPS Volume configurations **/
57+
public static final String ODPS_VOLUME_PROJECT = "odps.volume.project";
58+
59+
public static final String ODPS_VOLUME_NAME = "odps.volume.name";
60+
public static final String ODPS_VOLUME_PARTSPEC = "odps.volume.partspec";
61+
62+
public static final String ODPS_ACCESS_ID = "odps.access.id";
63+
public static final String ODPS_ACCESS_KEY = "odps.access.key";
64+
public static final String ODPS_ENDPOINT = "odps.endpoint";
65+
/* end */
66+
67+
}

0 commit comments

Comments
 (0)