Skip to content

Commit

Permalink
add new graph schema and bulk load file, but to be revised
Browse files Browse the repository at this point in the history
stash

impl new schema reading

fix
  • Loading branch information
zhanglei1949 committed Aug 9, 2023
1 parent 055654c commit c36b226
Show file tree
Hide file tree
Showing 22 changed files with 825 additions and 296 deletions.
9 changes: 9 additions & 0 deletions .github/workflows/flex.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,12 @@ jobs:
cmake .. && sudo make -j$(nproc)
export FLEX_DATA_DIR=../../../../storages/rt_mutable_graph/modern_graph/
./run_grin_test
- name: Test Graph Loading
env:
FLEX_DATA_DIR: ${GITHUB_WORKSPACE}/flex/build/
run: |
cd ${GITHUB_WORKSPACE}/flex/storages/rt_mutable_graph/modern_graph/
GLOG_v=10 ./tests/rt_mutable_graph/test_graph_loading \
../storages/rt_mutable_graph/modern_graph/modern_graph_new.yaml
../storages/rt_mutable_graph/modern_graph/bulk_load_new.yaml /tmp/csr-data-dir/
2 changes: 1 addition & 1 deletion flex/bin/rt_server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ int main(int argc, char** argv) {

auto ret = gs::Schema::LoadFromYaml(graph_schema_path, bulk_load_config_path);
db.Init(std::get<0>(ret), std::get<1>(ret), std::get<2>(ret),
std::get<3>(ret), data_path, shard_num);
std::get<3>(ret), std::get<4>(ret), data_path, shard_num);

t0 += grape::GetCurrentTime();

Expand Down
2 changes: 1 addition & 1 deletion flex/bin/sync_server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ int main(int argc, char** argv) {

auto ret = gs::Schema::LoadFromYaml(graph_schema_path, bulk_load_config_path);
db.Init(std::get<0>(ret), std::get<1>(ret), std::get<2>(ret),
std::get<3>(ret), data_path, shard_num);
std::get<3>(ret), std::get<4>(ret), data_path, shard_num);

t0 += grape::GetCurrentTime();

Expand Down
8 changes: 4 additions & 4 deletions flex/engines/graph_db/database/graph_db.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,10 @@ GraphDB& GraphDB::get() {
void GraphDB::Init(
const Schema& schema,
const std::vector<std::pair<std::string, std::string>>& vertex_files,
const std::vector<std::tuple<std::string, std::string, std::string,
std::string>>& edge_files,
const std::vector<std::string>& plugins, const std::string& data_dir,
int thread_num) {
const std::vector<std::tuple<std::string, std::string, std::string, int32_t,
int32_t, std::string>>& edge_files,
const std::vector<std::string>& plugins, const LoadConfig& load_config,
const std::string& data_dir, int thread_num) {
std::filesystem::path data_dir_path(data_dir);
if (!std::filesystem::exists(data_dir_path)) {
std::filesystem::create_directory(data_dir_path);
Expand Down
6 changes: 3 additions & 3 deletions flex/engines/graph_db/database/graph_db.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@ class GraphDB {
const Schema& schema,
const std::vector<std::pair<std::string, std::string>>& vertex_files,
const std::vector<std::tuple<std::string, std::string, std::string,
std::string>>& edge_files,
const std::vector<std::string>& plugins, const std::string& data_dir,
int thread_num = 1);
int32_t, int32_t, std::string>>& edge_files,
const std::vector<std::string>& plugins, const LoadConfig& config,
const std::string& data_dir, int thread_num = 1);

/** @brief Create a transaction to read vertices and edges.
*
Expand Down
1 change: 0 additions & 1 deletion flex/engines/hqps_db/core/utils/hqps_type.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ inline bool operator==(const Dist& a, const Dist& b) {

// distance in path.
using dist_t = Dist;
static constexpr label_t INVALID_LABEL_ID = std::numeric_limits<label_t>::max();
using offset_t = size_t;
using vertex_set_key_t = size_t;
static constexpr vid_t INVALID_VID = std::numeric_limits<vid_t>::max();
Expand Down
120 changes: 72 additions & 48 deletions flex/storages/rt_mutable_graph/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,64 +25,88 @@ The configuration file ([modern graph example](./modern_graph/modern_graph.yaml)
Here is an example of a configuration file:

```yaml
graph:
graph_store: mutable_csr
vertex:
- label_name: person
name: modern
store_type: mutable_csr
stored_procedures:
directory: plugins
enable_lists:
- libxxx.so
schema:
vertex_types:
- type_name: person
x_csr_params:
max_vertex_num: 100
properties:
- name: _ID
type: int64
- name: name
type: String
- name: age
type: int32
max_vertex_num: 100
- label_name: software
- property_id: 0
property_name: id
property_type:
primitive_type: DT_SIGNED_INT64
- property_id: 1
property_name: name
property_type:
primitive_type: DT_STRING
- property_id: 2
property_name: age
property_type:
primitive_type: DT_SIGNED_INT32
primary_keys:
- id
- type_name: software
x_csr_params:
max_vertex_num: 100
properties:
- name: _ID
type: int64
- name: name
type: String
- name: lang
type: String
max_vertex_num: 100
edge:
- src_label_name: person
dst_label_name: software
edge_label_name: created
- property_id: 0
property_name: id
property_type:
primitive_type: DT_SIGNED_INT64
x_csr_params:
- property_id: 1
property_name: name
property_type:
primitive_type: DT_STRING
- property_id: 2
property_name: lang
property_type:
primitive_type: DT_STRING
primary_keys:
- id
edge_types:
- type_name: knows
x_csr_params:
incoming_edge_strategy: None
outgoing_edge_strategy: Multiple
vertex_type_pair_relations:
source_vertex: person
destination_vertex: person
relation: MANY_TO_MANY
properties:
- name: _SRC
type: int64
- name: _DST
type: int64
- name: weight
type: double
incoming_edge_strategy: None
outgoing_edge_strategy: Single
- src_label_name: person
dst_label_name: person
edge_label_name: knows
- property_id: 0
property_name: weight
property_type:
primitive_type: DT_DOUBLE
- type_name: created
x_csr_params:
incoming_edge_strategy: None
outgoing_edge_strategy: Single
vertex_type_pair_relations:
source_vertex: person
destination_vertex: software
relation: ONE_TO_MANY
properties:
- name: _SRC
type: int64
- name: _DST
type: int64
- name: weight
type: double
incoming_edge_strategy: None
outgoing_edge_strategy: Multiple

stored_procedures:
- libxxx.so
- property_id: 0
property_name: weight
property_type:
primitive_type: DT_DOUBLE
```
Notes:
- `_ID`, `_SRC`, `_DST` are reserved words, they are the external id of vertices, only int64 type is supported.
- `max_vertex_num` limit the number of vertices of this type:
- Currently we only support one primary key, and the type has to be `DT_SIGNED_INT64`.
- All implementation related configuration are put under x_csr_params.
- `max_vertex_num` limit the number of vertices of this type:
- The limit number is used to `mmap` memory, so it only takes virtual memory before vertices are actually inserted.
- If `max_vertex_num` is not set, a default large number (e.g.: 2^48) will be used.
- `incoming/outgoing_edge_strategy` specifies the storing strategy of the incoming or outgoing edges of this type, there are 3 kinds of strategies
- `incoming/outgoing_edge_strategy` specifies the storing strategy of the incoming or outgoing edges of this type, there are 3 kinds of strategies
- None: no edge will be stored
- Single: only one edge will be stored
- Multiple(default): multiple edges will be stored
Expand Down
30 changes: 30 additions & 0 deletions flex/storages/rt_mutable_graph/load_config.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/** Copyright 2020 Alibaba Group Holding Limited.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef STORAGE_RT_MUTABLE_GRAPH_LOAD_CONFIG_H_
#define STORAGE_RT_MUTABLE_GRAPH_LOAD_CONFIG_H_

#include <string>

namespace gs {
// Provide meta info about bulk loading.
struct LoadConfig {
std::string data_source_; // "file", "hdfs", "oss", "s3"
std::string delimiter_; // "\t", ",", " ", "|"
std::string method_; // init, append, overwrite
};
} // namespace gs

#endif // STORAGE_RT_MUTABLE_GRAPH_LOAD_CONFIG_H_
114 changes: 91 additions & 23 deletions flex/storages/rt_mutable_graph/modern_graph/bulk_load.yaml
Original file line number Diff line number Diff line change
@@ -1,23 +1,91 @@
graph:
vertex:
- label_name: person
files:
- path: person.csv
format: standard_csv
- label_name: software
files:
- path: software.csv
format: standard_csv
edge:
- src_label_name: person
dst_label_name: software
edge_label_name: created
files:
- path: person_created_software.csv
format: standard_csv
- src_label_name: person
dst_label_name: person
edge_label_name: knows
files:
- path: person_knows_person.csv
format: standard_csv
graph: modern
loading_config:
data_source: file # file, oss, s3, hdfs
# data_location: # specify it or use FLEX_DATA_DIR env.
method: init # append, overwrite
meta_data:
delimiter: "|" # other loading configuration places here
vertex_mappings:
- type_name: person # must align with the schema
inputs:
- path: person.csv
format: standard_csv
# Define how each data filed, after splitting, map with the property in the schema
# The data field has index start from 0
# It is not necessary to load all data fields, therefore, the mapping can be partial
# but guarantee that the number of loaded data fields is equal to the number of properties
# column_mappings:
# - column:
# index: 0 # can be omitted if the index is the same as the property index
# name: id # can be omitted if the name is not known
# property: id
# - column:
# index: 1
# name: name
# property: name
# - column:
# index: 2
# name: age
# property: age
- type_name: software
inputs:
- path: software.csv
format: standard_csv
# column_mappings:
# - column:
# index: 0 # can be omitted if the index is the same as the property index
# name: id # can be omitted if the name is not known
# property: id # must align with the schema
# - column:
# index: 1
# name: name
# property: name
# - column:
# index: 2
# name: lang
# property: lang
# - column:
# index: 3
# name: creationDateTime
# property: creationDateTime
edge_mappings:
- type_triplet:
edge: knows
source_vertex: person
destination_vertex: person
inputs:
- path: person_knows_person.csv
format: standard_csv
source_vertex_mappings:
- column: # there may need multiple data fields to identify a vertex
index: 0
name: src_id
destination_vertex_mappings:
- column:
index: 1
name: dst_id
column_mappings:
- column:
index: 2
name: weight
property: weight
- type_triplet:
edge: created
source_vertex: person
destination_vertex: software
inputs:
- path: person_created_software.csv
format: standard_csv
source_vertex_mappings:
- column:
index: 0
name: src_id
destination_vertex_mappings:
- column:
index: 1
name: dst_id
column_mappings:
- column:
index: 2
name: weight
property: weight
Loading

0 comments on commit c36b226

Please sign in to comment.