From 55c72dbbe868f85720c9f3d6db337a81af4f29ad Mon Sep 17 00:00:00 2001 From: Javier Alvarado Date: Fri, 30 Nov 2018 18:05:07 -0800 Subject: [PATCH 1/2] Support custom tokenizers during bulk load. --- dgraph/cmd/bulk/loader.go | 31 ++++++++++++++------------- dgraph/cmd/bulk/run.go | 44 ++++++++++++++++++++++++--------------- 2 files changed, 43 insertions(+), 32 deletions(-) diff --git a/dgraph/cmd/bulk/loader.go b/dgraph/cmd/bulk/loader.go index d0759f8d33d..ad3a43cc906 100644 --- a/dgraph/cmd/bulk/loader.go +++ b/dgraph/cmd/bulk/loader.go @@ -41,21 +41,22 @@ import ( ) type options struct { - RDFDir string - SchemaFile string - DgraphsDir string - TmpDir string - NumGoroutines int - MapBufSize int64 - ExpandEdges bool - SkipMapPhase bool - CleanupTmp bool - NumShufflers int - Version bool - StoreXids bool - ZeroAddr string - HttpAddr string - IgnoreErrors bool + RDFDir string + SchemaFile string + DgraphsDir string + TmpDir string + NumGoroutines int + MapBufSize int64 + ExpandEdges bool + SkipMapPhase bool + CleanupTmp bool + NumShufflers int + Version bool + StoreXids bool + ZeroAddr string + HttpAddr string + IgnoreErrors bool + CustomTokenizers string MapShards int ReduceShards int diff --git a/dgraph/cmd/bulk/run.go b/dgraph/cmd/bulk/run.go index 9bd4fe9048a..2ace3e89bc7 100644 --- a/dgraph/cmd/bulk/run.go +++ b/dgraph/cmd/bulk/run.go @@ -19,6 +19,7 @@ package bulk import ( "encoding/json" "fmt" + "github.com/dgraph-io/dgraph/tok" "log" "net/http" _ "net/http/pprof" @@ -26,6 +27,7 @@ import ( "path/filepath" "runtime" "strconv" + "strings" "github.com/dgraph-io/dgraph/x" "github.com/spf13/cobra" @@ -84,27 +86,30 @@ func init() { "Number of reduce shards. This determines the number of dgraph instances in the final "+ "cluster. Increasing this potentially decreases the reduce stage runtime by using "+ "more parallelism, but increases memory usage.") + flag.String("custom_tokenizers", "", + "Comma separated list of tokenizer plugins") } func run() { opt := options{ - RDFDir: Bulk.Conf.GetString("rdfs"), - SchemaFile: Bulk.Conf.GetString("schema_file"), - DgraphsDir: Bulk.Conf.GetString("out"), - TmpDir: Bulk.Conf.GetString("tmp"), - NumGoroutines: Bulk.Conf.GetInt("num_go_routines"), - MapBufSize: int64(Bulk.Conf.GetInt("mapoutput_mb")), - ExpandEdges: Bulk.Conf.GetBool("expand_edges"), - SkipMapPhase: Bulk.Conf.GetBool("skip_map_phase"), - CleanupTmp: Bulk.Conf.GetBool("cleanup_tmp"), - NumShufflers: Bulk.Conf.GetInt("shufflers"), - Version: Bulk.Conf.GetBool("version"), - StoreXids: Bulk.Conf.GetBool("store_xids"), - ZeroAddr: Bulk.Conf.GetString("zero"), - HttpAddr: Bulk.Conf.GetString("http"), - IgnoreErrors: Bulk.Conf.GetBool("ignore_errors"), - MapShards: Bulk.Conf.GetInt("map_shards"), - ReduceShards: Bulk.Conf.GetInt("reduce_shards"), + RDFDir: Bulk.Conf.GetString("rdfs"), + SchemaFile: Bulk.Conf.GetString("schema_file"), + DgraphsDir: Bulk.Conf.GetString("out"), + TmpDir: Bulk.Conf.GetString("tmp"), + NumGoroutines: Bulk.Conf.GetInt("num_go_routines"), + MapBufSize: int64(Bulk.Conf.GetInt("mapoutput_mb")), + ExpandEdges: Bulk.Conf.GetBool("expand_edges"), + SkipMapPhase: Bulk.Conf.GetBool("skip_map_phase"), + CleanupTmp: Bulk.Conf.GetBool("cleanup_tmp"), + NumShufflers: Bulk.Conf.GetInt("shufflers"), + Version: Bulk.Conf.GetBool("version"), + StoreXids: Bulk.Conf.GetBool("store_xids"), + ZeroAddr: Bulk.Conf.GetString("zero"), + HttpAddr: Bulk.Conf.GetString("http"), + IgnoreErrors: Bulk.Conf.GetBool("ignore_errors"), + MapShards: Bulk.Conf.GetInt("map_shards"), + ReduceShards: Bulk.Conf.GetInt("reduce_shards"), + CustomTokenizers: Bulk.Conf.GetString("custom_tokenizers"), } x.PrintVersion() @@ -125,6 +130,11 @@ func run() { opt.NumShufflers, opt.ReduceShards) os.Exit(1) } + if opt.CustomTokenizers != "" { + for _, soFile := range strings.Split(opt.CustomTokenizers, ",") { + tok.LoadCustomTokenizer(soFile) + } + } opt.MapBufSize <<= 20 // Convert from MB to B. From 9bcc40c5b07976d13f74bd3b48cc630e04f7a0a5 Mon Sep 17 00:00:00 2001 From: Javier Alvarado Date: Tue, 11 Dec 2018 17:13:28 -0800 Subject: [PATCH 2/2] Run fule through goimports --- dgraph/cmd/bulk/run.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dgraph/cmd/bulk/run.go b/dgraph/cmd/bulk/run.go index 2ace3e89bc7..63e731b3bf0 100644 --- a/dgraph/cmd/bulk/run.go +++ b/dgraph/cmd/bulk/run.go @@ -19,7 +19,6 @@ package bulk import ( "encoding/json" "fmt" - "github.com/dgraph-io/dgraph/tok" "log" "net/http" _ "net/http/pprof" @@ -29,6 +28,7 @@ import ( "strconv" "strings" + "github.com/dgraph-io/dgraph/tok" "github.com/dgraph-io/dgraph/x" "github.com/spf13/cobra" )