From e95e4352e6e7a0e82dda289ae3bc33f9de4ad662 Mon Sep 17 00:00:00 2001 From: Ibrahim Jarif Date: Fri, 24 Jan 2020 20:44:27 +0530 Subject: [PATCH 1/6] Expose Badger Table and Vlog loading mode in Bulk Loader --- dgraph/cmd/bulk/loader.go | 9 ++++++++- dgraph/cmd/bulk/reduce.go | 28 ++++++++++++++++++++++++++++ dgraph/cmd/bulk/run.go | 22 +++++++++++++++++----- 3 files changed, 53 insertions(+), 6 deletions(-) diff --git a/dgraph/cmd/bulk/loader.go b/dgraph/cmd/bulk/loader.go index 43d73ce8607..1a463b08be6 100644 --- a/dgraph/cmd/bulk/loader.go +++ b/dgraph/cmd/bulk/loader.go @@ -49,7 +49,6 @@ type options struct { OutDir string ReplaceOutDir bool TmpDir string - BadgerKeyFile string // used only in enterprise build. nil otherwise. NumGoroutines int MapBufSize uint64 SkipMapPhase bool @@ -67,6 +66,14 @@ type options struct { ReduceShards int shardOutputDirs []string + + // ........... Badger options .......... + // BadgerTables is the name of the mode used to load the badger tables. + BadgerTables string + // BadgerVlog is the name of the mode used to load the badger value log. + BadgerVlog string + // BadgerKeyFile is the file containing the key used for encryption. Enterprise only feature. + BadgerKeyFile string } type state struct { diff --git a/dgraph/cmd/bulk/reduce.go b/dgraph/cmd/bulk/reduce.go index 20f9b783928..b689b7ca068 100644 --- a/dgraph/cmd/bulk/reduce.go +++ b/dgraph/cmd/bulk/reduce.go @@ -40,6 +40,7 @@ import ( "github.com/dgraph-io/dgraph/worker" "github.com/dgraph-io/dgraph/x" "github.com/gogo/protobuf/proto" + "github.com/golang/glog" ) type reducer struct { @@ -111,6 +112,9 @@ func (r *reducer) createBadger(i int) *badger.DB { // TOOD(Ibrahim): Remove this once badger is updated. opt.ZSTDCompressionLevel = 1 + // Over-write badger options based on the options provided by the user. + r.setBadgerOptions(&opt) + db, err := badger.OpenManaged(opt) x.Check(err) @@ -121,6 +125,30 @@ func (r *reducer) createBadger(i int) *badger.DB { return db } +func (r *reducer) setBadgerOptions(opt *badger.Options) { + glog.Infof("Setting Badger table load option: %s", r.state.opt.BadgerTables) + switch r.state.opt.BadgerTables { + case "mmap": + opt.TableLoadingMode = bo.MemoryMap + case "ram": + opt.TableLoadingMode = bo.LoadToRAM + case "disk": + opt.TableLoadingMode = bo.FileIO + default: + x.Fatalf("Invalid Badger Table Loading mode: %s", r.state.opt.BadgerTables) + } + + glog.Infof("Setting Badger value log load option: %s", r.state.opt.BadgerVlog) + switch r.state.opt.BadgerVlog { + case "mmap": + opt.ValueLogLoadingMode = bo.MemoryMap + case "disk": + opt.ValueLogLoadingMode = bo.FileIO + default: + x.Fatalf("Invalid Badger ValueLog Loading mode: %s", r.state.opt.BadgerVlog) + } +} + type mapIterator struct { fd *os.File reader *bufio.Reader diff --git a/dgraph/cmd/bulk/run.go b/dgraph/cmd/bulk/run.go index 23c6a23fb43..73487bf0ade 100644 --- a/dgraph/cmd/bulk/run.go +++ b/dgraph/cmd/bulk/run.go @@ -65,10 +65,6 @@ func init() { flag.String("tmp", "tmp", "Temp directory used to use for on-disk scratch space. Requires free space proportional"+ " to the size of the RDF file and the amount of indexing used.") - flag.String("encryption_key_file", "", - "The file that stores the encryption key. The key size must be 16, 24, or 32 bytes long. "+ - "The key size determines the corresponding block size for AES encryption "+ - "(AES-128, AES-192, and AES-256 respectively). Enterprise feature.") flag.IntP("num_go_routines", "j", int(math.Ceil(float64(runtime.NumCPU())/4.0)), "Number of worker threads to use. MORE THREADS LEAD TO HIGHER RAM USAGE.") @@ -101,6 +97,19 @@ func init() { "Comma separated list of tokenizer plugins") flag.Bool("new_uids", false, "Ignore UIDs in load files and assign new ones.") + + // Options around how to set up Badger. + flag.String("badger.tables", "mmap", + "[ram, mmap, disk] Specifies how Badger LSM tree is stored. "+ + "Option sequence consume most to least RAM while providing best to worst read "+ + "performance respectively.") + flag.String("badger.vlog", "mmap", + "[mmap, disk] Specifies how Badger Value log is stored."+ + " mmap consumes more RAM, but provides better performance.") + flag.String("encryption_key_file", "", + "The file that stores the encryption key. The key size must be 16, 24, or 32 bytes long. "+ + "The key size determines the corresponding block size for AES encryption "+ + "(AES-128, AES-192, and AES-256 respectively). Enterprise feature.") } func run() { @@ -111,7 +120,6 @@ func run() { OutDir: Bulk.Conf.GetString("out"), ReplaceOutDir: Bulk.Conf.GetBool("replace_out"), TmpDir: Bulk.Conf.GetString("tmp"), - BadgerKeyFile: Bulk.Conf.GetString("encryption_key_file"), NumGoroutines: Bulk.Conf.GetInt("num_go_routines"), MapBufSize: uint64(Bulk.Conf.GetInt("mapoutput_mb")), SkipMapPhase: Bulk.Conf.GetBool("skip_map_phase"), @@ -126,6 +134,10 @@ func run() { ReduceShards: Bulk.Conf.GetInt("reduce_shards"), CustomTokenizers: Bulk.Conf.GetString("custom_tokenizers"), NewUids: Bulk.Conf.GetBool("new_uids"), + + BadgerTables: Bulk.Conf.GetString("badger.tables"), + BadgerVlog: Bulk.Conf.GetString("badger.vlog"), + BadgerKeyFile: Bulk.Conf.GetString("encryption_key_file"), } x.PrintVersion() From 40eb06d366d7cb0fa822dcb873f5b0ef97e65ce1 Mon Sep 17 00:00:00 2001 From: Martin Martinez Rivera Date: Fri, 24 Jan 2020 13:56:00 -0800 Subject: [PATCH 2/6] Add flag to set compression level. --- dgraph/cmd/bulk/loader.go | 2 ++ dgraph/cmd/bulk/reduce.go | 10 +++++++--- dgraph/cmd/bulk/run.go | 9 ++++++--- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/dgraph/cmd/bulk/loader.go b/dgraph/cmd/bulk/loader.go index 1a463b08be6..5dfa7a28283 100644 --- a/dgraph/cmd/bulk/loader.go +++ b/dgraph/cmd/bulk/loader.go @@ -74,6 +74,8 @@ type options struct { BadgerVlog string // BadgerKeyFile is the file containing the key used for encryption. Enterprise only feature. BadgerKeyFile string + // Badger is the compression level to use while writing to badger. + BadgerCompressionLevel int } type state struct { diff --git a/dgraph/cmd/bulk/reduce.go b/dgraph/cmd/bulk/reduce.go index b689b7ca068..d12683d0663 100644 --- a/dgraph/cmd/bulk/reduce.go +++ b/dgraph/cmd/bulk/reduce.go @@ -109,9 +109,6 @@ func (r *reducer) createBadger(i int) *badger.DB { WithLogger(nil).WithMaxCacheSize(1 << 20). WithEncryptionKey(enc.ReadEncryptionKeyFile(r.opt.BadgerKeyFile)) - // TOOD(Ibrahim): Remove this once badger is updated. - opt.ZSTDCompressionLevel = 1 - // Over-write badger options based on the options provided by the user. r.setBadgerOptions(&opt) @@ -147,6 +144,13 @@ func (r *reducer) setBadgerOptions(opt *badger.Options) { default: x.Fatalf("Invalid Badger ValueLog Loading mode: %s", r.state.opt.BadgerVlog) } + + // Set the compression level. Default to 1 if the compression level is set to + // zero or a negative number. + opt.ZSTDCompressionLevel = r.state.opt.BadgerCompressionLevel + if r.state.opt.BadgerCompressionLevel < 1 { + opt.ZSTDCompressionLevel = 1 + } } type mapIterator struct { diff --git a/dgraph/cmd/bulk/run.go b/dgraph/cmd/bulk/run.go index 73487bf0ade..67e63568b0f 100644 --- a/dgraph/cmd/bulk/run.go +++ b/dgraph/cmd/bulk/run.go @@ -110,6 +110,8 @@ func init() { "The file that stores the encryption key. The key size must be 16, 24, or 32 bytes long. "+ "The key size determines the corresponding block size for AES encryption "+ "(AES-128, AES-192, and AES-256 respectively). Enterprise feature.") + flag.Int("badger.compression_level", 1, + "The compression level for Badger. A higher value uses more resources.") } func run() { @@ -135,9 +137,10 @@ func run() { CustomTokenizers: Bulk.Conf.GetString("custom_tokenizers"), NewUids: Bulk.Conf.GetBool("new_uids"), - BadgerTables: Bulk.Conf.GetString("badger.tables"), - BadgerVlog: Bulk.Conf.GetString("badger.vlog"), - BadgerKeyFile: Bulk.Conf.GetString("encryption_key_file"), + BadgerTables: Bulk.Conf.GetString("badger.tables"), + BadgerVlog: Bulk.Conf.GetString("badger.vlog"), + BadgerKeyFile: Bulk.Conf.GetString("encryption_key_file"), + BadgerCompressionLevel: Bulk.Conf.GetInt("badger.compression_level"), } x.PrintVersion() From 98bf77e1762cf409353b125cca0b57cf47059a9d Mon Sep 17 00:00:00 2001 From: Ibrahim Jarif Date: Mon, 27 Jan 2020 15:08:26 +0530 Subject: [PATCH 3/6] Address review comments --- dgraph/cmd/bulk/reduce.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dgraph/cmd/bulk/reduce.go b/dgraph/cmd/bulk/reduce.go index d12683d0663..a97a44ea7a4 100644 --- a/dgraph/cmd/bulk/reduce.go +++ b/dgraph/cmd/bulk/reduce.go @@ -145,11 +145,11 @@ func (r *reducer) setBadgerOptions(opt *badger.Options) { x.Fatalf("Invalid Badger ValueLog Loading mode: %s", r.state.opt.BadgerVlog) } - // Set the compression level. Default to 1 if the compression level is set to - // zero or a negative number. + // Set the compression level. opt.ZSTDCompressionLevel = r.state.opt.BadgerCompressionLevel if r.state.opt.BadgerCompressionLevel < 1 { - opt.ZSTDCompressionLevel = 1 + x.Fatalf("Invalid compression level: %d. It should be greater than zero", + r.state.opt.BadgerCompressionLevel) } } From 6e3b4b29c47cd26a43350669c30d64a2941394b8 Mon Sep 17 00:00:00 2001 From: Ibrahim Jarif Date: Fri, 31 Jan 2020 13:03:25 +0530 Subject: [PATCH 4/6] Remove table and vlog loading mode option --- dgraph/cmd/bulk/reduce.go | 23 ----------------------- dgraph/cmd/bulk/run.go | 9 --------- 2 files changed, 32 deletions(-) diff --git a/dgraph/cmd/bulk/reduce.go b/dgraph/cmd/bulk/reduce.go index a97a44ea7a4..6bb45e04b0f 100644 --- a/dgraph/cmd/bulk/reduce.go +++ b/dgraph/cmd/bulk/reduce.go @@ -40,7 +40,6 @@ import ( "github.com/dgraph-io/dgraph/worker" "github.com/dgraph-io/dgraph/x" "github.com/gogo/protobuf/proto" - "github.com/golang/glog" ) type reducer struct { @@ -123,28 +122,6 @@ func (r *reducer) createBadger(i int) *badger.DB { } func (r *reducer) setBadgerOptions(opt *badger.Options) { - glog.Infof("Setting Badger table load option: %s", r.state.opt.BadgerTables) - switch r.state.opt.BadgerTables { - case "mmap": - opt.TableLoadingMode = bo.MemoryMap - case "ram": - opt.TableLoadingMode = bo.LoadToRAM - case "disk": - opt.TableLoadingMode = bo.FileIO - default: - x.Fatalf("Invalid Badger Table Loading mode: %s", r.state.opt.BadgerTables) - } - - glog.Infof("Setting Badger value log load option: %s", r.state.opt.BadgerVlog) - switch r.state.opt.BadgerVlog { - case "mmap": - opt.ValueLogLoadingMode = bo.MemoryMap - case "disk": - opt.ValueLogLoadingMode = bo.FileIO - default: - x.Fatalf("Invalid Badger ValueLog Loading mode: %s", r.state.opt.BadgerVlog) - } - // Set the compression level. opt.ZSTDCompressionLevel = r.state.opt.BadgerCompressionLevel if r.state.opt.BadgerCompressionLevel < 1 { diff --git a/dgraph/cmd/bulk/run.go b/dgraph/cmd/bulk/run.go index 67e63568b0f..56027c31bd1 100644 --- a/dgraph/cmd/bulk/run.go +++ b/dgraph/cmd/bulk/run.go @@ -99,13 +99,6 @@ func init() { "Ignore UIDs in load files and assign new ones.") // Options around how to set up Badger. - flag.String("badger.tables", "mmap", - "[ram, mmap, disk] Specifies how Badger LSM tree is stored. "+ - "Option sequence consume most to least RAM while providing best to worst read "+ - "performance respectively.") - flag.String("badger.vlog", "mmap", - "[mmap, disk] Specifies how Badger Value log is stored."+ - " mmap consumes more RAM, but provides better performance.") flag.String("encryption_key_file", "", "The file that stores the encryption key. The key size must be 16, 24, or 32 bytes long. "+ "The key size determines the corresponding block size for AES encryption "+ @@ -137,8 +130,6 @@ func run() { CustomTokenizers: Bulk.Conf.GetString("custom_tokenizers"), NewUids: Bulk.Conf.GetBool("new_uids"), - BadgerTables: Bulk.Conf.GetString("badger.tables"), - BadgerVlog: Bulk.Conf.GetString("badger.vlog"), BadgerKeyFile: Bulk.Conf.GetString("encryption_key_file"), BadgerCompressionLevel: Bulk.Conf.GetInt("badger.compression_level"), } From de06ad9734bd59c2d2568cb5fcff839b6fd0bb34 Mon Sep 17 00:00:00 2001 From: Ibrahim Jarif Date: Fri, 31 Jan 2020 13:05:18 +0530 Subject: [PATCH 5/6] fix options --- dgraph/cmd/bulk/loader.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/dgraph/cmd/bulk/loader.go b/dgraph/cmd/bulk/loader.go index 5dfa7a28283..0597aa03de2 100644 --- a/dgraph/cmd/bulk/loader.go +++ b/dgraph/cmd/bulk/loader.go @@ -68,10 +68,6 @@ type options struct { shardOutputDirs []string // ........... Badger options .......... - // BadgerTables is the name of the mode used to load the badger tables. - BadgerTables string - // BadgerVlog is the name of the mode used to load the badger value log. - BadgerVlog string // BadgerKeyFile is the file containing the key used for encryption. Enterprise only feature. BadgerKeyFile string // Badger is the compression level to use while writing to badger. From 1c6031ee76263ce15b80e4563470922c1b85decb Mon Sep 17 00:00:00 2001 From: Ibrahim Jarif Date: Wed, 5 Feb 2020 00:01:22 +0530 Subject: [PATCH 6/6] fix comment --- dgraph/cmd/bulk/loader.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dgraph/cmd/bulk/loader.go b/dgraph/cmd/bulk/loader.go index 5dfa7a28283..86c8770a468 100644 --- a/dgraph/cmd/bulk/loader.go +++ b/dgraph/cmd/bulk/loader.go @@ -74,7 +74,7 @@ type options struct { BadgerVlog string // BadgerKeyFile is the file containing the key used for encryption. Enterprise only feature. BadgerKeyFile string - // Badger is the compression level to use while writing to badger. + // BadgerCompressionlevel is the compression level to use while writing to badger. BadgerCompressionLevel int }