Skip to content

Commit 1b301e2

Browse files
authored
Codexnull/bulk schema (#2762)
* Don't skip predicates with value type of default when loading the schema. (#2616) * Allow running test.sh from another directory. * Keep all predicates from bulk import schema, not just the ones used. * Make set of predicates the union of predicates in the schema and rdf. * Add test for schema after export/bulk load. * Add more schema test cases.
1 parent 3da37ce commit 1b301e2

File tree

5 files changed

+250
-5
lines changed

5 files changed

+250
-5
lines changed

contrib/scripts/test-bulk-schema.sh

+227
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
#!/bin/bash
2+
# verify fix of https://github.com/dgraph-io/dgraph/issues/2616
3+
4+
readonly ME=${0##*/}
5+
readonly SRCDIR=$(readlink -f ${BASH_SOURCE[0]%/*})
6+
7+
declare -ri PORT_OFFSET=$((RANDOM % 1000))
8+
declare -ri ZERO_PORT=$((5080+PORT_OFFSET))
9+
declare -ri ALPHA_PORT=$((7080+PORT_OFFSET)) HTTP_PORT=$((8080+PORT_OFFSET))
10+
11+
INFO() { echo "$ME: $@"; }
12+
ERROR() { echo >&2 "$ME: $@"; }
13+
FATAL() { ERROR "$@"; exit 1; }
14+
15+
set -e
16+
17+
INFO "running bulk load schema test"
18+
19+
WORKDIR=$(mktemp --tmpdir -d $ME.tmp-XXXXXX)
20+
INFO "using workdir $WORKDIR"
21+
cd $WORKDIR
22+
23+
function StartZero
24+
{
25+
INFO "starting zero server on port $ZERO_PORT"
26+
dgraph zero -o $PORT_OFFSET --my=localhost:$ZERO_PORT \
27+
>zero.log 2>&1 </dev/null &
28+
ZERO_PID=$!
29+
sleep 1
30+
$SRCDIR/../wait-for-it.sh -q -t 30 localhost:$ZERO_PORT \
31+
|| FATAL "failed to start zero"
32+
}
33+
34+
function BulkLoadSampleData
35+
{
36+
INFO "bulk loading sample data"
37+
cat >1million.schema <<EOF
38+
director.film: uid @reverse .
39+
genre: uid @reverse .
40+
initial_release_date: dateTime @index(year) .
41+
name: string @index(term) @lang .
42+
EOF
43+
mkfifo 1million.rdf.gz
44+
curl -LsS 'https://github.com/dgraph-io/tutorial/blob/master/resources/1million.rdf.gz?raw=true' >> 1million.rdf.gz &
45+
dgraph bulk -z localhost:$ZERO_PORT -s 1million.schema -r 1million.rdf.gz \
46+
>bulk.log 2>&1 </dev/null
47+
}
48+
49+
function StartAlpha
50+
{
51+
INFO "starting alpha server on port $ALPHA_PORT"
52+
dgraph alpha -o $PORT_OFFSET --my=localhost:$ALPHA_PORT --zero=localhost:$ZERO_PORT --lru_mb=2048 \
53+
>alpha.log 2>&1 </dev/null &
54+
ALPHA_PID=$!
55+
sleep 1
56+
$SRCDIR/../wait-for-it.sh -q -t 30 localhost:$ALPHA_PORT \
57+
|| FATAL "failed to start alpha"
58+
}
59+
60+
function UpdateDatabase
61+
{
62+
INFO "adding predicate with default type to schema"
63+
curl localhost:$HTTP_PORT/alter -X POST -d$'
64+
predicate_with_no_uid_count:string .
65+
predicate_with_default_type:default .
66+
predicate_with_index_no_uid_count:string @index(exact) .
67+
' &>/dev/null
68+
69+
curl localhost:$HTTP_PORT/mutate -X POST -H 'X-Dgraph-CommitNow: true' -d $'
70+
{
71+
set {
72+
_:company1 <predicate_with_default_type> "CompanyABC" .
73+
}
74+
}
75+
' &>/dev/null
76+
}
77+
78+
function QuerySchema
79+
{
80+
INFO "running schema query"
81+
local out_file=${1:?no out file}
82+
curl -sS localhost:$HTTP_PORT/query -XPOST -d'schema {}' | python -c "import json,sys; d=json.load(sys.stdin); json.dump(d['data'],sys.stdout,sort_keys=True,indent=2,separators=(',',': '))" > $out_file
83+
echo >> $out_file
84+
#INFO "schema is: " && cat $out_file
85+
}
86+
87+
function DoExport
88+
{
89+
INFO "running export"
90+
curl localhost:$HTTP_PORT/admin/export &>/dev/null
91+
sleep 1
92+
}
93+
94+
function BulkLoadExportedData
95+
{
96+
INFO "bulk loading exported data"
97+
dgraph bulk -z localhost:$ZERO_PORT \
98+
-s ../dir1/export/*/g01.schema.gz \
99+
-r ../dir1/export/*/g01.rdf.gz \
100+
>bulk.log 2>&1 </dev/null
101+
mv out/0/p .
102+
}
103+
104+
function BulkLoadFixtureData
105+
{
106+
INFO "bulk loading fixture data"
107+
108+
# schema test cases:
109+
#
110+
# 1. predicate with non-default type (name)
111+
# 2. predicate with default type (genre)
112+
# 3. predicate not used in rdf (language)
113+
cat >fixture.schema <<EOF
114+
name:string @index(term) .
115+
genre:default .
116+
language:string .
117+
EOF
118+
119+
# rdf test cases:
120+
#
121+
# 4. predicate not in schema (revenue)
122+
cat >fixture.rdf <<EOF
123+
_:et <name> "E.T. the Extra-Terrestrial" .
124+
_:et <genre> "Science Fiction" .
125+
_:et <revenue> "792.9" .
126+
EOF
127+
128+
dgraph bulk -z localhost:$ZERO_PORT -s fixture.schema -r fixture.rdf \
129+
>bulk.log 2>&1 </dev/null
130+
mv out/0/p .
131+
}
132+
133+
function StopServers
134+
{
135+
INFO "killing zero server at pid $ZERO_PID"
136+
INFO "killing alpha server at pid $ALPHA_PID"
137+
kill $ZERO_PID $ALPHA_PID
138+
sleep 1
139+
}
140+
141+
function Cleanup
142+
{
143+
INFO "removing $WORKDIR"
144+
rm -rf $WORKDIR
145+
}
146+
147+
mkdir dir1
148+
pushd dir1 >/dev/null
149+
150+
StartZero
151+
BulkLoadSampleData
152+
StartAlpha
153+
UpdateDatabase
154+
QuerySchema "schema.out"
155+
DoExport
156+
StopServers
157+
158+
popd >/dev/null
159+
mkdir dir2
160+
pushd dir2 >/dev/null
161+
162+
StartZero
163+
BulkLoadExportedData
164+
StartAlpha
165+
QuerySchema "schema.out"
166+
StopServers
167+
168+
popd >/dev/null
169+
170+
INFO "verifing schema is same before export and after bulk import"
171+
diff dir1/schema.out dir2/schema.out || FATAL "schema incorrect"
172+
INFO "schema is correct"
173+
174+
mkdir dir3
175+
pushd dir3 >/dev/null
176+
177+
StartZero
178+
BulkLoadFixtureData
179+
StartAlpha
180+
QuerySchema "schema.out"
181+
StopServers
182+
183+
popd >/dev/null
184+
185+
# final schema should include *all* predicates regardless of whether they were
186+
# introduced by the schema or rdf file, used or not used, or of default type
187+
# or non-default type
188+
INFO "verifying schema contains all predicates"
189+
diff - dir3/schema.out <<EOF || FATAL "schema incorrect"
190+
{
191+
"schema": [
192+
{
193+
"list": true,
194+
"predicate": "_predicate_",
195+
"type": "string"
196+
},
197+
{
198+
"predicate": "genre",
199+
"type": "default"
200+
},
201+
{
202+
"predicate": "language",
203+
"type": "string"
204+
},
205+
{
206+
"index": true,
207+
"predicate": "name",
208+
"tokenizer": [
209+
"term"
210+
],
211+
"type": "string"
212+
},
213+
{
214+
"predicate": "revenue",
215+
"type": "default"
216+
}
217+
]
218+
}
219+
EOF
220+
221+
INFO "schema is correct"
222+
223+
Cleanup
224+
225+
exit 0
226+
227+
# eof

dgraph/cmd/bulk/run.go

-2
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ package bulk
1818

1919
import (
2020
"encoding/json"
21-
"flag"
2221
"fmt"
2322
"log"
2423
"net/http"
@@ -113,7 +112,6 @@ func run() {
113112
os.Exit(0)
114113
}
115114
if opt.RDFDir == "" || opt.SchemaFile == "" {
116-
flag.Usage()
117115
fmt.Fprint(os.Stderr, "RDF and schema file(s) must be specified.\n")
118116
os.Exit(1)
119117
}

dgraph/cmd/bulk/schema.go

+15-1
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,21 @@ func (s *schemaStore) getPredicates(db *badger.DB) []string {
120120
func (s *schemaStore) write(db *badger.DB) {
121121
// Write schema always at timestamp 1, s.state.writeTs may not be equal to 1
122122
// if bulk loader was restarted or other similar scenarios.
123-
preds := s.getPredicates(db)
123+
124+
// Get predicates from the schema store so that the db includes all
125+
// predicates from the schema file.
126+
preds := make([]string, 0, len(s.m))
127+
for pred := range s.m {
128+
preds = append(preds, pred)
129+
}
130+
131+
// Add predicates from the db so that final schema includes predicates
132+
// used in the rdf file but not included in the schema file.
133+
for _, pred := range s.getPredicates(db) {
134+
if _, ok := s.m[pred]; ! ok {
135+
preds = append(preds, pred)
136+
}
137+
}
124138

125139
txn := db.NewTransactionAt(math.MaxUint64, true)
126140
defer txn.Discard()

schema/schema.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,7 @@ func LoadFromDb() error {
282282
var s pb.SchemaUpdate
283283
err := item.Value(func(val []byte) error {
284284
if len(val) == 0 {
285-
return nil
285+
s = pb.SchemaUpdate{Predicate: attr, ValueType: pb.Posting_DEFAULT}
286286
}
287287
x.Checkf(s.Unmarshal(val), "Error while loading schema from db")
288288
State().Set(attr, s)

test.sh

+7-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
#!/bin/bash
22

3-
source contrib/scripts/functions.sh
3+
# run from directory containing this script
4+
cd ${BASH_SOURCE[0]%/*}
5+
6+
source ./contrib/scripts/functions.sh
47
function run {
58
go test -short=true $@ |\
69
GREP_COLORS='mt=01;32' egrep --line-buffered --color=always '^ok\ .*|$' |\
@@ -31,6 +34,9 @@ echo
3134
echo "Running tests. Ignoring vendor folder."
3235
runAll || exit $?
3336

37+
# Run non-go tests.
38+
./contrib/scripts/test-bulk-schema.sh
39+
3440
echo
3541
echo "Running load-test.sh"
3642
./contrib/scripts/load-test.sh

0 commit comments

Comments
 (0)