Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve sync performance for pull-mirrors #19125

Merged
merged 25 commits into from
Mar 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
aa3a762
optimize tag-release sync procedure for pull-mirrors
petergardfjall Mar 15, 2022
33f5a5e
optimize tag-release sync procedure for pull-mirrors
petergardfjall Mar 18, 2022
5cd3114
godoc
petergardfjall Mar 18, 2022
4ce50be
please linter
petergardfjall Mar 18, 2022
d87be92
please linter: copyright notices
petergardfjall Mar 18, 2022
0e67646
test foreachref.Format
petergardfjall Mar 24, 2022
1fc0bb4
test foreachref.Parser
petergardfjall Mar 24, 2022
6d30da1
stream git for-each-ref output to Parser
petergardfjall Mar 24, 2022
5c91e3d
copyright header for test files
petergardfjall Mar 24, 2022
2847d13
gofumpt
petergardfjall Mar 24, 2022
5d6c962
explicitly ignore return value
petergardfjall Mar 24, 2022
73bb43e
explicitly ignore return value (again)
petergardfjall Mar 24, 2022
4274804
please linter: no encoding/json
petergardfjall Mar 24, 2022
638d690
rename foreachref.Parser test function
petergardfjall Mar 24, 2022
38a1b66
distinguish external imports
petergardfjall Mar 24, 2022
20f6dc5
distinguish external imports part 2
petergardfjall Mar 24, 2022
a5220cf
sort prior to pagination
petergardfjall Mar 24, 2022
fcd1de6
include payload for annotated tags with signature
petergardfjall Mar 28, 2022
ef6352c
less verbose signature payload construction
petergardfjall Mar 28, 2022
2f3908c
code cleanup: remove dead code
petergardfjall Mar 31, 2022
aa267d2
delimiters can be byte slices
petergardfjall Mar 31, 2022
61eae9e
avoid closing writer twice
petergardfjall Mar 31, 2022
fd7f58c
avoid repetitive memory allocations
petergardfjall Mar 31, 2022
c1da4e0
make sure tests add newline to every reference in simulated output
petergardfjall Mar 31, 2022
b5ef68f
Merge branch 'main' into pull-mirror-tag-sync-optimization
wxiaoguang Mar 31, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions modules/git/foreachref/format.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// Copyright 2022 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

package foreachref
petergardfjall marked this conversation as resolved.
Show resolved Hide resolved

import (
"encoding/hex"
"fmt"
"io"
"strings"
)

var (
nullChar = []byte("\x00")
dualNullChar = []byte("\x00\x00")
)

// Format supports specifying and parsing an output format for 'git
// for-each-ref'. See See git-for-each-ref(1) for available fields.
type Format struct {
petergardfjall marked this conversation as resolved.
Show resolved Hide resolved
// fieldNames hold %(fieldname)s to be passed to the '--format' flag of
// for-each-ref. See git-for-each-ref(1) for available fields.
fieldNames []string

// fieldDelim is the character sequence that is used to separate fields
// for each reference. fieldDelim and refDelim should be selected to not
// interfere with each other and to not be present in field values.
fieldDelim []byte
// fieldDelimStr is a string representation of fieldDelim. Used to save
// us from repetitive reallocation whenever we need the delimiter as a
// string.
fieldDelimStr string
// refDelim is the character sequence used to separate reference from
// each other in the output. fieldDelim and refDelim should be selected
// to not interfere with each other and to not be present in field
// values.
refDelim []byte
}

// NewFormat creates a forEachRefFormat using the specified fieldNames. See
// git-for-each-ref(1) for available fields.
func NewFormat(fieldNames ...string) Format {
return Format{
fieldNames: fieldNames,
fieldDelim: nullChar,
fieldDelimStr: string(nullChar),
refDelim: dualNullChar,
}
}

// Flag returns a for-each-ref --format flag value that captures the fieldNames.
func (f Format) Flag() string {
var formatFlag strings.Builder
for i, field := range f.fieldNames {
// field key and field value
formatFlag.WriteString(fmt.Sprintf("%s %%(%s)", field, field))

if i < len(f.fieldNames)-1 {
// note: escape delimiters to allow control characters as
// delimiters. For example, '%00' for null character or '%0a'
// for newline.
formatFlag.WriteString(f.hexEscaped(f.fieldDelim))
}
}
formatFlag.WriteString(f.hexEscaped(f.refDelim))
return formatFlag.String()
}

// Parser returns a Parser capable of parsing 'git for-each-ref' output produced
// with this Format.
func (f Format) Parser(r io.Reader) *Parser {
return NewParser(r, f)
}

// hexEscaped produces hex-escpaed characters from a string. For example, "\n\0"
// would turn into "%0a%00".
func (f Format) hexEscaped(delim []byte) string {
escaped := ""
for i := 0; i < len(delim); i++ {
escaped += "%" + hex.EncodeToString([]byte{delim[i]})
}
return escaped
}
67 changes: 67 additions & 0 deletions modules/git/foreachref/format_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Copyright 2022 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

package foreachref_test

import (
"testing"

"code.gitea.io/gitea/modules/git/foreachref"

"github.com/stretchr/testify/require"
)

func TestFormat_Flag(t *testing.T) {
tests := []struct {
name string

givenFormat foreachref.Format

wantFlag string
}{
{
name: "references are delimited by dual null chars",

// no reference fields requested
givenFormat: foreachref.NewFormat(),

// only a reference delimiter field in --format
wantFlag: "%00%00",
},

{
name: "a field is a space-separated key-value pair",

givenFormat: foreachref.NewFormat("refname:short"),

// only a reference delimiter field
wantFlag: "refname:short %(refname:short)%00%00",
},

{
name: "fields are separated by a null char field-delimiter",

givenFormat: foreachref.NewFormat("refname:short", "author"),

wantFlag: "refname:short %(refname:short)%00author %(author)%00%00",
},

{
name: "multiple fields",

givenFormat: foreachref.NewFormat("refname:short", "objecttype", "objectname"),

wantFlag: "refname:short %(refname:short)%00objecttype %(objecttype)%00objectname %(objectname)%00%00",
},
}

for _, test := range tests {
tc := test // don't close over loop variable
t.Run(tc.name, func(t *testing.T) {
gotFlag := tc.givenFormat.Flag()

require.Equal(t, tc.wantFlag, gotFlag, "unexpected for-each-ref --format string. wanted: '%s', got: '%s'", tc.wantFlag, gotFlag)
})
}
}
131 changes: 131 additions & 0 deletions modules/git/foreachref/parser.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
// Copyright 2022 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

package foreachref
petergardfjall marked this conversation as resolved.
Show resolved Hide resolved

import (
"bufio"
"bytes"
"fmt"
"io"
"strings"
)

// Parser parses 'git for-each-ref' output according to a given output Format.
type Parser struct {
petergardfjall marked this conversation as resolved.
Show resolved Hide resolved
// tokenizes 'git for-each-ref' output into "reference paragraphs".
scanner *bufio.Scanner

// format represents the '--format' string that describes the expected
// 'git for-each-ref' output structure.
format Format

// err holds the last encountered error during parsing.
err error
}

// NewParser creates a 'git for-each-ref' output parser that will parse all
// references in the provided Reader. The references in the output are assumed
// to follow the specified Format.
func NewParser(r io.Reader, format Format) *Parser {
scanner := bufio.NewScanner(r)

// in addition to the reference delimiter we specified in the --format,
// `git for-each-ref` will always add a newline after every reference.
refDelim := make([]byte, 0, len(format.refDelim)+1)
refDelim = append(refDelim, format.refDelim...)
refDelim = append(refDelim, '\n')

// Split input into delimiter-separated "reference blocks".
scanner.Split(
func(data []byte, atEOF bool) (advance int, token []byte, err error) {
// Scan until delimiter, marking end of reference.
delimIdx := bytes.Index(data, refDelim)
if delimIdx >= 0 {
token := data[:delimIdx]
advance := delimIdx + len(refDelim)
return advance, token, nil
}
// If we're at EOF, we have a final, non-terminated reference. Return it.
if atEOF {
return len(data), data, nil
}
// Not yet a full field. Request more data.
return 0, nil, nil
})

return &Parser{
scanner: scanner,
format: format,
err: nil,
}
}

// Next returns the next reference as a collection of key-value pairs. nil
// denotes EOF but is also returned on errors. The Err method should always be
// consulted after Next returning nil.
//
// It could, for example return something like:
//
// { "objecttype": "tag", "refname:short": "v1.16.4", "object": "f460b7543ed500e49c133c2cd85c8c55ee9dbe27" }
//
func (p *Parser) Next() map[string]string {
if !p.scanner.Scan() {
return nil
}
fields, err := p.parseRef(p.scanner.Text())
if err != nil {
p.err = err
return nil
}
return fields
}

// Err returns the latest encountered parsing error.
func (p *Parser) Err() error {
return p.err
}

// parseRef parses out all key-value pairs from a single reference block, such as
//
// "objecttype tag\0refname:short v1.16.4\0object f460b7543ed500e49c133c2cd85c8c55ee9dbe27"
//
func (p *Parser) parseRef(refBlock string) (map[string]string, error) {
if refBlock == "" {
// must be at EOF
return nil, nil
}

fieldValues := make(map[string]string)

fields := strings.Split(refBlock, p.format.fieldDelimStr)
if len(fields) != len(p.format.fieldNames) {
return nil, fmt.Errorf("unexpected number of reference fields: wanted %d, was %d",
len(fields), len(p.format.fieldNames))
}
for i, field := range fields {
field = strings.TrimSpace(field)

var fieldKey string
var fieldVal string
firstSpace := strings.Index(field, " ")
if firstSpace > 0 {
fieldKey = field[:firstSpace]
fieldVal = field[firstSpace+1:]
} else {
// could be the case if the requested field had no value
fieldKey = field
}

// enforce the format order of fields
if p.format.fieldNames[i] != fieldKey {
return nil, fmt.Errorf("unexpected field name at position %d: wanted: '%s', was: '%s'",
i, p.format.fieldNames[i], fieldKey)
}

fieldValues[fieldKey] = fieldVal
}

return fieldValues, nil
}
Loading