Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ require (
fortio.org/log v1.17.1
github.com/Masterminds/semver/v3 v3.3.1
github.com/andybalholm/brotli v1.1.1
github.com/avast/retry-go v3.0.0+incompatible
github.com/cenkalti/backoff/v4 v4.3.0
github.com/cncf/xds/go v0.0.0-20240905190251-b4127c9b8d78
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,8 @@ github.com/ashanbrown/forbidigo v1.6.0 h1:D3aewfM37Yb3pxHujIPSpTf6oQk9sc9WZi8ger
github.com/ashanbrown/forbidigo v1.6.0/go.mod h1:Y8j9jy9ZYAEHXdu723cUlraTqbzjKF1MUyfOKL+AjcU=
github.com/ashanbrown/makezero v1.2.0 h1:/2Lp1bypdmK9wDIq7uWBlDF1iMUpIIS4A+pF6C9IEUU=
github.com/ashanbrown/makezero v1.2.0/go.mod h1:dxlPhHbDMC6N6xICzFBSK+4njQDdK8euNO0qjQMtGY4=
github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0=
github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY=
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
Expand Down
49 changes: 33 additions & 16 deletions internal/wasm/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,16 @@
// option sets for configuring the cache.
CacheOptions

// permissionCheckCache is the cache for permission check for private OCI images.
// The permission check is run periodically by a background goroutine and the result is cached.
permissionCheckCache *permissionCache

// logger
logger logging.Logger
}

func (c *localFileCache) Start(ctx context.Context) {
c.permissionCheckCache.Start(ctx)
go c.purge(ctx)
}

Expand Down Expand Up @@ -131,9 +136,12 @@
func newLocalFileCache(options CacheOptions, logger logging.Logger) *localFileCache {
options = options.sanitize()
cache := &localFileCache{
httpFetcher: NewHTTPFetcher(options.HTTPRequestTimeout, options.HTTPRequestMaxRetries, logger),
modules: make(map[moduleKey]*cacheEntry),
checksums: make(map[string]*checksumEntry),
httpFetcher: NewHTTPFetcher(options.HTTPRequestTimeout, options.HTTPRequestMaxRetries, logger),
modules: make(map[moduleKey]*cacheEntry),
checksums: make(map[string]*checksumEntry),
permissionCheckCache: newPermissionCache(
permissionCacheOptions{},
logger),
CacheOptions: options,
logger: logger,
}
Expand Down Expand Up @@ -220,7 +228,7 @@
if ce != nil {
// We still need to check if the pull secret is correct if it is a private OCI image.
if u.Scheme == "oci" && ce.isPrivate {
if err = c.checkPermission(ctx, u, insecure, opts); err != nil {
if _, err := c.permissionCheckCache.IsAllowed(ctx, u, opts.PullSecret, insecure); err != nil {

Check warning on line 231 in internal/wasm/cache.go

View check run for this annotation

Codecov / codecov/patch

internal/wasm/cache.go#L231

Added line #L231 was not covered by tests
return nil, err
}
}
Expand Down Expand Up @@ -250,7 +258,24 @@
if len(opts.PullSecret) > 0 {
isPrivate = true
}
if imageBinaryFetcher, dChecksum, err = c.prepareFetch(ctx, u, insecure, opts); err != nil {

imageBinaryFetcher, dChecksum, err = c.prepareFetch(ctx, u, insecure, opts.PullSecret)

if isPrivate {
e := &permissionCacheEntry{
image: u,
fetcherOption: &ImageFetcherOption{
Insecure: insecure,
PullSecret: opts.PullSecret,
},
lastCheck: time.Now(),
lastAccess: time.Now(),
checkError: err,
}
c.permissionCheckCache.Put(e)
}

Check warning on line 276 in internal/wasm/cache.go

View check run for this annotation

Codecov / codecov/patch

internal/wasm/cache.go#L265-L276

Added lines #L265 - L276 were not covered by tests

if err != nil {
wasmRemoteFetchTotal.WithFailure(reasonManifestError).Increment()
return nil, fmt.Errorf("could not fetch Wasm OCI image: %w", err)
}
Expand Down Expand Up @@ -287,24 +312,16 @@
return c.addEntry(key, b, isPrivate)
}

func (c *localFileCache) checkPermission(ctx context.Context, u *url.URL, insecure bool, opts GetOptions) error {
// Try to get the image metadata to check if the pull secret is correct.
if _, _, err := c.prepareFetch(ctx, u, insecure, opts); err != nil {
return fmt.Errorf("failed to login to private registry: %w", err)
}
return nil
}

// prepareFetch won't fetch the binary, but it will prepare the binaryFetcher and actualDigest.
func (c *localFileCache) prepareFetch(
ctx context.Context, url *url.URL, insecure bool, opts GetOptions) (
ctx context.Context, url *url.URL, insecure bool, pullSecret []byte) (
binaryFetcher func() ([]byte, error), actualDigest string, err error,
) {
imgFetcherOps := ImageFetcherOption{
Insecure: insecure,
}
if len(opts.PullSecret) > 0 {
imgFetcherOps.PullSecret = opts.PullSecret
if len(pullSecret) > 0 {
imgFetcherOps.PullSecret = pullSecret

Check warning on line 324 in internal/wasm/cache.go

View check run for this annotation

Codecov / codecov/patch

internal/wasm/cache.go#L324

Added line #L324 was not covered by tests
}
fetcher := NewImageFetcher(ctx, imgFetcherOps, c.logger)
if binaryFetcher, actualDigest, err = fetcher.PrepareFetch(url.Host + url.Path); err != nil {
Expand Down
266 changes: 266 additions & 0 deletions internal/wasm/premissioncache.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
// Copyright Envoy Gateway Authors
// SPDX-License-Identifier: Apache-2.0
// The full text of the Apache license is available in the LICENSE file at
// the root of the repo.

// Copyright Istio Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package wasm

import (
"context"
"encoding/hex"
"errors"
"fmt"
"net/http"
"net/url"
"sync"
"time"

"github.com/avast/retry-go"
"github.com/google/go-containerregistry/pkg/v1/remote/transport"

"github.com/envoyproxy/gateway/internal/logging"
)

type permissionCacheOptions struct {
// checkInterval is the interval to recheck the permission for the cached permission entries.
checkInterval time.Duration

// permissionExpiry is the expiry time for permission cache entry.
// The permission cache entry will be updated by rechecking the OCI image permission against the pull secret.
permissionExpiry time.Duration

// cacheExpiry is the expiry time for the permission cache.
// The permission cache will be removed if it is not accessed for the specified expiry time.
// This is used to purge the cache.
cacheExpiry time.Duration
}

// sanitize validates and sets the default values for the permission cache options.
func (o *permissionCacheOptions) sanitize() {
if o.checkInterval == 0 {
o.checkInterval = 5 * time.Minute
}
if o.permissionExpiry == 0 {
o.permissionExpiry = 1 * time.Hour
}
if o.cacheExpiry == 0 {
o.cacheExpiry = 24 * time.Hour
}
}

// permissionCache is a cache for permission check for private OCI images.
// After a new permission is put into the cache, it will be checked periodically by a background goroutine.
// It is used to avoid blocking the translator due to the permission check.
type permissionCache struct {
sync.Mutex
permissionCacheOptions

cache map[string]*permissionCacheEntry
logger logging.Logger
}

// permissionCacheEntry is an entry in the permission cache.
type permissionCacheEntry struct {
// The oci image URL.
image *url.URL
// fetcherOption contains the pull secret for the image.
fetcherOption *ImageFetcherOption
// The last time the pull secret is checked against the image.
lastCheck time.Time
// The error returned by the OCI registry when checking the permission.
// If error is not nil, the permission is not allowed.
// If it's a permission error, it's represented by a transport.Error with 401 or 403 HTTP status code.
// But it's not necessarily a permission error, it could be other errors like network error, non-exist image, etc.
// In this case, the permission is also not allowed.
checkError error
// The last time the cache entry is accessed.
lastAccess time.Time
}

func (e *permissionCacheEntry) key() string {
return permissionCacheKey(e.image, e.fetcherOption.PullSecret)
}

// isPermissionExpired returns true if the permission check is older
// than the specified expiry duration. If this is true, the entry
// should be rechecked.
func (e *permissionCacheEntry) isPermissionExpired(expiry time.Duration) bool {
return time.Now().After(e.lastCheck.Add(expiry))
}

// isCacheExpired returns true if the cache entry has not been accessed
// for the specified expiry duration. If this is true, the entry
// should be removed.
func (e *permissionCacheEntry) isCacheExpired(expiry time.Duration) bool {
return time.Now().After(e.lastAccess.Add(expiry))
}

// permissionCacheKey generates a key for a permission cache entry.
// The key is the hex encoded of concatenation of the image URL and the pull secret.
func permissionCacheKey(image *url.URL, pullSecret []byte) string {
b := make([]byte, len(image.String())+len(pullSecret))
copy(b, image.String())
copy(b[len(image.String()):], pullSecret)
return hex.EncodeToString(b)
}

// newPermissionCache creates a new permission cache with a given TTL.
func newPermissionCache(options permissionCacheOptions, logger logging.Logger) *permissionCache {
options.sanitize()
return &permissionCache{
cache: make(map[string]*permissionCacheEntry),
permissionCacheOptions: options,
logger: logger,
}
}

// checkAndUpdatePermission checks the permission of the image against the pull secret and updates the cache entry.
func (p *permissionCache) checkAndUpdatePermission(ctx context.Context, e *permissionCacheEntry) error {
fetcher := NewImageFetcher(ctx, *e.fetcherOption, p.logger)
_, _, err := fetcher.PrepareFetch(e.image.Host + e.image.Path)
e.checkError = err
e.lastCheck = time.Now()
return err
}

// start starts a background goroutine to periodically check the permission for the cached permission entries.
func (p *permissionCache) Start(ctx context.Context) {
go func() {
ticker := time.NewTicker(p.checkInterval)
defer ticker.Stop()
for {
select {
case <-ticker.C:
func() {
p.Lock()
defer p.Unlock()
for _, e := range p.cache {
if e.isCacheExpired(p.cacheExpiry) {
p.logger.Info("removing permission cache entry", "image", e.image.String())
delete(p.cache, e.key())
continue
}
if e.isPermissionExpired(p.permissionExpiry) {
const retryAttempts = 3
const retryDelay = 1 * time.Second
p.logger.Info("rechecking permission for image", "image", e.image.String())
err := retry.Do(
func() error {
err := p.checkAndUpdatePermission(ctx, e)
if err != nil && isRetriableError(err) {
p.logger.Error(
err,
"failed to check permission for image, will retry again",
"image",
e.image.String())
return err
}

Check warning on line 171 in internal/wasm/premissioncache.go

View check run for this annotation

Codecov / codecov/patch

internal/wasm/premissioncache.go#L165-L171

Added lines #L165 - L171 were not covered by tests
return nil
},
retry.Attempts(retryAttempts),
retry.DelayType(retry.BackOffDelay),
retry.Delay(retryDelay),
retry.Context(ctx),
)
if err != nil {
p.logger.Error(
err,
fmt.Sprintf("failed to recheck permission for image after %d attempts", retryAttempts),
"image",
e.image.String())
}

Check warning on line 185 in internal/wasm/premissioncache.go

View check run for this annotation

Codecov / codecov/patch

internal/wasm/premissioncache.go#L180-L185

Added lines #L180 - L185 were not covered by tests
}
}
}()
case <-ctx.Done():
return
}
}
}()
}

// isRetriableError checks if the error is retriable.
// If the error is a permission error, it's not retriable. For example, 401 and 403 HTTP status code.
func isRetriableError(err error) bool {
var terr *transport.Error
if errors.As(err, &terr) {
if terr.StatusCode == http.StatusUnauthorized || terr.StatusCode == http.StatusForbidden {
return false
}
}
return true

Check warning on line 205 in internal/wasm/premissioncache.go

View check run for this annotation

Codecov / codecov/patch

internal/wasm/premissioncache.go#L205

Added line #L205 was not covered by tests
}

// put puts a new permission cache entry into the cache.
func (p *permissionCache) Put(e *permissionCacheEntry) {
p.Lock()
defer p.Unlock()
e.lastAccess = time.Now()
e.lastCheck = time.Now()
p.cache[e.key()] = e
}

// IsAllowed checks if the given image is allowed to be accessed with the provided pull secret.
// If the permission is not found in the cache, this method will block until the permission is checked and cached.
// This blocking won't be too long as it's only for the first time permission check and won't retry. Subsequent
// permission checks will be done in a background goroutine by the permission cache.
//
// If any error occurs, the permission is considered not allowed.
// The error can be a permission error or other errors like network error, non-exist image, etc.
func (p *permissionCache) IsAllowed(ctx context.Context, image *url.URL, pullSecret []byte, insecure bool) (bool, error) {
p.Lock()
defer p.Unlock()
key := permissionCacheKey(image, pullSecret)
if e, ok := p.cache[key]; ok {
e.lastAccess = time.Now()
return e.checkError == nil, e.checkError
}

e := &permissionCacheEntry{
image: image,
fetcherOption: &ImageFetcherOption{
Insecure: insecure,
PullSecret: pullSecret,
},
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the drawback of this, is, if permissions change on the server side, I'll have to wait 1 hr to update my wasm image

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the primary issue here is - fetching a WASM Image including checking permissions blocks the route linked to this EEP as well as other routes and endpoints and other xds from being pushed to the data plane. Ideally we'd like this two tasks to be happening in parallel.
Can we fetch and check permissions in a separate go routine to unblock the translator ? If the wasm fetcher / checker go routine finds errors it could flag it in the status using watchable
Hoping the WASM URL is based off policy ns-name so data plane requests will fail in case of a deny

Copy link
Member Author

@zhaohuabing zhaohuabing Mar 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we fetch and check permissions in a separate go routine to unblock the translator ? If the wasm fetcher / checker go routine finds errors it could flag it in the status using watchable
Hoping the WASM URL is based off policy ns-name so data plane requests will fail in case of a deny

We can't fetch the OCI image async. The sha256 checksum of the wasm module is a mandatory field for xDS Wasm remote code source, and we can't get the checksum without fetching the image.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the drawback of this, is, if permissions change on the server side, I'll have to wait 1 hr to update my wasm image

Permission change actually won't invalidate cached wasm image, it only prevents the requests from unauthorized EEPs, unauthorized EEP translation will fail.

The cached images are purged after not being touched for expiry duration, which is handled in the wasm image chace.

// Do not retry if the permission check fails because we don't want to block the translator for too long.
// The permission check will be retried in the background goroutine by the permission cache.
if err := p.checkAndUpdatePermission(ctx, e); err != nil {
p.logger.Error(err, "failed to check permission for image", "image", e.image.String())
}
e.lastAccess = time.Now()
p.cache[key] = e
return e.checkError == nil, e.checkError
}

// getForTest is a test helper to get a permission cache entry from the cache.
func (p *permissionCache) getForTest(key string) (permissionCacheEntry, bool) {
p.Lock()
defer p.Unlock()
entry, ok := p.cache[key]
if !ok {
return permissionCacheEntry{}, false
}
return *entry, true
}

// deleteForTest is a test helper to delete a permission cache entry from the cache.
func (p *permissionCache) deleteForTest(key string) {
p.Lock()
defer p.Unlock()
delete(p.cache, key)
}
Loading