Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pkg/datastore/filedb/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@ go_library(
srcs = [
"codec.go",
"filedb.go",
"filter.go",
"iterator.go",
],
importpath = "github.com/pipe-cd/pipecd/pkg/datastore/filedb",
visibility = ["//visibility:public"],
deps = [
"//pkg/cache:go_default_library",
"//pkg/datastore:go_default_library",
"//pkg/filestore:go_default_library",
"@org_uber_go_zap//:go_default_library",
Expand Down
103 changes: 100 additions & 3 deletions pkg/datastore/filedb/filedb.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,18 @@ package filedb
import (
"context"
"fmt"
"path/filepath"

"go.uber.org/zap"

"github.com/pipe-cd/pipecd/pkg/cache"
"github.com/pipe-cd/pipecd/pkg/datastore"
"github.com/pipe-cd/pipecd/pkg/filestore"
)

type FileDB struct {
backend filestore.Store
cache cache.Cache
logger *zap.Logger
}

Expand All @@ -37,9 +40,10 @@ func WithLogger(logger *zap.Logger) Option {
}
}

func NewFileDB(fs filestore.Store, opts ...Option) (*FileDB, error) {
func NewFileDB(fs filestore.Store, c cache.Cache, opts ...Option) (*FileDB, error) {
fd := &FileDB{
backend: fs,
cache: c,
logger: zap.NewNop(),
}
for _, opt := range opts {
Expand All @@ -61,11 +65,100 @@ func (f *FileDB) fetch(ctx context.Context, path string) ([]byte, error) {
}

func (f *FileDB) Find(ctx context.Context, col datastore.Collection, opts datastore.ListOptions) (datastore.Iterator, error) {
_, ok := col.(datastore.ShardStorable)
scol, ok := col.(datastore.ShardStorable)
if !ok {
return nil, datastore.ErrUnsupported
}
return nil, datastore.ErrUnimplemented

var (
kind = col.Kind()
shards = scol.ListInUsedShards()
// Map of objects values with the first key is the object id.
objects map[string][][]byte
)

// Fetch the first part of all objects under a specified "directory".
for _, shard := range shards {
dpath := makeHotStorageDirPath(kind, shard)
parts, err := f.backend.List(ctx, dpath)
if err != nil {
f.logger.Error("failed to find entities",
zap.String("kind", kind),
zap.Error(err),
)
return nil, err
}

if objects == nil {
objects = make(map[string][][]byte, len(parts))
}
for _, obj := range parts {
id := filepath.Base(obj.Path)

// Check if the raw data stored under this path is fetched or not
// based on its etag value stored as key in `f.cache` store.
cdata, err := f.cache.Get(obj.Etag)
if err == nil {
objects[id] = append(objects[id], cdata.([]byte))
continue
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about grouping and storing all models of a kind using HashCache https://github.com/pipe-cd/pipecd/blob/master/pkg/cache/rediscache/hashcache.go?
Then we can fetchAll to check instead of calling every time like this.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And I think we need to design the key name carefully instead of directly using the Etag value to avoid conflict with other keys in Redis. At other places we're prefixing the name.

Copy link
Member Author

@khanhtc1202 khanhtc1202 Mar 3, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel you, in that case, it's possible to have some objects updated while the whole others are not, meaning that we still for loop to check which one is updated and which one is not. In that case, we have a minor point to care about: Since I store the whole object content as the value for etag key, if we group it as a single value and use GetAll, it's a bit dangerous in case there are a lot of objects. Of course, as the downside of the current implementation, we may have a bunch of requests to cache to check whether the object is updated or not based on its etag. Wdyt about this trade-off 🤔

Copy link
Member Author

@khanhtc1202 khanhtc1202 Mar 3, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And I think we need to design the key name carefully instead of directly using the Etag value to avoid conflict with other keys in Redis. At other places we're prefixing the name.

Nice catch, lets me address it 🙆‍♂️ tbh, I made the key as id_shard_etag at first, but just feel like it's overdone, lets me add etag_ prefix to this key as other places.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's possible to have some objects updated while the whole others are not

Yes, it is. For the outdated ones, we call Cache HSET to update after fetching directly from the file store.

if we group it as a single value on use GetAll, it's a bit dangerous in case there are a lot of objects

I see. Storing by HashCache, I mean the Field Key is the object ID instead of eTag, eTag is included in the value with object content. So the number of objects will not increase when the object is updated.
When storing by ETag the number of objects in cache will increase quickly whenever an object has been updated. In that case, we will need TTL or something like that to deal with that problem.

Wdyt?

Copy link
Member Author

@khanhtc1202 khanhtc1202 Mar 3, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about this

  • Normal cache (not hashcache)
  • FieldKey: entity_id_shard
  • FieldValue: {etag: etag_value, data: data}

HashCache only helps us to reduce the number of fetch cache request, but the downside is that we have 2 for loop to handle which should be updated, and the size of the value on the hashkey can be too large since we store the whole object content.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What I am concerned about that way is that it still requires N requests to our Redis.
But definitely, it is simpler so let's apply that way for now and see how it goes. 👍

Copy link
Member Author

@khanhtc1202 khanhtc1202 Mar 3, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What I am concerned about that way is that it still requires N requests to our Redis.

I feel you, then why not both 🤔
I mean we can make our cache a bit more complicated to handle this case, we will have:

hashcache as your drafted with:

  • hashkey: List_{kind}
  • field key: entityId_shard
  • field value: etagValue

cache to store object data

  • key: etag_entityId (I mean etag_ is the prefix not the value of etag)
  • value: {etag: etagValue, data: data}

And whenever we update the etagValue in the hashcache, we update the cache storing object content as well. Wdyt 👀

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oops, I forgot that we still need to fetch cache to get the content by separated cache request. Please forget the above suggestion 🙏

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated, PTAL when you have time 😉


// If there is no value attached with the given etag key, fetch the
// content under the object path.
data, err := f.fetch(ctx, obj.Path)
if err != nil {
f.logger.Error("failed to fetch entity part",
zap.String("kind", kind),
zap.String("id", id),
zap.Error(err),
)
return nil, err
}

// Store fetched data to cache.
if err = f.cache.Put(obj.Etag, data); err != nil {
f.logger.Error("failed to store entity part to cache",
zap.String("kind", kind),
zap.String("id", id),
zap.String("key", obj.Etag),
zap.Error(err),
)
}

objects[id] = append(objects[id], data)
}
}

entities := make([]interface{}, 0, len(objects))
for id, obj := range objects {
e := col.Factory()()
if err := decode(col, e, obj...); err != nil {
f.logger.Error("failed to build entity",
zap.String("kind", kind),
zap.String("id", id),
zap.Error(err),
)
return nil, err
}

pass, err := filter(col, e, opts.Filters)
if err != nil {
f.logger.Error("failed to filter entity",
zap.String("kind", kind),
zap.String("id", id),
zap.Error(err),
)
return nil, err
}

if pass {
entities = append(entities, e)
}
}

return &Iterator{
data: entities,
}, nil
}

func (f *FileDB) Get(ctx context.Context, col datastore.Collection, id string, v interface{}) error {
Expand Down Expand Up @@ -141,3 +234,7 @@ func makeHotStorageFilePath(kind, id string, shard datastore.Shard) string {
// TODO: Find a way to separate files by project to avoid fetch resources cross project.
return fmt.Sprintf("%s/%s/%s.json", kind, shard, id)
}

func makeHotStorageDirPath(kind string, shard datastore.Shard) string {
return fmt.Sprintf("%s/%s/", kind, shard)
}
33 changes: 33 additions & 0 deletions pkg/datastore/filedb/filter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Copyright 2022 The PipeCD Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package filedb

import (
"github.com/pipe-cd/pipecd/pkg/datastore"
)

// TODO: Implement filterable interface for each collection.
type filterable interface {
Match(e interface{}, filters []datastore.ListFilter) (bool, error)
}

func filter(col datastore.Collection, e interface{}, filters []datastore.ListFilter) (bool, error) {
fcol, ok := col.(filterable)
if !ok {
return false, datastore.ErrUnsupported
}

return fcol.Match(e, filters)
}
29 changes: 29 additions & 0 deletions pkg/datastore/filedb/iterator.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// Copyright 2022 The PipeCD Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package filedb

import "github.com/pipe-cd/pipecd/pkg/datastore"

type Iterator struct {
data []interface{}
}

func (it *Iterator) Next(dst interface{}) error {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dst is unused in Next

return datastore.ErrUnimplemented
}

func (it *Iterator) Cursor() (string, error) {
return "", datastore.ErrUnimplemented
}