Skip to content
This repository has been archived by the owner on Dec 9, 2022. It is now read-only.

LOG-1772: Canoe: productionise Paddle #2

Merged
merged 12 commits into from
Oct 16, 2017
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
tmp
dist
paddle
36 changes: 34 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,40 @@
# Paddle

Paddle is a command line tool for data archival and processing.
Paddle is a command line tool for canoe data archival and processing.

Work in progress.
## Setup

Make sure you have Go installed on your machine and that you checkout the repo to
the right folder. By default should be:

```
mkdir -p ~/go/src/github.com/deliveroo
cd ~/go/src/github.com/deliveroo
git clone [email protected]:deliveroo/paddle.git
cd paddle
```

You will need create a `$HOME/.paddle.yaml` that contains the bucket name, e.g:

```
> cat $HOME/.paddle.yaml
bucket: roo-bucket
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be worth pointing out that these settings can be configured through ENV as well (e.g. BUCKET).

```

You will also need to create a `$HOME/.aws/config` or `$HOME/.aws/credentials` so Paddle can connect to AWS, e.g.:

```
> cat $HOME/.aws/credentials
[default]
aws_access_key_id=xxx
aws_secret_access_key=yyy
region=eu-west-1
```

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unnecessary newline


```
$ go build
```
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should specify that Go must be installed only for development / tests. for actual usage, we'll be generating a binary which we can include instructions in the Readme for.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍


## Usage

Expand Down
99 changes: 66 additions & 33 deletions cli/data/get.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,21 @@ import (
"strings"
)

type S3Target struct {
bucket string
prefix string
path string
}

func (s *S3Target) copy() *S3Target {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this here to ensure that we aren't mutating?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand enough yet about Go to grasp if I should be just passing structs by value and allow the runtime to clone those from my understanding but yet.. here was just do that scenario where you want to be a good citzen:

class A
  def initialize(options = {})
     opts = options.dup
     opts.merge(...)
  end
end

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I wouldn't have this function, if anything we can just copy below; and copy is a built-in so it's not a great name. I would just pass by value, no reason to pass by reference here.

clone := *s
return &clone
}

func (t *S3Target) fullPath() string {
return fmt.Sprintf("%s/%s/%s", t.bucket, t.prefix, t.path);
}

var getBranch string
var getCommitPath string

Expand All @@ -44,7 +59,14 @@ $ paddle data get -b experimental trained-model/version1 dest/path
if !viper.IsSet("bucket") {
exitErrorf("Bucket not defined. Please define 'bucket' in your config file.")
}
fetchPath(viper.GetString("bucket"), args[0], getBranch, getCommitPath, args[1])

source := S3Target{
bucket: viper.GetString("bucket"),
prefix: fmt.Sprintf("%s/%s", args[0], getBranch),
path: getCommitPath,
}

copyPathToDestination(&source, args[1])
},
}

Expand All @@ -53,69 +75,80 @@ func init() {
getCmd.Flags().StringVarP(&getCommitPath, "path", "p", "HEAD", "Path to fetch (instead of HEAD)")
}

func fetchPath(bucket string, version string, branch string, path string, destination string) {
sess := session.Must(session.NewSessionWithOptions(session.Options{
func copyPathToDestination(source *S3Target, destination string) {
session := session.Must(session.NewSessionWithOptions(session.Options{
SharedConfigState: session.SharedConfigEnable,
}))

if path == "HEAD" {
svc := s3.New(sess)
headPath := fmt.Sprintf("%s/%s/HEAD", version, branch)
fmt.Println(headPath)
out, err := svc.GetObject(&s3.GetObjectInput{
Bucket: aws.String(bucket),
Key: aws.String(headPath),
})
if err != nil {
exitErrorf("%v", err)
}
buf := new(bytes.Buffer)
buf.ReadFrom(out.Body)
path = buf.String()
} else {
path = fmt.Sprintf("%s/%s/%s", version, branch, path)
/*
* HEAD contains the path to latest folder
*/
if source.path == "HEAD" {
source = source.copy()
source.path = readHEAD(session, source)
}

fmt.Println("Copying " + source.fullPath() + " to " + destination)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I kinda liked a more minimal output to be honest, no biggie though

copy(session, source, destination)
}

func readHEAD(session *session.Session, source *S3Target) string {
svc := s3.New(session)
key := fmt.Sprintf("%s/HEAD", source.prefix)

out, err := svc.GetObject(&s3.GetObjectInput{
Bucket: aws.String(source.bucket),
Key: aws.String(key),
})

if err != nil {
exitErrorf("%v", err)
}
fmt.Println("Fetching " + path)
getBucketObjects(sess, bucket, path, destination)

buf := new(bytes.Buffer)
buf.ReadFrom(out.Body)
return buf.String()
}

func getBucketObjects(sess *session.Session, bucket string, prefix string, dest string) {
func copy(session *session.Session, source *S3Target, destination string) {
query := &s3.ListObjectsV2Input{
Bucket: aws.String(bucket),
Prefix: aws.String(prefix),
Bucket: aws.String(source.bucket),
Prefix: aws.String(source.prefix + "/" + source.path),
}
svc := s3.New(sess)
svc := s3.New(session)

truncatedListing := true

for truncatedListing {
resp, err := svc.ListObjectsV2(query)
response, err := svc.ListObjectsV2(query)

if err != nil {
fmt.Println(err.Error())
return
}
getObjectsAll(bucket, resp, svc, prefix, dest)
query.ContinuationToken = resp.NextContinuationToken
truncatedListing = *resp.IsTruncated
copyToLocalFiles(svc, response.Contents, source, destination)

// Check if more results
query.ContinuationToken = response.NextContinuationToken
truncatedListing = *response.IsTruncated
}
}

func getObjectsAll(bucket string, bucketObjectsList *s3.ListObjectsV2Output, s3Client *s3.S3, prefix string, dest string) {
for _, key := range bucketObjectsList.Contents {
func copyToLocalFiles(s3Client *s3.S3, objects []*s3.Object, source *S3Target, destination string) {
for _, key := range objects {
destFilename := *key.Key
if strings.HasSuffix(*key.Key, "/") {
fmt.Println("Got a directory")
continue
}
out, err := s3Client.GetObject(&s3.GetObjectInput{
Bucket: aws.String(bucket),
Bucket: aws.String(source.bucket),
Key: key.Key,
})
if err != nil {
exitErrorf("%v", err)
}
destFilePath := dest + "/" + strings.TrimPrefix(destFilename, prefix+"/")
destFilePath := destination + "/" + strings.TrimPrefix(destFilename, source.prefix + "/")
err = os.MkdirAll(filepath.Dir(destFilePath), 0777)
fmt.Print(destFilePath)
destFile, err := os.Create(destFilePath)
Expand Down
9 changes: 2 additions & 7 deletions cli/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,8 @@ var cfgFile string
// RootCmd represents the base command when called without any subcommands
var RootCmd = &cobra.Command{
Use: "paddle",
Short: "A brief description of your application",
Long: `A longer description that spans multiple lines and likely contains
examples and usage of using your application. For example:

Cobra is a CLI library for Go that empowers applications.
This application is a tool to generate the needed files
to quickly create a Cobra application.`,
Short: "Canoe tool for data archival and processing",
Long: "Canoe tool for data archival and processing",
// Uncomment the following line if your bare application
// has an action associated with it:
// Run: func(cmd *cobra.Command, args []string) { },
Expand Down