Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

r/glue_crawler - s3 sample size #20203

Merged
merged 8 commits into from
Jul 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/20203.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:enhancement
resource/aws_glue_crawler: Add `sample_size` argument in `s3_target` block.
```
67 changes: 32 additions & 35 deletions aws/resource_aws_glue_crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,15 +79,10 @@ func resourceAwsGlueCrawler() *schema.Resource {
Elem: &schema.Schema{Type: schema.TypeString},
},
"schema_change_policy": {
Type: schema.TypeList,
Optional: true,
DiffSuppressFunc: func(k, old, new string, d *schema.ResourceData) bool {
if old == "1" && new == "0" {
return true
}
return false
},
MaxItems: 1,
Type: schema.TypeList,
Optional: true,
DiffSuppressFunc: suppressMissingOptionalConfigurationBlock,
MaxItems: 1,
Elem: &schema.Resource{
Schema: map[string]*schema.Schema{
"delete_behavior": {
Expand Down Expand Up @@ -130,6 +125,11 @@ func resourceAwsGlueCrawler() *schema.Resource {
Optional: true,
Elem: &schema.Schema{Type: schema.TypeString},
},
"sample_size": {
Type: schema.TypeInt,
Optional: true,
ValidateFunc: validation.IntBetween(1, 249),
},
},
},
},
Expand Down Expand Up @@ -233,15 +233,10 @@ func resourceAwsGlueCrawler() *schema.Resource {
ValidateFunc: validation.StringIsJSON,
},
"lineage_configuration": {
Type: schema.TypeList,
Optional: true,
MaxItems: 1,
DiffSuppressFunc: func(k, old, new string, d *schema.ResourceData) bool {
if old == "1" && new == "0" {
return true
}
return false
},
Type: schema.TypeList,
Optional: true,
MaxItems: 1,
DiffSuppressFunc: suppressMissingOptionalConfigurationBlock,
Elem: &schema.Resource{
Schema: map[string]*schema.Schema{
"crawler_lineage_settings": {
Expand All @@ -254,15 +249,10 @@ func resourceAwsGlueCrawler() *schema.Resource {
},
},
"recrawl_policy": {
Type: schema.TypeList,
Optional: true,
MaxItems: 1,
DiffSuppressFunc: func(k, old, new string, d *schema.ResourceData) bool {
ewbankkit marked this conversation as resolved.
Show resolved Hide resolved
if old == "1" && new == "0" {
return true
}
return false
},
Type: schema.TypeList,
Optional: true,
MaxItems: 1,
DiffSuppressFunc: suppressMissingOptionalConfigurationBlock,
Elem: &schema.Resource{
Schema: map[string]*schema.Schema{
"recrawl_behavior": {
Expand Down Expand Up @@ -403,16 +393,14 @@ func updateCrawlerInput(d *schema.ResourceData, crawlerName string) (*glue.Updat

crawlerInput.TablePrefix = aws.String(d.Get("table_prefix").(string))

if configuration, ok := d.GetOk("configuration"); ok {
crawlerInput.Configuration = aws.String(configuration.(string))
}

if v, ok := d.GetOk("configuration"); ok {
configuration, err := structure.NormalizeJsonString(v)
if err != nil {
return nil, fmt.Errorf("Configuration contains an invalid JSON: %v", err)
}
crawlerInput.Configuration = aws.String(configuration)
} else {
crawlerInput.Configuration = aws.String("")
}

if securityConfiguration, ok := d.GetOk("security_configuration"); ok {
Expand Down Expand Up @@ -521,13 +509,18 @@ func expandGlueS3Target(cfg map[string]interface{}) *glue.S3Target {
Path: aws.String(cfg["path"].(string)),
}

if connection, ok := cfg["connection_name"]; ok {
target.ConnectionName = aws.String(connection.(string))
if v, ok := cfg["connection_name"]; ok {
target.ConnectionName = aws.String(v.(string))
}

if exclusions, ok := cfg["exclusions"]; ok {
target.Exclusions = expandStringList(exclusions.([]interface{}))
if v, ok := cfg["exclusions"]; ok {
target.Exclusions = expandStringList(v.([]interface{}))
}

if v, ok := cfg["sample_size"]; ok && v.(int) > 0 {
target.SampleSize = aws.Int64(int64(v.(int)))
}

return target
}

Expand Down Expand Up @@ -768,6 +761,10 @@ func flattenGlueS3Targets(s3Targets []*glue.S3Target) []map[string]interface{} {
attrs["path"] = aws.StringValue(s3Target.Path)
attrs["connection_name"] = aws.StringValue(s3Target.ConnectionName)

if s3Target.SampleSize != nil {
attrs["sample_size"] = aws.Int64Value(s3Target.SampleSize)
}

result = append(result, attrs)
}
return result
Expand Down
73 changes: 69 additions & 4 deletions aws/resource_aws_glue_crawler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,11 @@ func testSweepGlueCrawlers(region string) error {
for _, crawler := range page.Crawlers {
name := aws.StringValue(crawler.Name)

log.Printf("[INFO] Deleting Glue Crawler: %s", name)
_, err := conn.DeleteCrawler(&glue.DeleteCrawlerInput{
Name: aws.String(name),
})
r := resourceAwsGlueCrawler()
d := r.Data(nil)
d.SetId(name)

err := r.Delete(d, client)
if err != nil {
log.Printf("[ERROR] Failed to delete Glue Crawler %s: %s", name, err)
}
Expand Down Expand Up @@ -629,6 +630,42 @@ func TestAccAWSGlueCrawler_S3Target_ConnectionName(t *testing.T) {
})
}

func TestAccAWSGlueCrawler_S3Target_SampleSize(t *testing.T) {
var crawler glue.Crawler
rName := acctest.RandomWithPrefix("tf-acc-test")
resourceName := "aws_glue_crawler.test"

resource.ParallelTest(t, resource.TestCase{
PreCheck: func() { testAccPreCheck(t) },
ErrorCheck: testAccErrorCheck(t, glue.EndpointsID),
Providers: testAccProviders,
CheckDestroy: testAccCheckAWSGlueCrawlerDestroy,
Steps: []resource.TestStep{
{
Config: testAccGlueCrawlerConfig_S3TargetSampleSize(rName, 1),
Check: resource.ComposeTestCheckFunc(
testAccCheckAWSGlueCrawlerExists(resourceName, &crawler),
resource.TestCheckResourceAttr(resourceName, "s3_target.#", "1"),
resource.TestCheckResourceAttr(resourceName, "s3_target.0.sample_size", "1"),
),
},
{
ResourceName: resourceName,
ImportState: true,
ImportStateVerify: true,
},
{
Config: testAccGlueCrawlerConfig_S3TargetSampleSize(rName, 2),
Check: resource.ComposeTestCheckFunc(
testAccCheckAWSGlueCrawlerExists(resourceName, &crawler),
resource.TestCheckResourceAttr(resourceName, "s3_target.#", "1"),
resource.TestCheckResourceAttr(resourceName, "s3_target.0.sample_size", "2"),
),
},
},
})
}

func TestAccAWSGlueCrawler_S3Target_Exclusions(t *testing.T) {
var crawler glue.Crawler
rName := acctest.RandomWithPrefix("tf-acc-test")
Expand Down Expand Up @@ -953,6 +990,13 @@ func TestAccAWSGlueCrawler_Configuration(t *testing.T) {
ImportState: true,
ImportStateVerify: true,
},
{
Config: testAccGlueCrawlerConfig_Configuration(rName, ""),
Check: resource.ComposeTestCheckFunc(
testAccCheckAWSGlueCrawlerExists(resourceName, &crawler),
resource.TestCheckResourceAttr(resourceName, "configuration", ""),
),
},
},
})
}
Expand Down Expand Up @@ -2550,3 +2594,24 @@ resource "aws_glue_crawler" "test" {
}
`, rName, policy)
}

func testAccGlueCrawlerConfig_S3TargetSampleSize(rName string, size int) string {
return testAccGlueCrawlerConfig_Base(rName) + fmt.Sprintf(`
resource "aws_glue_catalog_database" "test" {
name = %[1]q
}

resource "aws_glue_crawler" "test" {
depends_on = [aws_iam_role_policy_attachment.test-AWSGlueServiceRole]

database_name = aws_glue_catalog_database.test.name
name = %[1]q
role = aws_iam_role.test.name

s3_target {
sample_size = %[2]d
path = "s3://bucket1"
}
}
`, rName, size)
}
1 change: 1 addition & 0 deletions website/docs/r/glue_crawler.html.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ The following arguments are supported:
* `path` - (Required) The path to the Amazon S3 target.
* `connection_name` - (Optional) The name of a connection which allows crawler to access data in S3 within a VPC.
* `exclusions` - (Optional) A list of glob patterns used to exclude from the crawl.
* `sample_size` - (Optional) Sets the number of files in each leaf folder to be crawled when crawling sample files in a dataset. If not set, all the files are crawled. A valid value is an integer between 1 and 249.

### Catalog Target

Expand Down