diff --git a/dlp/README.md b/dlp/README.md index 2a96a80446b..8cda47a223a 100644 --- a/dlp/README.md +++ b/dlp/README.md @@ -66,49 +66,50 @@ Options: -f, --maxFindings [number] [default: 0] maximum number of results to retrieve -q, --includeQuote [boolean] [default: true] include matching string in results - -t, --infoTypes restrict to limited set of infoTypes [ default: []] - [ eg. PHONE_NUMBER US_PASSPORT] + -t, --infoTypes set of infoTypes to search for [eg. PHONE_NUMBER US_PASSPORT] + -customDictionaries set of comma-separated dictionary words to search for as customInfoTypes + -customRegexes set of regex patterns to search for as customInfoTypes ``` ### Examples - Inspect a string: ``` - java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Inspect -s "My phone number is (123) 456-7890 and my email address is me@somedomain.com" + java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Inspect -s "My phone number is (123) 456-7890 and my email address is me@somedomain.com" --infoTypes PHONE_NUMBER EMAIL_ADDRESS + java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Inspect -s "My phone number is (123) 456-7890 and my email address is me@somedomain.com" -customDictionaries me@somedomain.com -customRegexes "\(\d{3}\) \d{3}-\d{4}" ``` - Inspect a local file (text / image): ``` - java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Inspect -f resources/test.txt - java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Inspect -f resources/test.png + java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Inspect -f src/test/resources/test.txt --infoTypes PHONE_NUMBER EMAIL_ADDRESS + java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Inspect -f src/test/resources/test.png --infoTypes PHONE_NUMBER EMAIL_ADDRESS ``` - Inspect a file on Google Cloud Storage: ``` - java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Inspect -gcs -bucketName my-bucket -fileName my-file.txt + java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Inspect -gcs -bucketName my-bucket -fileName my-file.txt --infoTypes PHONE_NUMBER EMAIL_ADDRESS ``` - Inspect a Google Cloud Datastore kind: ``` - java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Inspect -ds -kind my-kind + java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Inspect -ds -kind my-kind --infoTypes PHONE_NUMBER EMAIL_ADDRESS ``` -## Automatic redaction of sensitive data -[Automatic redaction](https://cloud.google.com/dlp/docs/classification-redaction) produces an output with sensitive data matches removed. +## Automatic redaction of sensitive data from images +[Automatic redaction](https://cloud.google.com/dlp/docs/redacting-sensitive-data-images) produces an output image with sensitive data matches removed. ``` Commands: - -s Source input string - -r String to replace detected info types + -f Source image file + -o Destination image file Options: --help Show help -minLikelihood choices: "LIKELIHOOD_UNSPECIFIED", "VERY_UNLIKELY", "UNLIKELY", "POSSIBLE", "LIKELY", "VERY_LIKELY"] [default: "LIKELIHOOD_UNSPECIFIED"] specifies the minimum reporting likelihood threshold. - -infoTypes restrict operation to limited set of info types [ default: []] - [ eg. PHONE_NUMBER US_PASSPORT] + -infoTypes set of infoTypes to search for [eg. PHONE_NUMBER US_PASSPORT] ``` ### Example -- Replace sensitive data in text with `_REDACTED_`: +- Redact phone numbers and email addresses from `test.png`: ``` - java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Redact -s "My phone number is (123) 456-7890 and my email address is me@somedomain.com" -r "_REDACTED_" + java -cp target/dlp-samples-1.0-jar-with-dependencies.jar com.example.dlp.Redact -f src/test/resources/test.png -o test-redacted.png -infoTypes PHONE_NUMBER EMAIL_ADDRESS ``` ## Integration tests diff --git a/dlp/src/main/java/com/example/dlp/DeIdentification.java b/dlp/src/main/java/com/example/dlp/DeIdentification.java index cd6932454af..7415cc79593 100644 --- a/dlp/src/main/java/com/example/dlp/DeIdentification.java +++ b/dlp/src/main/java/com/example/dlp/DeIdentification.java @@ -57,6 +57,7 @@ import java.time.format.DateTimeParseException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.stream.Collectors; import org.apache.commons.cli.CommandLine; @@ -81,7 +82,11 @@ public class DeIdentification { * @param projectId ID of Google Cloud project to run the API under. */ private static void deIdentifyWithMask( - String string, Character maskingCharacter, int numberToMask, String projectId) { + String string, + List infoTypes, + Character maskingCharacter, + int numberToMask, + String projectId) { // instantiate a client try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) { @@ -108,6 +113,11 @@ private static void deIdentifyWithMask( .addTransformations(infoTypeTransformationObject) .build(); + InspectConfig inspectConfig = + InspectConfig.newBuilder() + .addAllInfoTypes(infoTypes) + .build(); + DeidentifyConfig deidentifyConfig = DeidentifyConfig.newBuilder() .setInfoTypeTransformations(infoTypeTransformationArray) @@ -117,6 +127,7 @@ private static void deIdentifyWithMask( DeidentifyContentRequest request = DeidentifyContentRequest.newBuilder() .setParent(ProjectName.of(projectId).toString()) + .setInspectConfig(inspectConfig) .setDeidentifyConfig(deidentifyConfig) .setItem(contentItem) .build(); @@ -147,6 +158,7 @@ private static void deIdentifyWithMask( */ private static void deIdentifyWithFpe( String string, + List infoTypes, FfxCommonNativeAlphabet alphabet, String keyName, String wrappedKey, @@ -188,6 +200,11 @@ private static void deIdentifyWithFpe( .addTransformations(infoTypeTransformationObject) .build(); + InspectConfig inspectConfig = + InspectConfig.newBuilder() + .addAllInfoTypes(infoTypes) + .build(); + // Create the deidentification request object DeidentifyConfig deidentifyConfig = DeidentifyConfig.newBuilder() @@ -197,6 +214,7 @@ private static void deIdentifyWithFpe( DeidentifyContentRequest request = DeidentifyContentRequest.newBuilder() .setParent(ProjectName.of(projectId).toString()) + .setInspectConfig(inspectConfig) .setDeidentifyConfig(deidentifyConfig) .setItem(contentItem) .build(); @@ -513,6 +531,10 @@ public static void main(String[] args) throws Exception { Options commandLineOptions = new Options(); commandLineOptions.addOptionGroup(optionsGroup); + Option infoTypesOption = Option.builder("infoTypes").hasArg(true).required(false).build(); + infoTypesOption.setArgs(Option.UNLIMITED_VALUES); + commandLineOptions.addOption(infoTypesOption); + Option maskingCharacterOption = Option.builder("maskingCharacter").hasArg(true).required(false).build(); commandLineOptions.addOption(maskingCharacterOption); @@ -575,12 +597,21 @@ public static void main(String[] args) throws Exception { String projectId = cmd.getOptionValue(projectIdOption.getOpt(), ServiceOptions.getDefaultProjectId()); + List infoTypesList = Collections.emptyList(); + if (cmd.hasOption(infoTypesOption.getOpt())) { + infoTypesList = new ArrayList<>(); + String[] infoTypes = cmd.getOptionValues(infoTypesOption.getOpt()); + for (String infoType : infoTypes) { + infoTypesList.add(InfoType.newBuilder().setName(infoType).build()); + } + } + if (cmd.hasOption("m")) { // deidentification with character masking int numberToMask = Integer.parseInt(cmd.getOptionValue(numberToMaskOption.getOpt(), "0")); char maskingCharacter = cmd.getOptionValue(maskingCharacterOption.getOpt(), "*").charAt(0); String val = cmd.getOptionValue(deidentifyMaskingOption.getOpt()); - deIdentifyWithMask(val, maskingCharacter, numberToMask, projectId); + deIdentifyWithMask(val, infoTypesList, maskingCharacter, numberToMask, projectId); } else if (cmd.hasOption("f")) { // deidentification with FPE String wrappedKey = cmd.getOptionValue(wrappedKeyOption.getOpt()); @@ -591,7 +622,8 @@ public static void main(String[] args) throws Exception { FfxCommonNativeAlphabet.valueOf( cmd.getOptionValue( alphabetOption.getOpt(), FfxCommonNativeAlphabet.ALPHA_NUMERIC.name())); - deIdentifyWithFpe(val, alphabet, keyName, wrappedKey, projectId, surrogateType); + deIdentifyWithFpe( + val, infoTypesList, alphabet, keyName, wrappedKey, projectId, surrogateType); } else if (cmd.hasOption("d")) { //deidentify with date shift String inputCsv = cmd.getOptionValue(inputCsvPathOption.getOpt()); diff --git a/dlp/src/main/java/com/example/dlp/Inspect.java b/dlp/src/main/java/com/example/dlp/Inspect.java index d4adc6d33ba..8defeb657c7 100644 --- a/dlp/src/main/java/com/example/dlp/Inspect.java +++ b/dlp/src/main/java/com/example/dlp/Inspect.java @@ -27,6 +27,10 @@ import com.google.privacy.dlp.v2.CloudStorageOptions; import com.google.privacy.dlp.v2.ContentItem; import com.google.privacy.dlp.v2.CreateDlpJobRequest; +import com.google.privacy.dlp.v2.CustomInfoType; +import com.google.privacy.dlp.v2.CustomInfoType.Dictionary; +import com.google.privacy.dlp.v2.CustomInfoType.Dictionary.WordList; +import com.google.privacy.dlp.v2.CustomInfoType.Regex; import com.google.privacy.dlp.v2.DatastoreOptions; import com.google.privacy.dlp.v2.DlpJob; import com.google.privacy.dlp.v2.Finding; @@ -52,6 +56,7 @@ import java.nio.file.Files; import java.nio.file.Paths; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.concurrent.TimeUnit; @@ -82,6 +87,7 @@ private static void inspectString( Likelihood minLikelihood, int maxFindings, List infoTypes, + List customInfoTypes, boolean includeQuote, String projectId) { // instantiate a client @@ -91,6 +97,7 @@ private static void inspectString( InspectConfig inspectConfig = InspectConfig.newBuilder() .addAllInfoTypes(infoTypes) + .addAllCustomInfoTypes(customInfoTypes) .setMinLikelihood(minLikelihood) .setLimits(findingLimits) .setIncludeQuote(includeQuote) @@ -146,6 +153,7 @@ private static void inspectFile( Likelihood minLikelihood, int maxFindings, List infoTypes, + List customInfoTypes, boolean includeQuote, String projectId) { // Instantiates a client @@ -189,6 +197,7 @@ private static void inspectFile( InspectConfig inspectConfig = InspectConfig.newBuilder() .addAllInfoTypes(infoTypes) + .addAllCustomInfoTypes(customInfoTypes) .setMinLikelihood(minLikelihood) .setLimits(findingLimits) .setIncludeQuote(includeQuote) @@ -242,6 +251,7 @@ private static void inspectGcsFile( String fileName, Likelihood minLikelihood, List infoTypes, + List customInfoTypes, int maxFindings, String topicId, String subscriptionId, @@ -266,6 +276,7 @@ private static void inspectGcsFile( InspectConfig inspectConfig = InspectConfig.newBuilder() .addAllInfoTypes(infoTypes) + .addAllCustomInfoTypes(customInfoTypes) .setMinLikelihood(minLikelihood) .setLimits(findingLimits) .build(); @@ -363,6 +374,7 @@ private static void inspectDatastore( String kind, Likelihood minLikelihood, List infoTypes, + List customInfoTypes, int maxFindings, String topicId, String subscriptionId) { @@ -388,6 +400,7 @@ private static void inspectDatastore( InspectConfig inspectConfig = InspectConfig.newBuilder() .addAllInfoTypes(infoTypes) + .addAllCustomInfoTypes(customInfoTypes) .setMinLikelihood(minLikelihood) .setLimits(findingLimits) .build(); @@ -486,6 +499,7 @@ private static void inspectBigquery( String tableId, Likelihood minLikelihood, List infoTypes, + List customInfoTypes, int maxFindings, String topicId, String subscriptionId) { @@ -511,6 +525,7 @@ private static void inspectBigquery( InspectConfig inspectConfig = InspectConfig.newBuilder() .addAllInfoTypes(infoTypes) + .addAllCustomInfoTypes(customInfoTypes) .setMinLikelihood(minLikelihood) .setLimits(findingLimits) .build(); @@ -629,6 +644,16 @@ public static void main(String[] args) throws Exception { infoTypesOption.setArgs(Option.UNLIMITED_VALUES); commandLineOptions.addOption(infoTypesOption); + Option customDictionariesOption = + Option.builder("customDictionaries").hasArg(true).required(false).build(); + customDictionariesOption.setArgs(Option.UNLIMITED_VALUES); + commandLineOptions.addOption(customDictionariesOption); + + Option customRegexesOption = + Option.builder("customRegexes").hasArg(true).required(false).build(); + customRegexesOption.setArgs(Option.UNLIMITED_VALUES); + commandLineOptions.addOption(customRegexesOption); + Option includeQuoteOption = Option.builder("includeQuote").hasArg(true).required(false).build(); commandLineOptions.addOption(includeQuoteOption); @@ -695,13 +720,62 @@ public static void main(String[] args) throws Exception { infoTypesList.add(InfoType.newBuilder().setName(infoType).build()); } } + + List customInfoTypesList = new ArrayList<>(); + if (cmd.hasOption(customDictionariesOption.getOpt())) { + String[] dictionaryStrings = cmd.getOptionValues(customDictionariesOption.getOpt()); + for (int i = 0; i < dictionaryStrings.length; i++) { + String[] dictionaryWords = dictionaryStrings[i].split(","); + CustomInfoType customInfoType = + CustomInfoType + .newBuilder() + .setInfoType( + InfoType.newBuilder().setName(String.format("CUSTOM_DICTIONARY_%s", i))) + .setDictionary( + Dictionary + .newBuilder() + .setWordList( + WordList + .newBuilder() + .addAllWords(Arrays.asList(dictionaryWords)))) + .build(); + customInfoTypesList.add(customInfoType); + } + } + if (cmd.hasOption(customRegexesOption.getOpt())) { + String[] patterns = cmd.getOptionValues(customRegexesOption.getOpt()); + for (int i = 0; i < patterns.length; i++) { + CustomInfoType customInfoType = + CustomInfoType + .newBuilder() + .setInfoType(InfoType.newBuilder().setName(String.format("CUSTOM_REGEX_%s", i))) + .setRegex(Regex.newBuilder().setPattern(patterns[i])) + .build(); + customInfoTypesList.add(customInfoType); + } + } + // string inspection if (cmd.hasOption("s")) { String val = cmd.getOptionValue(stringOption.getOpt()); - inspectString(val, minLikelihood, maxFindings, infoTypesList, includeQuote, projectId); + inspectString( + val, + minLikelihood, + maxFindings, + infoTypesList, + customInfoTypesList, + includeQuote, + projectId); } else if (cmd.hasOption("f")) { String filePath = cmd.getOptionValue(fileOption.getOpt()); - inspectFile(filePath, minLikelihood, maxFindings, infoTypesList, includeQuote, projectId); + inspectFile( + filePath, + minLikelihood, + maxFindings, + infoTypesList, + customInfoTypesList, + includeQuote, + projectId); // gcs file inspection } else if (cmd.hasOption("gcs")) { String bucketName = cmd.getOptionValue(bucketNameOption.getOpt()); @@ -711,6 +785,7 @@ public static void main(String[] args) throws Exception { fileName, minLikelihood, infoTypesList, + customInfoTypesList, maxFindings, topicId, subscriptionId, @@ -726,6 +801,7 @@ public static void main(String[] args) throws Exception { kind, minLikelihood, infoTypesList, + customInfoTypesList, maxFindings, topicId, subscriptionId); @@ -739,6 +815,7 @@ public static void main(String[] args) throws Exception { tableId, minLikelihood, infoTypesList, + customInfoTypesList, maxFindings, topicId, subscriptionId); diff --git a/dlp/src/test/java/com/example/dlp/DeIdentificationIT.java b/dlp/src/test/java/com/example/dlp/DeIdentificationIT.java index ec796c60bbb..9ec1d44ef6b 100644 --- a/dlp/src/test/java/com/example/dlp/DeIdentificationIT.java +++ b/dlp/src/test/java/com/example/dlp/DeIdentificationIT.java @@ -63,6 +63,7 @@ public void testDeidStringMasksCharacters() throws Exception { DeIdentification.main( new String[] { "-m", text, + "-infoTypes", "US_SOCIAL_SECURITY_NUMBER", "-maskingCharacter", "x", "-numberToMask", "5" }); @@ -79,6 +80,8 @@ public void testDeidReidFpe() throws Exception { new String[] { "-f", "\"" + text + "\"", + "-infoTypes", + "US_SOCIAL_SECURITY_NUMBER", "-wrappedKey", wrappedKey, "-keyName", diff --git a/dlp/src/test/java/com/example/dlp/InspectIT.java b/dlp/src/test/java/com/example/dlp/InspectIT.java index 60f078dcb26..17814cfc395 100644 --- a/dlp/src/test/java/com/example/dlp/InspectIT.java +++ b/dlp/src/test/java/com/example/dlp/InspectIT.java @@ -64,17 +64,52 @@ public void testStringInspectionReturnsInfoTypes() throws Exception { assertThat(output, containsString("EMAIL_ADDRESS")); } + @Test + public void testStringInspectionReturnsCustomInfoTypes() throws Exception { + String text = + "\"My phone number is (234) 456-7890 and my email address is gary@somedomain.com\""; + Inspect.main( + new String[] { + "-s", + text, + "-customDictionaries", + "gary@somedomain.com", + "-customRegexes", + "\\(\\d{3}\\) \\d{3}-\\d{4}" + }); + String output = bout.toString(); + + assertThat(output, containsString("CUSTOM_DICTIONARY_0")); + assertThat(output, containsString("CUSTOM_REGEX_0")); + } + @Test public void testTextFileInspectionReturnsInfoTypes() throws Exception { Inspect.main( new String[] { - "-f", "src/test/resources/test.txt", "-infoTypes", "PHONE_NUMBER", "EMAIL_ADDRESS" + "-f", "src/test/resources/test.txt", "-infoTypes", "PHONE_NUMBER", "EMAIL_ADDRESS" }); String output = bout.toString(); assertThat(output, containsString("PHONE_NUMBER")); assertThat(output, containsString("EMAIL_ADDRESS")); } + @Test + public void testTextFileInspectionReturnsCustomInfoTypes() throws Exception { + Inspect.main( + new String[] { + "-f", + "src/test/resources/test.txt", + "-customDictionaries", + "gary@somedomain.com", + "-customRegexes", + "\\(\\d{3}\\) \\d{3}-\\d{4}" + }); + String output = bout.toString(); + assertThat(output, containsString("CUSTOM_DICTIONARY_0")); + assertThat(output, containsString("CUSTOM_REGEX_0")); + } + @Test public void testImageFileInspectionReturnsInfoTypes() throws Exception { Inspect.main(