From 8bd2f74c942cbfeba221338342f2fa3ebde88e2a Mon Sep 17 00:00:00 2001 From: Adrian Galvan Date: Sun, 8 Dec 2024 19:48:01 -0800 Subject: [PATCH 01/22] Allow masking without primary keys --- .../dataset/postgres_example_test_dataset.yml | 11 ++++ .../query_configs/bigquery_query_config.py | 12 ++--- .../query_configs/mongodb_query_config.py | 6 +-- .../connectors/query_configs/query_config.py | 37 +++++++++---- .../query_configs/snowflake_query_config.py | 4 +- tests/fixtures/application_fixtures.py | 53 +++++++++++++++++++ .../service/connectors/test_query_config.py | 42 +++++++++++++-- 7 files changed, 141 insertions(+), 24 deletions(-) diff --git a/data/dataset/postgres_example_test_dataset.yml b/data/dataset/postgres_example_test_dataset.yml index d62eb38d46..e8d58f626c 100644 --- a/data/dataset/postgres_example_test_dataset.yml +++ b/data/dataset/postgres_example_test_dataset.yml @@ -7,18 +7,29 @@ dataset: fields: - name: city data_categories: [user.contact.address.city] + fides_meta: + data_type: string - name: house data_categories: [user.contact.address.street] + fides_meta: + data_type: string - name: id data_categories: [system.operations] fides_meta: + data_type: string primary_key: True - name: state data_categories: [user.contact.address.state] + fides_meta: + data_type: string - name: street data_categories: [user.contact.address.street] + fides_meta: + data_type: string - name: zip data_categories: [user.contact.address.postal_code] + fides_meta: + data_type: string - name: customer fields: diff --git a/src/fides/api/service/connectors/query_configs/bigquery_query_config.py b/src/fides/api/service/connectors/query_configs/bigquery_query_config.py index 681e2b9c60..74b28f3ada 100644 --- a/src/fides/api/service/connectors/query_configs/bigquery_query_config.py +++ b/src/fides/api/service/connectors/query_configs/bigquery_query_config.py @@ -140,7 +140,7 @@ def generate_update( return [] table = Table(self._generate_table_name(), MetaData(bind=client), autoload=True) - pk_clauses: List[ColumnElement] = [ + where_clauses: List[ColumnElement] = [ getattr(table.c, k) == v for k, v in non_empty_primary_keys.items() ] @@ -153,13 +153,13 @@ def generate_update( for partition_clause in partition_clauses: partitioned_queries.append( table.update() - .where(*(pk_clauses + [text(partition_clause)])) + .where(*(where_clauses + [text(partition_clause)])) .values(**update_value_map) ) return partitioned_queries - return [table.update().where(*pk_clauses).values(**update_value_map)] + return [table.update().where(*where_clauses).values(**update_value_map)] def generate_delete(self, row: Row, client: Engine) -> List[Delete]: """Returns a List of SQLAlchemy DELETE statements for BigQuery. Does not actually execute the delete statement. @@ -189,7 +189,7 @@ def generate_delete(self, row: Row, client: Engine) -> List[Delete]: return [] table = Table(self._generate_table_name(), MetaData(bind=client), autoload=True) - pk_clauses: List[ColumnElement] = [ + where_clauses: List[ColumnElement] = [ getattr(table.c, k) == v for k, v in non_empty_primary_keys.items() ] @@ -202,9 +202,9 @@ def generate_delete(self, row: Row, client: Engine) -> List[Delete]: for partition_clause in partition_clauses: partitioned_queries.append( - table.delete().where(*(pk_clauses + [text(partition_clause)])) + table.delete().where(*(where_clauses + [text(partition_clause)])) ) return partitioned_queries - return [table.delete().where(*pk_clauses)] + return [table.delete().where(*where_clauses)] diff --git a/src/fides/api/service/connectors/query_configs/mongodb_query_config.py b/src/fides/api/service/connectors/query_configs/mongodb_query_config.py index bd650723f4..1a6aa303f0 100644 --- a/src/fides/api/service/connectors/query_configs/mongodb_query_config.py +++ b/src/fides/api/service/connectors/query_configs/mongodb_query_config.py @@ -69,21 +69,21 @@ def generate_update_stmt( """Generate a SQL update statement in the form of Mongo update statement components""" update_clauses = self.update_value_map(row, policy, request) - pk_clauses: Dict[str, Any] = filter_nonempty_values( + where_clauses: Dict[str, Any] = filter_nonempty_values( { field_path.string_path: field.cast(row[field_path.string_path]) for field_path, field in self.primary_key_field_paths.items() } ) - valid = len(pk_clauses) > 0 and len(update_clauses) > 0 + valid = len(where_clauses) > 0 and len(update_clauses) > 0 if not valid: logger.warning( "There is not enough data to generate a valid update for {}", self.node.address, ) return None - return pk_clauses, {"$set": update_clauses} + return where_clauses, {"$set": update_clauses} def query_to_str(self, t: MongoStatement, input_data: Dict[str, List[Any]]) -> str: """string representation of a query for logging/dry-run""" diff --git a/src/fides/api/service/connectors/query_configs/query_config.py b/src/fides/api/service/connectors/query_configs/query_config.py index 6e868964af..ae62196bc1 100644 --- a/src/fides/api/service/connectors/query_configs/query_config.py +++ b/src/fides/api/service/connectors/query_configs/query_config.py @@ -15,6 +15,7 @@ ROOT_COLLECTION_ADDRESS, CollectionAddress, Field, + FieldAddress, FieldPath, MaskingTruncation, ) @@ -100,6 +101,15 @@ def primary_key_field_paths(self) -> Dict[FieldPath, Field]: if field.primary_key } + @property + def identity_or_reference_field_paths(self) -> Dict[FieldPath, Field]: + """Mapping of FieldPaths to Fields that have identity or dataset references""" + return { + field_path: field + for field_path, field in self.field_map().items() + if field_path in {edge.f2.field_path for edge in self.node.incoming_edges} + } + def query_sources(self) -> Dict[str, List[CollectionAddress]]: """Display the input collection(s) for each query key for display purposes. @@ -412,10 +422,10 @@ def generate_query_without_tuples( # pylint: disable=R0914 def get_update_stmt( self, update_clauses: List[str], - pk_clauses: List[str], + where_clauses: List[str], ) -> str: """Returns a SQL UPDATE statement to fit SQL syntax.""" - return f"UPDATE {self.node.address.collection} SET {', '.join(update_clauses)} WHERE {' AND '.join(pk_clauses)}" + return f"UPDATE {self.node.address.collection} SET {', '.join(update_clauses)} WHERE {' AND '.join(where_clauses)}" @abstractmethod def get_update_clauses( @@ -436,6 +446,7 @@ def generate_update_stmt( ) -> Optional[T]: """Returns an update statement in generic SQL-ish dialect.""" update_value_map: Dict[str, Any] = self.update_value_map(row, policy, request) + non_empty_primary_keys: Dict[str, Field] = filter_nonempty_values( { fpath.string_path: fld.cast(row[fpath.string_path]) @@ -444,17 +455,25 @@ def generate_update_stmt( } ) + non_empty_reference_fields: Dict[str, Field] = filter_nonempty_values( + { + fpath.string_path: fld.cast(row[fpath.string_path]) + for fpath, fld in self.identity_or_reference_field_paths.items() + if fpath.string_path in row + } + ) + update_clauses = self.get_update_clauses( - update_value_map, non_empty_primary_keys + update_value_map, non_empty_reference_fields ) - pk_clauses = self.format_key_map_for_update_stmt( - list(non_empty_primary_keys.keys()) + where_clauses = self.format_key_map_for_update_stmt( + list(non_empty_reference_fields.keys()) ) - for k, v in non_empty_primary_keys.items(): - update_value_map[k] = v + # for k, v in non_empty_reference_fields.items(): + # update_value_map[k] = v - valid = len(pk_clauses) > 0 and len(update_clauses) > 0 + valid = len(where_clauses) > 0 and len(update_clauses) > 0 if not valid: logger.warning( "There is not enough data to generate a valid update statement for {}", @@ -464,7 +483,7 @@ def generate_update_stmt( query_str = self.get_update_stmt( update_clauses, - pk_clauses, + where_clauses, ) logger.info("query = {}, params = {}", Pii(query_str), Pii(update_value_map)) return self.format_query_stmt(query_str, update_value_map) diff --git a/src/fides/api/service/connectors/query_configs/snowflake_query_config.py b/src/fides/api/service/connectors/query_configs/snowflake_query_config.py index 574e1ea1b1..443dd94051 100644 --- a/src/fides/api/service/connectors/query_configs/snowflake_query_config.py +++ b/src/fides/api/service/connectors/query_configs/snowflake_query_config.py @@ -67,7 +67,7 @@ def format_key_map_for_update_stmt(self, fields: List[str]) -> List[str]: def get_update_stmt( self, update_clauses: List[str], - pk_clauses: List[str], + where_clauses: List[str], ) -> str: """Returns a parameterized update statement in Snowflake dialect.""" - return f'UPDATE {self._generate_table_name()} SET {", ".join(update_clauses)} WHERE {" AND ".join(pk_clauses)}' + return f'UPDATE {self._generate_table_name()} SET {", ".join(update_clauses)} WHERE {" AND ".join(where_clauses)}' diff --git a/tests/fixtures/application_fixtures.py b/tests/fixtures/application_fixtures.py index eb28b35657..355985ddd0 100644 --- a/tests/fixtures/application_fixtures.py +++ b/tests/fixtures/application_fixtures.py @@ -864,6 +864,59 @@ def erasure_policy( "rule_id": erasure_rule.id, }, ) + + yield erasure_policy + try: + rule_target.delete(db) + except ObjectDeletedError: + pass + try: + erasure_rule.delete(db) + except ObjectDeletedError: + pass + try: + erasure_policy.delete(db) + except ObjectDeletedError: + pass + + +@pytest.fixture(scope="function") +def erasure_policy_address_city( + db: Session, + oauth_client: ClientDetail, +) -> Generator: + erasure_policy = Policy.create( + db=db, + data={ + "name": "example erasure policy", + "key": "example_erasure_policy", + "client_id": oauth_client.id, + }, + ) + + erasure_rule = Rule.create( + db=db, + data={ + "action_type": ActionType.erasure.value, + "client_id": oauth_client.id, + "name": "Erasure Rule", + "policy_id": erasure_policy.id, + "masking_strategy": { + "strategy": "null_rewrite", + "configuration": {}, + }, + }, + ) + + rule_target = RuleTarget.create( + db=db, + data={ + "client_id": oauth_client.id, + "data_category": DataCategory("user.contact.address.city").value, + "rule_id": erasure_rule.id, + }, + ) + yield erasure_policy try: rule_target.delete(db) diff --git a/tests/ops/service/connectors/test_query_config.py b/tests/ops/service/connectors/test_query_config.py index 01d7b9dbd2..451788ddf7 100644 --- a/tests/ops/service/connectors/test_query_config.py +++ b/tests/ops/service/connectors/test_query_config.py @@ -286,10 +286,41 @@ def test_generate_update_stmt_one_field( "id": 1, } text_clause = config.generate_update_stmt(row, erasure_policy, privacy_request) - assert text_clause.text == """UPDATE customer SET name = :name WHERE id = :id""" + assert ( + text_clause.text + == """UPDATE customer SET name = :name WHERE email = :email""" + ) assert text_clause._bindparams["name"].key == "name" assert text_clause._bindparams["name"].value is None # Null masking strategy + def test_generate_update_stmt_one_field_inbound_reference( + self, erasure_policy_address_city, example_datasets, connection_config + ): + dataset = Dataset(**example_datasets[0]) + graph = convert_dataset_to_graph(dataset, connection_config.key) + dataset_graph = DatasetGraph(*[graph]) + traversal = Traversal(dataset_graph, {"email": "customer-1@example.com"}) + + address_node = traversal.traversal_node_dict[ + CollectionAddress("postgres_example_test_dataset", "address") + ].to_mock_execution_node() + + config = SQLQueryConfig(address_node) + row = { + "id": 1, + "house": "123", + "street": "Main St", + "city": "San Francisco", + "state": "CA", + "zip": "94105", + } + text_clause = config.generate_update_stmt( + row, erasure_policy_address_city, privacy_request + ) + assert text_clause.text == """UPDATE address SET city = :city WHERE id = :id""" + assert text_clause._bindparams["city"].key == "city" + assert text_clause._bindparams["city"].value is None # Null masking strategy + def test_generate_update_stmt_length_truncation( self, erasure_policy_string_rewrite_long, @@ -316,7 +347,10 @@ def test_generate_update_stmt_length_truncation( text_clause = config.generate_update_stmt( row, erasure_policy_string_rewrite_long, privacy_request ) - assert text_clause.text == """UPDATE customer SET name = :name WHERE id = :id""" + assert ( + text_clause.text + == """UPDATE customer SET name = :name WHERE email = :email""" + ) assert text_clause._bindparams["name"].key == "name" # length truncation on name field assert ( @@ -365,7 +399,7 @@ def test_generate_update_stmt_multiple_fields_same_rule( text_clause = config.generate_update_stmt(row, erasure_policy, privacy_request) assert ( text_clause.text - == "UPDATE customer SET email = :email, name = :name WHERE id = :id" + == "UPDATE customer SET email = :email, name = :name WHERE email = :email" ) assert text_clause._bindparams["name"].key == "name" # since length is set to 40 in dataset.yml, we expect only first 40 chars of masked val @@ -409,7 +443,7 @@ def test_generate_update_stmts_from_multiple_rules( assert ( text_clause.text - == "UPDATE customer SET email = :email, name = :name WHERE id = :id" + == "UPDATE customer SET email = :email, name = :name WHERE email = :email" ) # Two different masking strategies used for name and email assert text_clause._bindparams["name"].value is None # Null masking strategy From cadcdb7ba7a59b2ed41da3f0a1eef4253fc4de46 Mon Sep 17 00:00:00 2001 From: Adrian Galvan Date: Sun, 8 Dec 2024 22:06:43 -0800 Subject: [PATCH 02/22] Updating tests --- .../dataset/postgres_example_test_dataset.yml | 2 +- src/fides/api/models/connectionconfig.py | 7 +++++-- .../connectors/query_configs/query_config.py | 5 ++--- src/fides/api/task/graph_task.py | 20 ------------------- tests/ops/integration_tests/test_sql_task.py | 17 +++++----------- 5 files changed, 13 insertions(+), 38 deletions(-) diff --git a/data/dataset/postgres_example_test_dataset.yml b/data/dataset/postgres_example_test_dataset.yml index e8d58f626c..fddbfbb391 100644 --- a/data/dataset/postgres_example_test_dataset.yml +++ b/data/dataset/postgres_example_test_dataset.yml @@ -16,7 +16,7 @@ dataset: - name: id data_categories: [system.operations] fides_meta: - data_type: string + data_type: integer primary_key: True - name: state data_categories: [user.contact.address.state] diff --git a/src/fides/api/models/connectionconfig.py b/src/fides/api/models/connectionconfig.py index 1758222b88..2dc518cd44 100644 --- a/src/fides/api/models/connectionconfig.py +++ b/src/fides/api/models/connectionconfig.py @@ -220,10 +220,13 @@ def authorized(self) -> bool: return False # hard-coding to avoid cyclic dependency - if authentication.strategy not in ["oauth2_authorization_code", "oauth2_client_credentials"]: + if authentication.strategy not in [ + "oauth2_authorization_code", + "oauth2_client_credentials", + ]: return False - return bool(self.secrets and 'access_token' in self.secrets.keys()) + return bool(self.secrets and "access_token" in self.secrets.keys()) @property def name_or_key(self) -> str: diff --git a/src/fides/api/service/connectors/query_configs/query_config.py b/src/fides/api/service/connectors/query_configs/query_config.py index ae62196bc1..aa00189ac4 100644 --- a/src/fides/api/service/connectors/query_configs/query_config.py +++ b/src/fides/api/service/connectors/query_configs/query_config.py @@ -15,7 +15,6 @@ ROOT_COLLECTION_ADDRESS, CollectionAddress, Field, - FieldAddress, FieldPath, MaskingTruncation, ) @@ -470,8 +469,8 @@ def generate_update_stmt( list(non_empty_reference_fields.keys()) ) - # for k, v in non_empty_reference_fields.items(): - # update_value_map[k] = v + for k, v in non_empty_reference_fields.items(): + update_value_map[k] = v valid = len(where_clauses) > 0 and len(update_clauses) > 0 if not valid: diff --git a/src/fides/api/task/graph_task.py b/src/fides/api/task/graph_task.py index 6b78b57297..85576264f0 100644 --- a/src/fides/api/task/graph_task.py +++ b/src/fides/api/task/graph_task.py @@ -603,26 +603,6 @@ def erasure_request( *erasure_prereqs: int, # TODO Remove when we stop support for DSR 2.0. DSR 3.0 enforces with downstream_tasks. ) -> int: """Run erasure request""" - # if there is no primary key specified in the graph node configuration - # note this in the execution log and perform no erasures on this node - if not self.execution_node.collection.contains_field(lambda f: f.primary_key): - logger.warning( - "No erasures on {} as there is no primary_key defined.", - self.execution_node.address, - ) - if self.request_task.id: - # For DSR 3.0, largely for testing. DSR 3.0 uses Request Task status - # instead of presence of cached erasure data to know if we should rerun a node - self.request_task.rows_masked = 0 # Saved as part of update_status - # TODO Remove when we stop support for DSR 2.0 - self.resources.cache_erasure(self.key.value, 0) - self.update_status( - "No values were erased since no primary key was defined for this collection", - None, - ActionType.erasure, - ExecutionLogStatus.complete, - ) - return 0 if not self.can_write_data(): logger.warning( diff --git a/tests/ops/integration_tests/test_sql_task.py b/tests/ops/integration_tests/test_sql_task.py index 298d77229a..b349040988 100644 --- a/tests/ops/integration_tests/test_sql_task.py +++ b/tests/ops/integration_tests/test_sql_task.py @@ -8,13 +8,7 @@ from sqlalchemy import text from sqlalchemy.orm import Session -from fides.api.graph.config import ( - Collection, - CollectionAddress, - FieldAddress, - GraphDataset, - ScalarField, -) +from fides.api.graph.config import Collection, FieldAddress, GraphDataset, ScalarField from fides.api.graph.data_type import DataType, StringTypeConverter from fides.api.graph.graph import DatasetGraph, Edge, Node from fides.api.graph.traversal import TraversalNode @@ -25,7 +19,6 @@ ExecutionLog, ExecutionLogStatus, PrivacyRequest, - PrivacyRequestStatus, RequestTask, ) from fides.api.service.connectors import get_connector @@ -57,7 +50,7 @@ "dsr_version", ["use_dsr_3_0", "use_dsr_2_0"], ) -async def test_sql_erasure_ignores_collections_without_pk( +async def test_sql_erasure_does_not_ignore_collections_without_pk( db, postgres_inserts, integration_postgres_config, @@ -116,7 +109,7 @@ async def test_sql_erasure_ignores_collections_without_pk( .all() ) logs = [log.__dict__ for log in logs] - # since address has no primary_key=True field, it's erasure is skipped + # erasure is not skipped since primary_key is not required assert ( len( records_matching_fields( @@ -126,13 +119,13 @@ async def test_sql_erasure_ignores_collections_without_pk( message="No values were erased since no primary key was defined for this collection", ) ) - == 1 + == 0 ) assert v == { "postgres_example:customer": 1, "postgres_example:payment_card": 0, "postgres_example:orders": 0, - "postgres_example:address": 0, + "postgres_example:address": 2, } From de3ce24df7f420713867f3fadc0e831c29e585f0 Mon Sep 17 00:00:00 2001 From: Adrian Galvan Date: Mon, 9 Dec 2024 00:27:14 -0800 Subject: [PATCH 03/22] Separating overlapping keys in update value map --- .../connectors/query_configs/query_config.py | 47 ++++++++++--------- .../query_configs/snowflake_query_config.py | 5 +- .../service/connectors/scylla_query_config.py | 13 +++-- .../v1/endpoints/test_dataset_endpoints.py | 16 +++---- .../service/connectors/test_query_config.py | 5 +- 5 files changed, 43 insertions(+), 43 deletions(-) diff --git a/src/fides/api/service/connectors/query_configs/query_config.py b/src/fides/api/service/connectors/query_configs/query_config.py index aa00189ac4..3eed947dff 100644 --- a/src/fides/api/service/connectors/query_configs/query_config.py +++ b/src/fides/api/service/connectors/query_configs/query_config.py @@ -437,7 +437,7 @@ def format_query_stmt(self, query_str: str, update_value_map: Dict[str, Any]) -> """Returns a formatted update statement in the appropriate dialect.""" @abstractmethod - def format_key_map_for_update_stmt(self, fields: List[str]) -> List[str]: + def format_key_map_for_update_stmt(self, param_map: Dict[str, Any]) -> List[str]: """Adds the appropriate formatting for update statements in this datastore.""" def generate_update_stmt( @@ -445,15 +445,6 @@ def generate_update_stmt( ) -> Optional[T]: """Returns an update statement in generic SQL-ish dialect.""" update_value_map: Dict[str, Any] = self.update_value_map(row, policy, request) - - non_empty_primary_keys: Dict[str, Field] = filter_nonempty_values( - { - fpath.string_path: fld.cast(row[fpath.string_path]) - for fpath, fld in self.primary_key_field_paths.items() - if fpath.string_path in row - } - ) - non_empty_reference_fields: Dict[str, Field] = filter_nonempty_values( { fpath.string_path: fld.cast(row[fpath.string_path]) @@ -462,11 +453,27 @@ def generate_update_stmt( } ) + # Identify overlapping fields and create parameter mappings + overlapping_keys = set(update_value_map.keys()) & set( + non_empty_reference_fields.keys() + ) + param_map = { + **{k: v for k, v in update_value_map.items()}, # SET values + **{ + f"where_{k}" if k in overlapping_keys else k: v + for k, v in non_empty_reference_fields.items() + }, # WHERE values + } + + # Generate SQL clauses using parameter names update_clauses = self.get_update_clauses( - update_value_map, non_empty_reference_fields + {k: k for k in update_value_map}, non_empty_reference_fields ) where_clauses = self.format_key_map_for_update_stmt( - list(non_empty_reference_fields.keys()) + { + k: f"where_{k}" if k in overlapping_keys else k + for k in non_empty_reference_fields + } ) for k, v in non_empty_reference_fields.items(): @@ -480,12 +487,9 @@ def generate_update_stmt( ) return None - query_str = self.get_update_stmt( - update_clauses, - where_clauses, - ) - logger.info("query = {}, params = {}", Pii(query_str), Pii(update_value_map)) - return self.format_query_stmt(query_str, update_value_map) + query_str = self.get_update_stmt(update_clauses, where_clauses) + logger.info("query = {}, params = {}", Pii(query_str), Pii(param_map)) + return self.format_query_stmt(query_str, param_map) class SQLQueryConfig(SQLLikeQueryConfig[Executable]): @@ -556,16 +560,15 @@ def generate_query( ) return None - def format_key_map_for_update_stmt(self, fields: List[str]) -> List[str]: + def format_key_map_for_update_stmt(self, param_map: Dict[str, Any]) -> List[str]: """Adds the appropriate formatting for update statements in this datastore.""" - fields.sort() - return [f"{k} = :{k}" for k in fields] + return [f"{k} = :{v}" for k, v in param_map.items()] def get_update_clauses( self, update_value_map: Dict[str, Any], non_empty_primary_keys: Dict[str, Field] ) -> List[str]: """Returns a list of update clauses for the update statement.""" - return self.format_key_map_for_update_stmt(list(update_value_map.keys())) + return self.format_key_map_for_update_stmt(update_value_map) def format_query_stmt( self, query_str: str, update_value_map: Dict[str, Any] diff --git a/src/fides/api/service/connectors/query_configs/snowflake_query_config.py b/src/fides/api/service/connectors/query_configs/snowflake_query_config.py index 443dd94051..279e601141 100644 --- a/src/fides/api/service/connectors/query_configs/snowflake_query_config.py +++ b/src/fides/api/service/connectors/query_configs/snowflake_query_config.py @@ -59,10 +59,9 @@ def get_formatted_query_string( """Returns a query string with double quotation mark formatting as required by Snowflake syntax.""" return f'SELECT {field_list} FROM {self._generate_table_name()} WHERE ({" OR ".join(clauses)})' - def format_key_map_for_update_stmt(self, fields: List[str]) -> List[str]: + def format_key_map_for_update_stmt(self, param_map: Dict[str, Any]) -> List[str]: """Adds the appropriate formatting for update statements in this datastore.""" - fields.sort() - return [f'"{k}" = :{k}' for k in fields] + return [f'"{k}" = :{v}' for k, v in param_map] def get_update_stmt( self, diff --git a/src/fides/api/service/connectors/scylla_query_config.py b/src/fides/api/service/connectors/scylla_query_config.py index 2a72270a40..ce8f60335c 100644 --- a/src/fides/api/service/connectors/scylla_query_config.py +++ b/src/fides/api/service/connectors/scylla_query_config.py @@ -70,21 +70,20 @@ def generate_query( ) -> Optional[ScyllaDBStatement]: return self.generate_query_without_tuples(input_data, policy) - def format_key_map_for_update_stmt(self, fields: List[str]) -> List[str]: + def format_key_map_for_update_stmt(self, param_map: Dict[str, Any]) -> List[str]: """Adds the appropriate formatting for update statements in this datastore.""" - fields.sort() - return [f"{k} = %({k})s" for k in fields] + return [f"{k} = %({v})s" for k, v in param_map.items()] def get_update_clauses( self, update_value_map: Dict[str, Any], non_empty_primary_keys: Dict[str, Field] ) -> List[str]: """Returns a list of update clauses for the update statement.""" return self.format_key_map_for_update_stmt( - [ - key - for key in update_value_map.keys() + { + key: value + for key, value in update_value_map.keys() if key not in non_empty_primary_keys - ] + } ) def format_query_data_name(self, query_data_name: str) -> str: diff --git a/tests/ops/api/v1/endpoints/test_dataset_endpoints.py b/tests/ops/api/v1/endpoints/test_dataset_endpoints.py index f744d59e47..1ae9ac28f8 100644 --- a/tests/ops/api/v1/endpoints/test_dataset_endpoints.py +++ b/tests/ops/api/v1/endpoints/test_dataset_endpoints.py @@ -232,9 +232,7 @@ def test_put_validate_dataset_invalid_length( invalid_dataset = example_datasets[0] # string is properly read: - invalid_dataset["collections"][0]["fields"][0]["fidesops_meta"] = { - "length": 123 - } + invalid_dataset["collections"][0]["fields"][0]["fides_meta"] = {"length": 123} response = api_client.put( validate_dataset_url, headers=auth_header, json=invalid_dataset ) @@ -247,7 +245,7 @@ def test_put_validate_dataset_invalid_length( ) # fails with an invalid value - invalid_dataset["collections"][0]["fields"][0]["fidesops_meta"] = {"length": -1} + invalid_dataset["collections"][0]["fields"][0]["fides_meta"] = {"length": -1} response = api_client.put( validate_dataset_url, headers=auth_header, json=invalid_dataset ) @@ -269,7 +267,7 @@ def test_put_validate_dataset_invalid_data_type( invalid_dataset = example_datasets[0] # string is properly read: - invalid_dataset["collections"][0]["fields"][0]["fidesops_meta"] = { + invalid_dataset["collections"][0]["fields"][0]["fides_meta"] = { "data_type": "string" } response = api_client.put( @@ -284,7 +282,7 @@ def test_put_validate_dataset_invalid_data_type( ) # fails with an invalid value - invalid_dataset["collections"][0]["fields"][0]["fidesops_meta"] = { + invalid_dataset["collections"][0]["fields"][0]["fides_meta"] = { "data_type": "stringsssssss" } @@ -298,7 +296,7 @@ def test_put_validate_dataset_invalid_data_type( == "Value error, The data type stringsssssss is not supported." ) - def test_put_validate_dataset_invalid_fidesops_meta( + def test_put_validate_dataset_invalid_fides_meta( self, example_datasets: List, validate_dataset_url, @@ -307,8 +305,8 @@ def test_put_validate_dataset_invalid_fidesops_meta( ) -> None: auth_header = generate_auth_header(scopes=[DATASET_READ]) invalid_dataset = example_datasets[0] - # Add an invalid fidesops_meta annotation to ensure our type-checking is comprehensive - invalid_dataset["collections"][0]["fields"][0]["fidesops_meta"] = { + # Add an invalid fides_meta annotation to ensure our type-checking is comprehensive + invalid_dataset["collections"][0]["fields"][0]["fides_meta"] = { "references": [ { "dataset": "postgres_example_test_dataset", diff --git a/tests/ops/service/connectors/test_query_config.py b/tests/ops/service/connectors/test_query_config.py index 451788ddf7..991c945081 100644 --- a/tests/ops/service/connectors/test_query_config.py +++ b/tests/ops/service/connectors/test_query_config.py @@ -399,7 +399,7 @@ def test_generate_update_stmt_multiple_fields_same_rule( text_clause = config.generate_update_stmt(row, erasure_policy, privacy_request) assert ( text_clause.text - == "UPDATE customer SET email = :email, name = :name WHERE email = :email" + == "UPDATE customer SET email = :email, name = :name WHERE email = :where_email" ) assert text_clause._bindparams["name"].key == "name" # since length is set to 40 in dataset.yml, we expect only first 40 chars of masked val @@ -415,6 +415,7 @@ def test_generate_update_stmt_multiple_fields_same_rule( ["customer-1@example.com"], request_id=privacy_request.id )[0] ) + assert text_clause._bindparams["where_email"].value == "customer-1@example.com" clear_cache_secrets(privacy_request.id) def test_generate_update_stmts_from_multiple_rules( @@ -443,7 +444,7 @@ def test_generate_update_stmts_from_multiple_rules( assert ( text_clause.text - == "UPDATE customer SET email = :email, name = :name WHERE email = :email" + == "UPDATE customer SET name = :name, email = :email WHERE email = :where_email" ) # Two different masking strategies used for name and email assert text_clause._bindparams["name"].value is None # Null masking strategy From 93974b88d970f68b140a47e6f819cd89a99d9be4 Mon Sep 17 00:00:00 2001 From: Adrian Galvan Date: Mon, 9 Dec 2024 09:40:57 -0800 Subject: [PATCH 04/22] Fixing data type --- data/dataset/postgres_example_test_dataset.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/dataset/postgres_example_test_dataset.yml b/data/dataset/postgres_example_test_dataset.yml index fddbfbb391..a6b7080ec2 100644 --- a/data/dataset/postgres_example_test_dataset.yml +++ b/data/dataset/postgres_example_test_dataset.yml @@ -12,7 +12,7 @@ dataset: - name: house data_categories: [user.contact.address.street] fides_meta: - data_type: string + data_type: integer - name: id data_categories: [system.operations] fides_meta: From f274ae05e7d6e2ead07efc70be9233268e9957ed Mon Sep 17 00:00:00 2001 From: Adrian Galvan Date: Mon, 9 Dec 2024 11:20:34 -0800 Subject: [PATCH 05/22] Sorting update map keys --- src/fides/api/service/connectors/query_configs/query_config.py | 2 +- .../service/connectors/query_configs/snowflake_query_config.py | 2 +- src/fides/api/service/connectors/scylla_query_config.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/fides/api/service/connectors/query_configs/query_config.py b/src/fides/api/service/connectors/query_configs/query_config.py index 3eed947dff..4a29e0ace5 100644 --- a/src/fides/api/service/connectors/query_configs/query_config.py +++ b/src/fides/api/service/connectors/query_configs/query_config.py @@ -562,7 +562,7 @@ def generate_query( def format_key_map_for_update_stmt(self, param_map: Dict[str, Any]) -> List[str]: """Adds the appropriate formatting for update statements in this datastore.""" - return [f"{k} = :{v}" for k, v in param_map.items()] + return [f"{k} = :{v}" for k, v in sorted(param_map.items())] def get_update_clauses( self, update_value_map: Dict[str, Any], non_empty_primary_keys: Dict[str, Field] diff --git a/src/fides/api/service/connectors/query_configs/snowflake_query_config.py b/src/fides/api/service/connectors/query_configs/snowflake_query_config.py index 279e601141..ec640191d8 100644 --- a/src/fides/api/service/connectors/query_configs/snowflake_query_config.py +++ b/src/fides/api/service/connectors/query_configs/snowflake_query_config.py @@ -61,7 +61,7 @@ def get_formatted_query_string( def format_key_map_for_update_stmt(self, param_map: Dict[str, Any]) -> List[str]: """Adds the appropriate formatting for update statements in this datastore.""" - return [f'"{k}" = :{v}' for k, v in param_map] + return [f'"{k}" = :{v}' for k, v in sorted(param_map.items())] def get_update_stmt( self, diff --git a/src/fides/api/service/connectors/scylla_query_config.py b/src/fides/api/service/connectors/scylla_query_config.py index ce8f60335c..dc619a72c7 100644 --- a/src/fides/api/service/connectors/scylla_query_config.py +++ b/src/fides/api/service/connectors/scylla_query_config.py @@ -72,7 +72,7 @@ def generate_query( def format_key_map_for_update_stmt(self, param_map: Dict[str, Any]) -> List[str]: """Adds the appropriate formatting for update statements in this datastore.""" - return [f"{k} = %({v})s" for k, v in param_map.items()] + return [f"{k} = %({v})s" for k, v in sorted(param_map.items())] def get_update_clauses( self, update_value_map: Dict[str, Any], non_empty_primary_keys: Dict[str, Field] From 357b6ec0625b1facdbbd447bb83c5865a8337112 Mon Sep 17 00:00:00 2001 From: Adrian Galvan Date: Mon, 9 Dec 2024 11:54:51 -0800 Subject: [PATCH 06/22] Removing primary keys from sample and test datasets --- .../bigquery_enterprise_test_dataset.yml | 8 +++---- .../dataset/bigquery_example_test_dataset.yml | 18 --------------- .../dataset/dynamodb_example_test_dataset.yml | 8 ------- data/dataset/email_dataset.yml | 6 ----- ...le_field_masking_override_test_dataset.yml | 18 --------------- data/dataset/example_test_dataset.invalid | 18 --------------- data/dataset/example_test_datasets.yml | 16 -------------- ...e_cloud_sql_mysql_example_test_dataset.yml | 8 ------- ...loud_sql_postgres_example_test_dataset.yml | 8 ------- data/dataset/manual_dataset.yml | 4 ---- data/dataset/mariadb_example_test_dataset.yml | 8 ------- data/dataset/mongo_example_test_dataset.yml | 22 ------------------- data/dataset/mssql_example_test_dataset.yml | 8 ------- data/dataset/mysql_example_test_dataset.yml | 9 -------- ...s_example_custom_request_field_dataset.yml | 1 - ...alid_masking_strategy_override_dataset.yml | 4 ---- .../dataset/postgres_example_test_dataset.yml | 17 -------------- .../dataset/redshift_example_test_dataset.yml | 18 --------------- .../dataset/scylladb_example_test_dataset.yml | 5 ----- .../snowflake_example_test_dataset.yml | 18 --------------- .../dataset/timebase_example_test_dataset.yml | 18 --------------- data/saas/dataset/hubspot_dataset.yml | 3 --- data/saas/dataset/mailchimp_dataset.yml | 2 -- data/saas/dataset/stripe_dataset.yml | 16 -------------- .../mongo_example_test_dataset.yml | 22 ------------------- ...s_example_custom_request_field_dataset.yml | 1 - .../postgres_example_test_dataset.yml | 18 --------------- .../test_data/mailchimp_override_dataset.yml | 2 -- .../saas/test_data/saas_async_dataset.yml | 2 -- ..._custom_privacy_request_fields_dataset.yml | 1 - .../test_data/saas_erasure_order_dataset.yml | 5 ----- .../saas/test_data/saas_example_dataset.yml | 10 --------- tests/ops/generator/test_data_generator.py | 4 ---- .../example_datasets/multiple_identities.yml | 2 -- ...le_identities_with_external_dependency.yml | 2 -- .../example_datasets/no_identities.yml | 2 -- .../example_datasets/single_identity.yml | 2 -- ...ngle_identity_with_internal_dependency.yml | 2 -- tests/ops/util/test_dataset_yaml.py | 3 --- 39 files changed, 4 insertions(+), 335 deletions(-) diff --git a/data/dataset/bigquery_enterprise_test_dataset.yml b/data/dataset/bigquery_enterprise_test_dataset.yml index 59d27e68a2..52b20e7d03 100644 --- a/data/dataset/bigquery_enterprise_test_dataset.yml +++ b/data/dataset/bigquery_enterprise_test_dataset.yml @@ -30,7 +30,7 @@ dataset: fides_meta: references: null identity: null - primary_key: true + primary_key: null data_type: integer length: null return_all_elements: null @@ -102,7 +102,7 @@ dataset: fides_meta: references: null identity: null - primary_key: true + primary_key: null data_type: integer length: null return_all_elements: null @@ -204,7 +204,7 @@ dataset: fides_meta: references: null identity: null - primary_key: true + primary_key: null data_type: integer length: null return_all_elements: null @@ -347,7 +347,7 @@ dataset: fides_meta: references: null identity: stackoverflow_user_id - primary_key: true + primary_key: null data_type: integer length: null return_all_elements: null diff --git a/data/dataset/bigquery_example_test_dataset.yml b/data/dataset/bigquery_example_test_dataset.yml index 11fdac1aba..c4ea16cb44 100644 --- a/data/dataset/bigquery_example_test_dataset.yml +++ b/data/dataset/bigquery_example_test_dataset.yml @@ -13,8 +13,6 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -53,8 +51,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -80,8 +76,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -98,8 +92,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: time data_categories: [user.sensor] @@ -114,8 +106,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: @@ -166,8 +156,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: name data_categories: [user.financial] - name: preferred @@ -177,8 +165,6 @@ dataset: fields: - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: name data_categories: [system.operations] - name: price @@ -193,8 +179,6 @@ dataset: data_type: string - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: month data_categories: [system.operations] - name: name @@ -227,8 +211,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: opened data_categories: [system.operations] diff --git a/data/dataset/dynamodb_example_test_dataset.yml b/data/dataset/dynamodb_example_test_dataset.yml index d9ecbb8d1f..4aa3f8b2bf 100644 --- a/data/dataset/dynamodb_example_test_dataset.yml +++ b/data/dataset/dynamodb_example_test_dataset.yml @@ -19,8 +19,6 @@ dataset: data_categories: [system.operations] - name: email data_categories: [user.contact.email] - fides_meta: - primary_key: True identity: email data_type: string - name: name @@ -33,8 +31,6 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -59,16 +55,12 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] - name: login fields: - name: customer_id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: login_date data_categories: [system.operations] - name: name diff --git a/data/dataset/email_dataset.yml b/data/dataset/email_dataset.yml index c829e8a4ea..64b49f71a8 100644 --- a/data/dataset/email_dataset.yml +++ b/data/dataset/email_dataset.yml @@ -7,8 +7,6 @@ dataset: fields: - name: id data_categories: [system.operations] - fides_meta: - primary_key: true - name: customer_id data_categories: [user] fides_meta: @@ -22,8 +20,6 @@ dataset: fields: - name: id data_categories: [system.operations] - fides_meta: - primary_key: true - name: first_name data_categories: [user.childrens] - name: last_name @@ -54,8 +50,6 @@ dataset: fields: - name: id data_categories: [ system.operations ] - fides_meta: - primary_key: true - name: payer_email data_categories: [ user.contact.email ] fides_meta: diff --git a/data/dataset/example_field_masking_override_test_dataset.yml b/data/dataset/example_field_masking_override_test_dataset.yml index 24bdf84555..74e29ca84e 100644 --- a/data/dataset/example_field_masking_override_test_dataset.yml +++ b/data/dataset/example_field_masking_override_test_dataset.yml @@ -11,8 +11,6 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -38,8 +36,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -90,8 +86,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -108,8 +102,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: time data_categories: [user.sensor] @@ -124,8 +116,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: @@ -176,8 +166,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: name data_categories: [user.financial] - name: preferred @@ -187,8 +175,6 @@ dataset: fields: - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: name data_categories: [system.operations] - name: price @@ -203,8 +189,6 @@ dataset: data_type: string - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: month data_categories: [system.operations] - name: name @@ -237,8 +221,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: opened data_categories: [system.operations] - name: visit diff --git a/data/dataset/example_test_dataset.invalid b/data/dataset/example_test_dataset.invalid index 46e5235876..a3bfe261ff 100644 --- a/data/dataset/example_test_dataset.invalid +++ b/data/dataset/example_test_dataset.invalid @@ -11,8 +11,6 @@ dataset: data_categories: [user.contact.address.street] * name: id data_categories: [system.operations] - fides_meta: - primary_key: True * name: state data_categories: [user.contact.address.state] * name: street @@ -38,8 +36,6 @@ dataset: data_type: string * name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True * name: name data_categories: [user.name] fides_meta: @@ -62,8 +58,6 @@ dataset: data_type: string * name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True * name: name data_categories: [user.name] fides_meta: @@ -80,8 +74,6 @@ dataset: direction: from * name: id data_categories: [system.operations] - fides_meta: - primary_key: True * name: time data_categories: [user.sensor] @@ -96,8 +88,6 @@ dataset: direction: from * name: id data_categories: [system.operations] - fides_meta: - primary_key: True * name: shipping_address_id data_categories: [system.operations] fides_meta: @@ -148,8 +138,6 @@ dataset: direction: from * name: id data_categories: [system.operations] - fides_meta: - primary_key: True * name: name data_categories: [user.financial] * name: preferred @@ -159,8 +147,6 @@ dataset: fields: * name: id data_categories: [system.operations] - fides_meta: - primary_key: True * name: name data_categories: [system.operations] * name: price @@ -175,8 +161,6 @@ dataset: data_type: string * name: id data_categories: [system.operations] - fides_meta: - primary_key: True * name: month data_categories: [system.operations] * name: name @@ -209,8 +193,6 @@ dataset: direction: from * name: id data_categories: [system.operations] - fides_meta: - primary_key: True * name: opened data_categories: [system.operations] diff --git a/data/dataset/example_test_datasets.yml b/data/dataset/example_test_datasets.yml index 898d61bc71..e64e9fb1e8 100644 --- a/data/dataset/example_test_datasets.yml +++ b/data/dataset/example_test_datasets.yml @@ -11,8 +11,6 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -38,8 +36,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] @@ -59,8 +55,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] @@ -89,8 +83,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: @@ -220,8 +212,6 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -247,8 +237,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] @@ -268,8 +256,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] @@ -298,8 +284,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: diff --git a/data/dataset/google_cloud_sql_mysql_example_test_dataset.yml b/data/dataset/google_cloud_sql_mysql_example_test_dataset.yml index 7f090e0487..86b6ad2171 100644 --- a/data/dataset/google_cloud_sql_mysql_example_test_dataset.yml +++ b/data/dataset/google_cloud_sql_mysql_example_test_dataset.yml @@ -11,8 +11,6 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -38,8 +36,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] @@ -59,8 +55,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] @@ -89,8 +83,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: diff --git a/data/dataset/google_cloud_sql_postgres_example_test_dataset.yml b/data/dataset/google_cloud_sql_postgres_example_test_dataset.yml index 47989b4201..833361a300 100644 --- a/data/dataset/google_cloud_sql_postgres_example_test_dataset.yml +++ b/data/dataset/google_cloud_sql_postgres_example_test_dataset.yml @@ -11,8 +11,6 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -38,8 +36,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] @@ -59,8 +55,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] @@ -89,8 +83,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: diff --git a/data/dataset/manual_dataset.yml b/data/dataset/manual_dataset.yml index 66f5e4a0da..26d6acbe48 100644 --- a/data/dataset/manual_dataset.yml +++ b/data/dataset/manual_dataset.yml @@ -7,8 +7,6 @@ dataset: fields: - name: id data_categories: [system.operations] - fides_meta: - primary_key: true - name: authorized_user data_categories: [user] fides_meta: @@ -31,8 +29,6 @@ dataset: fields: - name: box_id data_categories: [user] - fides_meta: - primary_key: true - name: email data_categories: [user.contact.email] fides_meta: diff --git a/data/dataset/mariadb_example_test_dataset.yml b/data/dataset/mariadb_example_test_dataset.yml index 5e3c90f08f..204ad8a56d 100644 --- a/data/dataset/mariadb_example_test_dataset.yml +++ b/data/dataset/mariadb_example_test_dataset.yml @@ -11,8 +11,6 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -38,8 +36,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] @@ -59,8 +55,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] @@ -89,8 +83,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: diff --git a/data/dataset/mongo_example_test_dataset.yml b/data/dataset/mongo_example_test_dataset.yml index 0205f33049..4392c00bfc 100644 --- a/data/dataset/mongo_example_test_dataset.yml +++ b/data/dataset/mongo_example_test_dataset.yml @@ -7,8 +7,6 @@ dataset: fields: - name: _id data_categories: [system.operations] - fides_meta: - primary_key: True - name: customer_id data_categories: [user.unique_id] fides_meta: @@ -81,8 +79,6 @@ dataset: fields: - name: _id data_categories: [system.operations] - fides_meta: - primary_key: True data_type: object_id - name: customer_identifiers fields: @@ -112,8 +108,6 @@ dataset: fields: - name: _id data_categories: [system.operations] - fides_meta: - primary_key: True data_type: object_id - name: customer_information fields: @@ -145,8 +139,6 @@ dataset: fields: - name: _id data_categories: [system.operations] - fides_meta: - primary_key: True data_type: object_id - name: passenger_information fields: @@ -175,8 +167,6 @@ dataset: fields: - name: _id data_categories: [system.operations] - fides_meta: - primary_key: True data_type: object_id - name: thread fides_meta: @@ -200,8 +190,6 @@ dataset: fields: - name: _id data_categories: [system.operations] - fides_meta: - primary_key: True data_type: object_id - name: email data_categories: [user.contact.email] @@ -210,8 +198,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True references: - dataset: mongo_test field: flights.pilots @@ -224,8 +210,6 @@ dataset: fields: - name: _id data_categories: [system.operations] - fides_meta: - primary_key: True data_type: object_id - name: planes data_categories: [system.operations] @@ -243,8 +227,6 @@ dataset: fields: - name: _id data_categories: [system.operations] - fides_meta: - primary_key: True data_type: object_id - name: billing_address_id data_categories: [system.operations] @@ -261,8 +243,6 @@ dataset: data_categories: [user.unique_id] - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: name data_categories: [user.financial] - name: preferred @@ -270,8 +250,6 @@ dataset: - name: rewards fields: - name: _id - fides_meta: - primary_key: True data_type: object_id - name: owner fides_meta: diff --git a/data/dataset/mssql_example_test_dataset.yml b/data/dataset/mssql_example_test_dataset.yml index 661c600727..d58cf013d3 100644 --- a/data/dataset/mssql_example_test_dataset.yml +++ b/data/dataset/mssql_example_test_dataset.yml @@ -11,8 +11,6 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -38,8 +36,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] @@ -59,8 +55,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] @@ -89,8 +83,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: diff --git a/data/dataset/mysql_example_test_dataset.yml b/data/dataset/mysql_example_test_dataset.yml index f311ebf2c7..7d2b16541b 100644 --- a/data/dataset/mysql_example_test_dataset.yml +++ b/data/dataset/mysql_example_test_dataset.yml @@ -11,8 +11,6 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -38,8 +36,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] @@ -59,8 +55,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] @@ -89,8 +83,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: @@ -218,4 +210,3 @@ dataset: data_type: string - name: updated_at data_categories: [system.operations] - diff --git a/data/dataset/postgres_example_custom_request_field_dataset.yml b/data/dataset/postgres_example_custom_request_field_dataset.yml index 96b58645d4..0a878fad87 100644 --- a/data/dataset/postgres_example_custom_request_field_dataset.yml +++ b/data/dataset/postgres_example_custom_request_field_dataset.yml @@ -10,7 +10,6 @@ dataset: data_categories: [system.operations] fides_meta: data_type: string - primary_key: True - name: email_address data_categories: [system.operations] fides_meta: diff --git a/data/dataset/postgres_example_invalid_masking_strategy_override_dataset.yml b/data/dataset/postgres_example_invalid_masking_strategy_override_dataset.yml index 5195a3671a..e66c2cd140 100644 --- a/data/dataset/postgres_example_invalid_masking_strategy_override_dataset.yml +++ b/data/dataset/postgres_example_invalid_masking_strategy_override_dataset.yml @@ -14,8 +14,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -31,8 +29,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: diff --git a/data/dataset/postgres_example_test_dataset.yml b/data/dataset/postgres_example_test_dataset.yml index a6b7080ec2..1f01fe1f03 100644 --- a/data/dataset/postgres_example_test_dataset.yml +++ b/data/dataset/postgres_example_test_dataset.yml @@ -17,7 +17,6 @@ dataset: data_categories: [system.operations] fides_meta: data_type: integer - primary_key: True - name: state data_categories: [user.contact.address.state] fides_meta: @@ -49,8 +48,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -73,8 +70,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -97,8 +92,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: time data_categories: [user.sensor] @@ -113,8 +106,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: @@ -165,8 +156,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: name data_categories: [user.financial] - name: preferred @@ -176,8 +165,6 @@ dataset: fields: - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: name data_categories: [system.operations] - name: price @@ -192,8 +179,6 @@ dataset: data_type: string - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: month data_categories: [system.operations] - name: name @@ -226,8 +211,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: opened data_categories: [system.operations] - name: visit diff --git a/data/dataset/redshift_example_test_dataset.yml b/data/dataset/redshift_example_test_dataset.yml index 9794f86bb3..2b1858e99a 100644 --- a/data/dataset/redshift_example_test_dataset.yml +++ b/data/dataset/redshift_example_test_dataset.yml @@ -11,8 +11,6 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -38,8 +36,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -62,8 +58,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -80,8 +74,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: time data_categories: [user.sensor] @@ -96,8 +88,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: @@ -148,8 +138,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: name data_categories: [user.financial] - name: preferred @@ -159,8 +147,6 @@ dataset: fields: - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: name data_categories: [system.operations] - name: price @@ -175,8 +161,6 @@ dataset: data_type: string - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: month data_categories: [system.operations] - name: name @@ -209,8 +193,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: opened data_categories: [system.operations] diff --git a/data/dataset/scylladb_example_test_dataset.yml b/data/dataset/scylladb_example_test_dataset.yml index 8374540cc1..38c0ea7b51 100644 --- a/data/dataset/scylladb_example_test_dataset.yml +++ b/data/dataset/scylladb_example_test_dataset.yml @@ -47,7 +47,6 @@ dataset: data_categories: [user.unique_id] fides_meta: data_type: integer - primary_key: True - name: uuid data_categories: [user.government_id] - name: user_activity @@ -60,12 +59,10 @@ dataset: field: users.user_id direction: from data_type: integer - primary_key: True - name: timestamp data_categories: [user.behavior] fides_meta: data_type: string - primary_key: True - name: user_agent data_categories: [user.device] fides_meta: @@ -80,7 +77,6 @@ dataset: data_categories: [system.operations] fides_meta: data_type: integer - primary_key: True - name: user_id data_categories: [user.unique_id] fides_meta: @@ -101,7 +97,6 @@ dataset: data_categories: [system.operations] fides_meta: data_type: integer - primary_key: True - name: payment_method_id data_categories: [system.operations] fides_meta: diff --git a/data/dataset/snowflake_example_test_dataset.yml b/data/dataset/snowflake_example_test_dataset.yml index da13723693..9b1b79f125 100644 --- a/data/dataset/snowflake_example_test_dataset.yml +++ b/data/dataset/snowflake_example_test_dataset.yml @@ -11,8 +11,6 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -38,8 +36,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -66,8 +62,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -84,8 +78,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: time data_categories: [user.sensor] @@ -100,8 +92,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: @@ -152,8 +142,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: name data_categories: [user.financial] - name: preferred @@ -163,8 +151,6 @@ dataset: fields: - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: name data_categories: [system.operations] - name: price @@ -179,8 +165,6 @@ dataset: data_type: string - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: month data_categories: [system.operations] - name: name @@ -213,8 +197,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: opened data_categories: [system.operations] diff --git a/data/dataset/timebase_example_test_dataset.yml b/data/dataset/timebase_example_test_dataset.yml index ffd57a7c67..fe8a7e7d1d 100644 --- a/data/dataset/timebase_example_test_dataset.yml +++ b/data/dataset/timebase_example_test_dataset.yml @@ -11,8 +11,6 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -38,8 +36,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -62,8 +58,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -80,8 +74,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: time data_categories: [user.sensor] @@ -96,8 +88,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: @@ -148,8 +138,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: name data_categories: [user.financial] - name: preferred @@ -159,8 +147,6 @@ dataset: fields: - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: name data_categories: [system.operations] - name: price @@ -175,8 +161,6 @@ dataset: data_type: string - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: month data_categories: [system.operations] - name: name @@ -209,8 +193,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: opened data_categories: [system.operations] diff --git a/data/saas/dataset/hubspot_dataset.yml b/data/saas/dataset/hubspot_dataset.yml index 94f0bf43f9..406c800840 100644 --- a/data/saas/dataset/hubspot_dataset.yml +++ b/data/saas/dataset/hubspot_dataset.yml @@ -8,7 +8,6 @@ dataset: - name: id data_categories: [user.unique_id] fidesops_meta: - primary_key: True data_type: string - name: properties fidesops_meta: @@ -117,7 +116,6 @@ dataset: - name: id data_categories: [system.operations] fidesops_meta: - primary_key: True data_type: string - name: name data_categories: [system.operations] @@ -152,7 +150,6 @@ dataset: - name: id data_categories: [user.unique_id] fidesops_meta: - primary_key: True data_type: string - name: email data_categories: [user.contact.email] diff --git a/data/saas/dataset/mailchimp_dataset.yml b/data/saas/dataset/mailchimp_dataset.yml index 05e3e45a2e..46751b04a4 100644 --- a/data/saas/dataset/mailchimp_dataset.yml +++ b/data/saas/dataset/mailchimp_dataset.yml @@ -35,8 +35,6 @@ dataset: fields: - name: id data_categories: [user.unique_id] - fidesops_meta: - primary_key: True - name: list_id data_categories: [system.operations] - name: email_address diff --git a/data/saas/dataset/stripe_dataset.yml b/data/saas/dataset/stripe_dataset.yml index f8e7482ecf..5b26474a13 100644 --- a/data/saas/dataset/stripe_dataset.yml +++ b/data/saas/dataset/stripe_dataset.yml @@ -7,8 +7,6 @@ dataset: fields: - name: id data_categories: [system.operations] - fidesops_meta: - primary_key: True read_only: True data_type: string - name: object @@ -617,8 +615,6 @@ dataset: fields: - name: id data_categories: [system.operations] - fidesops_meta: - primary_key: True data_type: string - name: object data_categories: [system.operations] @@ -714,8 +710,6 @@ dataset: fields: - name: id data_categories: [system.operations] - fidesops_meta: - primary_key: True data_type: string - name: object data_categories: [system.operations] @@ -754,8 +748,6 @@ dataset: fields: - name: id data_categories: [system.operations] - fidesops_meta: - primary_key: True data_type: string - name: object data_categories: [system.operations] @@ -923,8 +915,6 @@ dataset: fields: - name: id data_categories: [system.operations] - fidesops_meta: - primary_key: True data_type: string - name: object data_categories: [system.operations] @@ -958,8 +948,6 @@ dataset: fields: - name: id data_categories: [system.operations] - fidesops_meta: - primary_key: True data_type: string - name: object data_categories: [system.operations] @@ -1235,8 +1223,6 @@ dataset: fields: - name: id data_categories: [system.operations] - fidesops_meta: - primary_key: True data_type: string - name: object data_categories: [system.operations] @@ -1324,8 +1310,6 @@ dataset: fields: - name: id data_categories: [system.operations] - fidesops_meta: - primary_key: true data_type: string - name: object data_categories: [system.operations] diff --git a/src/fides/data/sample_project/sample_resources/mongo_example_test_dataset.yml b/src/fides/data/sample_project/sample_resources/mongo_example_test_dataset.yml index 3971b6481d..542887d5c7 100644 --- a/src/fides/data/sample_project/sample_resources/mongo_example_test_dataset.yml +++ b/src/fides/data/sample_project/sample_resources/mongo_example_test_dataset.yml @@ -7,8 +7,6 @@ dataset: fields: - name: _id data_categories: [system.operations] - fides_meta: - primary_key: True - name: customer_id data_categories: [user.unique_id] fides_meta: @@ -77,8 +75,6 @@ dataset: fields: - name: _id data_categories: [system.operations] - fides_meta: - primary_key: True data_type: object_id - name: customer_identifiers fields: @@ -108,8 +104,6 @@ dataset: fields: - name: _id data_categories: [system.operations] - fides_meta: - primary_key: True data_type: object_id - name: customer_information fields: @@ -141,8 +135,6 @@ dataset: fields: - name: _id data_categories: [system.operations] - fides_meta: - primary_key: True data_type: object_id - name: passenger_information fields: @@ -171,8 +163,6 @@ dataset: fields: - name: _id data_categories: [system.operations] - fides_meta: - primary_key: True data_type: object_id - name: thread fides_meta: @@ -196,8 +186,6 @@ dataset: fields: - name: _id data_categories: [system.operations] - fides_meta: - primary_key: True data_type: object_id - name: email data_categories: [user.contact.email] @@ -206,8 +194,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True references: - dataset: mongo_test field: flights.pilots @@ -220,8 +206,6 @@ dataset: fields: - name: _id data_categories: [system.operations] - fides_meta: - primary_key: True data_type: object_id - name: planes data_categories: [system.operations] @@ -239,8 +223,6 @@ dataset: fields: - name: _id data_categories: [system.operations] - fides_meta: - primary_key: True data_type: object_id - name: billing_address_id data_categories: [system.operations] @@ -257,8 +239,6 @@ dataset: data_categories: [user.unique_id] - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: name data_categories: [user.financial] - name: preferred @@ -266,8 +246,6 @@ dataset: - name: rewards fields: - name: _id - fides_meta: - primary_key: True data_type: object_id - name: owner fides_meta: diff --git a/src/fides/data/sample_project/sample_resources/postgres_example_custom_request_field_dataset.yml b/src/fides/data/sample_project/sample_resources/postgres_example_custom_request_field_dataset.yml index 96b58645d4..0a878fad87 100644 --- a/src/fides/data/sample_project/sample_resources/postgres_example_custom_request_field_dataset.yml +++ b/src/fides/data/sample_project/sample_resources/postgres_example_custom_request_field_dataset.yml @@ -10,7 +10,6 @@ dataset: data_categories: [system.operations] fides_meta: data_type: string - primary_key: True - name: email_address data_categories: [system.operations] fides_meta: diff --git a/src/fides/data/sample_project/sample_resources/postgres_example_test_dataset.yml b/src/fides/data/sample_project/sample_resources/postgres_example_test_dataset.yml index e519a75008..768c972d99 100644 --- a/src/fides/data/sample_project/sample_resources/postgres_example_test_dataset.yml +++ b/src/fides/data/sample_project/sample_resources/postgres_example_test_dataset.yml @@ -11,8 +11,6 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -38,8 +36,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -62,8 +58,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -80,8 +74,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: time data_categories: [user.sensor] @@ -96,8 +88,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: @@ -148,8 +138,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: name data_categories: [user.financial] - name: preferred @@ -159,8 +147,6 @@ dataset: fields: - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: name data_categories: [system.operations] - name: price @@ -175,8 +161,6 @@ dataset: data_type: string - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: month data_categories: [system.operations] - name: name @@ -209,8 +193,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: opened data_categories: [system.operations] diff --git a/tests/fixtures/saas/test_data/mailchimp_override_dataset.yml b/tests/fixtures/saas/test_data/mailchimp_override_dataset.yml index b0d39058e3..d9f6e3214b 100644 --- a/tests/fixtures/saas/test_data/mailchimp_override_dataset.yml +++ b/tests/fixtures/saas/test_data/mailchimp_override_dataset.yml @@ -41,8 +41,6 @@ dataset: fields: - name: id data_categories: [user.unique_id] - fidesops_meta: - primary_key: True - name: list_id data_categories: [system.operations] - name: email_address diff --git a/tests/fixtures/saas/test_data/saas_async_dataset.yml b/tests/fixtures/saas/test_data/saas_async_dataset.yml index 42f6557ddb..2398a307a1 100644 --- a/tests/fixtures/saas/test_data/saas_async_dataset.yml +++ b/tests/fixtures/saas/test_data/saas_async_dataset.yml @@ -7,8 +7,6 @@ dataset: fields: - name: id data_categories: [user.unique_id] - fidesops_meta: - primary_key: True - name: system_id data_categories: [system] - name: state diff --git a/tests/fixtures/saas/test_data/saas_custom_privacy_request_fields_dataset.yml b/tests/fixtures/saas/test_data/saas_custom_privacy_request_fields_dataset.yml index 0009dce2e2..c0e9f1f094 100644 --- a/tests/fixtures/saas/test_data/saas_custom_privacy_request_fields_dataset.yml +++ b/tests/fixtures/saas/test_data/saas_custom_privacy_request_fields_dataset.yml @@ -9,4 +9,3 @@ dataset: data_categories: [system.operations] fidesops_meta: data_type: integer - primary_key: True diff --git a/tests/fixtures/saas/test_data/saas_erasure_order_dataset.yml b/tests/fixtures/saas/test_data/saas_erasure_order_dataset.yml index 513088b6e2..2dcd28806d 100644 --- a/tests/fixtures/saas/test_data/saas_erasure_order_dataset.yml +++ b/tests/fixtures/saas/test_data/saas_erasure_order_dataset.yml @@ -9,35 +9,30 @@ dataset: data_categories: [system.operations] fidesops_meta: data_type: integer - primary_key: True - name: refunds fields: - name: id data_categories: [system.operations] fidesops_meta: data_type: integer - primary_key: True - name: labels fields: - name: id data_categories: [system.operations] fidesops_meta: data_type: integer - primary_key: True - name: orders_to_refunds fields: - name: id data_categories: [system.operations] fidesops_meta: data_type: integer - primary_key: True - name: refunds_to_orders fields: - name: id data_categories: [system.operations] fidesops_meta: data_type: integer - primary_key: True - name: products fields: - name: id diff --git a/tests/fixtures/saas/test_data/saas_example_dataset.yml b/tests/fixtures/saas/test_data/saas_example_dataset.yml index c0c430eb80..8eaa4ce3a5 100644 --- a/tests/fixtures/saas/test_data/saas_example_dataset.yml +++ b/tests/fixtures/saas/test_data/saas_example_dataset.yml @@ -41,8 +41,6 @@ dataset: fields: - name: id data_categories: [user.unique_id] - fidesops_meta: - primary_key: True - name: list_id data_categories: [system.operations] - name: email_address @@ -187,8 +185,6 @@ dataset: fields: - name: id data_categories: [system.operations] - fidesops_meta: - primary_key: True read_only: True - name: name fields: @@ -233,37 +229,31 @@ dataset: - name: id fidesops_meta: data_type: integer - primary_key: True - name: skipped_collection fields: - name: id fides_meta: data_type: integer - primary_key: True - name: request_with_output_template fields: - name: id fides_meta: data_type: integer - primary_key: True - name: request_with_invalid_output_template fields: - name: id fides_meta: data_type: integer - primary_key: True - name: standalone_output_template fields: - name: id fides_meta: data_type: integer - primary_key: True - name: complex_template_example fields: - name: id fides_meta: data_type: integer - primary_key: True - fides_key: saas_connector_external_example name: An Example External SaaS Dataset diff --git a/tests/ops/generator/test_data_generator.py b/tests/ops/generator/test_data_generator.py index af9ab1cc62..659185d080 100644 --- a/tests/ops/generator/test_data_generator.py +++ b/tests/ops/generator/test_data_generator.py @@ -19,8 +19,6 @@ - name: user fields: - name: id - fides_meta: - primary_key: True data_type: integer references: - dataset: db @@ -33,8 +31,6 @@ - name: address fields: - name: id - fides_meta: - primary_key: True data_type: integer - name: user_id - name: street diff --git a/tests/ops/service/dataset/example_datasets/multiple_identities.yml b/tests/ops/service/dataset/example_datasets/multiple_identities.yml index 053afb3ced..dd76dbfa6d 100644 --- a/tests/ops/service/dataset/example_datasets/multiple_identities.yml +++ b/tests/ops/service/dataset/example_datasets/multiple_identities.yml @@ -16,8 +16,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: diff --git a/tests/ops/service/dataset/example_datasets/multiple_identities_with_external_dependency.yml b/tests/ops/service/dataset/example_datasets/multiple_identities_with_external_dependency.yml index fdfcd32bfc..db9e227a74 100644 --- a/tests/ops/service/dataset/example_datasets/multiple_identities_with_external_dependency.yml +++ b/tests/ops/service/dataset/example_datasets/multiple_identities_with_external_dependency.yml @@ -32,7 +32,5 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: shipping_address_id data_categories: [system.operations] diff --git a/tests/ops/service/dataset/example_datasets/no_identities.yml b/tests/ops/service/dataset/example_datasets/no_identities.yml index fac879de99..82b56f9c65 100644 --- a/tests/ops/service/dataset/example_datasets/no_identities.yml +++ b/tests/ops/service/dataset/example_datasets/no_identities.yml @@ -13,8 +13,6 @@ dataset: data_categories: [user.contact.email] - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: diff --git a/tests/ops/service/dataset/example_datasets/single_identity.yml b/tests/ops/service/dataset/example_datasets/single_identity.yml index 19cdc7df3e..ce1506886d 100644 --- a/tests/ops/service/dataset/example_datasets/single_identity.yml +++ b/tests/ops/service/dataset/example_datasets/single_identity.yml @@ -16,8 +16,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: diff --git a/tests/ops/service/dataset/example_datasets/single_identity_with_internal_dependency.yml b/tests/ops/service/dataset/example_datasets/single_identity_with_internal_dependency.yml index 708aefbaf0..af73f8bcb8 100644 --- a/tests/ops/service/dataset/example_datasets/single_identity_with_internal_dependency.yml +++ b/tests/ops/service/dataset/example_datasets/single_identity_with_internal_dependency.yml @@ -16,8 +16,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: diff --git a/tests/ops/util/test_dataset_yaml.py b/tests/ops/util/test_dataset_yaml.py index edaa26a7ca..a610ac7569 100644 --- a/tests/ops/util/test_dataset_yaml.py +++ b/tests/ops/util/test_dataset_yaml.py @@ -33,7 +33,6 @@ - name: id data_categories: [system.operations] fidesops_meta: - primary_key: True data_type: integer """ @@ -47,7 +46,6 @@ - name: _id data_categories: [system.operations] fidesops_meta: - primary_key: True data_type: object_id - name: photo_id data_categories: [user.unique_id] @@ -223,7 +221,6 @@ def test_invalid_datatype(): - name: id data_categories: [system.operations] fidesops_meta: - primary_key: True data_type: integer - name: users fields: From fc1aaccca78d7e2ce54c945c19a261c1157df09f Mon Sep 17 00:00:00 2001 From: Adrian Galvan Date: Mon, 9 Dec 2024 12:28:15 -0800 Subject: [PATCH 07/22] Simplifying generate_update_stmt and fixing tests --- .../dataset/dynamodb_example_test_dataset.yml | 1 + data/dataset/mongo_example_test_dataset.yml | 16 +++++-- .../query_configs/mongodb_query_config.py | 2 +- .../connectors/query_configs/query_config.py | 24 +++------- .../service/connectors/test_query_config.py | 47 +++++++++++-------- 5 files changed, 48 insertions(+), 42 deletions(-) diff --git a/data/dataset/dynamodb_example_test_dataset.yml b/data/dataset/dynamodb_example_test_dataset.yml index 4aa3f8b2bf..a4e5a1291a 100644 --- a/data/dataset/dynamodb_example_test_dataset.yml +++ b/data/dataset/dynamodb_example_test_dataset.yml @@ -19,6 +19,7 @@ dataset: data_categories: [system.operations] - name: email data_categories: [user.contact.email] + fides_meta: identity: email data_type: string - name: name diff --git a/data/dataset/mongo_example_test_dataset.yml b/data/dataset/mongo_example_test_dataset.yml index 4392c00bfc..ece5817b1a 100644 --- a/data/dataset/mongo_example_test_dataset.yml +++ b/data/dataset/mongo_example_test_dataset.yml @@ -79,6 +79,7 @@ dataset: fields: - name: _id data_categories: [system.operations] + fides_meta: data_type: object_id - name: customer_identifiers fields: @@ -108,6 +109,7 @@ dataset: fields: - name: _id data_categories: [system.operations] + fides_meta: data_type: object_id - name: customer_information fields: @@ -139,6 +141,7 @@ dataset: fields: - name: _id data_categories: [system.operations] + fides_meta: data_type: object_id - name: passenger_information fields: @@ -167,6 +170,7 @@ dataset: fields: - name: _id data_categories: [system.operations] + fides_meta: data_type: object_id - name: thread fides_meta: @@ -190,6 +194,7 @@ dataset: fields: - name: _id data_categories: [system.operations] + fides_meta: data_type: object_id - name: email data_categories: [user.contact.email] @@ -198,10 +203,10 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - references: - - dataset: mongo_test - field: flights.pilots - direction: from + references: + - dataset: mongo_test + field: flights.pilots + direction: from - name: name data_categories: [user.name] fides_meta: @@ -210,6 +215,7 @@ dataset: fields: - name: _id data_categories: [system.operations] + fides_meta: data_type: object_id - name: planes data_categories: [system.operations] @@ -227,6 +233,7 @@ dataset: fields: - name: _id data_categories: [system.operations] + fides_meta: data_type: object_id - name: billing_address_id data_categories: [system.operations] @@ -250,6 +257,7 @@ dataset: - name: rewards fields: - name: _id + fides_meta: data_type: object_id - name: owner fides_meta: diff --git a/src/fides/api/service/connectors/query_configs/mongodb_query_config.py b/src/fides/api/service/connectors/query_configs/mongodb_query_config.py index 1a6aa303f0..a132f2dfc8 100644 --- a/src/fides/api/service/connectors/query_configs/mongodb_query_config.py +++ b/src/fides/api/service/connectors/query_configs/mongodb_query_config.py @@ -72,7 +72,7 @@ def generate_update_stmt( where_clauses: Dict[str, Any] = filter_nonempty_values( { field_path.string_path: field.cast(row[field_path.string_path]) - for field_path, field in self.primary_key_field_paths.items() + for field_path, field in self.identity_or_reference_field_paths.items() } ) diff --git a/src/fides/api/service/connectors/query_configs/query_config.py b/src/fides/api/service/connectors/query_configs/query_config.py index 4a29e0ace5..abc3da34fe 100644 --- a/src/fides/api/service/connectors/query_configs/query_config.py +++ b/src/fides/api/service/connectors/query_configs/query_config.py @@ -453,32 +453,20 @@ def generate_update_stmt( } ) - # Identify overlapping fields and create parameter mappings - overlapping_keys = set(update_value_map.keys()) & set( - non_empty_reference_fields.keys() - ) + # Create parameter mappings with masked_ prefix for SET values param_map = { - **{k: v for k, v in update_value_map.items()}, # SET values - **{ - f"where_{k}" if k in overlapping_keys else k: v - for k, v in non_empty_reference_fields.items() - }, # WHERE values + **{f"masked_{k}": v for k, v in update_value_map.items()}, + **non_empty_reference_fields, } - # Generate SQL clauses using parameter names update_clauses = self.get_update_clauses( - {k: k for k in update_value_map}, non_empty_reference_fields + {k: f"masked_{k}" for k in update_value_map}, + non_empty_reference_fields, ) where_clauses = self.format_key_map_for_update_stmt( - { - k: f"where_{k}" if k in overlapping_keys else k - for k in non_empty_reference_fields - } + {k: k for k in non_empty_reference_fields} ) - for k, v in non_empty_reference_fields.items(): - update_value_map[k] = v - valid = len(where_clauses) > 0 and len(update_clauses) > 0 if not valid: logger.warning( diff --git a/tests/ops/service/connectors/test_query_config.py b/tests/ops/service/connectors/test_query_config.py index 991c945081..75c9b26c1b 100644 --- a/tests/ops/service/connectors/test_query_config.py +++ b/tests/ops/service/connectors/test_query_config.py @@ -288,10 +288,12 @@ def test_generate_update_stmt_one_field( text_clause = config.generate_update_stmt(row, erasure_policy, privacy_request) assert ( text_clause.text - == """UPDATE customer SET name = :name WHERE email = :email""" + == """UPDATE customer SET name = :masked_name WHERE email = :email""" ) - assert text_clause._bindparams["name"].key == "name" - assert text_clause._bindparams["name"].value is None # Null masking strategy + assert text_clause._bindparams["masked_name"].key == "masked_name" + assert ( + text_clause._bindparams["masked_name"].value is None + ) # Null masking strategy def test_generate_update_stmt_one_field_inbound_reference( self, erasure_policy_address_city, example_datasets, connection_config @@ -317,9 +319,14 @@ def test_generate_update_stmt_one_field_inbound_reference( text_clause = config.generate_update_stmt( row, erasure_policy_address_city, privacy_request ) - assert text_clause.text == """UPDATE address SET city = :city WHERE id = :id""" - assert text_clause._bindparams["city"].key == "city" - assert text_clause._bindparams["city"].value is None # Null masking strategy + assert ( + text_clause.text + == """UPDATE address SET city = :masked_city WHERE id = :id""" + ) + assert text_clause._bindparams["masked_city"].key == "masked_city" + assert ( + text_clause._bindparams["masked_city"].value is None + ) # Null masking strategy def test_generate_update_stmt_length_truncation( self, @@ -349,12 +356,12 @@ def test_generate_update_stmt_length_truncation( ) assert ( text_clause.text - == """UPDATE customer SET name = :name WHERE email = :email""" + == """UPDATE customer SET name = :masked_name WHERE email = :email""" ) - assert text_clause._bindparams["name"].key == "name" + assert text_clause._bindparams["masked_name"].key == "masked_name" # length truncation on name field assert ( - text_clause._bindparams["name"].value + text_clause._bindparams["masked_name"].value == "some rewrite value that is very long and" ) @@ -399,23 +406,23 @@ def test_generate_update_stmt_multiple_fields_same_rule( text_clause = config.generate_update_stmt(row, erasure_policy, privacy_request) assert ( text_clause.text - == "UPDATE customer SET email = :email, name = :name WHERE email = :where_email" + == "UPDATE customer SET email = :masked_email, name = :masked_name WHERE email = :email" ) - assert text_clause._bindparams["name"].key == "name" + assert text_clause._bindparams["masked_name"].key == "masked_name" # since length is set to 40 in dataset.yml, we expect only first 40 chars of masked val assert ( - text_clause._bindparams["name"].value + text_clause._bindparams["masked_name"].value == HashMaskingStrategy(HashMaskingConfiguration(algorithm="SHA-512")).mask( ["John Customer"], request_id=privacy_request.id )[0][0:40] ) assert ( - text_clause._bindparams["email"].value + text_clause._bindparams["masked_email"].value == HashMaskingStrategy(HashMaskingConfiguration(algorithm="SHA-512")).mask( ["customer-1@example.com"], request_id=privacy_request.id )[0] ) - assert text_clause._bindparams["where_email"].value == "customer-1@example.com" + assert text_clause._bindparams["email"].value == "customer-1@example.com" clear_cache_secrets(privacy_request.id) def test_generate_update_stmts_from_multiple_rules( @@ -444,12 +451,14 @@ def test_generate_update_stmts_from_multiple_rules( assert ( text_clause.text - == "UPDATE customer SET name = :name, email = :email WHERE email = :where_email" + == "UPDATE customer SET email = :masked_email, name = :masked_name WHERE email = :email" ) # Two different masking strategies used for name and email - assert text_clause._bindparams["name"].value is None # Null masking strategy assert ( - text_clause._bindparams["email"].value == "*****" + text_clause._bindparams["masked_name"].value is None + ) # Null masking strategy + assert ( + text_clause._bindparams["masked_email"].value == "*****" ) # String rewrite masking strategy @@ -618,7 +627,7 @@ def test_generate_update_stmt_multiple_fields( row, erasure_policy, privacy_request ) - expected_result_0 = {"_id": 1} + expected_result_0 = {"customer_id": 1} expected_result_1 = { "$set": { "birthday": None, @@ -700,7 +709,7 @@ def test_generate_update_stmt_multiple_rules( mongo_statement = config.generate_update_stmt( row, erasure_policy_two_rules, privacy_request ) - assert mongo_statement[0] == {"_id": 1} + assert mongo_statement[0] == {"customer_id": 1} assert len(mongo_statement[1]["$set"]["gender"]) == 30 assert ( mongo_statement[1]["$set"]["birthday"] From 0e11551ccb961203af62fb8eda00be3e1484d63c Mon Sep 17 00:00:00 2001 From: Adrian Galvan Date: Mon, 9 Dec 2024 13:11:04 -0800 Subject: [PATCH 08/22] More cleanup --- .../fixtures/connectors/datasetconfig.json | 18 ++--- tests/fixtures/email_fixtures.py | 27 ++++--- .../saas/test_data/saas_example_dataset.yml | 1 + tests/ops/models/test_datasetconfig.py | 7 +- tests/ops/task/test_create_request_tasks.py | 6 +- tests/ops/task/traversal_data.py | 70 +++++++------------ 6 files changed, 54 insertions(+), 75 deletions(-) diff --git a/clients/admin-ui/cypress/fixtures/connectors/datasetconfig.json b/clients/admin-ui/cypress/fixtures/connectors/datasetconfig.json index 6cf4d7d77c..c41d13993b 100644 --- a/clients/admin-ui/cypress/fixtures/connectors/datasetconfig.json +++ b/clients/admin-ui/cypress/fixtures/connectors/datasetconfig.json @@ -38,7 +38,7 @@ "fides_meta": { "references": null, "identity": null, - "primary_key": true, + "primary_key": null, "data_type": null, "length": null, "return_all_elements": null, @@ -125,7 +125,7 @@ "fides_meta": { "references": null, "identity": null, - "primary_key": true, + "primary_key": null, "data_type": null, "length": null, "return_all_elements": null, @@ -199,7 +199,7 @@ "fides_meta": { "references": null, "identity": null, - "primary_key": true, + "primary_key": null, "data_type": null, "length": null, "return_all_elements": null, @@ -258,7 +258,7 @@ "fides_meta": { "references": null, "identity": null, - "primary_key": true, + "primary_key": null, "data_type": null, "length": null, "return_all_elements": null, @@ -366,7 +366,7 @@ "fides_meta": { "references": null, "identity": null, - "primary_key": true, + "primary_key": null, "data_type": null, "length": null, "return_all_elements": null, @@ -466,7 +466,7 @@ "fides_meta": { "references": null, "identity": null, - "primary_key": true, + "primary_key": null, "data_type": null, "length": null, "return_all_elements": null, @@ -503,7 +503,7 @@ "fides_meta": { "references": null, "identity": null, - "primary_key": true, + "primary_key": null, "data_type": null, "length": null, "return_all_elements": null, @@ -555,7 +555,7 @@ "fides_meta": { "references": null, "identity": null, - "primary_key": true, + "primary_key": null, "data_type": null, "length": null, "return_all_elements": null, @@ -664,7 +664,7 @@ "fides_meta": { "references": null, "identity": null, - "primary_key": true, + "primary_key": null, "data_type": null, "length": null, "return_all_elements": null, diff --git a/tests/fixtures/email_fixtures.py b/tests/fixtures/email_fixtures.py index 0df1d61b84..4d9f6c5587 100644 --- a/tests/fixtures/email_fixtures.py +++ b/tests/fixtures/email_fixtures.py @@ -193,29 +193,28 @@ def dynamic_email_address_config_dataset( "name": "id", "data_categories": ["system.operations"], "fides_meta": { - "data_type": "string", - "primary_key": True, + "data_type": "string" }, }, { "name": "email_address", "data_categories": ["system.operations"], "fides_meta": { - "data_type": "string", + "data_type": "string" }, }, { "name": "vendor_name", "data_categories": ["system.operations"], "fides_meta": { - "data_type": "string", + "data_type": "string" }, }, { "name": "site_id", "data_categories": ["system.operations"], "fides_meta": { - "data_type": "string", + "data_type": "string" "custom_request_field": "tenant_id", }, }, @@ -246,29 +245,28 @@ def dynamic_email_address_config_second_dataset( "name": "id", "data_categories": ["system.operations"], "fides_meta": { - "data_type": "string", - "primary_key": True, + "data_type": "string" }, }, { "name": "email_address", "data_categories": ["system.operations"], "fides_meta": { - "data_type": "string", + "data_type": "string" }, }, { "name": "vendor_name", "data_categories": ["system.operations"], "fides_meta": { - "data_type": "string", + "data_type": "string" }, }, { "name": "custom_field", "data_categories": ["system.operations"], "fides_meta": { - "data_type": "string", + "data_type": "string" "custom_request_field": "custom_field", }, }, @@ -281,29 +279,28 @@ def dynamic_email_address_config_second_dataset( "name": "id2", "data_categories": ["system.operations"], "fides_meta": { - "data_type": "string", - "primary_key": True, + "data_type": "string" }, }, { "name": "email_address2", "data_categories": ["system.operations"], "fides_meta": { - "data_type": "string", + "data_type": "string" }, }, { "name": "vendor_name2", "data_categories": ["system.operations"], "fides_meta": { - "data_type": "string", + "data_type": "string" }, }, { "name": "site_id2", "data_categories": ["system.operations"], "fides_meta": { - "data_type": "string", + "data_type": "string" "custom_request_field": "tenant_id", }, }, diff --git a/tests/fixtures/saas/test_data/saas_example_dataset.yml b/tests/fixtures/saas/test_data/saas_example_dataset.yml index 8eaa4ce3a5..500c82df50 100644 --- a/tests/fixtures/saas/test_data/saas_example_dataset.yml +++ b/tests/fixtures/saas/test_data/saas_example_dataset.yml @@ -185,6 +185,7 @@ dataset: fields: - name: id data_categories: [system.operations] + fidesops_meta: read_only: True - name: name fields: diff --git a/tests/ops/models/test_datasetconfig.py b/tests/ops/models/test_datasetconfig.py index 969002baac..933c2fcb3f 100644 --- a/tests/ops/models/test_datasetconfig.py +++ b/tests/ops/models/test_datasetconfig.py @@ -194,18 +194,17 @@ def test_convert_dataset_to_graph(example_datasets): (FieldAddress("postgres_example_test_dataset", "customer", "id"), "from") ] - # check that primary key member has been set assert ( field([graph], "postgres_example_test_dataset", "address", "id").primary_key - is True + is False ) assert ( field([graph], "postgres_example_test_dataset", "customer", "id").primary_key - is True + is False ) assert ( field([graph], "postgres_example_test_dataset", "employee", "id").primary_key - is True + is False ) assert ( field([graph], "postgres_example_test_dataset", "visit", "email").primary_key diff --git a/tests/ops/task/test_create_request_tasks.py b/tests/ops/task/test_create_request_tasks.py index 290c2dc1be..3792fea0e3 100644 --- a/tests/ops/task/test_create_request_tasks.py +++ b/tests/ops/task/test_create_request_tasks.py @@ -105,7 +105,7 @@ "is_array": False, "read_only": None, "references": [], - "primary_key": True, + "primary_key": False, "data_categories": ["system.operations"], "data_type_converter": "None", "return_all_elements": None, @@ -307,7 +307,7 @@ def test_persist_access_tasks_with_object_fields_in_collection( "is_array": False, "read_only": None, "references": [], - "primary_key": True, + "primary_key": False, "data_categories": ["system.operations"], "data_type_converter": "object_id", "return_all_elements": None, @@ -927,7 +927,7 @@ def test_erase_after_saas_upstream_and_downstream_tasks( "is_array": False, "read_only": None, "references": [], - "primary_key": True, + "primary_key": False, "data_categories": ["system.operations"], "data_type_converter": "integer", "return_all_elements": None, diff --git a/tests/ops/task/traversal_data.py b/tests/ops/task/traversal_data.py index 20d3773e17..59032b337f 100644 --- a/tests/ops/task/traversal_data.py +++ b/tests/ops/task/traversal_data.py @@ -33,7 +33,7 @@ def postgres_dataset_dict(db_name: str) -> Dict[str, Any]: "fields": [ { "name": "id", - "fides_meta": {"primary_key": True, "data_type": "integer"}, + "fides_meta": {"data_type": "integer"}, }, {"name": "name", "fides_meta": {"data_type": "string"}}, { @@ -58,7 +58,7 @@ def postgres_dataset_dict(db_name: str) -> Dict[str, Any]: "name": "address", "after": [f"{db_name}.customer", f"{db_name}.orders"], "fields": [ - {"name": "id", "fides_meta": {"primary_key": True}}, + {"name": "id"}, {"name": "street", "fides_meta": {"data_type": "string"}}, {"name": "city", "fides_meta": {"data_type": "string"}}, {"name": "state", "fides_meta": {"data_type": "string"}}, @@ -68,7 +68,7 @@ def postgres_dataset_dict(db_name: str) -> Dict[str, Any]: { "name": "orders", "fields": [ - {"name": "id", "fides_meta": {"primary_key": True}}, + {"name": "id"}, { "name": "customer_id", "fides_meta": { @@ -113,7 +113,7 @@ def postgres_dataset_dict(db_name: str) -> Dict[str, Any]: "fields": [ { "name": "id", - "fides_meta": {"primary_key": True, "data_type": "string"}, + "fides_meta": {"data_type": "string"}, }, {"name": "name", "fides_meta": {"data_type": "string"}}, {"name": "ccn"}, @@ -156,7 +156,7 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase { "name": "address", "fields": [ - {"name": "_id", "fides_meta": {"primary_key": True}}, + {"name": "_id"}, { "name": "id", "fides_meta": { @@ -178,7 +178,7 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase { "name": "orders", "fields": [ - {"name": "_id", "fides_meta": {"primary_key": True}}, + {"name": "_id"}, { "name": "customer_id", "fides_meta": { @@ -200,7 +200,6 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase { "name": "_id", "fides_meta": { - "primary_key": True, "data_type": "object_id", }, }, @@ -229,7 +228,6 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase { "name": "_id", "fides_meta": { - "primary_key": True, "data_type": "object_id", }, }, @@ -240,25 +238,25 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase { "name": "comment", "fides_meta": { - "data_type": "string", + "data_type": "string" }, }, { "name": "message", "fides_meta": { - "data_type": "string", + "data_type": "string" }, }, { "name": "chat_name", "fides_meta": { - "data_type": "string", + "data_type": "string" }, }, { "name": "ccn", "fides_meta": { - "data_type": "string", + "data_type": "string" }, }, ], @@ -270,9 +268,6 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase "fields": [ { "name": "_id", - "fides_meta": { - "primary_key": True, - }, }, { "name": "birthday", @@ -320,19 +315,19 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase { "name": "name", "fides_meta": { - "data_type": "string", + "data_type": "string" }, }, { "name": "relationship", "fides_meta": { - "data_type": "string", + "data_type": "string" }, }, { "name": "phone", "fides_meta": { - "data_type": "string", + "data_type": "string" }, }, ], @@ -352,13 +347,13 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase { "name": "employer", "fides_meta": { - "data_type": "string", + "data_type": "string" }, }, { "name": "position", "fides_meta": { - "data_type": "string", + "data_type": "string" }, }, { @@ -375,7 +370,6 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase { "name": "_id", "fides_meta": { - "primary_key": True, "data_type": "object_id", }, }, @@ -386,20 +380,20 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase { "name": "email", "fides_meta": { - "data_type": "string", + "data_type": "string" "identity": "email", }, }, { "name": "phone", "fides_meta": { - "data_type": "string", + "data_type": "string" }, }, { "name": "internal_customer_id", "fides_meta": { - "data_type": "string", + "data_type": "string" }, }, ], @@ -424,7 +418,6 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase { "name": "_id", "fides_meta": { - "primary_key": True, "data_type": "object_id", }, }, @@ -438,7 +431,6 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase { "name": "id", "fides_meta": { - "primary_key": True, "references": [ { "dataset": mongo_db_name, @@ -459,10 +451,7 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase "fields": [ { "name": "_id", - "fides_meta": { - "primary_key": True, - "data_type": "object_id", - }, + "fides_meta": {"data_type": "object_id"}, }, { "name": "date", @@ -490,7 +479,7 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase { "name": "full_name", "fides_meta": { - "data_type": "string", + "data_type": "string" }, }, ], @@ -510,10 +499,7 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase "fields": [ { "name": "_id", - "fides_meta": { - "primary_key": True, - "data_type": "object_id", - }, + "fides_meta": {"data_type": "object_id"}, }, { "name": "customer_identifiers", @@ -560,10 +546,7 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase "fields": [ { "name": "_id", - "fides_meta": { - "primary_key": True, - "data_type": "object_id", - }, + "fides_meta": {"data_type": "object_id"}, }, { "name": "owner", @@ -657,7 +640,7 @@ def scylladb_dataset_dict(db_name: str) -> Dict[str, Any]: { "name": "user_id", "data_categories": ["user.unique_id"], - "fides_meta": {"data_type": "integer", "primary_key": True}, + "fides_meta": {"data_type": "integer"}, }, {"name": "uuid", "data_categories": ["user.government_id"]}, ], @@ -677,13 +660,12 @@ def scylladb_dataset_dict(db_name: str) -> Dict[str, Any]: } ], "data_type": "integer", - "primary_key": True, }, }, { "name": "timestamp", "data_categories": ["user.behavior"], - "fides_meta": {"data_type": "string", "primary_key": True}, + "fides_meta": {"data_type": "string"}, }, { "name": "user_agent", @@ -703,7 +685,7 @@ def scylladb_dataset_dict(db_name: str) -> Dict[str, Any]: { "name": "payment_method_id", "data_categories": ["system.operations"], - "fides_meta": {"data_type": "integer", "primary_key": True}, + "fides_meta": {"data_type": "integer"}, }, { "name": "user_id", @@ -733,7 +715,7 @@ def scylladb_dataset_dict(db_name: str) -> Dict[str, Any]: { "name": "order_id", "data_categories": ["system.operations"], - "fides_meta": {"data_type": "integer", "primary_key": True}, + "fides_meta": {"data_type": "integer"}, }, { "name": "payment_method_id", From 6a10d877ab202805e7979492cde710493fd11f3a Mon Sep 17 00:00:00 2001 From: Adrian Galvan Date: Mon, 9 Dec 2024 13:34:13 -0800 Subject: [PATCH 09/22] Misc fixes --- .../connectors/query_configs/query_config.py | 8 ++- .../service/connectors/scylla_query_config.py | 8 +-- tests/fixtures/email_fixtures.py | 42 +++++----------- tests/ops/task/traversal_data.py | 50 +++++-------------- 4 files changed, 36 insertions(+), 72 deletions(-) diff --git a/src/fides/api/service/connectors/query_configs/query_config.py b/src/fides/api/service/connectors/query_configs/query_config.py index abc3da34fe..b201c2c3a4 100644 --- a/src/fides/api/service/connectors/query_configs/query_config.py +++ b/src/fides/api/service/connectors/query_configs/query_config.py @@ -428,7 +428,9 @@ def get_update_stmt( @abstractmethod def get_update_clauses( - self, update_value_map: Dict[str, Any], non_empty_primary_keys: Dict[str, Field] + self, + update_value_map: Dict[str, Any], + non_empty_reference_fields: Dict[str, Field], ) -> List[str]: """Returns a list of update clauses for the update statement.""" @@ -553,7 +555,9 @@ def format_key_map_for_update_stmt(self, param_map: Dict[str, Any]) -> List[str] return [f"{k} = :{v}" for k, v in sorted(param_map.items())] def get_update_clauses( - self, update_value_map: Dict[str, Any], non_empty_primary_keys: Dict[str, Field] + self, + update_value_map: Dict[str, Any], + non_empty_reference_fields: Dict[str, Field], ) -> List[str]: """Returns a list of update clauses for the update statement.""" return self.format_key_map_for_update_stmt(update_value_map) diff --git a/src/fides/api/service/connectors/scylla_query_config.py b/src/fides/api/service/connectors/scylla_query_config.py index dc619a72c7..5e93668459 100644 --- a/src/fides/api/service/connectors/scylla_query_config.py +++ b/src/fides/api/service/connectors/scylla_query_config.py @@ -75,14 +75,16 @@ def format_key_map_for_update_stmt(self, param_map: Dict[str, Any]) -> List[str] return [f"{k} = %({v})s" for k, v in sorted(param_map.items())] def get_update_clauses( - self, update_value_map: Dict[str, Any], non_empty_primary_keys: Dict[str, Field] + self, + update_value_map: Dict[str, Any], + non_empty_reference_fields: Dict[str, Field], ) -> List[str]: """Returns a list of update clauses for the update statement.""" return self.format_key_map_for_update_stmt( { key: value - for key, value in update_value_map.keys() - if key not in non_empty_primary_keys + for key, value in update_value_map.items() + if key not in non_empty_reference_fields } ) diff --git a/tests/fixtures/email_fixtures.py b/tests/fixtures/email_fixtures.py index 4d9f6c5587..e25f39e3f4 100644 --- a/tests/fixtures/email_fixtures.py +++ b/tests/fixtures/email_fixtures.py @@ -192,29 +192,23 @@ def dynamic_email_address_config_dataset( { "name": "id", "data_categories": ["system.operations"], - "fides_meta": { - "data_type": "string" - }, + "fides_meta": {"data_type": "string"}, }, { "name": "email_address", "data_categories": ["system.operations"], - "fides_meta": { - "data_type": "string" - }, + "fides_meta": {"data_type": "string"}, }, { "name": "vendor_name", "data_categories": ["system.operations"], - "fides_meta": { - "data_type": "string" - }, + "fides_meta": {"data_type": "string"}, }, { "name": "site_id", "data_categories": ["system.operations"], "fides_meta": { - "data_type": "string" + "data_type": "string", "custom_request_field": "tenant_id", }, }, @@ -244,29 +238,23 @@ def dynamic_email_address_config_second_dataset( { "name": "id", "data_categories": ["system.operations"], - "fides_meta": { - "data_type": "string" - }, + "fides_meta": {"data_type": "string"}, }, { "name": "email_address", "data_categories": ["system.operations"], - "fides_meta": { - "data_type": "string" - }, + "fides_meta": {"data_type": "string"}, }, { "name": "vendor_name", "data_categories": ["system.operations"], - "fides_meta": { - "data_type": "string" - }, + "fides_meta": {"data_type": "string"}, }, { "name": "custom_field", "data_categories": ["system.operations"], "fides_meta": { - "data_type": "string" + "data_type": "string", "custom_request_field": "custom_field", }, }, @@ -278,29 +266,23 @@ def dynamic_email_address_config_second_dataset( { "name": "id2", "data_categories": ["system.operations"], - "fides_meta": { - "data_type": "string" - }, + "fides_meta": {"data_type": "string"}, }, { "name": "email_address2", "data_categories": ["system.operations"], - "fides_meta": { - "data_type": "string" - }, + "fides_meta": {"data_type": "string"}, }, { "name": "vendor_name2", "data_categories": ["system.operations"], - "fides_meta": { - "data_type": "string" - }, + "fides_meta": {"data_type": "string"}, }, { "name": "site_id2", "data_categories": ["system.operations"], "fides_meta": { - "data_type": "string" + "data_type": "string", "custom_request_field": "tenant_id", }, }, diff --git a/tests/ops/task/traversal_data.py b/tests/ops/task/traversal_data.py index 59032b337f..07c6067d7b 100644 --- a/tests/ops/task/traversal_data.py +++ b/tests/ops/task/traversal_data.py @@ -237,27 +237,19 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase "fields": [ { "name": "comment", - "fides_meta": { - "data_type": "string" - }, + "fides_meta": {"data_type": "string"}, }, { "name": "message", - "fides_meta": { - "data_type": "string" - }, + "fides_meta": {"data_type": "string"}, }, { "name": "chat_name", - "fides_meta": { - "data_type": "string" - }, + "fides_meta": {"data_type": "string"}, }, { "name": "ccn", - "fides_meta": { - "data_type": "string" - }, + "fides_meta": {"data_type": "string"}, }, ], }, @@ -314,21 +306,15 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase "fields": [ { "name": "name", - "fides_meta": { - "data_type": "string" - }, + "fides_meta": {"data_type": "string"}, }, { "name": "relationship", - "fides_meta": { - "data_type": "string" - }, + "fides_meta": {"data_type": "string"}, }, { "name": "phone", - "fides_meta": { - "data_type": "string" - }, + "fides_meta": {"data_type": "string"}, }, ], }, @@ -346,15 +332,11 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase "fields": [ { "name": "employer", - "fides_meta": { - "data_type": "string" - }, + "fides_meta": {"data_type": "string"}, }, { "name": "position", - "fides_meta": { - "data_type": "string" - }, + "fides_meta": {"data_type": "string"}, }, { "name": "direct_reports", @@ -380,21 +362,17 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase { "name": "email", "fides_meta": { - "data_type": "string" + "data_type": "string", "identity": "email", }, }, { "name": "phone", - "fides_meta": { - "data_type": "string" - }, + "fides_meta": {"data_type": "string"}, }, { "name": "internal_customer_id", - "fides_meta": { - "data_type": "string" - }, + "fides_meta": {"data_type": "string"}, }, ], }, @@ -478,9 +456,7 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase }, { "name": "full_name", - "fides_meta": { - "data_type": "string" - }, + "fides_meta": {"data_type": "string"}, }, ], }, From 001b8ecfcc67eaa74f2e4321b44572140e6f2946 Mon Sep 17 00:00:00 2001 From: Adrian Galvan Date: Mon, 9 Dec 2024 13:45:01 -0800 Subject: [PATCH 10/22] Renaming identity_or_reference_fields_paths to incoming_field_paths --- .../connectors/query_configs/mongodb_query_config.py | 2 +- .../api/service/connectors/query_configs/query_config.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/fides/api/service/connectors/query_configs/mongodb_query_config.py b/src/fides/api/service/connectors/query_configs/mongodb_query_config.py index a132f2dfc8..edb57599db 100644 --- a/src/fides/api/service/connectors/query_configs/mongodb_query_config.py +++ b/src/fides/api/service/connectors/query_configs/mongodb_query_config.py @@ -72,7 +72,7 @@ def generate_update_stmt( where_clauses: Dict[str, Any] = filter_nonempty_values( { field_path.string_path: field.cast(row[field_path.string_path]) - for field_path, field in self.identity_or_reference_field_paths.items() + for field_path, field in self.incoming_field_paths.items() } ) diff --git a/src/fides/api/service/connectors/query_configs/query_config.py b/src/fides/api/service/connectors/query_configs/query_config.py index b201c2c3a4..2026fe0b0f 100644 --- a/src/fides/api/service/connectors/query_configs/query_config.py +++ b/src/fides/api/service/connectors/query_configs/query_config.py @@ -101,8 +101,8 @@ def primary_key_field_paths(self) -> Dict[FieldPath, Field]: } @property - def identity_or_reference_field_paths(self) -> Dict[FieldPath, Field]: - """Mapping of FieldPaths to Fields that have identity or dataset references""" + def incoming_field_paths(self) -> Dict[FieldPath, Field]: + """Mapping of FieldPaths to Fields that have incoming identity or dataset references""" return { field_path: field for field_path, field in self.field_map().items() @@ -450,7 +450,7 @@ def generate_update_stmt( non_empty_reference_fields: Dict[str, Field] = filter_nonempty_values( { fpath.string_path: fld.cast(row[fpath.string_path]) - for fpath, fld in self.identity_or_reference_field_paths.items() + for fpath, fld in self.incoming_field_paths.items() if fpath.string_path in row } ) From 8816be7d70fdc865a0f644b01d97424a8956536f Mon Sep 17 00:00:00 2001 From: Adrian Galvan Date: Mon, 9 Dec 2024 15:45:23 -0800 Subject: [PATCH 11/22] Re-adding continue on error --- .github/workflows/backend_checks.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/backend_checks.yml b/.github/workflows/backend_checks.yml index eb77b7e21c..e3c7a7c5e2 100644 --- a/.github/workflows/backend_checks.yml +++ b/.github/workflows/backend_checks.yml @@ -255,6 +255,7 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 45 + continue-on-error: true steps: - name: Download container uses: actions/download-artifact@v4 From cae84930ab676366d7b19c923caf8f762bc327da Mon Sep 17 00:00:00 2001 From: Adrian Galvan Date: Mon, 9 Dec 2024 16:34:53 -0800 Subject: [PATCH 12/22] Adding individual timeouts to tests --- data/saas/dataset/stripe_dataset.yml | 8 ++++++++ dev-requirements.txt | 3 ++- pyproject.toml | 3 ++- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/data/saas/dataset/stripe_dataset.yml b/data/saas/dataset/stripe_dataset.yml index 5b26474a13..5d1f101973 100644 --- a/data/saas/dataset/stripe_dataset.yml +++ b/data/saas/dataset/stripe_dataset.yml @@ -7,6 +7,7 @@ dataset: fields: - name: id data_categories: [system.operations] + fidesops_meta: read_only: True data_type: string - name: object @@ -615,6 +616,7 @@ dataset: fields: - name: id data_categories: [system.operations] + fidesops_meta: data_type: string - name: object data_categories: [system.operations] @@ -710,6 +712,7 @@ dataset: fields: - name: id data_categories: [system.operations] + fidesops_meta: data_type: string - name: object data_categories: [system.operations] @@ -748,6 +751,7 @@ dataset: fields: - name: id data_categories: [system.operations] + fidesops_meta: data_type: string - name: object data_categories: [system.operations] @@ -915,6 +919,7 @@ dataset: fields: - name: id data_categories: [system.operations] + fidesops_meta: data_type: string - name: object data_categories: [system.operations] @@ -948,6 +953,7 @@ dataset: fields: - name: id data_categories: [system.operations] + fidesops_meta: data_type: string - name: object data_categories: [system.operations] @@ -1223,6 +1229,7 @@ dataset: fields: - name: id data_categories: [system.operations] + fidesops_meta: data_type: string - name: object data_categories: [system.operations] @@ -1310,6 +1317,7 @@ dataset: fields: - name: id data_categories: [system.operations] + fidesops_meta: data_type: string - name: object data_categories: [system.operations] diff --git a/dev-requirements.txt b/dev-requirements.txt index 149cdcd658..467e7b05a3 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -7,12 +7,13 @@ mypy==1.10.0 nox==2022.8.7 pre-commit==2.20.0 pylint==3.2.5 +pytest==7.2.2 pytest-asyncio==0.19.0 pytest-cov==4.0.0 pytest-env==0.6.2 pytest-mock==3.14.0 pytest-rerunfailures==14.0 -pytest==7.2.2 +pytest-timeout==2.3.1 requests-mock==1.10.0 setuptools>=64.0.2 sqlalchemy-stubs==0.4 diff --git a/pyproject.toml b/pyproject.toml index 087d2b2033..29b929a5e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -175,7 +175,8 @@ addopts = [ "--no-cov-on-fail", "-ra", "-vv", - "--disable-pytest-warnings" + "--disable-pytest-warnings", + "--timeout=300" ] markers = [ "unit: only runs tests that don't require non-python dependencies (i.e. a database)", From bb7714a5f5199017d116182b130e6481cd776df7 Mon Sep 17 00:00:00 2001 From: Adrian Galvan Date: Mon, 9 Dec 2024 18:09:56 -0800 Subject: [PATCH 13/22] Fixing datasets --- data/dataset/mongo_example_test_dataset.yml | 11 +++++++---- tests/ops/generator/test_data_generator.py | 1 + 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/data/dataset/mongo_example_test_dataset.yml b/data/dataset/mongo_example_test_dataset.yml index ece5817b1a..587e74b317 100644 --- a/data/dataset/mongo_example_test_dataset.yml +++ b/data/dataset/mongo_example_test_dataset.yml @@ -7,6 +7,8 @@ dataset: fields: - name: _id data_categories: [system.operations] + fides_meta: + primary_key: True - name: customer_id data_categories: [user.unique_id] fides_meta: @@ -203,10 +205,11 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - references: - - dataset: mongo_test - field: flights.pilots - direction: from + fides_meta: + references: + - dataset: mongo_test + field: flights.pilots + direction: from - name: name data_categories: [user.name] fides_meta: diff --git a/tests/ops/generator/test_data_generator.py b/tests/ops/generator/test_data_generator.py index 659185d080..d990169f28 100644 --- a/tests/ops/generator/test_data_generator.py +++ b/tests/ops/generator/test_data_generator.py @@ -31,6 +31,7 @@ - name: address fields: - name: id + fides_meta: data_type: integer - name: user_id - name: street From 0d4340139c8e8594c60e5a429bd56c82f5579c2a Mon Sep 17 00:00:00 2001 From: Adrian Galvan Date: Mon, 9 Dec 2024 18:48:22 -0800 Subject: [PATCH 14/22] Fixing some tests --- dev-requirements.txt | 1 - pyproject.toml | 1 - .../sample_resources/mongo_example_test_dataset.yml | 10 ++++++++++ tests/ops/generator/test_data_generator.py | 1 + 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 467e7b05a3..c51b9369a5 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -13,7 +13,6 @@ pytest-cov==4.0.0 pytest-env==0.6.2 pytest-mock==3.14.0 pytest-rerunfailures==14.0 -pytest-timeout==2.3.1 requests-mock==1.10.0 setuptools>=64.0.2 sqlalchemy-stubs==0.4 diff --git a/pyproject.toml b/pyproject.toml index 29b929a5e2..f1f4963dd0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -176,7 +176,6 @@ addopts = [ "-ra", "-vv", "--disable-pytest-warnings", - "--timeout=300" ] markers = [ "unit: only runs tests that don't require non-python dependencies (i.e. a database)", diff --git a/src/fides/data/sample_project/sample_resources/mongo_example_test_dataset.yml b/src/fides/data/sample_project/sample_resources/mongo_example_test_dataset.yml index 542887d5c7..468b43bbae 100644 --- a/src/fides/data/sample_project/sample_resources/mongo_example_test_dataset.yml +++ b/src/fides/data/sample_project/sample_resources/mongo_example_test_dataset.yml @@ -7,6 +7,8 @@ dataset: fields: - name: _id data_categories: [system.operations] + fides_meta: + primary_key: True - name: customer_id data_categories: [user.unique_id] fides_meta: @@ -104,6 +106,7 @@ dataset: fields: - name: _id data_categories: [system.operations] + fides_meta: data_type: object_id - name: customer_information fields: @@ -135,6 +138,7 @@ dataset: fields: - name: _id data_categories: [system.operations] + fides_meta: data_type: object_id - name: passenger_information fields: @@ -163,6 +167,7 @@ dataset: fields: - name: _id data_categories: [system.operations] + fides_meta: data_type: object_id - name: thread fides_meta: @@ -186,6 +191,7 @@ dataset: fields: - name: _id data_categories: [system.operations] + fides_meta: data_type: object_id - name: email data_categories: [user.contact.email] @@ -194,6 +200,7 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: references: - dataset: mongo_test field: flights.pilots @@ -206,6 +213,7 @@ dataset: fields: - name: _id data_categories: [system.operations] + fides_meta: data_type: object_id - name: planes data_categories: [system.operations] @@ -223,6 +231,7 @@ dataset: fields: - name: _id data_categories: [system.operations] + fides_meta: data_type: object_id - name: billing_address_id data_categories: [system.operations] @@ -246,6 +255,7 @@ dataset: - name: rewards fields: - name: _id + fides_meta: data_type: object_id - name: owner fides_meta: diff --git a/tests/ops/generator/test_data_generator.py b/tests/ops/generator/test_data_generator.py index d990169f28..04441237b7 100644 --- a/tests/ops/generator/test_data_generator.py +++ b/tests/ops/generator/test_data_generator.py @@ -19,6 +19,7 @@ - name: user fields: - name: id + fides_meta: data_type: integer references: - dataset: db From b13632b48e05168ec642693cf83cb9787e8a819c Mon Sep 17 00:00:00 2001 From: Adrian Galvan Date: Mon, 9 Dec 2024 18:52:00 -0800 Subject: [PATCH 15/22] Fixing MongoDB dataset --- .../sample_resources/mongo_example_test_dataset.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/src/fides/data/sample_project/sample_resources/mongo_example_test_dataset.yml b/src/fides/data/sample_project/sample_resources/mongo_example_test_dataset.yml index 468b43bbae..ccfbe853a9 100644 --- a/src/fides/data/sample_project/sample_resources/mongo_example_test_dataset.yml +++ b/src/fides/data/sample_project/sample_resources/mongo_example_test_dataset.yml @@ -77,6 +77,7 @@ dataset: fields: - name: _id data_categories: [system.operations] + fides_meta: data_type: object_id - name: customer_identifiers fields: From b0ef57d5020188b3fff15a57bd0852edfb8418ef Mon Sep 17 00:00:00 2001 From: Adrian Galvan Date: Mon, 9 Dec 2024 19:58:49 -0800 Subject: [PATCH 16/22] Re-adding primary key to mongo_test.customer_details --- tests/ops/task/traversal_data.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/ops/task/traversal_data.py b/tests/ops/task/traversal_data.py index 07c6067d7b..d0ef50ae18 100644 --- a/tests/ops/task/traversal_data.py +++ b/tests/ops/task/traversal_data.py @@ -258,9 +258,7 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase { "name": "customer_details", "fields": [ - { - "name": "_id", - }, + {"name": "_id", "fides_meta": {"primary_key": True}}, { "name": "birthday", "fides_meta": {"data_type": "string"}, From 77a5770a897c5d647308c2449bb46c64bbd5e855 Mon Sep 17 00:00:00 2001 From: Adrian Galvan Date: Tue, 10 Dec 2024 10:52:32 -0800 Subject: [PATCH 17/22] Splitting out query configs and tests --- .../query_configs/mongodb_query_config.py | 2 +- .../connectors/query_configs/query_config.py | 17 +- .../connectors/test_dynamodb_query_config.py | 129 ++++++ .../connectors/test_mongo_query_config.py | 283 ++++++++++++ .../service/connectors/test_query_config.py | 431 +----------------- .../connectors/test_scylladb_query_config.py | 47 ++ tests/ops/task/traversal_data.py | 79 +++- 7 files changed, 537 insertions(+), 451 deletions(-) create mode 100644 tests/ops/service/connectors/test_dynamodb_query_config.py create mode 100644 tests/ops/service/connectors/test_mongo_query_config.py create mode 100644 tests/ops/service/connectors/test_scylladb_query_config.py diff --git a/src/fides/api/service/connectors/query_configs/mongodb_query_config.py b/src/fides/api/service/connectors/query_configs/mongodb_query_config.py index edb57599db..1a6aa303f0 100644 --- a/src/fides/api/service/connectors/query_configs/mongodb_query_config.py +++ b/src/fides/api/service/connectors/query_configs/mongodb_query_config.py @@ -72,7 +72,7 @@ def generate_update_stmt( where_clauses: Dict[str, Any] = filter_nonempty_values( { field_path.string_path: field.cast(row[field_path.string_path]) - for field_path, field in self.incoming_field_paths.items() + for field_path, field in self.primary_key_field_paths.items() } ) diff --git a/src/fides/api/service/connectors/query_configs/query_config.py b/src/fides/api/service/connectors/query_configs/query_config.py index 2026fe0b0f..4ef115d910 100644 --- a/src/fides/api/service/connectors/query_configs/query_config.py +++ b/src/fides/api/service/connectors/query_configs/query_config.py @@ -101,7 +101,7 @@ def primary_key_field_paths(self) -> Dict[FieldPath, Field]: } @property - def incoming_field_paths(self) -> Dict[FieldPath, Field]: + def reference_field_paths(self) -> Dict[FieldPath, Field]: """Mapping of FieldPaths to Fields that have incoming identity or dataset references""" return { field_path: field @@ -447,10 +447,19 @@ def generate_update_stmt( ) -> Optional[T]: """Returns an update statement in generic SQL-ish dialect.""" update_value_map: Dict[str, Any] = self.update_value_map(row, policy, request) + + non_empty_primary_key_fields: Dict[str, Field] = filter_nonempty_values( + { + fpath.string_path: fld.cast(row[fpath.string_path]) + for fpath, fld in self.primary_key_field_paths.items() + if fpath.string_path in row + } + ) + non_empty_reference_fields: Dict[str, Field] = filter_nonempty_values( { fpath.string_path: fld.cast(row[fpath.string_path]) - for fpath, fld in self.incoming_field_paths.items() + for fpath, fld in self.reference_field_paths.items() if fpath.string_path in row } ) @@ -463,10 +472,10 @@ def generate_update_stmt( update_clauses = self.get_update_clauses( {k: f"masked_{k}" for k in update_value_map}, - non_empty_reference_fields, + non_empty_primary_key_fields or non_empty_reference_fields, ) where_clauses = self.format_key_map_for_update_stmt( - {k: k for k in non_empty_reference_fields} + {k: k for k in non_empty_primary_key_fields or non_empty_reference_fields} ) valid = len(where_clauses) > 0 and len(update_clauses) > 0 diff --git a/tests/ops/service/connectors/test_dynamodb_query_config.py b/tests/ops/service/connectors/test_dynamodb_query_config.py new file mode 100644 index 0000000000..4591ae9385 --- /dev/null +++ b/tests/ops/service/connectors/test_dynamodb_query_config.py @@ -0,0 +1,129 @@ +from datetime import datetime, timezone + +import pytest +from boto3.dynamodb.types import TypeDeserializer +from fideslang.models import Dataset + +from fides.api.graph.config import CollectionAddress +from fides.api.graph.graph import DatasetGraph +from fides.api.graph.traversal import Traversal +from fides.api.models.datasetconfig import convert_dataset_to_graph +from fides.api.models.privacy_request import PrivacyRequest +from fides.api.service.connectors.query_configs.dynamodb_query_config import ( + DynamoDBQueryConfig, +) + +privacy_request = PrivacyRequest(id="234544") + + +class TestDynamoDBQueryConfig: + @pytest.fixture(scope="function") + def identity(self): + identity = {"email": "customer-test_uuid@example.com"} + return identity + + @pytest.fixture(scope="function") + def dataset_graph(self, integration_dynamodb_config, example_datasets): + dataset = Dataset(**example_datasets[11]) + dataset_graph = convert_dataset_to_graph( + dataset, integration_dynamodb_config.key + ) + + return DatasetGraph(*[dataset_graph]) + + @pytest.fixture(scope="function") + def traversal(self, identity, dataset_graph): + dynamo_traversal = Traversal(dataset_graph, identity) + return dynamo_traversal + + @pytest.fixture(scope="function") + def customer_node(self, traversal): + return traversal.traversal_node_dict[ + CollectionAddress("dynamodb_example_test_dataset", "customer") + ].to_mock_execution_node() + + @pytest.fixture(scope="function") + def customer_identifier_node(self, traversal): + return traversal.traversal_node_dict[ + CollectionAddress("dynamodb_example_test_dataset", "customer_identifier") + ].to_mock_execution_node() + + @pytest.fixture(scope="function") + def customer_row(self): + row = { + "customer_email": {"S": "customer-1@example.com"}, + "name": {"S": "John Customer"}, + "address_id": {"L": [{"S": "1"}, {"S": "2"}]}, + "personal_info": {"M": {"gender": {"S": "male"}, "age": {"S": "99"}}}, + "id": {"S": "1"}, + } + return row + + @pytest.fixture(scope="function") + def deserialized_customer_row(self, customer_row): + deserialized_customer_row = {} + deserializer = TypeDeserializer() + for key, value in customer_row.items(): + deserialized_customer_row[key] = deserializer.deserialize(value) + return deserialized_customer_row + + @pytest.fixture(scope="function") + def customer_identifier_row(self): + row = { + "customer_id": {"S": "customer-1@example.com"}, + "email": {"S": "customer-1@example.com"}, + "name": {"S": "Customer 1"}, + "created": {"S": datetime.now(timezone.utc).isoformat()}, + } + return row + + @pytest.fixture(scope="function") + def deserialized_customer_identifier_row(self, customer_identifier_row): + deserialized_customer_identifier_row = {} + deserializer = TypeDeserializer() + for key, value in customer_identifier_row.items(): + deserialized_customer_identifier_row[key] = deserializer.deserialize(value) + return deserialized_customer_identifier_row + + def test_get_query_param_formatting_single_key( + self, + resources_dict, + customer_node, + ) -> None: + input_data = { + "fidesops_grouped_inputs": [], + "email": ["customer-test_uuid@example.com"], + } + attribute_definitions = [{"AttributeName": "email", "AttributeType": "S"}] + query_config = DynamoDBQueryConfig(customer_node, attribute_definitions) + item = query_config.generate_query( + input_data=input_data, policy=resources_dict["policy"] + ) + assert item["ExpressionAttributeValues"] == { + ":value": {"S": "customer-test_uuid@example.com"} + } + assert item["KeyConditionExpression"] == "email = :value" + + def test_put_query_param_formatting_single_key( + self, + erasure_policy, + customer_node, + deserialized_customer_row, + ) -> None: + input_data = { + "fidesops_grouped_inputs": [], + "email": ["customer-test_uuid@example.com"], + } + attribute_definitions = [{"AttributeName": "email", "AttributeType": "S"}] + query_config = DynamoDBQueryConfig(customer_node, attribute_definitions) + update_item = query_config.generate_update_stmt( + deserialized_customer_row, erasure_policy, privacy_request + ) + + assert update_item == { + "customer_email": {"S": "customer-1@example.com"}, + "name": {"NULL": True}, + "address_id": {"L": [{"S": "1"}, {"S": "2"}]}, + "personal_info": {"M": {"gender": {"S": "male"}, "age": {"S": "99"}}}, + "id": {"S": "1"}, + } diff --git a/tests/ops/service/connectors/test_mongo_query_config.py b/tests/ops/service/connectors/test_mongo_query_config.py new file mode 100644 index 0000000000..3912618801 --- /dev/null +++ b/tests/ops/service/connectors/test_mongo_query_config.py @@ -0,0 +1,283 @@ +import pytest +from fideslang.models import Dataset + +from fides.api.graph.config import ( + CollectionAddress, + FieldAddress, + FieldPath, + ObjectField, + ScalarField, +) +from fides.api.graph.graph import DatasetGraph, Edge +from fides.api.graph.traversal import Traversal +from fides.api.models.datasetconfig import convert_dataset_to_graph +from fides.api.models.privacy_request import PrivacyRequest +from fides.api.schemas.masking.masking_configuration import HashMaskingConfiguration +from fides.api.schemas.masking.masking_secrets import MaskingSecretCache, SecretType +from fides.api.service.connectors.query_configs.mongodb_query_config import ( + MongoQueryConfig, +) +from fides.api.service.masking.strategy.masking_strategy_hash import HashMaskingStrategy +from fides.api.util.data_category import DataCategory + +from ...task.traversal_data import combined_mongo_postgresql_graph +from ...test_helpers.cache_secrets_helper import cache_secret + +privacy_request = PrivacyRequest(id="234544") + + +class TestMongoQueryConfig: + @pytest.fixture(scope="function") + def combined_traversal(self, connection_config, integration_mongodb_config): + mongo_dataset, postgres_dataset = combined_mongo_postgresql_graph( + connection_config, integration_mongodb_config + ) + combined_dataset_graph = DatasetGraph(mongo_dataset, postgres_dataset) + combined_traversal = Traversal( + combined_dataset_graph, + {"email": "customer-1@examplecom"}, + ) + return combined_traversal + + @pytest.fixture(scope="function") + def customer_details_node(self, combined_traversal): + return combined_traversal.traversal_node_dict[ + CollectionAddress("mongo_test", "customer_details") + ].to_mock_execution_node() + + @pytest.fixture(scope="function") + def customer_feedback_node(self, combined_traversal): + return combined_traversal.traversal_node_dict[ + CollectionAddress("mongo_test", "customer_feedback") + ].to_mock_execution_node() + + def test_field_map_nested(self, customer_details_node): + config = MongoQueryConfig(customer_details_node) + + field_map = config.field_map() + assert isinstance(field_map[FieldPath("workplace_info")], ObjectField) + assert isinstance( + field_map[FieldPath("workplace_info", "employer")], ScalarField + ) + + def test_primary_key_field_paths(self, customer_details_node): + config = MongoQueryConfig(customer_details_node) + assert list(config.primary_key_field_paths.keys()) == [FieldPath("_id")] + assert isinstance(config.primary_key_field_paths[FieldPath("_id")], ScalarField) + + def test_nested_query_field_paths( + self, customer_details_node, customer_feedback_node + ): + assert customer_details_node.query_field_paths == { + FieldPath("customer_id"), + } + + assert customer_feedback_node.query_field_paths == { + FieldPath("customer_information", "email") + } + + def test_nested_typed_filtered_values(self, customer_feedback_node): + """Identity data is located on a nested object""" + input_data = { + "customer_information.email": ["test@example.com"], + "ignore": ["abcde"], + } + assert customer_feedback_node.typed_filtered_values(input_data) == { + "customer_information.email": ["test@example.com"] + } + + def test_generate_query( + self, + policy, + example_datasets, + integration_mongodb_config, + connection_config, + ): + dataset_postgres = Dataset(**example_datasets[0]) + graph = convert_dataset_to_graph(dataset_postgres, connection_config.key) + dataset_mongo = Dataset(**example_datasets[1]) + mongo_graph = convert_dataset_to_graph( + dataset_mongo, integration_mongodb_config.key + ) + dataset_graph = DatasetGraph(*[graph, mongo_graph]) + traversal = Traversal(dataset_graph, {"email": "customer-1@example.com"}) + # Edge created from Root to nested customer_information.email field + assert ( + Edge( + FieldAddress("__ROOT__", "__ROOT__", "email"), + FieldAddress( + "mongo_test", "customer_feedback", "customer_information", "email" + ), + ) + in traversal.edges + ) + + # Test query on nested field + customer_feedback = traversal.traversal_node_dict[ + CollectionAddress("mongo_test", "customer_feedback") + ].to_mock_execution_node() + config = MongoQueryConfig(customer_feedback) + input_data = {"customer_information.email": ["customer-1@example.com"]} + # Tuple of query, projection - Searching for documents with nested + # customer_information.email = customer-1@example.com + assert config.generate_query(input_data, policy) == ( + {"customer_information.email": "customer-1@example.com"}, + {"_id": 1, "customer_information": 1, "date": 1, "message": 1, "rating": 1}, + ) + + # Test query nested data + customer_details = traversal.traversal_node_dict[ + CollectionAddress("mongo_test", "customer_details") + ].to_mock_execution_node() + config = MongoQueryConfig(customer_details) + input_data = {"customer_id": [1]} + # Tuple of query, projection - Projection is specifying fields at the top-level. Nested data will + # be filtered later. + assert config.generate_query(input_data, policy) == ( + {"customer_id": 1}, + { + "_id": 1, + "birthday": 1, + "comments": 1, + "customer_id": 1, + "customer_uuid": 1, + "emergency_contacts": 1, + "children": 1, + "gender": 1, + "travel_identifiers": 1, + "workplace_info": 1, + }, + ) + + def test_generate_update_stmt_multiple_fields( + self, + erasure_policy, + example_datasets, + integration_mongodb_config, + connection_config, + ): + dataset_postgres = Dataset(**example_datasets[0]) + graph = convert_dataset_to_graph(dataset_postgres, connection_config.key) + dataset_mongo = Dataset(**example_datasets[1]) + mongo_graph = convert_dataset_to_graph( + dataset_mongo, integration_mongodb_config.key + ) + dataset_graph = DatasetGraph(*[graph, mongo_graph]) + + traversal = Traversal(dataset_graph, {"email": "customer-1@example.com"}) + customer_details = traversal.traversal_node_dict[ + CollectionAddress("mongo_test", "customer_details") + ].to_mock_execution_node() + config = MongoQueryConfig(customer_details) + row = { + "birthday": "1988-01-10", + "gender": "male", + "customer_id": 1, + "_id": 1, + "workplace_info": { + "position": "Chief Strategist", + "direct_reports": ["Robbie Margo", "Sully Hunter"], + }, + "emergency_contacts": [{"name": "June Customer", "phone": "444-444-4444"}], + "children": ["Christopher Customer", "Courtney Customer"], + } + + # Make target more broad + rule = erasure_policy.rules[0] + target = rule.targets[0] + target.data_category = DataCategory("user").value + + mongo_statement = config.generate_update_stmt( + row, erasure_policy, privacy_request + ) + + expected_result_0 = {"customer_id": 1} + expected_result_1 = { + "$set": { + "birthday": None, + "children.0": None, + "children.1": None, + "customer_id": None, + "emergency_contacts.0.name": None, + "workplace_info.direct_reports.0": None, # Both direct reports are masked. + "workplace_info.direct_reports.1": None, + "emergency_contacts.0.phone": None, + "gender": None, + "workplace_info.position": None, + } + } + + print(mongo_statement[1]) + print(expected_result_1) + assert mongo_statement[0] == expected_result_0 + assert mongo_statement[1] == expected_result_1 + + def test_generate_update_stmt_multiple_rules( + self, + erasure_policy_two_rules, + example_datasets, + integration_mongodb_config, + connection_config, + ): + dataset_postgres = Dataset(**example_datasets[0]) + graph = convert_dataset_to_graph(dataset_postgres, connection_config.key) + dataset_mongo = Dataset(**example_datasets[1]) + mongo_graph = convert_dataset_to_graph( + dataset_mongo, integration_mongodb_config.key + ) + dataset_graph = DatasetGraph(*[graph, mongo_graph]) + + traversal = Traversal(dataset_graph, {"email": "customer-1@example.com"}) + + customer_details = traversal.traversal_node_dict[ + CollectionAddress("mongo_test", "customer_details") + ].to_mock_execution_node() + + config = MongoQueryConfig(customer_details) + row = { + "birthday": "1988-01-10", + "gender": "male", + "customer_id": 1, + "_id": 1, + "workplace_info": { + "position": "Chief Strategist", + "direct_reports": ["Robbie Margo", "Sully Hunter"], + }, + "emergency_contacts": [{"name": "June Customer", "phone": "444-444-4444"}], + "children": ["Christopher Customer", "Courtney Customer"], + } + + rule = erasure_policy_two_rules.rules[0] + rule.masking_strategy = { + "strategy": "hash", + "configuration": {"algorithm": "SHA-512"}, + } + target = rule.targets[0] + target.data_category = DataCategory("user.demographic.date_of_birth").value + + rule_two = erasure_policy_two_rules.rules[1] + rule_two.masking_strategy = { + "strategy": "random_string_rewrite", + "configuration": {"length": 30}, + } + target = rule_two.targets[0] + target.data_category = DataCategory("user.demographic.gender").value + # cache secrets for hash strategy + secret = MaskingSecretCache[str]( + secret="adobo", + masking_strategy=HashMaskingStrategy.name, + secret_type=SecretType.salt, + ) + cache_secret(secret, privacy_request.id) + + mongo_statement = config.generate_update_stmt( + row, erasure_policy_two_rules, privacy_request + ) + assert mongo_statement[0] == {"customer_id": 1} + assert len(mongo_statement[1]["$set"]["gender"]) == 30 + assert ( + mongo_statement[1]["$set"]["birthday"] + == HashMaskingStrategy(HashMaskingConfiguration(algorithm="SHA-512")).mask( + ["1988-01-10"], request_id=privacy_request.id + )[0] + ) diff --git a/tests/ops/service/connectors/test_query_config.py b/tests/ops/service/connectors/test_query_config.py index 75c9b26c1b..2aa0871255 100644 --- a/tests/ops/service/connectors/test_query_config.py +++ b/tests/ops/service/connectors/test_query_config.py @@ -1,43 +1,28 @@ -from datetime import datetime, timezone from typing import Any, Dict, Set from unittest import mock import pytest -from boto3.dynamodb.types import TypeDeserializer from fideslang.models import Dataset from fides.api.common_exceptions import MissingNamespaceSchemaException -from fides.api.graph.config import ( - CollectionAddress, - FieldAddress, - FieldPath, - ObjectField, - ScalarField, -) +from fides.api.graph.config import CollectionAddress, FieldPath from fides.api.graph.execution import ExecutionNode -from fides.api.graph.graph import DatasetGraph, Edge +from fides.api.graph.graph import DatasetGraph from fides.api.graph.traversal import Traversal, TraversalNode from fides.api.models.datasetconfig import convert_dataset_to_graph from fides.api.models.privacy_request import PrivacyRequest from fides.api.schemas.masking.masking_configuration import HashMaskingConfiguration from fides.api.schemas.masking.masking_secrets import MaskingSecretCache, SecretType from fides.api.schemas.namespace_meta.namespace_meta import NamespaceMeta -from fides.api.service.connectors.query_configs.dynamodb_query_config import ( - DynamoDBQueryConfig, -) -from fides.api.service.connectors.query_configs.mongodb_query_config import ( - MongoQueryConfig, -) from fides.api.service.connectors.query_configs.query_config import ( QueryConfig, SQLQueryConfig, ) -from fides.api.service.connectors.scylla_query_config import ScyllaDBQueryConfig from fides.api.service.masking.strategy.masking_strategy_hash import HashMaskingStrategy from fides.api.util.data_category import DataCategory from tests.fixtures.application_fixtures import load_dataset -from ...task.traversal_data import combined_mongo_postgresql_graph, integration_db_graph +from ...task.traversal_data import integration_db_graph from ...test_helpers.cache_secrets_helper import cache_secret, clear_cache_secrets # customers -> address, order @@ -461,416 +446,6 @@ def test_generate_update_stmts_from_multiple_rules( text_clause._bindparams["masked_email"].value == "*****" ) # String rewrite masking strategy - -class TestMongoQueryConfig: - @pytest.fixture(scope="function") - def combined_traversal(self, connection_config, integration_mongodb_config): - mongo_dataset, postgres_dataset = combined_mongo_postgresql_graph( - connection_config, integration_mongodb_config - ) - combined_dataset_graph = DatasetGraph(mongo_dataset, postgres_dataset) - combined_traversal = Traversal( - combined_dataset_graph, - {"email": "customer-1@examplecom"}, - ) - return combined_traversal - - @pytest.fixture(scope="function") - def customer_details_node(self, combined_traversal): - return combined_traversal.traversal_node_dict[ - CollectionAddress("mongo_test", "customer_details") - ].to_mock_execution_node() - - @pytest.fixture(scope="function") - def customer_feedback_node(self, combined_traversal): - return combined_traversal.traversal_node_dict[ - CollectionAddress("mongo_test", "customer_feedback") - ].to_mock_execution_node() - - def test_field_map_nested(self, customer_details_node): - config = MongoQueryConfig(customer_details_node) - - field_map = config.field_map() - assert isinstance(field_map[FieldPath("workplace_info")], ObjectField) - assert isinstance( - field_map[FieldPath("workplace_info", "employer")], ScalarField - ) - - def test_primary_key_field_paths(self, customer_details_node): - config = MongoQueryConfig(customer_details_node) - assert list(config.primary_key_field_paths.keys()) == [FieldPath("_id")] - assert isinstance(config.primary_key_field_paths[FieldPath("_id")], ScalarField) - - def test_nested_query_field_paths( - self, customer_details_node, customer_feedback_node - ): - assert customer_details_node.query_field_paths == { - FieldPath("customer_id"), - } - - assert customer_feedback_node.query_field_paths == { - FieldPath("customer_information", "email") - } - - def test_nested_typed_filtered_values(self, customer_feedback_node): - """Identity data is located on a nested object""" - input_data = { - "customer_information.email": ["test@example.com"], - "ignore": ["abcde"], - } - assert customer_feedback_node.typed_filtered_values(input_data) == { - "customer_information.email": ["test@example.com"] - } - - def test_generate_query( - self, - policy, - example_datasets, - integration_mongodb_config, - connection_config, - ): - dataset_postgres = Dataset(**example_datasets[0]) - graph = convert_dataset_to_graph(dataset_postgres, connection_config.key) - dataset_mongo = Dataset(**example_datasets[1]) - mongo_graph = convert_dataset_to_graph( - dataset_mongo, integration_mongodb_config.key - ) - dataset_graph = DatasetGraph(*[graph, mongo_graph]) - traversal = Traversal(dataset_graph, {"email": "customer-1@example.com"}) - # Edge created from Root to nested customer_information.email field - assert ( - Edge( - FieldAddress("__ROOT__", "__ROOT__", "email"), - FieldAddress( - "mongo_test", "customer_feedback", "customer_information", "email" - ), - ) - in traversal.edges - ) - - # Test query on nested field - customer_feedback = traversal.traversal_node_dict[ - CollectionAddress("mongo_test", "customer_feedback") - ].to_mock_execution_node() - config = MongoQueryConfig(customer_feedback) - input_data = {"customer_information.email": ["customer-1@example.com"]} - # Tuple of query, projection - Searching for documents with nested - # customer_information.email = customer-1@example.com - assert config.generate_query(input_data, policy) == ( - {"customer_information.email": "customer-1@example.com"}, - {"_id": 1, "customer_information": 1, "date": 1, "message": 1, "rating": 1}, - ) - - # Test query nested data - customer_details = traversal.traversal_node_dict[ - CollectionAddress("mongo_test", "customer_details") - ].to_mock_execution_node() - config = MongoQueryConfig(customer_details) - input_data = {"customer_id": [1]} - # Tuple of query, projection - Projection is specifying fields at the top-level. Nested data will - # be filtered later. - assert config.generate_query(input_data, policy) == ( - {"customer_id": 1}, - { - "_id": 1, - "birthday": 1, - "comments": 1, - "customer_id": 1, - "customer_uuid": 1, - "emergency_contacts": 1, - "children": 1, - "gender": 1, - "travel_identifiers": 1, - "workplace_info": 1, - }, - ) - - def test_generate_update_stmt_multiple_fields( - self, - erasure_policy, - example_datasets, - integration_mongodb_config, - connection_config, - ): - dataset_postgres = Dataset(**example_datasets[0]) - graph = convert_dataset_to_graph(dataset_postgres, connection_config.key) - dataset_mongo = Dataset(**example_datasets[1]) - mongo_graph = convert_dataset_to_graph( - dataset_mongo, integration_mongodb_config.key - ) - dataset_graph = DatasetGraph(*[graph, mongo_graph]) - - traversal = Traversal(dataset_graph, {"email": "customer-1@example.com"}) - customer_details = traversal.traversal_node_dict[ - CollectionAddress("mongo_test", "customer_details") - ].to_mock_execution_node() - config = MongoQueryConfig(customer_details) - row = { - "birthday": "1988-01-10", - "gender": "male", - "customer_id": 1, - "_id": 1, - "workplace_info": { - "position": "Chief Strategist", - "direct_reports": ["Robbie Margo", "Sully Hunter"], - }, - "emergency_contacts": [{"name": "June Customer", "phone": "444-444-4444"}], - "children": ["Christopher Customer", "Courtney Customer"], - } - - # Make target more broad - rule = erasure_policy.rules[0] - target = rule.targets[0] - target.data_category = DataCategory("user").value - - mongo_statement = config.generate_update_stmt( - row, erasure_policy, privacy_request - ) - - expected_result_0 = {"customer_id": 1} - expected_result_1 = { - "$set": { - "birthday": None, - "children.0": None, - "children.1": None, - "customer_id": None, - "emergency_contacts.0.name": None, - "workplace_info.direct_reports.0": None, # Both direct reports are masked. - "workplace_info.direct_reports.1": None, - "emergency_contacts.0.phone": None, - "gender": None, - "workplace_info.position": None, - } - } - - print(mongo_statement[1]) - print(expected_result_1) - assert mongo_statement[0] == expected_result_0 - assert mongo_statement[1] == expected_result_1 - - def test_generate_update_stmt_multiple_rules( - self, - erasure_policy_two_rules, - example_datasets, - integration_mongodb_config, - connection_config, - ): - dataset_postgres = Dataset(**example_datasets[0]) - graph = convert_dataset_to_graph(dataset_postgres, connection_config.key) - dataset_mongo = Dataset(**example_datasets[1]) - mongo_graph = convert_dataset_to_graph( - dataset_mongo, integration_mongodb_config.key - ) - dataset_graph = DatasetGraph(*[graph, mongo_graph]) - - traversal = Traversal(dataset_graph, {"email": "customer-1@example.com"}) - - customer_details = traversal.traversal_node_dict[ - CollectionAddress("mongo_test", "customer_details") - ].to_mock_execution_node() - - config = MongoQueryConfig(customer_details) - row = { - "birthday": "1988-01-10", - "gender": "male", - "customer_id": 1, - "_id": 1, - "workplace_info": { - "position": "Chief Strategist", - "direct_reports": ["Robbie Margo", "Sully Hunter"], - }, - "emergency_contacts": [{"name": "June Customer", "phone": "444-444-4444"}], - "children": ["Christopher Customer", "Courtney Customer"], - } - - rule = erasure_policy_two_rules.rules[0] - rule.masking_strategy = { - "strategy": "hash", - "configuration": {"algorithm": "SHA-512"}, - } - target = rule.targets[0] - target.data_category = DataCategory("user.demographic.date_of_birth").value - - rule_two = erasure_policy_two_rules.rules[1] - rule_two.masking_strategy = { - "strategy": "random_string_rewrite", - "configuration": {"length": 30}, - } - target = rule_two.targets[0] - target.data_category = DataCategory("user.demographic.gender").value - # cache secrets for hash strategy - secret = MaskingSecretCache[str]( - secret="adobo", - masking_strategy=HashMaskingStrategy.name, - secret_type=SecretType.salt, - ) - cache_secret(secret, privacy_request.id) - - mongo_statement = config.generate_update_stmt( - row, erasure_policy_two_rules, privacy_request - ) - assert mongo_statement[0] == {"customer_id": 1} - assert len(mongo_statement[1]["$set"]["gender"]) == 30 - assert ( - mongo_statement[1]["$set"]["birthday"] - == HashMaskingStrategy(HashMaskingConfiguration(algorithm="SHA-512")).mask( - ["1988-01-10"], request_id=privacy_request.id - )[0] - ) - - -class TestDynamoDBQueryConfig: - @pytest.fixture(scope="function") - def identity(self): - identity = {"email": "customer-test_uuid@example.com"} - return identity - - @pytest.fixture(scope="function") - def dataset_graph(self, integration_dynamodb_config, example_datasets): - dataset = Dataset(**example_datasets[11]) - dataset_graph = convert_dataset_to_graph( - dataset, integration_dynamodb_config.key - ) - - return DatasetGraph(*[dataset_graph]) - - @pytest.fixture(scope="function") - def traversal(self, identity, dataset_graph): - dynamo_traversal = Traversal(dataset_graph, identity) - return dynamo_traversal - - @pytest.fixture(scope="function") - def customer_node(self, traversal): - return traversal.traversal_node_dict[ - CollectionAddress("dynamodb_example_test_dataset", "customer") - ].to_mock_execution_node() - - @pytest.fixture(scope="function") - def customer_identifier_node(self, traversal): - return traversal.traversal_node_dict[ - CollectionAddress("dynamodb_example_test_dataset", "customer_identifier") - ].to_mock_execution_node() - - @pytest.fixture(scope="function") - def customer_row(self): - row = { - "customer_email": {"S": "customer-1@example.com"}, - "name": {"S": "John Customer"}, - "address_id": {"L": [{"S": "1"}, {"S": "2"}]}, - "personal_info": {"M": {"gender": {"S": "male"}, "age": {"S": "99"}}}, - "id": {"S": "1"}, - } - return row - - @pytest.fixture(scope="function") - def deserialized_customer_row(self, customer_row): - deserialized_customer_row = {} - deserializer = TypeDeserializer() - for key, value in customer_row.items(): - deserialized_customer_row[key] = deserializer.deserialize(value) - return deserialized_customer_row - - @pytest.fixture(scope="function") - def customer_identifier_row(self): - row = { - "customer_id": {"S": "customer-1@example.com"}, - "email": {"S": "customer-1@example.com"}, - "name": {"S": "Customer 1"}, - "created": {"S": datetime.now(timezone.utc).isoformat()}, - } - return row - - @pytest.fixture(scope="function") - def deserialized_customer_identifier_row(self, customer_identifier_row): - deserialized_customer_identifier_row = {} - deserializer = TypeDeserializer() - for key, value in customer_identifier_row.items(): - deserialized_customer_identifier_row[key] = deserializer.deserialize(value) - return deserialized_customer_identifier_row - - def test_get_query_param_formatting_single_key( - self, - resources_dict, - customer_node, - ) -> None: - input_data = { - "fidesops_grouped_inputs": [], - "email": ["customer-test_uuid@example.com"], - } - attribute_definitions = [{"AttributeName": "email", "AttributeType": "S"}] - query_config = DynamoDBQueryConfig(customer_node, attribute_definitions) - item = query_config.generate_query( - input_data=input_data, policy=resources_dict["policy"] - ) - assert item["ExpressionAttributeValues"] == { - ":value": {"S": "customer-test_uuid@example.com"} - } - assert item["KeyConditionExpression"] == "email = :value" - - def test_put_query_param_formatting_single_key( - self, - erasure_policy, - customer_node, - deserialized_customer_row, - ) -> None: - input_data = { - "fidesops_grouped_inputs": [], - "email": ["customer-test_uuid@example.com"], - } - attribute_definitions = [{"AttributeName": "email", "AttributeType": "S"}] - query_config = DynamoDBQueryConfig(customer_node, attribute_definitions) - update_item = query_config.generate_update_stmt( - deserialized_customer_row, erasure_policy, privacy_request - ) - - assert update_item == { - "customer_email": {"S": "customer-1@example.com"}, - "name": {"NULL": True}, - "address_id": {"L": [{"S": "1"}, {"S": "2"}]}, - "personal_info": {"M": {"gender": {"S": "male"}, "age": {"S": "99"}}}, - "id": {"S": "1"}, - } - - -class TestScyllaDBQueryConfig: - @pytest.fixture(scope="function") - def complete_execution_node( - self, example_datasets, integration_scylladb_config_with_keyspace - ): - dataset = Dataset(**example_datasets[15]) - graph = convert_dataset_to_graph( - dataset, integration_scylladb_config_with_keyspace.key - ) - dataset_graph = DatasetGraph(*[graph]) - identity = {"email": "customer-1@example.com"} - scylla_traversal = Traversal(dataset_graph, identity) - return scylla_traversal.traversal_node_dict[ - CollectionAddress("scylladb_example_test_dataset", "users") - ].to_mock_execution_node() - - def test_dry_run_query_no_data(self, scylladb_execution_node): - query_config = ScyllaDBQueryConfig(scylladb_execution_node) - dry_run_query = query_config.dry_run_query() - assert dry_run_query is None - - def test_dry_run_query_with_data(self, complete_execution_node): - query_config = ScyllaDBQueryConfig(complete_execution_node) - dry_run_query = query_config.dry_run_query() - assert ( - dry_run_query - == "SELECT age, alternative_contacts, ascii_data, big_int_data, do_not_contact, double_data, duration, email, float_data, last_contacted, logins, name, states_lived, timestamp, user_id, uuid FROM users WHERE email = ? ALLOW FILTERING;" - ) - - def test_query_to_str(self, complete_execution_node): - query_config = ScyllaDBQueryConfig(complete_execution_node) - statement = ( - "SELECT name FROM users WHERE email = %(email)s", - {"email": "test@example.com"}, - ) - query_to_str = query_config.query_to_str(statement, {}) - assert query_to_str == "SELECT name FROM users WHERE email = 'test@example.com'" - - class TestSQLLikeQueryConfig: def test_missing_namespace_meta_schema(self): diff --git a/tests/ops/service/connectors/test_scylladb_query_config.py b/tests/ops/service/connectors/test_scylladb_query_config.py new file mode 100644 index 0000000000..3cbc6f493f --- /dev/null +++ b/tests/ops/service/connectors/test_scylladb_query_config.py @@ -0,0 +1,47 @@ +import pytest +from fideslang.models import Dataset + +from fides.api.graph.config import CollectionAddress +from fides.api.graph.graph import DatasetGraph +from fides.api.graph.traversal import Traversal +from fides.api.models.datasetconfig import convert_dataset_to_graph +from fides.api.service.connectors.scylla_query_config import ScyllaDBQueryConfig + + +class TestScyllaDBQueryConfig: + @pytest.fixture(scope="function") + def complete_execution_node( + self, example_datasets, integration_scylladb_config_with_keyspace + ): + dataset = Dataset(**example_datasets[15]) + graph = convert_dataset_to_graph( + dataset, integration_scylladb_config_with_keyspace.key + ) + dataset_graph = DatasetGraph(*[graph]) + identity = {"email": "customer-1@example.com"} + scylla_traversal = Traversal(dataset_graph, identity) + return scylla_traversal.traversal_node_dict[ + CollectionAddress("scylladb_example_test_dataset", "users") + ].to_mock_execution_node() + + def test_dry_run_query_no_data(self, scylladb_execution_node): + query_config = ScyllaDBQueryConfig(scylladb_execution_node) + dry_run_query = query_config.dry_run_query() + assert dry_run_query is None + + def test_dry_run_query_with_data(self, complete_execution_node): + query_config = ScyllaDBQueryConfig(complete_execution_node) + dry_run_query = query_config.dry_run_query() + assert ( + dry_run_query + == "SELECT age, alternative_contacts, ascii_data, big_int_data, do_not_contact, double_data, duration, email, float_data, last_contacted, logins, name, states_lived, timestamp, user_id, uuid FROM users WHERE email = ? ALLOW FILTERING;" + ) + + def test_query_to_str(self, complete_execution_node): + query_config = ScyllaDBQueryConfig(complete_execution_node) + statement = ( + "SELECT name FROM users WHERE email = %(email)s", + {"email": "test@example.com"}, + ) + query_to_str = query_config.query_to_str(statement, {}) + assert query_to_str == "SELECT name FROM users WHERE email = 'test@example.com'" diff --git a/tests/ops/task/traversal_data.py b/tests/ops/task/traversal_data.py index d0ef50ae18..07ff478e3e 100644 --- a/tests/ops/task/traversal_data.py +++ b/tests/ops/task/traversal_data.py @@ -156,7 +156,7 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase { "name": "address", "fields": [ - {"name": "_id"}, + {"name": "_id", "fides_meta": {"primary_key": True}}, { "name": "id", "fides_meta": { @@ -178,7 +178,7 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase { "name": "orders", "fields": [ - {"name": "_id"}, + {"name": "_id", "fides_meta": {"primary_key": True}}, { "name": "customer_id", "fides_meta": { @@ -200,6 +200,7 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase { "name": "_id", "fides_meta": { + "primary_key": True, "data_type": "object_id", }, }, @@ -228,6 +229,7 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase { "name": "_id", "fides_meta": { + "primary_key": True, "data_type": "object_id", }, }, @@ -237,19 +239,27 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase "fields": [ { "name": "comment", - "fides_meta": {"data_type": "string"}, + "fides_meta": { + "data_type": "string", + }, }, { "name": "message", - "fides_meta": {"data_type": "string"}, + "fides_meta": { + "data_type": "string", + }, }, { "name": "chat_name", - "fides_meta": {"data_type": "string"}, + "fides_meta": { + "data_type": "string", + }, }, { "name": "ccn", - "fides_meta": {"data_type": "string"}, + "fides_meta": { + "data_type": "string", + }, }, ], }, @@ -258,7 +268,12 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase { "name": "customer_details", "fields": [ - {"name": "_id", "fides_meta": {"primary_key": True}}, + { + "name": "_id", + "fides_meta": { + "primary_key": True, + }, + }, { "name": "birthday", "fides_meta": {"data_type": "string"}, @@ -304,15 +319,21 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase "fields": [ { "name": "name", - "fides_meta": {"data_type": "string"}, + "fides_meta": { + "data_type": "string", + }, }, { "name": "relationship", - "fides_meta": {"data_type": "string"}, + "fides_meta": { + "data_type": "string", + }, }, { "name": "phone", - "fides_meta": {"data_type": "string"}, + "fides_meta": { + "data_type": "string", + }, }, ], }, @@ -330,11 +351,15 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase "fields": [ { "name": "employer", - "fides_meta": {"data_type": "string"}, + "fides_meta": { + "data_type": "string", + }, }, { "name": "position", - "fides_meta": {"data_type": "string"}, + "fides_meta": { + "data_type": "string", + }, }, { "name": "direct_reports", @@ -350,6 +375,7 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase { "name": "_id", "fides_meta": { + "primary_key": True, "data_type": "object_id", }, }, @@ -366,11 +392,15 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase }, { "name": "phone", - "fides_meta": {"data_type": "string"}, + "fides_meta": { + "data_type": "string", + }, }, { "name": "internal_customer_id", - "fides_meta": {"data_type": "string"}, + "fides_meta": { + "data_type": "string", + }, }, ], }, @@ -394,6 +424,7 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase { "name": "_id", "fides_meta": { + "primary_key": True, "data_type": "object_id", }, }, @@ -407,6 +438,7 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase { "name": "id", "fides_meta": { + "primary_key": True, "references": [ { "dataset": mongo_db_name, @@ -427,7 +459,10 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase "fields": [ { "name": "_id", - "fides_meta": {"data_type": "object_id"}, + "fides_meta": { + "primary_key": True, + "data_type": "object_id", + }, }, { "name": "date", @@ -454,7 +489,9 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase }, { "name": "full_name", - "fides_meta": {"data_type": "string"}, + "fides_meta": { + "data_type": "string", + }, }, ], }, @@ -473,7 +510,10 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase "fields": [ { "name": "_id", - "fides_meta": {"data_type": "object_id"}, + "fides_meta": { + "primary_key": True, + "data_type": "object_id", + }, }, { "name": "customer_identifiers", @@ -520,7 +560,10 @@ def mongo_dataset_dict(mongo_db_name: str, postgres_db_name: str) -> GraphDatase "fields": [ { "name": "_id", - "fides_meta": {"data_type": "object_id"}, + "fides_meta": { + "primary_key": True, + "data_type": "object_id", + }, }, { "name": "owner", From 5d26b2f087b8ebcffd976d33f4ddd94e7507ea75 Mon Sep 17 00:00:00 2001 From: Adrian Galvan Date: Tue, 10 Dec 2024 14:16:57 -0800 Subject: [PATCH 18/22] Splitting out tests --- .../api/service/connectors/base_connector.py | 5 + .../connectors/query_configs/query_config.py | 1 + .../api/service/connectors/saas_connector.py | 3 + .../integration_tests/test_mariadb_task.py | 101 +++ .../ops/integration_tests/test_mssql_task.py | 101 +++ .../ops/integration_tests/test_mysql_task.py | 101 +++ .../integration_tests/test_scylladb_task.py | 190 +++++ tests/ops/integration_tests/test_sql_task.py | 756 +----------------- .../integration_tests/test_timescale_task.py | 294 +++++++ .../connectors/test_mongo_query_config.py | 4 +- 10 files changed, 801 insertions(+), 755 deletions(-) create mode 100644 tests/ops/integration_tests/test_mariadb_task.py create mode 100644 tests/ops/integration_tests/test_mssql_task.py create mode 100644 tests/ops/integration_tests/test_mysql_task.py create mode 100644 tests/ops/integration_tests/test_scylladb_task.py create mode 100644 tests/ops/integration_tests/test_timescale_task.py diff --git a/src/fides/api/service/connectors/base_connector.py b/src/fides/api/service/connectors/base_connector.py index ca3439f523..4bf46e5eca 100644 --- a/src/fides/api/service/connectors/base_connector.py +++ b/src/fides/api/service/connectors/base_connector.py @@ -132,3 +132,8 @@ def execute_standalone_retrieval_query( raise NotImplementedError( "execute_standalone_retrieval_query must be implemented in a concrete subclass" ) + + @property + def requires_primary_keys(self) -> bool: + """Indicates if datasets linked to this connector require primary keys for erasures. Defaults to True.""" + return True diff --git a/src/fides/api/service/connectors/query_configs/query_config.py b/src/fides/api/service/connectors/query_configs/query_config.py index 4ef115d910..c54eecff85 100644 --- a/src/fides/api/service/connectors/query_configs/query_config.py +++ b/src/fides/api/service/connectors/query_configs/query_config.py @@ -467,6 +467,7 @@ def generate_update_stmt( # Create parameter mappings with masked_ prefix for SET values param_map = { **{f"masked_{k}": v for k, v in update_value_map.items()}, + **non_empty_primary_key_fields, **non_empty_reference_fields, } diff --git a/src/fides/api/service/connectors/saas_connector.py b/src/fides/api/service/connectors/saas_connector.py index b917b6cfda..40a4d8a7eb 100644 --- a/src/fides/api/service/connectors/saas_connector.py +++ b/src/fides/api/service/connectors/saas_connector.py @@ -72,6 +72,9 @@ class SaaSConnector(BaseConnector[AuthenticatedClient], Contextualizable): """A connector type to integrate with third-party SaaS APIs""" + def requires_primary_keys(self) -> bool: + return False + def get_log_context(self) -> Dict[LoggerContextKeys, Any]: return { LoggerContextKeys.system_key: ( diff --git a/tests/ops/integration_tests/test_mariadb_task.py b/tests/ops/integration_tests/test_mariadb_task.py new file mode 100644 index 0000000000..3951a2830a --- /dev/null +++ b/tests/ops/integration_tests/test_mariadb_task.py @@ -0,0 +1,101 @@ +import pytest + +from fides.api.models.privacy_request import ExecutionLog + +from ...conftest import access_runner_tester +from ..graph.graph_test_util import assert_rows_match, records_matching_fields +from ..task.traversal_data import integration_db_graph + + +@pytest.mark.integration_mariadb +@pytest.mark.integration +@pytest.mark.asyncio +@pytest.mark.parametrize( + "dsr_version", + ["use_dsr_3_0", "use_dsr_2_0"], +) +async def test_mariadb_access_request_task( + db, + policy, + connection_config_mariadb, + mariadb_integration_db, + dsr_version, + request, + privacy_request, +) -> None: + request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0 + + v = access_runner_tester( + privacy_request, + policy, + integration_db_graph("my_maria_db_1"), + [connection_config_mariadb], + {"email": "customer-1@example.com"}, + db, + ) + + assert_rows_match( + v["my_maria_db_1:address"], + min_size=2, + keys=["id", "street", "city", "state", "zip"], + ) + assert_rows_match( + v["my_maria_db_1:orders"], + min_size=3, + keys=["id", "customer_id", "shipping_address_id", "payment_card_id"], + ) + assert_rows_match( + v["my_maria_db_1:payment_card"], + min_size=2, + keys=["id", "name", "ccn", "customer_id", "billing_address_id"], + ) + assert_rows_match( + v["my_maria_db_1:customer"], + min_size=1, + keys=["id", "name", "email", "address_id"], + ) + + # links + assert v["my_maria_db_1:customer"][0]["email"] == "customer-1@example.com" + + logs = ( + ExecutionLog.query(db=db) + .filter(ExecutionLog.privacy_request_id == privacy_request.id) + .all() + ) + + logs = [log.__dict__ for log in logs] + assert ( + len( + records_matching_fields( + logs, dataset_name="my_maria_db_1", collection_name="customer" + ) + ) + > 0 + ) + assert ( + len( + records_matching_fields( + logs, dataset_name="my_maria_db_1", collection_name="address" + ) + ) + > 0 + ) + assert ( + len( + records_matching_fields( + logs, dataset_name="my_maria_db_1", collection_name="orders" + ) + ) + > 0 + ) + assert ( + len( + records_matching_fields( + logs, + dataset_name="my_maria_db_1", + collection_name="payment_card", + ) + ) + > 0 + ) diff --git a/tests/ops/integration_tests/test_mssql_task.py b/tests/ops/integration_tests/test_mssql_task.py new file mode 100644 index 0000000000..6bc23eeda0 --- /dev/null +++ b/tests/ops/integration_tests/test_mssql_task.py @@ -0,0 +1,101 @@ +import pytest + +from fides.api.models.privacy_request import ExecutionLog + +from ...conftest import access_runner_tester +from ..graph.graph_test_util import assert_rows_match, records_matching_fields +from ..task.traversal_data import integration_db_graph + + +@pytest.mark.integration_mssql +@pytest.mark.integration +@pytest.mark.asyncio +@pytest.mark.parametrize( + "dsr_version", + ["use_dsr_3_0", "use_dsr_2_0"], +) +async def test_mssql_access_request_task( + db, + policy, + connection_config_mssql, + mssql_integration_db, + privacy_request, + dsr_version, + request, +) -> None: + request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0 + + v = access_runner_tester( + privacy_request, + policy, + integration_db_graph("my_mssql_db_1"), + [connection_config_mssql], + {"email": "customer-1@example.com"}, + db, + ) + + assert_rows_match( + v["my_mssql_db_1:address"], + min_size=2, + keys=["id", "street", "city", "state", "zip"], + ) + assert_rows_match( + v["my_mssql_db_1:orders"], + min_size=3, + keys=["id", "customer_id", "shipping_address_id", "payment_card_id"], + ) + assert_rows_match( + v["my_mssql_db_1:payment_card"], + min_size=2, + keys=["id", "name", "ccn", "customer_id", "billing_address_id"], + ) + assert_rows_match( + v["my_mssql_db_1:customer"], + min_size=1, + keys=["id", "name", "email", "address_id"], + ) + + # links + assert v["my_mssql_db_1:customer"][0]["email"] == "customer-1@example.com" + + logs = ( + ExecutionLog.query(db=db) + .filter(ExecutionLog.privacy_request_id == privacy_request.id) + .all() + ) + + logs = [log.__dict__ for log in logs] + assert ( + len( + records_matching_fields( + logs, dataset_name="my_mssql_db_1", collection_name="customer" + ) + ) + > 0 + ) + assert ( + len( + records_matching_fields( + logs, dataset_name="my_mssql_db_1", collection_name="address" + ) + ) + > 0 + ) + assert ( + len( + records_matching_fields( + logs, dataset_name="my_mssql_db_1", collection_name="orders" + ) + ) + > 0 + ) + assert ( + len( + records_matching_fields( + logs, + dataset_name="my_mssql_db_1", + collection_name="payment_card", + ) + ) + > 0 + ) diff --git a/tests/ops/integration_tests/test_mysql_task.py b/tests/ops/integration_tests/test_mysql_task.py new file mode 100644 index 0000000000..40551dd4d9 --- /dev/null +++ b/tests/ops/integration_tests/test_mysql_task.py @@ -0,0 +1,101 @@ +import pytest + +from fides.api.models.privacy_request import ExecutionLog + +from ...conftest import access_runner_tester +from ..graph.graph_test_util import assert_rows_match, records_matching_fields +from ..task.traversal_data import integration_db_graph + + +@pytest.mark.integration +@pytest.mark.integration_mysql +@pytest.mark.asyncio +@pytest.mark.parametrize( + "dsr_version", + ["use_dsr_3_0", "use_dsr_2_0"], +) +async def test_mysql_access_request_task( + db, + policy, + connection_config_mysql, + mysql_integration_db, + privacy_request, + dsr_version, + request, +) -> None: + request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0 + + v = access_runner_tester( + privacy_request, + policy, + integration_db_graph("my_mysql_db_1"), + [connection_config_mysql], + {"email": "customer-1@example.com"}, + db, + ) + + assert_rows_match( + v["my_mysql_db_1:address"], + min_size=2, + keys=["id", "street", "city", "state", "zip"], + ) + assert_rows_match( + v["my_mysql_db_1:orders"], + min_size=3, + keys=["id", "customer_id", "shipping_address_id", "payment_card_id"], + ) + assert_rows_match( + v["my_mysql_db_1:payment_card"], + min_size=2, + keys=["id", "name", "ccn", "customer_id", "billing_address_id"], + ) + assert_rows_match( + v["my_mysql_db_1:customer"], + min_size=1, + keys=["id", "name", "email", "address_id"], + ) + + # links + assert v["my_mysql_db_1:customer"][0]["email"] == "customer-1@example.com" + + logs = ( + ExecutionLog.query(db=db) + .filter(ExecutionLog.privacy_request_id == privacy_request.id) + .all() + ) + + logs = [log.__dict__ for log in logs] + assert ( + len( + records_matching_fields( + logs, dataset_name="my_mysql_db_1", collection_name="customer" + ) + ) + > 0 + ) + assert ( + len( + records_matching_fields( + logs, dataset_name="my_mysql_db_1", collection_name="address" + ) + ) + > 0 + ) + assert ( + len( + records_matching_fields( + logs, dataset_name="my_mysql_db_1", collection_name="orders" + ) + ) + > 0 + ) + assert ( + len( + records_matching_fields( + logs, + dataset_name="my_mysql_db_1", + collection_name="payment_card", + ) + ) + > 0 + ) diff --git a/tests/ops/integration_tests/test_scylladb_task.py b/tests/ops/integration_tests/test_scylladb_task.py new file mode 100644 index 0000000000..8ced1317ad --- /dev/null +++ b/tests/ops/integration_tests/test_scylladb_task.py @@ -0,0 +1,190 @@ +import pytest +from sqlalchemy.orm import Session + +from fides.api.models.privacy_request import ExecutionLogStatus, PrivacyRequest +from fides.api.service.connectors.scylla_connector import ScyllaConnectorMissingKeyspace +from fides.api.task.graph_task import get_cached_data_for_erasures + +from ...conftest import access_runner_tester, erasure_runner_tester +from ..graph.graph_test_util import assert_rows_match, erasure_policy +from ..task.traversal_data import integration_scylladb_graph + + +@pytest.mark.integration +@pytest.mark.integration_scylladb +@pytest.mark.asyncio +class TestScyllaDSRs: + @pytest.mark.parametrize( + "dsr_version", + ["use_dsr_2_0"], + ) + async def test_scylladb_access_request_task_no_keyspace_dsr2( + self, + db: Session, + policy, + integration_scylladb_config, + scylladb_integration_no_keyspace, + privacy_request, + dsr_version, + request, + ) -> None: + request.getfixturevalue(dsr_version) + + with pytest.raises(ScyllaConnectorMissingKeyspace) as err: + v = access_runner_tester( + privacy_request, + policy, + integration_scylladb_graph("scylla_example"), + [integration_scylladb_config], + {"email": "customer-1@example.com"}, + db, + ) + + assert ( + "No keyspace provided in the ScyllaDB configuration for connector scylla_example" + in str(err.value) + ) + + @pytest.mark.parametrize( + "dsr_version", + ["use_dsr_3_0"], + ) + async def test_scylladb_access_request_task_no_keyspace_dsr3( + self, + db, + policy, + integration_scylladb_config, + scylladb_integration_no_keyspace, + privacy_request: PrivacyRequest, + dsr_version, + request, + ) -> None: + request.getfixturevalue(dsr_version) + v = access_runner_tester( + privacy_request, + policy, + integration_scylladb_graph("scylla_example"), + [integration_scylladb_config], + {"email": "customer-1@example.com"}, + db, + ) + + assert v == {} + assert ( + privacy_request.access_tasks.count() == 6 + ) # There's 4 tables plus the root and terminal "dummy" tasks + + # Root task should be completed + assert privacy_request.access_tasks.first().collection_name == "__ROOT__" + assert ( + privacy_request.access_tasks.first().status == ExecutionLogStatus.complete + ) + + # All other tasks should be error + for access_task in privacy_request.access_tasks.offset(1): + assert access_task.status == ExecutionLogStatus.error + + @pytest.mark.parametrize( + "dsr_version", + ["use_dsr_2_0", "use_dsr_3_0"], + ) + async def test_scylladb_access_request_task( + self, + db, + policy, + integration_scylladb_config_with_keyspace, + scylla_reset_db, + scylladb_integration_with_keyspace, + privacy_request, + dsr_version, + request, + ) -> None: + request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0 + + results = access_runner_tester( + privacy_request, + policy, + integration_scylladb_graph("scylla_example_with_keyspace"), + [integration_scylladb_config_with_keyspace], + {"email": "customer-1@example.com"}, + db, + ) + + assert_rows_match( + results["scylla_example_with_keyspace:users"], + min_size=1, + keys=[ + "age", + "alternative_contacts", + "do_not_contact", + "email", + "name", + "last_contacted", + "logins", + "states_lived", + ], + ) + assert_rows_match( + results["scylla_example_with_keyspace:user_activity"], + min_size=3, + keys=["timestamp", "user_agent", "activity_type"], + ) + assert_rows_match( + results["scylla_example_with_keyspace:payment_methods"], + min_size=2, + keys=["card_number", "expiration_date"], + ) + assert_rows_match( + results["scylla_example_with_keyspace:orders"], + min_size=2, + keys=["order_amount", "order_date", "order_description"], + ) + + @pytest.mark.parametrize( + "dsr_version", + ["use_dsr_2_0", "use_dsr_3_0"], + ) + async def test_scylladb_erasure_task( + self, + db, + integration_scylladb_config_with_keyspace, + scylladb_integration_with_keyspace, + scylla_reset_db, + privacy_request, + dsr_version, + request, + ): + request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0 + + seed_email = "customer-1@example.com" + + policy = erasure_policy( + db, "user.name", "user.behavior", "user.device", "user.payment" + ) + privacy_request.policy_id = policy.id + privacy_request.save(db) + + graph = integration_scylladb_graph("scylla_example_with_keyspace") + access_runner_tester( + privacy_request, + policy, + integration_scylladb_graph("scylla_example_with_keyspace"), + [integration_scylladb_config_with_keyspace], + {"email": seed_email}, + db, + ) + results = erasure_runner_tester( + privacy_request, + policy, + graph, + [integration_scylladb_config_with_keyspace], + {"email": seed_email}, + get_cached_data_for_erasures(privacy_request.id), + db, + ) + assert results == { + "scylla_example_with_keyspace:user_activity": 3, + "scylla_example_with_keyspace:users": 1, + "scylla_example_with_keyspace:payment_methods": 2, + "scylla_example_with_keyspace:orders": 2, + } diff --git a/tests/ops/integration_tests/test_sql_task.py b/tests/ops/integration_tests/test_sql_task.py index b349040988..cd4bb1551f 100644 --- a/tests/ops/integration_tests/test_sql_task.py +++ b/tests/ops/integration_tests/test_sql_task.py @@ -6,7 +6,6 @@ import pytest from fideslang import Dataset from sqlalchemy import text -from sqlalchemy.orm import Session from fides.api.graph.config import Collection, FieldAddress, GraphDataset, ScalarField from fides.api.graph.data_type import DataType, StringTypeConverter @@ -15,14 +14,8 @@ from fides.api.models.connectionconfig import ConnectionConfig from fides.api.models.datasetconfig import convert_dataset_to_graph from fides.api.models.policy import ActionType, Policy, Rule, RuleTarget -from fides.api.models.privacy_request import ( - ExecutionLog, - ExecutionLogStatus, - PrivacyRequest, - RequestTask, -) +from fides.api.models.privacy_request import ExecutionLog, RequestTask from fides.api.service.connectors import get_connector -from fides.api.service.connectors.scylla_connector import ScyllaConnectorMissingKeyspace from fides.api.task.filter_results import filter_data_categories from fides.api.task.graph_task import get_cached_data_for_erasures from fides.config import CONFIG @@ -35,12 +28,7 @@ field, records_matching_fields, ) -from ..task.traversal_data import ( - integration_db_graph, - integration_scylladb_graph, - postgres_db_graph_dataset, - str_converter, -) +from ..task.traversal_data import integration_db_graph, postgres_db_graph_dataset @pytest.mark.integration_postgres @@ -497,468 +485,7 @@ async def test_postgres_privacy_requests_against_non_default_schema( assert johanna_record.name is None # Masked by erasure request -@pytest.mark.integration_mssql -@pytest.mark.integration -@pytest.mark.asyncio -@pytest.mark.parametrize( - "dsr_version", - ["use_dsr_3_0", "use_dsr_2_0"], -) -async def test_mssql_access_request_task( - db, - policy, - connection_config_mssql, - mssql_integration_db, - privacy_request, - dsr_version, - request, -) -> None: - request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0 - - v = access_runner_tester( - privacy_request, - policy, - integration_db_graph("my_mssql_db_1"), - [connection_config_mssql], - {"email": "customer-1@example.com"}, - db, - ) - - assert_rows_match( - v["my_mssql_db_1:address"], - min_size=2, - keys=["id", "street", "city", "state", "zip"], - ) - assert_rows_match( - v["my_mssql_db_1:orders"], - min_size=3, - keys=["id", "customer_id", "shipping_address_id", "payment_card_id"], - ) - assert_rows_match( - v["my_mssql_db_1:payment_card"], - min_size=2, - keys=["id", "name", "ccn", "customer_id", "billing_address_id"], - ) - assert_rows_match( - v["my_mssql_db_1:customer"], - min_size=1, - keys=["id", "name", "email", "address_id"], - ) - - # links - assert v["my_mssql_db_1:customer"][0]["email"] == "customer-1@example.com" - - logs = ( - ExecutionLog.query(db=db) - .filter(ExecutionLog.privacy_request_id == privacy_request.id) - .all() - ) - - logs = [log.__dict__ for log in logs] - assert ( - len( - records_matching_fields( - logs, dataset_name="my_mssql_db_1", collection_name="customer" - ) - ) - > 0 - ) - assert ( - len( - records_matching_fields( - logs, dataset_name="my_mssql_db_1", collection_name="address" - ) - ) - > 0 - ) - assert ( - len( - records_matching_fields( - logs, dataset_name="my_mssql_db_1", collection_name="orders" - ) - ) - > 0 - ) - assert ( - len( - records_matching_fields( - logs, - dataset_name="my_mssql_db_1", - collection_name="payment_card", - ) - ) - > 0 - ) - - -@pytest.mark.integration -@pytest.mark.integration_mysql -@pytest.mark.asyncio -@pytest.mark.parametrize( - "dsr_version", - ["use_dsr_3_0", "use_dsr_2_0"], -) -async def test_mysql_access_request_task( - db, - policy, - connection_config_mysql, - mysql_integration_db, - privacy_request, - dsr_version, - request, -) -> None: - request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0 - - v = access_runner_tester( - privacy_request, - policy, - integration_db_graph("my_mysql_db_1"), - [connection_config_mysql], - {"email": "customer-1@example.com"}, - db, - ) - - assert_rows_match( - v["my_mysql_db_1:address"], - min_size=2, - keys=["id", "street", "city", "state", "zip"], - ) - assert_rows_match( - v["my_mysql_db_1:orders"], - min_size=3, - keys=["id", "customer_id", "shipping_address_id", "payment_card_id"], - ) - assert_rows_match( - v["my_mysql_db_1:payment_card"], - min_size=2, - keys=["id", "name", "ccn", "customer_id", "billing_address_id"], - ) - assert_rows_match( - v["my_mysql_db_1:customer"], - min_size=1, - keys=["id", "name", "email", "address_id"], - ) - - # links - assert v["my_mysql_db_1:customer"][0]["email"] == "customer-1@example.com" - - logs = ( - ExecutionLog.query(db=db) - .filter(ExecutionLog.privacy_request_id == privacy_request.id) - .all() - ) - - logs = [log.__dict__ for log in logs] - assert ( - len( - records_matching_fields( - logs, dataset_name="my_mysql_db_1", collection_name="customer" - ) - ) - > 0 - ) - assert ( - len( - records_matching_fields( - logs, dataset_name="my_mysql_db_1", collection_name="address" - ) - ) - > 0 - ) - assert ( - len( - records_matching_fields( - logs, dataset_name="my_mysql_db_1", collection_name="orders" - ) - ) - > 0 - ) - assert ( - len( - records_matching_fields( - logs, - dataset_name="my_mysql_db_1", - collection_name="payment_card", - ) - ) - > 0 - ) - - -@pytest.mark.integration_mariadb -@pytest.mark.integration -@pytest.mark.asyncio -@pytest.mark.parametrize( - "dsr_version", - ["use_dsr_3_0", "use_dsr_2_0"], -) -async def test_mariadb_access_request_task( - db, - policy, - connection_config_mariadb, - mariadb_integration_db, - dsr_version, - request, - privacy_request, -) -> None: - request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0 - - v = access_runner_tester( - privacy_request, - policy, - integration_db_graph("my_maria_db_1"), - [connection_config_mariadb], - {"email": "customer-1@example.com"}, - db, - ) - - assert_rows_match( - v["my_maria_db_1:address"], - min_size=2, - keys=["id", "street", "city", "state", "zip"], - ) - assert_rows_match( - v["my_maria_db_1:orders"], - min_size=3, - keys=["id", "customer_id", "shipping_address_id", "payment_card_id"], - ) - assert_rows_match( - v["my_maria_db_1:payment_card"], - min_size=2, - keys=["id", "name", "ccn", "customer_id", "billing_address_id"], - ) - assert_rows_match( - v["my_maria_db_1:customer"], - min_size=1, - keys=["id", "name", "email", "address_id"], - ) - - # links - assert v["my_maria_db_1:customer"][0]["email"] == "customer-1@example.com" - - logs = ( - ExecutionLog.query(db=db) - .filter(ExecutionLog.privacy_request_id == privacy_request.id) - .all() - ) - - logs = [log.__dict__ for log in logs] - assert ( - len( - records_matching_fields( - logs, dataset_name="my_maria_db_1", collection_name="customer" - ) - ) - > 0 - ) - assert ( - len( - records_matching_fields( - logs, dataset_name="my_maria_db_1", collection_name="address" - ) - ) - > 0 - ) - assert ( - len( - records_matching_fields( - logs, dataset_name="my_maria_db_1", collection_name="orders" - ) - ) - > 0 - ) - assert ( - len( - records_matching_fields( - logs, - dataset_name="my_maria_db_1", - collection_name="payment_card", - ) - ) - > 0 - ) - - -@pytest.mark.integration -@pytest.mark.integration_scylladb -@pytest.mark.asyncio -class TestScyllaDSRs: - @pytest.mark.parametrize( - "dsr_version", - ["use_dsr_2_0"], - ) - async def test_scylladb_access_request_task_no_keyspace_dsr2( - self, - db: Session, - policy, - integration_scylladb_config, - scylladb_integration_no_keyspace, - privacy_request, - dsr_version, - request, - ) -> None: - request.getfixturevalue(dsr_version) - - with pytest.raises(ScyllaConnectorMissingKeyspace) as err: - v = access_runner_tester( - privacy_request, - policy, - integration_scylladb_graph("scylla_example"), - [integration_scylladb_config], - {"email": "customer-1@example.com"}, - db, - ) - - assert ( - "No keyspace provided in the ScyllaDB configuration for connector scylla_example" - in str(err.value) - ) - - @pytest.mark.parametrize( - "dsr_version", - ["use_dsr_3_0"], - ) - async def test_scylladb_access_request_task_no_keyspace_dsr3( - self, - db, - policy, - integration_scylladb_config, - scylladb_integration_no_keyspace, - privacy_request: PrivacyRequest, - dsr_version, - request, - ) -> None: - request.getfixturevalue(dsr_version) - v = access_runner_tester( - privacy_request, - policy, - integration_scylladb_graph("scylla_example"), - [integration_scylladb_config], - {"email": "customer-1@example.com"}, - db, - ) - - assert v == {} - assert ( - privacy_request.access_tasks.count() == 6 - ) # There's 4 tables plus the root and terminal "dummy" tasks - - # Root task should be completed - assert privacy_request.access_tasks.first().collection_name == "__ROOT__" - assert ( - privacy_request.access_tasks.first().status == ExecutionLogStatus.complete - ) - - # All other tasks should be error - for access_task in privacy_request.access_tasks.offset(1): - assert access_task.status == ExecutionLogStatus.error - - @pytest.mark.parametrize( - "dsr_version", - ["use_dsr_2_0", "use_dsr_3_0"], - ) - async def test_scylladb_access_request_task( - self, - db, - policy, - integration_scylladb_config_with_keyspace, - scylla_reset_db, - scylladb_integration_with_keyspace, - privacy_request, - dsr_version, - request, - ) -> None: - request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0 - - results = access_runner_tester( - privacy_request, - policy, - integration_scylladb_graph("scylla_example_with_keyspace"), - [integration_scylladb_config_with_keyspace], - {"email": "customer-1@example.com"}, - db, - ) - - assert_rows_match( - results["scylla_example_with_keyspace:users"], - min_size=1, - keys=[ - "age", - "alternative_contacts", - "do_not_contact", - "email", - "name", - "last_contacted", - "logins", - "states_lived", - ], - ) - assert_rows_match( - results["scylla_example_with_keyspace:user_activity"], - min_size=3, - keys=["timestamp", "user_agent", "activity_type"], - ) - assert_rows_match( - results["scylla_example_with_keyspace:payment_methods"], - min_size=2, - keys=["card_number", "expiration_date"], - ) - assert_rows_match( - results["scylla_example_with_keyspace:orders"], - min_size=2, - keys=["order_amount", "order_date", "order_description"], - ) - - @pytest.mark.parametrize( - "dsr_version", - ["use_dsr_2_0", "use_dsr_3_0"], - ) - async def test_scylladb_erasure_task( - self, - db, - integration_scylladb_config_with_keyspace, - scylladb_integration_with_keyspace, - scylla_reset_db, - privacy_request, - dsr_version, - request, - ): - request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0 - - seed_email = "customer-1@example.com" - - policy = erasure_policy( - db, "user.name", "user.behavior", "user.device", "user.payment" - ) - privacy_request.policy_id = policy.id - privacy_request.save(db) - - graph = integration_scylladb_graph("scylla_example_with_keyspace") - access_runner_tester( - privacy_request, - policy, - integration_scylladb_graph("scylla_example_with_keyspace"), - [integration_scylladb_config_with_keyspace], - {"email": seed_email}, - db, - ) - results = erasure_runner_tester( - privacy_request, - policy, - graph, - [integration_scylladb_config_with_keyspace], - {"email": seed_email}, - get_cached_data_for_erasures(privacy_request.id), - db, - ) - assert results == { - "scylla_example_with_keyspace:user_activity": 3, - "scylla_example_with_keyspace:users": 1, - "scylla_example_with_keyspace:payment_methods": 2, - "scylla_example_with_keyspace:orders": 2, - } - - +@pytest.mark.integration_postgres @pytest.mark.integration @pytest.mark.asyncio @pytest.mark.parametrize( @@ -1595,280 +1122,3 @@ async def test_retry_erasure( "error", "error", } - - -@pytest.mark.integration_timescale -@pytest.mark.integration -@pytest.mark.asyncio -@pytest.mark.parametrize( - "dsr_version", - ["use_dsr_3_0", "use_dsr_2_0"], -) -async def test_timescale_access_request_task( - db, - policy, - timescale_connection_config, - timescale_integration_db, - privacy_request, - dsr_version, - request, -) -> None: - database_name = "my_timescale_db_1" - request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0 - - v = access_runner_tester( - privacy_request, - policy, - integration_db_graph(database_name), - [timescale_connection_config], - {"email": "customer-1@example.com"}, - db, - ) - - assert_rows_match( - v[f"{database_name}:address"], - min_size=2, - keys=["id", "street", "city", "state", "zip"], - ) - assert_rows_match( - v[f"{database_name}:orders"], - min_size=3, - keys=["id", "customer_id", "shipping_address_id", "payment_card_id"], - ) - assert_rows_match( - v[f"{database_name}:payment_card"], - min_size=2, - keys=["id", "name", "ccn", "customer_id", "billing_address_id"], - ) - assert_rows_match( - v[f"{database_name}:customer"], - min_size=1, - keys=["id", "name", "email", "address_id"], - ) - - # links - assert v[f"{database_name}:customer"][0]["email"] == "customer-1@example.com" - - logs = ( - ExecutionLog.query(db=db) - .filter(ExecutionLog.privacy_request_id == privacy_request.id) - .all() - ) - - logs = [log.__dict__ for log in logs] - - assert ( - len( - records_matching_fields( - logs, dataset_name=database_name, collection_name="customer" - ) - ) - > 0 - ) - - assert ( - len( - records_matching_fields( - logs, dataset_name=database_name, collection_name="address" - ) - ) - > 0 - ) - - assert ( - len( - records_matching_fields( - logs, dataset_name=database_name, collection_name="orders" - ) - ) - > 0 - ) - - assert ( - len( - records_matching_fields( - logs, - dataset_name=database_name, - collection_name="payment_card", - ) - ) - > 0 - ) - - -@pytest.mark.integration_timescale -@pytest.mark.integration -@pytest.mark.asyncio -@pytest.mark.parametrize( - "dsr_version", - ["use_dsr_3_0", "use_dsr_2_0"], -) -async def test_timescale_erasure_request_task( - db, - erasure_policy, - timescale_connection_config, - timescale_integration_db, - privacy_request_with_erasure_policy, - dsr_version, - request, -) -> None: - request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0 - - rule = erasure_policy.rules[0] - target = rule.targets[0] - target.data_category = "user" - target.save(db) - - database_name = "my_timescale_db_1" - - dataset = postgres_db_graph_dataset(database_name, timescale_connection_config.key) - - # Set some data categories on fields that will be targeted by the policy above - field([dataset], database_name, "customer", "name").data_categories = ["user.name"] - field([dataset], database_name, "address", "street").data_categories = ["user"] - field([dataset], database_name, "payment_card", "ccn").data_categories = ["user"] - - graph = DatasetGraph(dataset) - - v = access_runner_tester( - privacy_request_with_erasure_policy, - erasure_policy, - graph, - [timescale_connection_config], - {"email": "customer-1@example.com"}, - db, - ) - - v = erasure_runner_tester( - privacy_request_with_erasure_policy, - erasure_policy, - graph, - [timescale_connection_config], - {"email": "customer-1@example.com"}, - get_cached_data_for_erasures(privacy_request_with_erasure_policy.id), - db, - ) - assert v == { - f"{database_name}:customer": 1, - f"{database_name}:orders": 0, - f"{database_name}:payment_card": 2, - f"{database_name}:address": 2, - }, "No erasure on orders table - no data categories targeted" - - # Verify masking in appropriate tables - address_cursor = timescale_integration_db.execute( - text("select * from address where id in (1, 2)") - ) - for address in address_cursor: - assert address.street is None # Masked due to matching data category - assert address.state is not None - assert address.city is not None - assert address.zip is not None - - customer_cursor = timescale_integration_db.execute( - text("select * from customer where id = 1") - ) - customer = [customer for customer in customer_cursor][0] - assert customer.name is None # Masked due to matching data category - assert customer.email == "customer-1@example.com" - assert customer.address_id is not None - - payment_card_cursor = timescale_integration_db.execute( - text("select * from payment_card where id in ('pay_aaa-aaa', 'pay_bbb-bbb')") - ) - payment_cards = [card for card in payment_card_cursor] - assert all( - [card.ccn is None for card in payment_cards] - ) # Masked due to matching data category - assert not any([card.name is None for card in payment_cards]) is None - - -@pytest.mark.integration_timescale -@pytest.mark.integration -@pytest.mark.asyncio -@pytest.mark.parametrize( - "dsr_version", - ["use_dsr_3_0", "use_dsr_2_0"], -) -async def test_timescale_query_and_mask_hypertable( - db, - erasure_policy, - timescale_connection_config, - timescale_integration_db, - privacy_request_with_erasure_policy, - dsr_version, - request, -) -> None: - request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0 - - database_name = "my_timescale_db_1" - - dataset = postgres_db_graph_dataset(database_name, timescale_connection_config.key) - # For this test, add a new collection to our standard dataset corresponding to the - # "onsite_personnel" timescale hypertable - onsite_personnel_collection = Collection( - name="onsite_personnel", - fields=[ - ScalarField( - name="responsible", data_type_converter=str_converter, identity="email" - ), - ScalarField( - name="time", data_type_converter=str_converter, primary_key=True - ), - ], - ) - - dataset.collections.append(onsite_personnel_collection) - graph = DatasetGraph(dataset) - rule = erasure_policy.rules[0] - target = rule.targets[0] - target.data_category = "user" - target.save(db) - # Update data category on responsible field - field( - [dataset], database_name, "onsite_personnel", "responsible" - ).data_categories = ["user.contact.email"] - - access_results = access_runner_tester( - privacy_request_with_erasure_policy, - erasure_policy, - graph, - [timescale_connection_config], - {"email": "employee-1@example.com"}, - db, - ) - - # Demonstrate hypertable can be queried - assert access_results[f"{database_name}:onsite_personnel"] == [ - {"responsible": "employee-1@example.com", "time": datetime(2022, 1, 1, 9, 0)}, - {"responsible": "employee-1@example.com", "time": datetime(2022, 1, 2, 9, 0)}, - {"responsible": "employee-1@example.com", "time": datetime(2022, 1, 3, 9, 0)}, - {"responsible": "employee-1@example.com", "time": datetime(2022, 1, 5, 9, 0)}, - ] - - # Run an erasure on the hypertable targeting the responsible field - v = erasure_runner_tester( - privacy_request_with_erasure_policy, - erasure_policy, - graph, - [timescale_connection_config], - {"email": "employee-1@example.com"}, - get_cached_data_for_erasures(privacy_request_with_erasure_policy.id), - db, - ) - - assert v == { - f"{database_name}:customer": 0, - f"{database_name}:orders": 0, - f"{database_name}:payment_card": 0, - f"{database_name}:address": 0, - f"{database_name}:onsite_personnel": 4, - }, "onsite_personnel.responsible was the only targeted data category" - - personnel_records = timescale_integration_db.execute( - text("select * from onsite_personnel") - ) - for record in personnel_records: - assert ( - record.responsible != "employee-1@example.com" - ) # These emails have all been masked diff --git a/tests/ops/integration_tests/test_timescale_task.py b/tests/ops/integration_tests/test_timescale_task.py new file mode 100644 index 0000000000..97af65ce65 --- /dev/null +++ b/tests/ops/integration_tests/test_timescale_task.py @@ -0,0 +1,294 @@ +from datetime import datetime + +import pytest +from sqlalchemy import text + +from fides.api.graph.config import Collection, ScalarField +from fides.api.graph.graph import DatasetGraph +from fides.api.models.privacy_request import ExecutionLog +from fides.api.task.graph_task import get_cached_data_for_erasures + +from ...conftest import access_runner_tester, erasure_runner_tester +from ..graph.graph_test_util import assert_rows_match, field, records_matching_fields +from ..task.traversal_data import ( + integration_db_graph, + postgres_db_graph_dataset, + str_converter, +) + + +@pytest.mark.integration_timescale +@pytest.mark.integration +@pytest.mark.asyncio +@pytest.mark.parametrize( + "dsr_version", + ["use_dsr_3_0", "use_dsr_2_0"], +) +async def test_timescale_access_request_task( + db, + policy, + timescale_connection_config, + timescale_integration_db, + privacy_request, + dsr_version, + request, +) -> None: + database_name = "my_timescale_db_1" + request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0 + + v = access_runner_tester( + privacy_request, + policy, + integration_db_graph(database_name), + [timescale_connection_config], + {"email": "customer-1@example.com"}, + db, + ) + + assert_rows_match( + v[f"{database_name}:address"], + min_size=2, + keys=["id", "street", "city", "state", "zip"], + ) + assert_rows_match( + v[f"{database_name}:orders"], + min_size=3, + keys=["id", "customer_id", "shipping_address_id", "payment_card_id"], + ) + assert_rows_match( + v[f"{database_name}:payment_card"], + min_size=2, + keys=["id", "name", "ccn", "customer_id", "billing_address_id"], + ) + assert_rows_match( + v[f"{database_name}:customer"], + min_size=1, + keys=["id", "name", "email", "address_id"], + ) + + # links + assert v[f"{database_name}:customer"][0]["email"] == "customer-1@example.com" + + logs = ( + ExecutionLog.query(db=db) + .filter(ExecutionLog.privacy_request_id == privacy_request.id) + .all() + ) + + logs = [log.__dict__ for log in logs] + + assert ( + len( + records_matching_fields( + logs, dataset_name=database_name, collection_name="customer" + ) + ) + > 0 + ) + + assert ( + len( + records_matching_fields( + logs, dataset_name=database_name, collection_name="address" + ) + ) + > 0 + ) + + assert ( + len( + records_matching_fields( + logs, dataset_name=database_name, collection_name="orders" + ) + ) + > 0 + ) + + assert ( + len( + records_matching_fields( + logs, + dataset_name=database_name, + collection_name="payment_card", + ) + ) + > 0 + ) + + +@pytest.mark.integration_timescale +@pytest.mark.integration +@pytest.mark.asyncio +@pytest.mark.parametrize( + "dsr_version", + ["use_dsr_3_0", "use_dsr_2_0"], +) +async def test_timescale_erasure_request_task( + db, + erasure_policy, + timescale_connection_config, + timescale_integration_db, + privacy_request_with_erasure_policy, + dsr_version, + request, +) -> None: + request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0 + + rule = erasure_policy.rules[0] + target = rule.targets[0] + target.data_category = "user" + target.save(db) + + database_name = "my_timescale_db_1" + + dataset = postgres_db_graph_dataset(database_name, timescale_connection_config.key) + + # Set some data categories on fields that will be targeted by the policy above + field([dataset], database_name, "customer", "name").data_categories = ["user.name"] + field([dataset], database_name, "address", "street").data_categories = ["user"] + field([dataset], database_name, "payment_card", "ccn").data_categories = ["user"] + + graph = DatasetGraph(dataset) + + v = access_runner_tester( + privacy_request_with_erasure_policy, + erasure_policy, + graph, + [timescale_connection_config], + {"email": "customer-1@example.com"}, + db, + ) + + v = erasure_runner_tester( + privacy_request_with_erasure_policy, + erasure_policy, + graph, + [timescale_connection_config], + {"email": "customer-1@example.com"}, + get_cached_data_for_erasures(privacy_request_with_erasure_policy.id), + db, + ) + assert v == { + f"{database_name}:customer": 1, + f"{database_name}:orders": 0, + f"{database_name}:payment_card": 2, + f"{database_name}:address": 2, + }, "No erasure on orders table - no data categories targeted" + + # Verify masking in appropriate tables + address_cursor = timescale_integration_db.execute( + text("select * from address where id in (1, 2)") + ) + for address in address_cursor: + assert address.street is None # Masked due to matching data category + assert address.state is not None + assert address.city is not None + assert address.zip is not None + + customer_cursor = timescale_integration_db.execute( + text("select * from customer where id = 1") + ) + customer = [customer for customer in customer_cursor][0] + assert customer.name is None # Masked due to matching data category + assert customer.email == "customer-1@example.com" + assert customer.address_id is not None + + payment_card_cursor = timescale_integration_db.execute( + text("select * from payment_card where id in ('pay_aaa-aaa', 'pay_bbb-bbb')") + ) + payment_cards = [card for card in payment_card_cursor] + assert all( + [card.ccn is None for card in payment_cards] + ) # Masked due to matching data category + assert not any([card.name is None for card in payment_cards]) is None + + +@pytest.mark.integration_timescale +@pytest.mark.integration +@pytest.mark.asyncio +@pytest.mark.parametrize( + "dsr_version", + ["use_dsr_3_0", "use_dsr_2_0"], +) +async def test_timescale_query_and_mask_hypertable( + db, + erasure_policy, + timescale_connection_config, + timescale_integration_db, + privacy_request_with_erasure_policy, + dsr_version, + request, +) -> None: + request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0 + + database_name = "my_timescale_db_1" + + dataset = postgres_db_graph_dataset(database_name, timescale_connection_config.key) + # For this test, add a new collection to our standard dataset corresponding to the + # "onsite_personnel" timescale hypertable + onsite_personnel_collection = Collection( + name="onsite_personnel", + fields=[ + ScalarField( + name="responsible", data_type_converter=str_converter, identity="email" + ), + ScalarField( + name="time", data_type_converter=str_converter, primary_key=True + ), + ], + ) + + dataset.collections.append(onsite_personnel_collection) + graph = DatasetGraph(dataset) + rule = erasure_policy.rules[0] + target = rule.targets[0] + target.data_category = "user" + target.save(db) + # Update data category on responsible field + field( + [dataset], database_name, "onsite_personnel", "responsible" + ).data_categories = ["user.contact.email"] + + access_results = access_runner_tester( + privacy_request_with_erasure_policy, + erasure_policy, + graph, + [timescale_connection_config], + {"email": "employee-1@example.com"}, + db, + ) + + # Demonstrate hypertable can be queried + assert access_results[f"{database_name}:onsite_personnel"] == [ + {"responsible": "employee-1@example.com", "time": datetime(2022, 1, 1, 9, 0)}, + {"responsible": "employee-1@example.com", "time": datetime(2022, 1, 2, 9, 0)}, + {"responsible": "employee-1@example.com", "time": datetime(2022, 1, 3, 9, 0)}, + {"responsible": "employee-1@example.com", "time": datetime(2022, 1, 5, 9, 0)}, + ] + + # Run an erasure on the hypertable targeting the responsible field + v = erasure_runner_tester( + privacy_request_with_erasure_policy, + erasure_policy, + graph, + [timescale_connection_config], + {"email": "employee-1@example.com"}, + get_cached_data_for_erasures(privacy_request_with_erasure_policy.id), + db, + ) + + assert v == { + f"{database_name}:customer": 0, + f"{database_name}:orders": 0, + f"{database_name}:payment_card": 0, + f"{database_name}:address": 0, + f"{database_name}:onsite_personnel": 4, + }, "onsite_personnel.responsible was the only targeted data category" + + personnel_records = timescale_integration_db.execute( + text("select * from onsite_personnel") + ) + for record in personnel_records: + assert ( + record.responsible != "employee-1@example.com" + ) # These emails have all been masked diff --git a/tests/ops/service/connectors/test_mongo_query_config.py b/tests/ops/service/connectors/test_mongo_query_config.py index 3912618801..c0f6079df1 100644 --- a/tests/ops/service/connectors/test_mongo_query_config.py +++ b/tests/ops/service/connectors/test_mongo_query_config.py @@ -191,7 +191,7 @@ def test_generate_update_stmt_multiple_fields( row, erasure_policy, privacy_request ) - expected_result_0 = {"customer_id": 1} + expected_result_0 = {"_id": 1} expected_result_1 = { "$set": { "birthday": None, @@ -273,7 +273,7 @@ def test_generate_update_stmt_multiple_rules( mongo_statement = config.generate_update_stmt( row, erasure_policy_two_rules, privacy_request ) - assert mongo_statement[0] == {"customer_id": 1} + assert mongo_statement[0] == {"_id": 1} assert len(mongo_statement[1]["$set"]["gender"]) == 30 assert ( mongo_statement[1]["$set"]["birthday"] From 647586fa8f9d319d98514e38f0de644552146c99 Mon Sep 17 00:00:00 2001 From: Adrian Galvan Date: Tue, 10 Dec 2024 14:51:07 -0800 Subject: [PATCH 19/22] Reverting most of the removal of primary keys + misc files --- .../fixtures/connectors/datasetconfig.json | 18 ++++----- .../bigquery_enterprise_test_dataset.yml | 8 ++-- .../dataset/bigquery_example_test_dataset.yml | 18 +++++++++ .../dataset/dynamodb_example_test_dataset.yml | 7 ++++ data/dataset/email_dataset.yml | 6 +++ ...le_field_masking_override_test_dataset.yml | 18 +++++++++ data/dataset/example_test_dataset.invalid | 18 +++++++++ data/dataset/example_test_datasets.yml | 16 ++++++++ ...e_cloud_sql_mysql_example_test_dataset.yml | 8 ++++ ...loud_sql_postgres_example_test_dataset.yml | 8 ++++ data/dataset/manual_dataset.yml | 4 ++ data/dataset/mariadb_example_test_dataset.yml | 8 ++++ data/dataset/mongo_example_test_dataset.yml | 11 ++++++ data/dataset/mssql_example_test_dataset.yml | 8 ++++ data/dataset/mysql_example_test_dataset.yml | 9 +++++ ...s_example_custom_request_field_dataset.yml | 1 + ...alid_masking_strategy_override_dataset.yml | 4 ++ .../dataset/postgres_example_test_dataset.yml | 28 +++++++------ .../dataset/redshift_example_test_dataset.yml | 18 +++++++++ .../dataset/scylladb_example_test_dataset.yml | 5 +++ .../snowflake_example_test_dataset.yml | 18 +++++++++ .../dataset/timebase_example_test_dataset.yml | 18 +++++++++ dev-requirements.txt | 2 +- pyproject.toml | 2 +- src/fides/api/task/graph_task.py | 27 +++++++++++++ tests/fixtures/email_fixtures.py | 39 ++++++++++++++----- .../v1/endpoints/test_dataset_endpoints.py | 16 ++++---- tests/ops/generator/test_data_generator.py | 2 + tests/ops/models/test_datasetconfig.py | 7 ++-- tests/ops/task/test_create_request_tasks.py | 6 +-- tests/ops/task/traversal_data.py | 17 ++++---- tests/ops/util/test_dataset_yaml.py | 3 ++ 32 files changed, 322 insertions(+), 56 deletions(-) diff --git a/clients/admin-ui/cypress/fixtures/connectors/datasetconfig.json b/clients/admin-ui/cypress/fixtures/connectors/datasetconfig.json index c41d13993b..6cf4d7d77c 100644 --- a/clients/admin-ui/cypress/fixtures/connectors/datasetconfig.json +++ b/clients/admin-ui/cypress/fixtures/connectors/datasetconfig.json @@ -38,7 +38,7 @@ "fides_meta": { "references": null, "identity": null, - "primary_key": null, + "primary_key": true, "data_type": null, "length": null, "return_all_elements": null, @@ -125,7 +125,7 @@ "fides_meta": { "references": null, "identity": null, - "primary_key": null, + "primary_key": true, "data_type": null, "length": null, "return_all_elements": null, @@ -199,7 +199,7 @@ "fides_meta": { "references": null, "identity": null, - "primary_key": null, + "primary_key": true, "data_type": null, "length": null, "return_all_elements": null, @@ -258,7 +258,7 @@ "fides_meta": { "references": null, "identity": null, - "primary_key": null, + "primary_key": true, "data_type": null, "length": null, "return_all_elements": null, @@ -366,7 +366,7 @@ "fides_meta": { "references": null, "identity": null, - "primary_key": null, + "primary_key": true, "data_type": null, "length": null, "return_all_elements": null, @@ -466,7 +466,7 @@ "fides_meta": { "references": null, "identity": null, - "primary_key": null, + "primary_key": true, "data_type": null, "length": null, "return_all_elements": null, @@ -503,7 +503,7 @@ "fides_meta": { "references": null, "identity": null, - "primary_key": null, + "primary_key": true, "data_type": null, "length": null, "return_all_elements": null, @@ -555,7 +555,7 @@ "fides_meta": { "references": null, "identity": null, - "primary_key": null, + "primary_key": true, "data_type": null, "length": null, "return_all_elements": null, @@ -664,7 +664,7 @@ "fides_meta": { "references": null, "identity": null, - "primary_key": null, + "primary_key": true, "data_type": null, "length": null, "return_all_elements": null, diff --git a/data/dataset/bigquery_enterprise_test_dataset.yml b/data/dataset/bigquery_enterprise_test_dataset.yml index 52b20e7d03..59d27e68a2 100644 --- a/data/dataset/bigquery_enterprise_test_dataset.yml +++ b/data/dataset/bigquery_enterprise_test_dataset.yml @@ -30,7 +30,7 @@ dataset: fides_meta: references: null identity: null - primary_key: null + primary_key: true data_type: integer length: null return_all_elements: null @@ -102,7 +102,7 @@ dataset: fides_meta: references: null identity: null - primary_key: null + primary_key: true data_type: integer length: null return_all_elements: null @@ -204,7 +204,7 @@ dataset: fides_meta: references: null identity: null - primary_key: null + primary_key: true data_type: integer length: null return_all_elements: null @@ -347,7 +347,7 @@ dataset: fides_meta: references: null identity: stackoverflow_user_id - primary_key: null + primary_key: true data_type: integer length: null return_all_elements: null diff --git a/data/dataset/bigquery_example_test_dataset.yml b/data/dataset/bigquery_example_test_dataset.yml index c4ea16cb44..11fdac1aba 100644 --- a/data/dataset/bigquery_example_test_dataset.yml +++ b/data/dataset/bigquery_example_test_dataset.yml @@ -13,6 +13,8 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -51,6 +53,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -76,6 +80,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -92,6 +98,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: time data_categories: [user.sensor] @@ -106,6 +114,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: @@ -156,6 +166,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: name data_categories: [user.financial] - name: preferred @@ -165,6 +177,8 @@ dataset: fields: - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: name data_categories: [system.operations] - name: price @@ -179,6 +193,8 @@ dataset: data_type: string - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: month data_categories: [system.operations] - name: name @@ -211,6 +227,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: opened data_categories: [system.operations] diff --git a/data/dataset/dynamodb_example_test_dataset.yml b/data/dataset/dynamodb_example_test_dataset.yml index a4e5a1291a..d9ecbb8d1f 100644 --- a/data/dataset/dynamodb_example_test_dataset.yml +++ b/data/dataset/dynamodb_example_test_dataset.yml @@ -20,6 +20,7 @@ dataset: - name: email data_categories: [user.contact.email] fides_meta: + primary_key: True identity: email data_type: string - name: name @@ -32,6 +33,8 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -56,12 +59,16 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] - name: login fields: - name: customer_id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: login_date data_categories: [system.operations] - name: name diff --git a/data/dataset/email_dataset.yml b/data/dataset/email_dataset.yml index 64b49f71a8..c829e8a4ea 100644 --- a/data/dataset/email_dataset.yml +++ b/data/dataset/email_dataset.yml @@ -7,6 +7,8 @@ dataset: fields: - name: id data_categories: [system.operations] + fides_meta: + primary_key: true - name: customer_id data_categories: [user] fides_meta: @@ -20,6 +22,8 @@ dataset: fields: - name: id data_categories: [system.operations] + fides_meta: + primary_key: true - name: first_name data_categories: [user.childrens] - name: last_name @@ -50,6 +54,8 @@ dataset: fields: - name: id data_categories: [ system.operations ] + fides_meta: + primary_key: true - name: payer_email data_categories: [ user.contact.email ] fides_meta: diff --git a/data/dataset/example_field_masking_override_test_dataset.yml b/data/dataset/example_field_masking_override_test_dataset.yml index 74e29ca84e..24bdf84555 100644 --- a/data/dataset/example_field_masking_override_test_dataset.yml +++ b/data/dataset/example_field_masking_override_test_dataset.yml @@ -11,6 +11,8 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -36,6 +38,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -86,6 +90,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -102,6 +108,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: time data_categories: [user.sensor] @@ -116,6 +124,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: @@ -166,6 +176,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: name data_categories: [user.financial] - name: preferred @@ -175,6 +187,8 @@ dataset: fields: - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: name data_categories: [system.operations] - name: price @@ -189,6 +203,8 @@ dataset: data_type: string - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: month data_categories: [system.operations] - name: name @@ -221,6 +237,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: opened data_categories: [system.operations] - name: visit diff --git a/data/dataset/example_test_dataset.invalid b/data/dataset/example_test_dataset.invalid index a3bfe261ff..46e5235876 100644 --- a/data/dataset/example_test_dataset.invalid +++ b/data/dataset/example_test_dataset.invalid @@ -11,6 +11,8 @@ dataset: data_categories: [user.contact.address.street] * name: id data_categories: [system.operations] + fides_meta: + primary_key: True * name: state data_categories: [user.contact.address.state] * name: street @@ -36,6 +38,8 @@ dataset: data_type: string * name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True * name: name data_categories: [user.name] fides_meta: @@ -58,6 +62,8 @@ dataset: data_type: string * name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True * name: name data_categories: [user.name] fides_meta: @@ -74,6 +80,8 @@ dataset: direction: from * name: id data_categories: [system.operations] + fides_meta: + primary_key: True * name: time data_categories: [user.sensor] @@ -88,6 +96,8 @@ dataset: direction: from * name: id data_categories: [system.operations] + fides_meta: + primary_key: True * name: shipping_address_id data_categories: [system.operations] fides_meta: @@ -138,6 +148,8 @@ dataset: direction: from * name: id data_categories: [system.operations] + fides_meta: + primary_key: True * name: name data_categories: [user.financial] * name: preferred @@ -147,6 +159,8 @@ dataset: fields: * name: id data_categories: [system.operations] + fides_meta: + primary_key: True * name: name data_categories: [system.operations] * name: price @@ -161,6 +175,8 @@ dataset: data_type: string * name: id data_categories: [system.operations] + fides_meta: + primary_key: True * name: month data_categories: [system.operations] * name: name @@ -193,6 +209,8 @@ dataset: direction: from * name: id data_categories: [system.operations] + fides_meta: + primary_key: True * name: opened data_categories: [system.operations] diff --git a/data/dataset/example_test_datasets.yml b/data/dataset/example_test_datasets.yml index e64e9fb1e8..898d61bc71 100644 --- a/data/dataset/example_test_datasets.yml +++ b/data/dataset/example_test_datasets.yml @@ -11,6 +11,8 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -36,6 +38,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] @@ -55,6 +59,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] @@ -83,6 +89,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: @@ -212,6 +220,8 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -237,6 +247,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] @@ -256,6 +268,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] @@ -284,6 +298,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: diff --git a/data/dataset/google_cloud_sql_mysql_example_test_dataset.yml b/data/dataset/google_cloud_sql_mysql_example_test_dataset.yml index 86b6ad2171..7f090e0487 100644 --- a/data/dataset/google_cloud_sql_mysql_example_test_dataset.yml +++ b/data/dataset/google_cloud_sql_mysql_example_test_dataset.yml @@ -11,6 +11,8 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -36,6 +38,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] @@ -55,6 +59,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] @@ -83,6 +89,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: diff --git a/data/dataset/google_cloud_sql_postgres_example_test_dataset.yml b/data/dataset/google_cloud_sql_postgres_example_test_dataset.yml index 833361a300..47989b4201 100644 --- a/data/dataset/google_cloud_sql_postgres_example_test_dataset.yml +++ b/data/dataset/google_cloud_sql_postgres_example_test_dataset.yml @@ -11,6 +11,8 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -36,6 +38,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] @@ -55,6 +59,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] @@ -83,6 +89,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: diff --git a/data/dataset/manual_dataset.yml b/data/dataset/manual_dataset.yml index 26d6acbe48..66f5e4a0da 100644 --- a/data/dataset/manual_dataset.yml +++ b/data/dataset/manual_dataset.yml @@ -7,6 +7,8 @@ dataset: fields: - name: id data_categories: [system.operations] + fides_meta: + primary_key: true - name: authorized_user data_categories: [user] fides_meta: @@ -29,6 +31,8 @@ dataset: fields: - name: box_id data_categories: [user] + fides_meta: + primary_key: true - name: email data_categories: [user.contact.email] fides_meta: diff --git a/data/dataset/mariadb_example_test_dataset.yml b/data/dataset/mariadb_example_test_dataset.yml index 204ad8a56d..5e3c90f08f 100644 --- a/data/dataset/mariadb_example_test_dataset.yml +++ b/data/dataset/mariadb_example_test_dataset.yml @@ -11,6 +11,8 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -36,6 +38,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] @@ -55,6 +59,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] @@ -83,6 +89,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: diff --git a/data/dataset/mongo_example_test_dataset.yml b/data/dataset/mongo_example_test_dataset.yml index 587e74b317..0205f33049 100644 --- a/data/dataset/mongo_example_test_dataset.yml +++ b/data/dataset/mongo_example_test_dataset.yml @@ -82,6 +82,7 @@ dataset: - name: _id data_categories: [system.operations] fides_meta: + primary_key: True data_type: object_id - name: customer_identifiers fields: @@ -112,6 +113,7 @@ dataset: - name: _id data_categories: [system.operations] fides_meta: + primary_key: True data_type: object_id - name: customer_information fields: @@ -144,6 +146,7 @@ dataset: - name: _id data_categories: [system.operations] fides_meta: + primary_key: True data_type: object_id - name: passenger_information fields: @@ -173,6 +176,7 @@ dataset: - name: _id data_categories: [system.operations] fides_meta: + primary_key: True data_type: object_id - name: thread fides_meta: @@ -197,6 +201,7 @@ dataset: - name: _id data_categories: [system.operations] fides_meta: + primary_key: True data_type: object_id - name: email data_categories: [user.contact.email] @@ -206,6 +211,7 @@ dataset: - name: id data_categories: [user.unique_id] fides_meta: + primary_key: True references: - dataset: mongo_test field: flights.pilots @@ -219,6 +225,7 @@ dataset: - name: _id data_categories: [system.operations] fides_meta: + primary_key: True data_type: object_id - name: planes data_categories: [system.operations] @@ -237,6 +244,7 @@ dataset: - name: _id data_categories: [system.operations] fides_meta: + primary_key: True data_type: object_id - name: billing_address_id data_categories: [system.operations] @@ -253,6 +261,8 @@ dataset: data_categories: [user.unique_id] - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: name data_categories: [user.financial] - name: preferred @@ -261,6 +271,7 @@ dataset: fields: - name: _id fides_meta: + primary_key: True data_type: object_id - name: owner fides_meta: diff --git a/data/dataset/mssql_example_test_dataset.yml b/data/dataset/mssql_example_test_dataset.yml index d58cf013d3..661c600727 100644 --- a/data/dataset/mssql_example_test_dataset.yml +++ b/data/dataset/mssql_example_test_dataset.yml @@ -11,6 +11,8 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -36,6 +38,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] @@ -55,6 +59,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] @@ -83,6 +89,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: diff --git a/data/dataset/mysql_example_test_dataset.yml b/data/dataset/mysql_example_test_dataset.yml index 7d2b16541b..f311ebf2c7 100644 --- a/data/dataset/mysql_example_test_dataset.yml +++ b/data/dataset/mysql_example_test_dataset.yml @@ -11,6 +11,8 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -36,6 +38,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] @@ -55,6 +59,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] @@ -83,6 +89,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: @@ -210,3 +218,4 @@ dataset: data_type: string - name: updated_at data_categories: [system.operations] + diff --git a/data/dataset/postgres_example_custom_request_field_dataset.yml b/data/dataset/postgres_example_custom_request_field_dataset.yml index 0a878fad87..96b58645d4 100644 --- a/data/dataset/postgres_example_custom_request_field_dataset.yml +++ b/data/dataset/postgres_example_custom_request_field_dataset.yml @@ -10,6 +10,7 @@ dataset: data_categories: [system.operations] fides_meta: data_type: string + primary_key: True - name: email_address data_categories: [system.operations] fides_meta: diff --git a/data/dataset/postgres_example_invalid_masking_strategy_override_dataset.yml b/data/dataset/postgres_example_invalid_masking_strategy_override_dataset.yml index e66c2cd140..5195a3671a 100644 --- a/data/dataset/postgres_example_invalid_masking_strategy_override_dataset.yml +++ b/data/dataset/postgres_example_invalid_masking_strategy_override_dataset.yml @@ -14,6 +14,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -29,6 +31,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] fides_meta: diff --git a/data/dataset/postgres_example_test_dataset.yml b/data/dataset/postgres_example_test_dataset.yml index 1f01fe1f03..d62eb38d46 100644 --- a/data/dataset/postgres_example_test_dataset.yml +++ b/data/dataset/postgres_example_test_dataset.yml @@ -7,28 +7,18 @@ dataset: fields: - name: city data_categories: [user.contact.address.city] - fides_meta: - data_type: string - name: house data_categories: [user.contact.address.street] - fides_meta: - data_type: integer - name: id data_categories: [system.operations] fides_meta: - data_type: integer + primary_key: True - name: state data_categories: [user.contact.address.state] - fides_meta: - data_type: string - name: street data_categories: [user.contact.address.street] - fides_meta: - data_type: string - name: zip data_categories: [user.contact.address.postal_code] - fides_meta: - data_type: string - name: customer fields: @@ -48,6 +38,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -70,6 +62,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -92,6 +86,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: time data_categories: [user.sensor] @@ -106,6 +102,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: @@ -156,6 +154,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: name data_categories: [user.financial] - name: preferred @@ -165,6 +165,8 @@ dataset: fields: - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: name data_categories: [system.operations] - name: price @@ -179,6 +181,8 @@ dataset: data_type: string - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: month data_categories: [system.operations] - name: name @@ -211,6 +215,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: opened data_categories: [system.operations] - name: visit diff --git a/data/dataset/redshift_example_test_dataset.yml b/data/dataset/redshift_example_test_dataset.yml index 2b1858e99a..9794f86bb3 100644 --- a/data/dataset/redshift_example_test_dataset.yml +++ b/data/dataset/redshift_example_test_dataset.yml @@ -11,6 +11,8 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -36,6 +38,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -58,6 +62,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -74,6 +80,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: time data_categories: [user.sensor] @@ -88,6 +96,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: @@ -138,6 +148,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: name data_categories: [user.financial] - name: preferred @@ -147,6 +159,8 @@ dataset: fields: - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: name data_categories: [system.operations] - name: price @@ -161,6 +175,8 @@ dataset: data_type: string - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: month data_categories: [system.operations] - name: name @@ -193,6 +209,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: opened data_categories: [system.operations] diff --git a/data/dataset/scylladb_example_test_dataset.yml b/data/dataset/scylladb_example_test_dataset.yml index 38c0ea7b51..8374540cc1 100644 --- a/data/dataset/scylladb_example_test_dataset.yml +++ b/data/dataset/scylladb_example_test_dataset.yml @@ -47,6 +47,7 @@ dataset: data_categories: [user.unique_id] fides_meta: data_type: integer + primary_key: True - name: uuid data_categories: [user.government_id] - name: user_activity @@ -59,10 +60,12 @@ dataset: field: users.user_id direction: from data_type: integer + primary_key: True - name: timestamp data_categories: [user.behavior] fides_meta: data_type: string + primary_key: True - name: user_agent data_categories: [user.device] fides_meta: @@ -77,6 +80,7 @@ dataset: data_categories: [system.operations] fides_meta: data_type: integer + primary_key: True - name: user_id data_categories: [user.unique_id] fides_meta: @@ -97,6 +101,7 @@ dataset: data_categories: [system.operations] fides_meta: data_type: integer + primary_key: True - name: payment_method_id data_categories: [system.operations] fides_meta: diff --git a/data/dataset/snowflake_example_test_dataset.yml b/data/dataset/snowflake_example_test_dataset.yml index 9b1b79f125..da13723693 100644 --- a/data/dataset/snowflake_example_test_dataset.yml +++ b/data/dataset/snowflake_example_test_dataset.yml @@ -11,6 +11,8 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -36,6 +38,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -62,6 +66,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -78,6 +84,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: time data_categories: [user.sensor] @@ -92,6 +100,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: @@ -142,6 +152,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: name data_categories: [user.financial] - name: preferred @@ -151,6 +163,8 @@ dataset: fields: - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: name data_categories: [system.operations] - name: price @@ -165,6 +179,8 @@ dataset: data_type: string - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: month data_categories: [system.operations] - name: name @@ -197,6 +213,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: opened data_categories: [system.operations] diff --git a/data/dataset/timebase_example_test_dataset.yml b/data/dataset/timebase_example_test_dataset.yml index fe8a7e7d1d..ffd57a7c67 100644 --- a/data/dataset/timebase_example_test_dataset.yml +++ b/data/dataset/timebase_example_test_dataset.yml @@ -11,6 +11,8 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -36,6 +38,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -58,6 +62,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -74,6 +80,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: time data_categories: [user.sensor] @@ -88,6 +96,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: @@ -138,6 +148,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: name data_categories: [user.financial] - name: preferred @@ -147,6 +159,8 @@ dataset: fields: - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: name data_categories: [system.operations] - name: price @@ -161,6 +175,8 @@ dataset: data_type: string - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: month data_categories: [system.operations] - name: name @@ -193,6 +209,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: opened data_categories: [system.operations] diff --git a/dev-requirements.txt b/dev-requirements.txt index c51b9369a5..149cdcd658 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -7,12 +7,12 @@ mypy==1.10.0 nox==2022.8.7 pre-commit==2.20.0 pylint==3.2.5 -pytest==7.2.2 pytest-asyncio==0.19.0 pytest-cov==4.0.0 pytest-env==0.6.2 pytest-mock==3.14.0 pytest-rerunfailures==14.0 +pytest==7.2.2 requests-mock==1.10.0 setuptools>=64.0.2 sqlalchemy-stubs==0.4 diff --git a/pyproject.toml b/pyproject.toml index f1f4963dd0..087d2b2033 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -175,7 +175,7 @@ addopts = [ "--no-cov-on-fail", "-ra", "-vv", - "--disable-pytest-warnings", + "--disable-pytest-warnings" ] markers = [ "unit: only runs tests that don't require non-python dependencies (i.e. a database)", diff --git a/src/fides/api/task/graph_task.py b/src/fides/api/task/graph_task.py index 85576264f0..145094ea25 100644 --- a/src/fides/api/task/graph_task.py +++ b/src/fides/api/task/graph_task.py @@ -604,6 +604,33 @@ def erasure_request( ) -> int: """Run erasure request""" + # if there is no primary key specified in the graph node configuration + # note this in the execution log and perform no erasures on this node + if ( + self.connector.requires_primary_keys + and not self.execution_node.collection.contains_field( + lambda f: f.primary_key + ) + ): + logger.warning( + 'Skipping erasures on "{}" as the "{}" connector requires a primary key to be defined in one of the collection fields, but none was found.', + self.execution_node.address, + self.connector.configuration.connection_type, + ) + if self.request_task.id: + # For DSR 3.0, largely for testing. DSR 3.0 uses Request Task status + # instead of presence of cached erasure data to know if we should rerun a node + self.request_task.rows_masked = 0 # Saved as part of update_status + # TODO Remove when we stop support for DSR 2.0 + self.resources.cache_erasure(self.key.value, 0) + self.update_status( + "No values were erased since no primary key was defined in any of the fields for this collection", + None, + ActionType.erasure, + ExecutionLogStatus.complete, + ) + return 0 + if not self.can_write_data(): logger.warning( "No erasures on {} as its ConnectionConfig does not have write access.", diff --git a/tests/fixtures/email_fixtures.py b/tests/fixtures/email_fixtures.py index e25f39e3f4..0df1d61b84 100644 --- a/tests/fixtures/email_fixtures.py +++ b/tests/fixtures/email_fixtures.py @@ -192,17 +192,24 @@ def dynamic_email_address_config_dataset( { "name": "id", "data_categories": ["system.operations"], - "fides_meta": {"data_type": "string"}, + "fides_meta": { + "data_type": "string", + "primary_key": True, + }, }, { "name": "email_address", "data_categories": ["system.operations"], - "fides_meta": {"data_type": "string"}, + "fides_meta": { + "data_type": "string", + }, }, { "name": "vendor_name", "data_categories": ["system.operations"], - "fides_meta": {"data_type": "string"}, + "fides_meta": { + "data_type": "string", + }, }, { "name": "site_id", @@ -238,17 +245,24 @@ def dynamic_email_address_config_second_dataset( { "name": "id", "data_categories": ["system.operations"], - "fides_meta": {"data_type": "string"}, + "fides_meta": { + "data_type": "string", + "primary_key": True, + }, }, { "name": "email_address", "data_categories": ["system.operations"], - "fides_meta": {"data_type": "string"}, + "fides_meta": { + "data_type": "string", + }, }, { "name": "vendor_name", "data_categories": ["system.operations"], - "fides_meta": {"data_type": "string"}, + "fides_meta": { + "data_type": "string", + }, }, { "name": "custom_field", @@ -266,17 +280,24 @@ def dynamic_email_address_config_second_dataset( { "name": "id2", "data_categories": ["system.operations"], - "fides_meta": {"data_type": "string"}, + "fides_meta": { + "data_type": "string", + "primary_key": True, + }, }, { "name": "email_address2", "data_categories": ["system.operations"], - "fides_meta": {"data_type": "string"}, + "fides_meta": { + "data_type": "string", + }, }, { "name": "vendor_name2", "data_categories": ["system.operations"], - "fides_meta": {"data_type": "string"}, + "fides_meta": { + "data_type": "string", + }, }, { "name": "site_id2", diff --git a/tests/ops/api/v1/endpoints/test_dataset_endpoints.py b/tests/ops/api/v1/endpoints/test_dataset_endpoints.py index 1ae9ac28f8..f744d59e47 100644 --- a/tests/ops/api/v1/endpoints/test_dataset_endpoints.py +++ b/tests/ops/api/v1/endpoints/test_dataset_endpoints.py @@ -232,7 +232,9 @@ def test_put_validate_dataset_invalid_length( invalid_dataset = example_datasets[0] # string is properly read: - invalid_dataset["collections"][0]["fields"][0]["fides_meta"] = {"length": 123} + invalid_dataset["collections"][0]["fields"][0]["fidesops_meta"] = { + "length": 123 + } response = api_client.put( validate_dataset_url, headers=auth_header, json=invalid_dataset ) @@ -245,7 +247,7 @@ def test_put_validate_dataset_invalid_length( ) # fails with an invalid value - invalid_dataset["collections"][0]["fields"][0]["fides_meta"] = {"length": -1} + invalid_dataset["collections"][0]["fields"][0]["fidesops_meta"] = {"length": -1} response = api_client.put( validate_dataset_url, headers=auth_header, json=invalid_dataset ) @@ -267,7 +269,7 @@ def test_put_validate_dataset_invalid_data_type( invalid_dataset = example_datasets[0] # string is properly read: - invalid_dataset["collections"][0]["fields"][0]["fides_meta"] = { + invalid_dataset["collections"][0]["fields"][0]["fidesops_meta"] = { "data_type": "string" } response = api_client.put( @@ -282,7 +284,7 @@ def test_put_validate_dataset_invalid_data_type( ) # fails with an invalid value - invalid_dataset["collections"][0]["fields"][0]["fides_meta"] = { + invalid_dataset["collections"][0]["fields"][0]["fidesops_meta"] = { "data_type": "stringsssssss" } @@ -296,7 +298,7 @@ def test_put_validate_dataset_invalid_data_type( == "Value error, The data type stringsssssss is not supported." ) - def test_put_validate_dataset_invalid_fides_meta( + def test_put_validate_dataset_invalid_fidesops_meta( self, example_datasets: List, validate_dataset_url, @@ -305,8 +307,8 @@ def test_put_validate_dataset_invalid_fides_meta( ) -> None: auth_header = generate_auth_header(scopes=[DATASET_READ]) invalid_dataset = example_datasets[0] - # Add an invalid fides_meta annotation to ensure our type-checking is comprehensive - invalid_dataset["collections"][0]["fields"][0]["fides_meta"] = { + # Add an invalid fidesops_meta annotation to ensure our type-checking is comprehensive + invalid_dataset["collections"][0]["fields"][0]["fidesops_meta"] = { "references": [ { "dataset": "postgres_example_test_dataset", diff --git a/tests/ops/generator/test_data_generator.py b/tests/ops/generator/test_data_generator.py index 04441237b7..af9ab1cc62 100644 --- a/tests/ops/generator/test_data_generator.py +++ b/tests/ops/generator/test_data_generator.py @@ -20,6 +20,7 @@ fields: - name: id fides_meta: + primary_key: True data_type: integer references: - dataset: db @@ -33,6 +34,7 @@ fields: - name: id fides_meta: + primary_key: True data_type: integer - name: user_id - name: street diff --git a/tests/ops/models/test_datasetconfig.py b/tests/ops/models/test_datasetconfig.py index 933c2fcb3f..969002baac 100644 --- a/tests/ops/models/test_datasetconfig.py +++ b/tests/ops/models/test_datasetconfig.py @@ -194,17 +194,18 @@ def test_convert_dataset_to_graph(example_datasets): (FieldAddress("postgres_example_test_dataset", "customer", "id"), "from") ] + # check that primary key member has been set assert ( field([graph], "postgres_example_test_dataset", "address", "id").primary_key - is False + is True ) assert ( field([graph], "postgres_example_test_dataset", "customer", "id").primary_key - is False + is True ) assert ( field([graph], "postgres_example_test_dataset", "employee", "id").primary_key - is False + is True ) assert ( field([graph], "postgres_example_test_dataset", "visit", "email").primary_key diff --git a/tests/ops/task/test_create_request_tasks.py b/tests/ops/task/test_create_request_tasks.py index 3792fea0e3..290c2dc1be 100644 --- a/tests/ops/task/test_create_request_tasks.py +++ b/tests/ops/task/test_create_request_tasks.py @@ -105,7 +105,7 @@ "is_array": False, "read_only": None, "references": [], - "primary_key": False, + "primary_key": True, "data_categories": ["system.operations"], "data_type_converter": "None", "return_all_elements": None, @@ -307,7 +307,7 @@ def test_persist_access_tasks_with_object_fields_in_collection( "is_array": False, "read_only": None, "references": [], - "primary_key": False, + "primary_key": True, "data_categories": ["system.operations"], "data_type_converter": "object_id", "return_all_elements": None, @@ -927,7 +927,7 @@ def test_erase_after_saas_upstream_and_downstream_tasks( "is_array": False, "read_only": None, "references": [], - "primary_key": False, + "primary_key": True, "data_categories": ["system.operations"], "data_type_converter": "integer", "return_all_elements": None, diff --git a/tests/ops/task/traversal_data.py b/tests/ops/task/traversal_data.py index 07ff478e3e..20d3773e17 100644 --- a/tests/ops/task/traversal_data.py +++ b/tests/ops/task/traversal_data.py @@ -33,7 +33,7 @@ def postgres_dataset_dict(db_name: str) -> Dict[str, Any]: "fields": [ { "name": "id", - "fides_meta": {"data_type": "integer"}, + "fides_meta": {"primary_key": True, "data_type": "integer"}, }, {"name": "name", "fides_meta": {"data_type": "string"}}, { @@ -58,7 +58,7 @@ def postgres_dataset_dict(db_name: str) -> Dict[str, Any]: "name": "address", "after": [f"{db_name}.customer", f"{db_name}.orders"], "fields": [ - {"name": "id"}, + {"name": "id", "fides_meta": {"primary_key": True}}, {"name": "street", "fides_meta": {"data_type": "string"}}, {"name": "city", "fides_meta": {"data_type": "string"}}, {"name": "state", "fides_meta": {"data_type": "string"}}, @@ -68,7 +68,7 @@ def postgres_dataset_dict(db_name: str) -> Dict[str, Any]: { "name": "orders", "fields": [ - {"name": "id"}, + {"name": "id", "fides_meta": {"primary_key": True}}, { "name": "customer_id", "fides_meta": { @@ -113,7 +113,7 @@ def postgres_dataset_dict(db_name: str) -> Dict[str, Any]: "fields": [ { "name": "id", - "fides_meta": {"data_type": "string"}, + "fides_meta": {"primary_key": True, "data_type": "string"}, }, {"name": "name", "fides_meta": {"data_type": "string"}}, {"name": "ccn"}, @@ -657,7 +657,7 @@ def scylladb_dataset_dict(db_name: str) -> Dict[str, Any]: { "name": "user_id", "data_categories": ["user.unique_id"], - "fides_meta": {"data_type": "integer"}, + "fides_meta": {"data_type": "integer", "primary_key": True}, }, {"name": "uuid", "data_categories": ["user.government_id"]}, ], @@ -677,12 +677,13 @@ def scylladb_dataset_dict(db_name: str) -> Dict[str, Any]: } ], "data_type": "integer", + "primary_key": True, }, }, { "name": "timestamp", "data_categories": ["user.behavior"], - "fides_meta": {"data_type": "string"}, + "fides_meta": {"data_type": "string", "primary_key": True}, }, { "name": "user_agent", @@ -702,7 +703,7 @@ def scylladb_dataset_dict(db_name: str) -> Dict[str, Any]: { "name": "payment_method_id", "data_categories": ["system.operations"], - "fides_meta": {"data_type": "integer"}, + "fides_meta": {"data_type": "integer", "primary_key": True}, }, { "name": "user_id", @@ -732,7 +733,7 @@ def scylladb_dataset_dict(db_name: str) -> Dict[str, Any]: { "name": "order_id", "data_categories": ["system.operations"], - "fides_meta": {"data_type": "integer"}, + "fides_meta": {"data_type": "integer", "primary_key": True}, }, { "name": "payment_method_id", diff --git a/tests/ops/util/test_dataset_yaml.py b/tests/ops/util/test_dataset_yaml.py index a610ac7569..edaa26a7ca 100644 --- a/tests/ops/util/test_dataset_yaml.py +++ b/tests/ops/util/test_dataset_yaml.py @@ -33,6 +33,7 @@ - name: id data_categories: [system.operations] fidesops_meta: + primary_key: True data_type: integer """ @@ -46,6 +47,7 @@ - name: _id data_categories: [system.operations] fidesops_meta: + primary_key: True data_type: object_id - name: photo_id data_categories: [user.unique_id] @@ -221,6 +223,7 @@ def test_invalid_datatype(): - name: id data_categories: [system.operations] fidesops_meta: + primary_key: True data_type: integer - name: users fields: From 7600ab4d4646ee1920e8ee62feee224942189ea7 Mon Sep 17 00:00:00 2001 From: Adrian Galvan Date: Tue, 10 Dec 2024 15:26:07 -0800 Subject: [PATCH 20/22] Removing primary key requirement for BigQuery erasures --- .../bigquery_enterprise_test_dataset.yml | 552 +++++------------- .../dataset/bigquery_example_test_dataset.yml | 18 - .../service/connectors/bigquery_connector.py | 4 + .../query_configs/bigquery_query_config.py | 16 +- .../connectors/test_bigquery_connector.py | 4 +- .../connectors/test_bigquery_queryconfig.py | 4 +- ...est_bigquery_enterprise_privacy_request.py | 22 +- 7 files changed, 166 insertions(+), 454 deletions(-) diff --git a/data/dataset/bigquery_enterprise_test_dataset.yml b/data/dataset/bigquery_enterprise_test_dataset.yml index 59d27e68a2..64668192d0 100644 --- a/data/dataset/bigquery_enterprise_test_dataset.yml +++ b/data/dataset/bigquery_enterprise_test_dataset.yml @@ -1,405 +1,149 @@ dataset: - - fides_key: enterprise_dsr_testing - organization_fides_key: default_organization - tags: null - name: Bigquery Enterprise Test Dataset - description: BigQuery dataset containing real data - meta: null - data_categories: null - fides_meta: - resource_id: enterprise_dsr_testing.prj-sandbox-55855.enterprise_dsr_testing - after: null - namespace: - dataset_id: enterprise_dsr_testing - project_id: prj-sandbox-55855 - collections: - - name: comments - description: null - data_categories: null - fields: - - name: creation_date - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: id - description: null - data_categories: - - system.operations - fides_meta: - references: null - identity: null - primary_key: true - data_type: integer - length: null - return_all_elements: null - read_only: null - custom_request_field: null - fields: null - - name: post_id - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: score - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: text - description: null - data_categories: - - user.contact - fides_meta: null - fields: null - - name: user_display_name - description: null - data_categories: - - user.contact - fides_meta: null - fields: null - - name: user_id - description: null - data_categories: - - user.contact - fides_meta: - references: - - dataset: enterprise_dsr_testing - field: users.id - direction: from - identity: null - primary_key: null - data_type: null - length: null - return_all_elements: null - read_only: null - custom_request_field: null - fields: null - fides_meta: null - - name: post_history - description: null - data_categories: null - fields: - - name: comment - description: null - data_categories: - - user.contact - fides_meta: null - fields: null - - name: creation_date - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: id - description: null - data_categories: - - system.operations - fides_meta: - references: null - identity: null - primary_key: true - data_type: integer - length: null - return_all_elements: null - read_only: null - custom_request_field: null - fields: null - - name: post_history_type_id - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: post_id - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: revision_guid - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: text - description: null - data_categories: - - user.contact - fides_meta: null - fields: null - - name: user_id - description: null - data_categories: - - system.operations - fides_meta: - references: - - dataset: enterprise_dsr_testing - field: users.id - direction: from - identity: null - primary_key: null - data_type: null - length: null - return_all_elements: null - read_only: null - custom_request_field: null - fields: null - fides_meta: null - - name: stackoverflow_posts - description: null - data_categories: null - fields: - - name: accepted_answer_id - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: answer_count - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: body - description: null - data_categories: - - user.contact - fides_meta: null - fields: null - - name: comment_count - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: community_owned_date - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: creation_date - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: favorite_count - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: id - description: null - data_categories: - - system.operations - fides_meta: - references: null - identity: null - primary_key: true - data_type: integer - length: null - return_all_elements: null - read_only: null - custom_request_field: null - fields: null - - name: last_activity_date - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: last_edit_date - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: last_editor_display_name - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: last_editor_user_id - description: null - data_categories: - - system.operations - fides_meta: - references: - - dataset: enterprise_dsr_testing - field: users.id - direction: from - identity: null - primary_key: null - data_type: null - length: null - return_all_elements: null - read_only: null - custom_request_field: null - fields: null - - name: owner_display_name - description: null - data_categories: - - user.contact - fides_meta: null - fields: null - - name: owner_user_id - description: null - data_categories: - - system.operations - fides_meta: - references: - - dataset: enterprise_dsr_testing - field: users.id - direction: from - identity: null - primary_key: null - data_type: integer - length: null - return_all_elements: null - read_only: null - custom_request_field: null - fields: null - - name: parent_id - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: post_type_id - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: score - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: tags - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: title - description: null - data_categories: - - user.contact - fides_meta: null - fields: null - - name: view_count - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - fides_meta: null - - name: users - description: null - data_categories: null - fields: - - name: about_me - description: null - data_categories: - - user.contact - fides_meta: null - fields: null - - name: age - description: null - data_categories: - - user - fides_meta: null - fields: null - - name: creation_date - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: display_name - description: null - data_categories: - - user.contact - fides_meta: null - fields: null - - name: down_votes - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: id - description: null - data_categories: - - user.contact - fides_meta: - references: null - identity: stackoverflow_user_id - primary_key: true - data_type: integer - length: null - return_all_elements: null - read_only: null - custom_request_field: null - fields: null - - name: last_access_date - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: location - description: null - data_categories: - - user.contact - fides_meta: null - fields: null - - name: profile_image_url - description: null - data_categories: - - user.contact - fides_meta: null - fields: null - - name: reputation - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: up_votes - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: views - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: website_url - description: null - data_categories: - - user - fides_meta: null - fields: null - fides_meta: - after: null - erase_after: - - enterprise_dsr_testing.comments - skip_processing: false - masking_strategy_override: null - partitioning: null + - fides_key: enterprise_dsr_testing + organization_fides_key: default_organization + name: Bigquery Enterprise Test Dataset + description: BigQuery dataset containing real data + fides_meta: + resource_id: enterprise_dsr_testing.prj-sandbox-55855.enterprise_dsr_testing + namespace: + dataset_id: enterprise_dsr_testing + project_id: prj-sandbox-55855 + collections: + - name: comments + fields: + - name: creation_date + data_categories: [system.operations] + - name: id + data_categories: [system.operations] + fides_meta: + data_type: integer + - name: post_id + data_categories: [system.operations] + - name: score + data_categories: [system.operations] + - name: text + data_categories: [user.contact] + - name: user_display_name + data_categories: [user.contact] + - name: user_id + data_categories: [user.contact] + fides_meta: + references: + - dataset: enterprise_dsr_testing + field: users.id + direction: from + - name: post_history + fields: + - name: comment + data_categories: [user.contact] + - name: creation_date + data_categories: [system.operations] + - name: id + data_categories: [system.operations] + fides_meta: + data_type: integer + - name: post_history_type_id + data_categories: [system.operations] + - name: post_id + data_categories: [system.operations] + - name: revision_guid + data_categories: [system.operations] + - name: text + data_categories: [user.contact] + - name: user_id + data_categories: [system.operations] + fides_meta: + references: + - dataset: enterprise_dsr_testing + field: users.id + direction: from + - name: stackoverflow_posts + fields: + - name: accepted_answer_id + data_categories: [system.operations] + - name: answer_count + data_categories: [system.operations] + - name: body + data_categories: [user.contact] + - name: comment_count + data_categories: [system.operations] + - name: community_owned_date + data_categories: [system.operations] + - name: creation_date + data_categories: [system.operations] + - name: favorite_count + data_categories: [system.operations] + - name: id + data_categories: [system.operations] + fides_meta: + data_type: integer + - name: last_activity_date + data_categories: [system.operations] + - name: last_edit_date + data_categories: [system.operations] + - name: last_editor_display_name + data_categories: [system.operations] + - name: last_editor_user_id + data_categories: [system.operations] + fides_meta: + references: + - dataset: enterprise_dsr_testing + field: users.id + direction: from + - name: owner_display_name + data_categories: [user.contact] + - name: owner_user_id + data_categories: [system.operations] + fides_meta: + references: + - dataset: enterprise_dsr_testing + field: users.id + direction: from + data_type: integer + - name: parent_id + data_categories: [system.operations] + - name: post_type_id + data_categories: [system.operations] + - name: score + data_categories: [system.operations] + - name: tags + data_categories: [system.operations] + - name: title + data_categories: [user.contact] + - name: view_count + data_categories: [system.operations] + - name: users + fields: + - name: about_me + data_categories: [user.contact] + - name: age + data_categories: [user] + - name: creation_date + data_categories: [system.operations] + - name: display_name + data_categories: [user.contact] + - name: down_votes + data_categories: [system.operations] + - name: id + data_categories: [user.contact] + fides_meta: + identity: stackoverflow_user_id + data_type: integer + - name: last_access_date + data_categories: [system.operations] + - name: location + data_categories: [user.contact] + - name: profile_image_url + data_categories: [user.contact] + - name: reputation + data_categories: [system.operations] + - name: up_votes + data_categories: [system.operations] + - name: views + data_categories: [system.operations] + - name: website_url + data_categories: [user] + fides_meta: + erase_after: + - enterprise_dsr_testing.comments + skip_processing: false diff --git a/data/dataset/bigquery_example_test_dataset.yml b/data/dataset/bigquery_example_test_dataset.yml index 11fdac1aba..c4ea16cb44 100644 --- a/data/dataset/bigquery_example_test_dataset.yml +++ b/data/dataset/bigquery_example_test_dataset.yml @@ -13,8 +13,6 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -53,8 +51,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -80,8 +76,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -98,8 +92,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: time data_categories: [user.sensor] @@ -114,8 +106,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: @@ -166,8 +156,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: name data_categories: [user.financial] - name: preferred @@ -177,8 +165,6 @@ dataset: fields: - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: name data_categories: [system.operations] - name: price @@ -193,8 +179,6 @@ dataset: data_type: string - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: month data_categories: [system.operations] - name: name @@ -227,8 +211,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: opened data_categories: [system.operations] diff --git a/src/fides/api/service/connectors/bigquery_connector.py b/src/fides/api/service/connectors/bigquery_connector.py index 8b51f90842..4c52b3b3f6 100644 --- a/src/fides/api/service/connectors/bigquery_connector.py +++ b/src/fides/api/service/connectors/bigquery_connector.py @@ -33,6 +33,10 @@ class BigQueryConnector(SQLConnector): secrets_schema = BigQuerySchema + @property + def requires_primary_keys(self) -> bool: + return False + # Overrides BaseConnector.build_uri def build_uri(self) -> str: """Build URI of format""" diff --git a/src/fides/api/service/connectors/query_configs/bigquery_query_config.py b/src/fides/api/service/connectors/query_configs/bigquery_query_config.py index 74b28f3ada..6060ff5822 100644 --- a/src/fides/api/service/connectors/query_configs/bigquery_query_config.py +++ b/src/fides/api/service/connectors/query_configs/bigquery_query_config.py @@ -123,15 +123,15 @@ def generate_update( TODO: DRY up this method and `generate_delete` a bit """ update_value_map: Dict[str, Any] = self.update_value_map(row, policy, request) - non_empty_primary_keys: Dict[str, Field] = filter_nonempty_values( + non_empty_reference_field_keys: Dict[str, Field] = filter_nonempty_values( { fpath.string_path: fld.cast(row[fpath.string_path]) - for fpath, fld in self.primary_key_field_paths.items() + for fpath, fld in self.reference_field_paths.items() if fpath.string_path in row } ) - valid = len(non_empty_primary_keys) > 0 and update_value_map + valid = len(non_empty_reference_field_keys) > 0 and update_value_map if not valid: logger.warning( "There is not enough data to generate a valid update statement for {}", @@ -141,7 +141,7 @@ def generate_update( table = Table(self._generate_table_name(), MetaData(bind=client), autoload=True) where_clauses: List[ColumnElement] = [ - getattr(table.c, k) == v for k, v in non_empty_primary_keys.items() + getattr(table.c, k) == v for k, v in non_empty_reference_field_keys.items() ] if self.partitioning: @@ -172,15 +172,15 @@ def generate_delete(self, row: Row, client: Engine) -> List[Delete]: TODO: DRY up this method and `generate_update` a bit """ - non_empty_primary_keys: Dict[str, Field] = filter_nonempty_values( + non_empty_reference_field_keys: Dict[str, Field] = filter_nonempty_values( { fpath.string_path: fld.cast(row[fpath.string_path]) - for fpath, fld in self.primary_key_field_paths.items() + for fpath, fld in self.reference_field_paths.items() if fpath.string_path in row } ) - valid = len(non_empty_primary_keys) > 0 + valid = len(non_empty_reference_field_keys) > 0 if not valid: logger.warning( "There is not enough data to generate a valid DELETE statement for {}", @@ -190,7 +190,7 @@ def generate_delete(self, row: Row, client: Engine) -> List[Delete]: table = Table(self._generate_table_name(), MetaData(bind=client), autoload=True) where_clauses: List[ColumnElement] = [ - getattr(table.c, k) == v for k, v in non_empty_primary_keys.items() + getattr(table.c, k) == v for k, v in non_empty_reference_field_keys.items() ] if self.partitioning: diff --git a/tests/ops/service/connectors/test_bigquery_connector.py b/tests/ops/service/connectors/test_bigquery_connector.py index a9524777fe..2e7bc3b075 100644 --- a/tests/ops/service/connectors/test_bigquery_connector.py +++ b/tests/ops/service/connectors/test_bigquery_connector.py @@ -129,7 +129,7 @@ def test_generate_update_partitioned_table( assert len(updates) == 2 assert ( str(updates[0]) - == "UPDATE `silken-precinct-284918.fidesopstest.customer` SET `name`=%(name:STRING)s WHERE `silken-precinct-284918.fidesopstest.customer`.`id` = %(id_1:INT64)s AND `created` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY) AND `created` <= CURRENT_TIMESTAMP()" + == "UPDATE `silken-precinct-284918.fidesopstest.customer` SET `name`=%(name:STRING)s WHERE `silken-precinct-284918.fidesopstest.customer`.`email` = %(email_1:STRING)s AND `created` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY) AND `created` <= CURRENT_TIMESTAMP()" ) def test_generate_delete_partitioned_table( @@ -158,7 +158,7 @@ def test_generate_delete_partitioned_table( assert len(deletes) == 2 assert ( str(deletes[0]) - == "DELETE FROM `silken-precinct-284918.fidesopstest.customer` WHERE `silken-precinct-284918.fidesopstest.customer`.`id` = %(id_1:INT64)s AND `created` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY) AND `created` <= CURRENT_TIMESTAMP()" + == "DELETE FROM `silken-precinct-284918.fidesopstest.customer` WHERE `silken-precinct-284918.fidesopstest.customer`.`email` = %(email_1:STRING)s AND `created` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY) AND `created` <= CURRENT_TIMESTAMP()" ) def test_retrieve_partitioned_data( diff --git a/tests/ops/service/connectors/test_bigquery_queryconfig.py b/tests/ops/service/connectors/test_bigquery_queryconfig.py index 06c51c5105..24a16517b6 100644 --- a/tests/ops/service/connectors/test_bigquery_queryconfig.py +++ b/tests/ops/service/connectors/test_bigquery_queryconfig.py @@ -196,7 +196,7 @@ def test_generate_delete_stmt( ) stmts = set(str(stmt) for stmt in delete_stmts) expected_stmts = { - "DELETE FROM `employee` WHERE `employee`.`id` = %(id_1:STRING)s" + "DELETE FROM `employee` WHERE `employee`.`address_id` = %(address_id_1:STRING)s AND `employee`.`email` = %(email_1:STRING)s" } assert stmts == expected_stmts @@ -289,6 +289,6 @@ def test_generate_namespaced_delete_stmt( ) stmts = set(str(stmt) for stmt in delete_stmts) expected_stmts = { - "DELETE FROM `silken-precinct-284918.fidesopstest.employee` WHERE `silken-precinct-284918.fidesopstest.employee`.`id` = %(id_1:STRING)s" + "DELETE FROM `silken-precinct-284918.fidesopstest.employee` WHERE `silken-precinct-284918.fidesopstest.employee`.`address_id` = %(address_id_1:STRING)s AND `silken-precinct-284918.fidesopstest.employee`.`email` = %(email_1:STRING)s" } assert stmts == expected_stmts diff --git a/tests/ops/service/privacy_request/test_bigquery_enterprise_privacy_request.py b/tests/ops/service/privacy_request/test_bigquery_enterprise_privacy_request.py index 8fb7e29729..5a133c031f 100644 --- a/tests/ops/service/privacy_request/test_bigquery_enterprise_privacy_request.py +++ b/tests/ops/service/privacy_request/test_bigquery_enterprise_privacy_request.py @@ -1,27 +1,9 @@ -import time -from datetime import datetime, timezone -from typing import Any, Dict, List, Set from unittest import mock -from unittest.mock import ANY, Mock, call -from uuid import uuid4 -import pydash import pytest from fides.api.models.audit_log import AuditLog, AuditLogAction -from fides.api.models.privacy_request import ( - ActionType, - CheckpointActionRequired, - ExecutionLog, - ExecutionLogStatus, - PolicyPreWebhook, - PrivacyRequest, - PrivacyRequestStatus, -) -from fides.api.schemas.masking.masking_configuration import MaskingConfiguration -from fides.api.schemas.masking.masking_secrets import MaskingSecretCache -from fides.api.schemas.policy import Rule -from fides.api.service.masking.strategy.masking_strategy import MaskingStrategy +from fides.api.models.privacy_request import ExecutionLog from tests.ops.service.privacy_request.test_request_runner_service import ( get_privacy_request_results, ) @@ -54,7 +36,7 @@ def test_create_and_process_access_request_bigquery_enterprise( customer_email = "customer-1@example.com" user_id = ( - 1754 # this is a real (not generated) user id in the Stackoverflow dataset + 1754 # this is a real (not generated) user id in the Stack Overflow dataset ) data = { "requested_at": "2024-08-30T16:09:37.359Z", From dd8a3ad873e1c80e9f2692671fd96fab0c64db7f Mon Sep 17 00:00:00 2001 From: Adrian Galvan Date: Tue, 10 Dec 2024 17:05:19 -0800 Subject: [PATCH 21/22] Setting requires_primary_keys for select connectors + updating tests --- .../api/service/connectors/base_connector.py | 8 +- .../service/connectors/bigquery_connector.py | 1 + .../service/connectors/postgres_connector.py | 5 + .../connectors/query_configs/query_config.py | 4 +- .../api/service/connectors/saas_connector.py | 2 + .../service/connectors/scylla_connector.py | 5 + .../service/connectors/scylla_query_config.py | 11 +- .../postgres_example_test_dataset.yml | 18 ++ .../service/connectors/test_query_config.py | 205 +++++++++++++++++- .../connectors/test_snowflake_query_config.py | 4 +- tests/ops/task/test_create_request_tasks.py | 2 +- tests/ops/test_helpers/dataset_utils.py | 30 ++- 12 files changed, 281 insertions(+), 14 deletions(-) diff --git a/src/fides/api/service/connectors/base_connector.py b/src/fides/api/service/connectors/base_connector.py index 4bf46e5eca..e1f735df1c 100644 --- a/src/fides/api/service/connectors/base_connector.py +++ b/src/fides/api/service/connectors/base_connector.py @@ -135,5 +135,11 @@ def execute_standalone_retrieval_query( @property def requires_primary_keys(self) -> bool: - """Indicates if datasets linked to this connector require primary keys for erasures. Defaults to True.""" + """ + Indicates if datasets linked to this connector require primary keys for erasures. + Defaults to True. + """ + + # Defaulting to true for now so we can keep the default behavior and + # incrementally determine the need for primary keys across all connectors return True diff --git a/src/fides/api/service/connectors/bigquery_connector.py b/src/fides/api/service/connectors/bigquery_connector.py index 4c52b3b3f6..ae6fe4b909 100644 --- a/src/fides/api/service/connectors/bigquery_connector.py +++ b/src/fides/api/service/connectors/bigquery_connector.py @@ -35,6 +35,7 @@ class BigQueryConnector(SQLConnector): @property def requires_primary_keys(self) -> bool: + """BigQuery does not have the concept of primary keys so they're not required for erasures.""" return False # Overrides BaseConnector.build_uri diff --git a/src/fides/api/service/connectors/postgres_connector.py b/src/fides/api/service/connectors/postgres_connector.py index 5354d4ec13..2abafc01c8 100644 --- a/src/fides/api/service/connectors/postgres_connector.py +++ b/src/fides/api/service/connectors/postgres_connector.py @@ -19,6 +19,11 @@ class PostgreSQLConnector(SQLConnector): secrets_schema = PostgreSQLSchema + @property + def requires_primary_keys(self) -> bool: + """Postgres allows arbitrary columns in the WHERE clause for updates so primary keys are not required.""" + return False + def build_uri(self) -> str: """Build URI of format postgresql://[user[:password]@][netloc][:port][/dbname]""" config = self.secrets_schema(**self.configuration.secrets or {}) diff --git a/src/fides/api/service/connectors/query_configs/query_config.py b/src/fides/api/service/connectors/query_configs/query_config.py index c54eecff85..9f5ddb0251 100644 --- a/src/fides/api/service/connectors/query_configs/query_config.py +++ b/src/fides/api/service/connectors/query_configs/query_config.py @@ -430,7 +430,7 @@ def get_update_stmt( def get_update_clauses( self, update_value_map: Dict[str, Any], - non_empty_reference_fields: Dict[str, Field], + where_clause_fields: Dict[str, Field], ) -> List[str]: """Returns a list of update clauses for the update statement.""" @@ -567,7 +567,7 @@ def format_key_map_for_update_stmt(self, param_map: Dict[str, Any]) -> List[str] def get_update_clauses( self, update_value_map: Dict[str, Any], - non_empty_reference_fields: Dict[str, Field], + where_clause_fields: Dict[str, Field], ) -> List[str]: """Returns a list of update clauses for the update statement.""" return self.format_key_map_for_update_stmt(update_value_map) diff --git a/src/fides/api/service/connectors/saas_connector.py b/src/fides/api/service/connectors/saas_connector.py index 40a4d8a7eb..b1101467bf 100644 --- a/src/fides/api/service/connectors/saas_connector.py +++ b/src/fides/api/service/connectors/saas_connector.py @@ -72,7 +72,9 @@ class SaaSConnector(BaseConnector[AuthenticatedClient], Contextualizable): """A connector type to integrate with third-party SaaS APIs""" + @property def requires_primary_keys(self) -> bool: + """SaaS connectors work with HTTP requests, so the database concept of primary keys does not apply.""" return False def get_log_context(self) -> Dict[LoggerContextKeys, Any]: diff --git a/src/fides/api/service/connectors/scylla_connector.py b/src/fides/api/service/connectors/scylla_connector.py index 43a821930c..ff17674b88 100644 --- a/src/fides/api/service/connectors/scylla_connector.py +++ b/src/fides/api/service/connectors/scylla_connector.py @@ -28,6 +28,11 @@ class ScyllaConnectorMissingKeyspace(Exception): class ScyllaConnector(BaseConnector[Cluster]): """Scylla Connector""" + @property + def requires_primary_keys(self) -> bool: + """ScyllaDB requires primary keys for erasures.""" + return True + def build_uri(self) -> str: """ Builds URI - Not yet implemented diff --git a/src/fides/api/service/connectors/scylla_query_config.py b/src/fides/api/service/connectors/scylla_query_config.py index 5e93668459..1fa52d573d 100644 --- a/src/fides/api/service/connectors/scylla_query_config.py +++ b/src/fides/api/service/connectors/scylla_query_config.py @@ -77,14 +77,19 @@ def format_key_map_for_update_stmt(self, param_map: Dict[str, Any]) -> List[str] def get_update_clauses( self, update_value_map: Dict[str, Any], - non_empty_reference_fields: Dict[str, Field], + where_clause_fields: Dict[str, Field], ) -> List[str]: - """Returns a list of update clauses for the update statement.""" + """Returns a list of update clauses for the update statement. + + Omits primary key fields from updates since ScyllaDB prohibits + updating primary key fields. + """ + return self.format_key_map_for_update_stmt( { key: value for key, value in update_value_map.items() - if key not in non_empty_reference_fields + if key not in where_clause_fields } ) diff --git a/src/fides/data/sample_project/sample_resources/postgres_example_test_dataset.yml b/src/fides/data/sample_project/sample_resources/postgres_example_test_dataset.yml index 768c972d99..e519a75008 100644 --- a/src/fides/data/sample_project/sample_resources/postgres_example_test_dataset.yml +++ b/src/fides/data/sample_project/sample_resources/postgres_example_test_dataset.yml @@ -11,6 +11,8 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -36,6 +38,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -58,6 +62,8 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] + fides_meta: + primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -74,6 +80,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: time data_categories: [user.sensor] @@ -88,6 +96,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: @@ -138,6 +148,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: name data_categories: [user.financial] - name: preferred @@ -147,6 +159,8 @@ dataset: fields: - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: name data_categories: [system.operations] - name: price @@ -161,6 +175,8 @@ dataset: data_type: string - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: month data_categories: [system.operations] - name: name @@ -193,6 +209,8 @@ dataset: direction: from - name: id data_categories: [system.operations] + fides_meta: + primary_key: True - name: opened data_categories: [system.operations] diff --git a/tests/ops/service/connectors/test_query_config.py b/tests/ops/service/connectors/test_query_config.py index 2aa0871255..eac650d587 100644 --- a/tests/ops/service/connectors/test_query_config.py +++ b/tests/ops/service/connectors/test_query_config.py @@ -21,6 +21,7 @@ from fides.api.service.masking.strategy.masking_strategy_hash import HashMaskingStrategy from fides.api.util.data_category import DataCategory from tests.fixtures.application_fixtures import load_dataset +from tests.ops.test_helpers.dataset_utils import remove_primary_keys from ...task.traversal_data import integration_db_graph from ...test_helpers.cache_secrets_helper import cache_secret, clear_cache_secrets @@ -273,7 +274,7 @@ def test_generate_update_stmt_one_field( text_clause = config.generate_update_stmt(row, erasure_policy, privacy_request) assert ( text_clause.text - == """UPDATE customer SET name = :masked_name WHERE email = :email""" + == """UPDATE customer SET name = :masked_name WHERE id = :id""" ) assert text_clause._bindparams["masked_name"].key == "masked_name" assert ( @@ -341,7 +342,7 @@ def test_generate_update_stmt_length_truncation( ) assert ( text_clause.text - == """UPDATE customer SET name = :masked_name WHERE email = :email""" + == """UPDATE customer SET name = :masked_name WHERE id = :id""" ) assert text_clause._bindparams["masked_name"].key == "masked_name" # length truncation on name field @@ -391,7 +392,7 @@ def test_generate_update_stmt_multiple_fields_same_rule( text_clause = config.generate_update_stmt(row, erasure_policy, privacy_request) assert ( text_clause.text - == "UPDATE customer SET email = :masked_email, name = :masked_name WHERE email = :email" + == "UPDATE customer SET email = :masked_email, name = :masked_name WHERE id = :id" ) assert text_clause._bindparams["masked_name"].key == "masked_name" # since length is set to 40 in dataset.yml, we expect only first 40 chars of masked val @@ -407,7 +408,7 @@ def test_generate_update_stmt_multiple_fields_same_rule( ["customer-1@example.com"], request_id=privacy_request.id )[0] ) - assert text_clause._bindparams["email"].value == "customer-1@example.com" + assert text_clause._bindparams["id"].value == 1 clear_cache_secrets(privacy_request.id) def test_generate_update_stmts_from_multiple_rules( @@ -434,6 +435,201 @@ def test_generate_update_stmts_from_multiple_rules( row, erasure_policy_two_rules, privacy_request ) + assert ( + text_clause.text + == "UPDATE customer SET email = :masked_email, name = :masked_name WHERE id = :id" + ) + # Two different masking strategies used for name and email + assert ( + text_clause._bindparams["masked_name"].value is None + ) # Null masking strategy + assert ( + text_clause._bindparams["masked_email"].value == "*****" + ) # String rewrite masking strategy + + def test_generate_update_stmt_one_field_without_primary_keys( + self, erasure_policy, example_datasets, connection_config + ): + dataset = remove_primary_keys(Dataset(**example_datasets[0])) + graph = convert_dataset_to_graph(dataset, connection_config.key) + dataset_graph = DatasetGraph(*[graph]) + traversal = Traversal(dataset_graph, {"email": "customer-1@example.com"}) + + customer_node = traversal.traversal_node_dict[ + CollectionAddress("postgres_example_test_dataset", "customer") + ].to_mock_execution_node() + + config = SQLQueryConfig(customer_node) + row = { + "email": "customer-1@example.com", + "name": "John Customer", + "address_id": 1, + "id": 1, + } + text_clause = config.generate_update_stmt(row, erasure_policy, privacy_request) + assert ( + text_clause.text + == """UPDATE customer SET name = :masked_name WHERE email = :email""" + ) + assert text_clause._bindparams["masked_name"].key == "masked_name" + assert ( + text_clause._bindparams["masked_name"].value is None + ) # Null masking strategy + + def test_generate_update_stmt_one_field_inbound_reference_without_primary_keys( + self, erasure_policy_address_city, example_datasets, connection_config + ): + dataset = remove_primary_keys(Dataset(**example_datasets[0])) + graph = convert_dataset_to_graph(dataset, connection_config.key) + dataset_graph = DatasetGraph(*[graph]) + traversal = Traversal(dataset_graph, {"email": "customer-1@example.com"}) + + address_node = traversal.traversal_node_dict[ + CollectionAddress("postgres_example_test_dataset", "address") + ].to_mock_execution_node() + + config = SQLQueryConfig(address_node) + row = { + "id": 1, + "house": "123", + "street": "Main St", + "city": "San Francisco", + "state": "CA", + "zip": "94105", + } + text_clause = config.generate_update_stmt( + row, erasure_policy_address_city, privacy_request + ) + assert ( + text_clause.text + == """UPDATE address SET city = :masked_city WHERE id = :id""" + ) + assert text_clause._bindparams["masked_city"].key == "masked_city" + assert ( + text_clause._bindparams["masked_city"].value is None + ) # Null masking strategy + + def test_generate_update_stmt_length_truncation_without_primary_keys( + self, + erasure_policy_string_rewrite_long, + example_datasets, + connection_config, + ): + dataset = remove_primary_keys(Dataset(**example_datasets[0])) + graph = convert_dataset_to_graph(dataset, connection_config.key) + dataset_graph = DatasetGraph(*[graph]) + traversal = Traversal(dataset_graph, {"email": "customer-1@example.com"}) + + customer_node = traversal.traversal_node_dict[ + CollectionAddress("postgres_example_test_dataset", "customer") + ].to_mock_execution_node() + + config = SQLQueryConfig(customer_node) + row = { + "email": "customer-1@example.com", + "name": "John Customer", + "address_id": 1, + "id": 1, + } + + text_clause = config.generate_update_stmt( + row, erasure_policy_string_rewrite_long, privacy_request + ) + assert ( + text_clause.text + == """UPDATE customer SET name = :masked_name WHERE email = :email""" + ) + assert text_clause._bindparams["masked_name"].key == "masked_name" + # length truncation on name field + assert ( + text_clause._bindparams["masked_name"].value + == "some rewrite value that is very long and" + ) + + def test_generate_update_stmt_multiple_fields_same_rule_without_primary_keys( + self, erasure_policy, example_datasets, connection_config + ): + dataset = remove_primary_keys(Dataset(**example_datasets[0])) + graph = convert_dataset_to_graph(dataset, connection_config.key) + dataset_graph = DatasetGraph(*[graph]) + traversal = Traversal(dataset_graph, {"email": "customer-1@example.com"}) + + customer_node = traversal.traversal_node_dict[ + CollectionAddress("postgres_example_test_dataset", "customer") + ].to_mock_execution_node() + + config = SQLQueryConfig(customer_node) + row = { + "email": "customer-1@example.com", + "name": "John Customer", + "address_id": 1, + "id": 1, + } + + # Make target more broad + rule = erasure_policy.rules[0] + target = rule.targets[0] + target.data_category = DataCategory("user").value + + # Update rule masking strategy + rule.masking_strategy = { + "strategy": "hash", + "configuration": {"algorithm": "SHA-512"}, + } + # cache secrets for hash strategy + secret = MaskingSecretCache[str]( + secret="adobo", + masking_strategy=HashMaskingStrategy.name, + secret_type=SecretType.salt, + ) + cache_secret(secret, privacy_request.id) + + text_clause = config.generate_update_stmt(row, erasure_policy, privacy_request) + assert ( + text_clause.text + == "UPDATE customer SET email = :masked_email, name = :masked_name WHERE email = :email" + ) + assert text_clause._bindparams["masked_name"].key == "masked_name" + # since length is set to 40 in dataset.yml, we expect only first 40 chars of masked val + assert ( + text_clause._bindparams["masked_name"].value + == HashMaskingStrategy(HashMaskingConfiguration(algorithm="SHA-512")).mask( + ["John Customer"], request_id=privacy_request.id + )[0][0:40] + ) + assert ( + text_clause._bindparams["masked_email"].value + == HashMaskingStrategy(HashMaskingConfiguration(algorithm="SHA-512")).mask( + ["customer-1@example.com"], request_id=privacy_request.id + )[0] + ) + assert text_clause._bindparams["email"].value == "customer-1@example.com" + clear_cache_secrets(privacy_request.id) + + def test_generate_update_stmts_from_multiple_rules_without_primary_keys( + self, erasure_policy_two_rules, example_datasets, connection_config + ): + dataset = remove_primary_keys(Dataset(**example_datasets[0])) + graph = convert_dataset_to_graph(dataset, connection_config.key) + dataset_graph = DatasetGraph(*[graph]) + traversal = Traversal(dataset_graph, {"email": "customer-1@example.com"}) + row = { + "email": "customer-1@example.com", + "name": "John Customer", + "address_id": 1, + "id": 1, + } + + customer_node = traversal.traversal_node_dict[ + CollectionAddress("postgres_example_test_dataset", "customer") + ].to_mock_execution_node() + + config = SQLQueryConfig(customer_node) + + text_clause = config.generate_update_stmt( + row, erasure_policy_two_rules, privacy_request + ) + assert ( text_clause.text == "UPDATE customer SET email = :masked_email, name = :masked_name WHERE email = :email" @@ -446,6 +642,7 @@ def test_generate_update_stmts_from_multiple_rules( text_clause._bindparams["masked_email"].value == "*****" ) # String rewrite masking strategy + class TestSQLLikeQueryConfig: def test_missing_namespace_meta_schema(self): diff --git a/tests/ops/service/connectors/test_snowflake_query_config.py b/tests/ops/service/connectors/test_snowflake_query_config.py index 5521a1a88a..4f4b23b8c4 100644 --- a/tests/ops/service/connectors/test_snowflake_query_config.py +++ b/tests/ops/service/connectors/test_snowflake_query_config.py @@ -150,7 +150,7 @@ def test_generate_update_stmt( ) assert ( str(update_stmt) - == 'UPDATE "address" SET "city" = :city, "house" = :house, "state" = :state, "street" = :street, "zip" = :zip WHERE "id" = :id' + == 'UPDATE "address" SET "city" = :masked_city, "house" = :masked_house, "state" = :masked_state, "street" = :masked_street, "zip" = :masked_zip WHERE "id" = :id' ) def test_generate_namespaced_update_stmt( @@ -191,5 +191,5 @@ def test_generate_namespaced_update_stmt( ) assert ( str(update_stmt) - == 'UPDATE "FIDESOPS_TEST"."TEST"."address" SET "city" = :city, "house" = :house, "state" = :state, "street" = :street, "zip" = :zip WHERE "id" = :id' + == 'UPDATE "FIDESOPS_TEST"."TEST"."address" SET "city" = :masked_city, "house" = :masked_house, "state" = :masked_state, "street" = :masked_street, "zip" = :masked_zip WHERE "id" = :id' ) diff --git a/tests/ops/task/test_create_request_tasks.py b/tests/ops/task/test_create_request_tasks.py index 290c2dc1be..ad118ee46c 100644 --- a/tests/ops/task/test_create_request_tasks.py +++ b/tests/ops/task/test_create_request_tasks.py @@ -927,7 +927,7 @@ def test_erase_after_saas_upstream_and_downstream_tasks( "is_array": False, "read_only": None, "references": [], - "primary_key": True, + "primary_key": False, "data_categories": ["system.operations"], "data_type_converter": "integer", "return_all_elements": None, diff --git a/tests/ops/test_helpers/dataset_utils.py b/tests/ops/test_helpers/dataset_utils.py index e60efb9892..d51e1f47ff 100644 --- a/tests/ops/test_helpers/dataset_utils.py +++ b/tests/ops/test_helpers/dataset_utils.py @@ -13,7 +13,11 @@ ) from fides.api.graph.data_type import DataType, get_data_type, to_data_type_string from fides.api.models.connectionconfig import ConnectionConfig -from fides.api.models.datasetconfig import DatasetConfig, convert_dataset_to_graph +from fides.api.models.datasetconfig import ( + DatasetConfig, + DatasetField, + convert_dataset_to_graph, +) from fides.api.util.collection_util import Row SAAS_DATASET_DIRECTORY = "data/saas/dataset/" @@ -231,3 +235,27 @@ def get_simple_fields(fields: Iterable[Field]) -> List[Dict[str, Any]]: object["fields"] = get_simple_fields(field.fields.values()) object_list.append(object) return object_list + + +def remove_primary_keys(dataset: Dataset) -> Dataset: + """Returns a copy of the dataset with primary key fields removed from fides_meta.""" + dataset_copy = dataset.model_copy(deep=True) + + for collection in dataset_copy.collections: + for field in collection.fields: + if field.fides_meta: + if field.fides_meta.primary_key: + field.fides_meta.primary_key = None + if field.fields: + _remove_nested_primary_keys(field.fields) + + return dataset_copy + + +def _remove_nested_primary_keys(fields: List[DatasetField]) -> None: + """Helper function to recursively remove primary keys from nested fields.""" + for field in fields: + if field.fides_meta and field.fides_meta.primary_key: + field.fides_meta.primary_key = None + if field.fields: + _remove_nested_primary_keys(field.fields) From 3c3c63c0d25798cea9ff5e9c4e5213d17aa7be9e Mon Sep 17 00:00:00 2001 From: Adrian Galvan Date: Tue, 10 Dec 2024 17:08:49 -0800 Subject: [PATCH 22/22] Revert setting requires_primary_keys to False for SaaS connectors --- src/fides/api/service/connectors/saas_connector.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/fides/api/service/connectors/saas_connector.py b/src/fides/api/service/connectors/saas_connector.py index b1101467bf..b917b6cfda 100644 --- a/src/fides/api/service/connectors/saas_connector.py +++ b/src/fides/api/service/connectors/saas_connector.py @@ -72,11 +72,6 @@ class SaaSConnector(BaseConnector[AuthenticatedClient], Contextualizable): """A connector type to integrate with third-party SaaS APIs""" - @property - def requires_primary_keys(self) -> bool: - """SaaS connectors work with HTTP requests, so the database concept of primary keys does not apply.""" - return False - def get_log_context(self) -> Dict[LoggerContextKeys, Any]: return { LoggerContextKeys.system_key: (