diff --git a/.github/workflows/backend_checks.yml b/.github/workflows/backend_checks.yml index e1cc215070..eeaf3dda2c 100644 --- a/.github/workflows/backend_checks.yml +++ b/.github/workflows/backend_checks.yml @@ -256,6 +256,7 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 45 + continue-on-error: true steps: - name: Download container uses: actions/download-artifact@v4 diff --git a/data/dataset/bigquery_enterprise_test_dataset.yml b/data/dataset/bigquery_enterprise_test_dataset.yml index 59d27e68a2..64668192d0 100644 --- a/data/dataset/bigquery_enterprise_test_dataset.yml +++ b/data/dataset/bigquery_enterprise_test_dataset.yml @@ -1,405 +1,149 @@ dataset: - - fides_key: enterprise_dsr_testing - organization_fides_key: default_organization - tags: null - name: Bigquery Enterprise Test Dataset - description: BigQuery dataset containing real data - meta: null - data_categories: null - fides_meta: - resource_id: enterprise_dsr_testing.prj-sandbox-55855.enterprise_dsr_testing - after: null - namespace: - dataset_id: enterprise_dsr_testing - project_id: prj-sandbox-55855 - collections: - - name: comments - description: null - data_categories: null - fields: - - name: creation_date - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: id - description: null - data_categories: - - system.operations - fides_meta: - references: null - identity: null - primary_key: true - data_type: integer - length: null - return_all_elements: null - read_only: null - custom_request_field: null - fields: null - - name: post_id - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: score - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: text - description: null - data_categories: - - user.contact - fides_meta: null - fields: null - - name: user_display_name - description: null - data_categories: - - user.contact - fides_meta: null - fields: null - - name: user_id - description: null - data_categories: - - user.contact - fides_meta: - references: - - dataset: enterprise_dsr_testing - field: users.id - direction: from - identity: null - primary_key: null - data_type: null - length: null - return_all_elements: null - read_only: null - custom_request_field: null - fields: null - fides_meta: null - - name: post_history - description: null - data_categories: null - fields: - - name: comment - description: null - data_categories: - - user.contact - fides_meta: null - fields: null - - name: creation_date - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: id - description: null - data_categories: - - system.operations - fides_meta: - references: null - identity: null - primary_key: true - data_type: integer - length: null - return_all_elements: null - read_only: null - custom_request_field: null - fields: null - - name: post_history_type_id - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: post_id - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: revision_guid - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: text - description: null - data_categories: - - user.contact - fides_meta: null - fields: null - - name: user_id - description: null - data_categories: - - system.operations - fides_meta: - references: - - dataset: enterprise_dsr_testing - field: users.id - direction: from - identity: null - primary_key: null - data_type: null - length: null - return_all_elements: null - read_only: null - custom_request_field: null - fields: null - fides_meta: null - - name: stackoverflow_posts - description: null - data_categories: null - fields: - - name: accepted_answer_id - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: answer_count - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: body - description: null - data_categories: - - user.contact - fides_meta: null - fields: null - - name: comment_count - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: community_owned_date - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: creation_date - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: favorite_count - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: id - description: null - data_categories: - - system.operations - fides_meta: - references: null - identity: null - primary_key: true - data_type: integer - length: null - return_all_elements: null - read_only: null - custom_request_field: null - fields: null - - name: last_activity_date - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: last_edit_date - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: last_editor_display_name - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: last_editor_user_id - description: null - data_categories: - - system.operations - fides_meta: - references: - - dataset: enterprise_dsr_testing - field: users.id - direction: from - identity: null - primary_key: null - data_type: null - length: null - return_all_elements: null - read_only: null - custom_request_field: null - fields: null - - name: owner_display_name - description: null - data_categories: - - user.contact - fides_meta: null - fields: null - - name: owner_user_id - description: null - data_categories: - - system.operations - fides_meta: - references: - - dataset: enterprise_dsr_testing - field: users.id - direction: from - identity: null - primary_key: null - data_type: integer - length: null - return_all_elements: null - read_only: null - custom_request_field: null - fields: null - - name: parent_id - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: post_type_id - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: score - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: tags - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: title - description: null - data_categories: - - user.contact - fides_meta: null - fields: null - - name: view_count - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - fides_meta: null - - name: users - description: null - data_categories: null - fields: - - name: about_me - description: null - data_categories: - - user.contact - fides_meta: null - fields: null - - name: age - description: null - data_categories: - - user - fides_meta: null - fields: null - - name: creation_date - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: display_name - description: null - data_categories: - - user.contact - fides_meta: null - fields: null - - name: down_votes - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: id - description: null - data_categories: - - user.contact - fides_meta: - references: null - identity: stackoverflow_user_id - primary_key: true - data_type: integer - length: null - return_all_elements: null - read_only: null - custom_request_field: null - fields: null - - name: last_access_date - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: location - description: null - data_categories: - - user.contact - fides_meta: null - fields: null - - name: profile_image_url - description: null - data_categories: - - user.contact - fides_meta: null - fields: null - - name: reputation - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: up_votes - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: views - description: null - data_categories: - - system.operations - fides_meta: null - fields: null - - name: website_url - description: null - data_categories: - - user - fides_meta: null - fields: null - fides_meta: - after: null - erase_after: - - enterprise_dsr_testing.comments - skip_processing: false - masking_strategy_override: null - partitioning: null + - fides_key: enterprise_dsr_testing + organization_fides_key: default_organization + name: Bigquery Enterprise Test Dataset + description: BigQuery dataset containing real data + fides_meta: + resource_id: enterprise_dsr_testing.prj-sandbox-55855.enterprise_dsr_testing + namespace: + dataset_id: enterprise_dsr_testing + project_id: prj-sandbox-55855 + collections: + - name: comments + fields: + - name: creation_date + data_categories: [system.operations] + - name: id + data_categories: [system.operations] + fides_meta: + data_type: integer + - name: post_id + data_categories: [system.operations] + - name: score + data_categories: [system.operations] + - name: text + data_categories: [user.contact] + - name: user_display_name + data_categories: [user.contact] + - name: user_id + data_categories: [user.contact] + fides_meta: + references: + - dataset: enterprise_dsr_testing + field: users.id + direction: from + - name: post_history + fields: + - name: comment + data_categories: [user.contact] + - name: creation_date + data_categories: [system.operations] + - name: id + data_categories: [system.operations] + fides_meta: + data_type: integer + - name: post_history_type_id + data_categories: [system.operations] + - name: post_id + data_categories: [system.operations] + - name: revision_guid + data_categories: [system.operations] + - name: text + data_categories: [user.contact] + - name: user_id + data_categories: [system.operations] + fides_meta: + references: + - dataset: enterprise_dsr_testing + field: users.id + direction: from + - name: stackoverflow_posts + fields: + - name: accepted_answer_id + data_categories: [system.operations] + - name: answer_count + data_categories: [system.operations] + - name: body + data_categories: [user.contact] + - name: comment_count + data_categories: [system.operations] + - name: community_owned_date + data_categories: [system.operations] + - name: creation_date + data_categories: [system.operations] + - name: favorite_count + data_categories: [system.operations] + - name: id + data_categories: [system.operations] + fides_meta: + data_type: integer + - name: last_activity_date + data_categories: [system.operations] + - name: last_edit_date + data_categories: [system.operations] + - name: last_editor_display_name + data_categories: [system.operations] + - name: last_editor_user_id + data_categories: [system.operations] + fides_meta: + references: + - dataset: enterprise_dsr_testing + field: users.id + direction: from + - name: owner_display_name + data_categories: [user.contact] + - name: owner_user_id + data_categories: [system.operations] + fides_meta: + references: + - dataset: enterprise_dsr_testing + field: users.id + direction: from + data_type: integer + - name: parent_id + data_categories: [system.operations] + - name: post_type_id + data_categories: [system.operations] + - name: score + data_categories: [system.operations] + - name: tags + data_categories: [system.operations] + - name: title + data_categories: [user.contact] + - name: view_count + data_categories: [system.operations] + - name: users + fields: + - name: about_me + data_categories: [user.contact] + - name: age + data_categories: [user] + - name: creation_date + data_categories: [system.operations] + - name: display_name + data_categories: [user.contact] + - name: down_votes + data_categories: [system.operations] + - name: id + data_categories: [user.contact] + fides_meta: + identity: stackoverflow_user_id + data_type: integer + - name: last_access_date + data_categories: [system.operations] + - name: location + data_categories: [user.contact] + - name: profile_image_url + data_categories: [user.contact] + - name: reputation + data_categories: [system.operations] + - name: up_votes + data_categories: [system.operations] + - name: views + data_categories: [system.operations] + - name: website_url + data_categories: [user] + fides_meta: + erase_after: + - enterprise_dsr_testing.comments + skip_processing: false diff --git a/data/dataset/bigquery_example_test_dataset.yml b/data/dataset/bigquery_example_test_dataset.yml index 11fdac1aba..c4ea16cb44 100644 --- a/data/dataset/bigquery_example_test_dataset.yml +++ b/data/dataset/bigquery_example_test_dataset.yml @@ -13,8 +13,6 @@ dataset: data_categories: [user.contact.address.street] - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: state data_categories: [user.contact.address.state] - name: street @@ -53,8 +51,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -80,8 +76,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: @@ -98,8 +92,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: time data_categories: [user.sensor] @@ -114,8 +106,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: shipping_address_id data_categories: [system.operations] fides_meta: @@ -166,8 +156,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: name data_categories: [user.financial] - name: preferred @@ -177,8 +165,6 @@ dataset: fields: - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: name data_categories: [system.operations] - name: price @@ -193,8 +179,6 @@ dataset: data_type: string - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: month data_categories: [system.operations] - name: name @@ -227,8 +211,6 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: opened data_categories: [system.operations] diff --git a/data/saas/dataset/hubspot_dataset.yml b/data/saas/dataset/hubspot_dataset.yml index 94f0bf43f9..406c800840 100644 --- a/data/saas/dataset/hubspot_dataset.yml +++ b/data/saas/dataset/hubspot_dataset.yml @@ -8,7 +8,6 @@ dataset: - name: id data_categories: [user.unique_id] fidesops_meta: - primary_key: True data_type: string - name: properties fidesops_meta: @@ -117,7 +116,6 @@ dataset: - name: id data_categories: [system.operations] fidesops_meta: - primary_key: True data_type: string - name: name data_categories: [system.operations] @@ -152,7 +150,6 @@ dataset: - name: id data_categories: [user.unique_id] fidesops_meta: - primary_key: True data_type: string - name: email data_categories: [user.contact.email] diff --git a/data/saas/dataset/mailchimp_dataset.yml b/data/saas/dataset/mailchimp_dataset.yml index 05e3e45a2e..46751b04a4 100644 --- a/data/saas/dataset/mailchimp_dataset.yml +++ b/data/saas/dataset/mailchimp_dataset.yml @@ -35,8 +35,6 @@ dataset: fields: - name: id data_categories: [user.unique_id] - fidesops_meta: - primary_key: True - name: list_id data_categories: [system.operations] - name: email_address diff --git a/data/saas/dataset/stripe_dataset.yml b/data/saas/dataset/stripe_dataset.yml index f8e7482ecf..5d1f101973 100644 --- a/data/saas/dataset/stripe_dataset.yml +++ b/data/saas/dataset/stripe_dataset.yml @@ -8,7 +8,6 @@ dataset: - name: id data_categories: [system.operations] fidesops_meta: - primary_key: True read_only: True data_type: string - name: object @@ -618,7 +617,6 @@ dataset: - name: id data_categories: [system.operations] fidesops_meta: - primary_key: True data_type: string - name: object data_categories: [system.operations] @@ -715,7 +713,6 @@ dataset: - name: id data_categories: [system.operations] fidesops_meta: - primary_key: True data_type: string - name: object data_categories: [system.operations] @@ -755,7 +752,6 @@ dataset: - name: id data_categories: [system.operations] fidesops_meta: - primary_key: True data_type: string - name: object data_categories: [system.operations] @@ -924,7 +920,6 @@ dataset: - name: id data_categories: [system.operations] fidesops_meta: - primary_key: True data_type: string - name: object data_categories: [system.operations] @@ -959,7 +954,6 @@ dataset: - name: id data_categories: [system.operations] fidesops_meta: - primary_key: True data_type: string - name: object data_categories: [system.operations] @@ -1236,7 +1230,6 @@ dataset: - name: id data_categories: [system.operations] fidesops_meta: - primary_key: True data_type: string - name: object data_categories: [system.operations] @@ -1325,7 +1318,6 @@ dataset: - name: id data_categories: [system.operations] fidesops_meta: - primary_key: true data_type: string - name: object data_categories: [system.operations] diff --git a/src/fides/api/service/connectors/base_connector.py b/src/fides/api/service/connectors/base_connector.py index ca3439f523..e1f735df1c 100644 --- a/src/fides/api/service/connectors/base_connector.py +++ b/src/fides/api/service/connectors/base_connector.py @@ -132,3 +132,14 @@ def execute_standalone_retrieval_query( raise NotImplementedError( "execute_standalone_retrieval_query must be implemented in a concrete subclass" ) + + @property + def requires_primary_keys(self) -> bool: + """ + Indicates if datasets linked to this connector require primary keys for erasures. + Defaults to True. + """ + + # Defaulting to true for now so we can keep the default behavior and + # incrementally determine the need for primary keys across all connectors + return True diff --git a/src/fides/api/service/connectors/bigquery_connector.py b/src/fides/api/service/connectors/bigquery_connector.py index 8b51f90842..ae6fe4b909 100644 --- a/src/fides/api/service/connectors/bigquery_connector.py +++ b/src/fides/api/service/connectors/bigquery_connector.py @@ -33,6 +33,11 @@ class BigQueryConnector(SQLConnector): secrets_schema = BigQuerySchema + @property + def requires_primary_keys(self) -> bool: + """BigQuery does not have the concept of primary keys so they're not required for erasures.""" + return False + # Overrides BaseConnector.build_uri def build_uri(self) -> str: """Build URI of format""" diff --git a/src/fides/api/service/connectors/postgres_connector.py b/src/fides/api/service/connectors/postgres_connector.py index 5354d4ec13..2abafc01c8 100644 --- a/src/fides/api/service/connectors/postgres_connector.py +++ b/src/fides/api/service/connectors/postgres_connector.py @@ -19,6 +19,11 @@ class PostgreSQLConnector(SQLConnector): secrets_schema = PostgreSQLSchema + @property + def requires_primary_keys(self) -> bool: + """Postgres allows arbitrary columns in the WHERE clause for updates so primary keys are not required.""" + return False + def build_uri(self) -> str: """Build URI of format postgresql://[user[:password]@][netloc][:port][/dbname]""" config = self.secrets_schema(**self.configuration.secrets or {}) diff --git a/src/fides/api/service/connectors/query_configs/bigquery_query_config.py b/src/fides/api/service/connectors/query_configs/bigquery_query_config.py index 681e2b9c60..6060ff5822 100644 --- a/src/fides/api/service/connectors/query_configs/bigquery_query_config.py +++ b/src/fides/api/service/connectors/query_configs/bigquery_query_config.py @@ -123,15 +123,15 @@ def generate_update( TODO: DRY up this method and `generate_delete` a bit """ update_value_map: Dict[str, Any] = self.update_value_map(row, policy, request) - non_empty_primary_keys: Dict[str, Field] = filter_nonempty_values( + non_empty_reference_field_keys: Dict[str, Field] = filter_nonempty_values( { fpath.string_path: fld.cast(row[fpath.string_path]) - for fpath, fld in self.primary_key_field_paths.items() + for fpath, fld in self.reference_field_paths.items() if fpath.string_path in row } ) - valid = len(non_empty_primary_keys) > 0 and update_value_map + valid = len(non_empty_reference_field_keys) > 0 and update_value_map if not valid: logger.warning( "There is not enough data to generate a valid update statement for {}", @@ -140,8 +140,8 @@ def generate_update( return [] table = Table(self._generate_table_name(), MetaData(bind=client), autoload=True) - pk_clauses: List[ColumnElement] = [ - getattr(table.c, k) == v for k, v in non_empty_primary_keys.items() + where_clauses: List[ColumnElement] = [ + getattr(table.c, k) == v for k, v in non_empty_reference_field_keys.items() ] if self.partitioning: @@ -153,13 +153,13 @@ def generate_update( for partition_clause in partition_clauses: partitioned_queries.append( table.update() - .where(*(pk_clauses + [text(partition_clause)])) + .where(*(where_clauses + [text(partition_clause)])) .values(**update_value_map) ) return partitioned_queries - return [table.update().where(*pk_clauses).values(**update_value_map)] + return [table.update().where(*where_clauses).values(**update_value_map)] def generate_delete(self, row: Row, client: Engine) -> List[Delete]: """Returns a List of SQLAlchemy DELETE statements for BigQuery. Does not actually execute the delete statement. @@ -172,15 +172,15 @@ def generate_delete(self, row: Row, client: Engine) -> List[Delete]: TODO: DRY up this method and `generate_update` a bit """ - non_empty_primary_keys: Dict[str, Field] = filter_nonempty_values( + non_empty_reference_field_keys: Dict[str, Field] = filter_nonempty_values( { fpath.string_path: fld.cast(row[fpath.string_path]) - for fpath, fld in self.primary_key_field_paths.items() + for fpath, fld in self.reference_field_paths.items() if fpath.string_path in row } ) - valid = len(non_empty_primary_keys) > 0 + valid = len(non_empty_reference_field_keys) > 0 if not valid: logger.warning( "There is not enough data to generate a valid DELETE statement for {}", @@ -189,8 +189,8 @@ def generate_delete(self, row: Row, client: Engine) -> List[Delete]: return [] table = Table(self._generate_table_name(), MetaData(bind=client), autoload=True) - pk_clauses: List[ColumnElement] = [ - getattr(table.c, k) == v for k, v in non_empty_primary_keys.items() + where_clauses: List[ColumnElement] = [ + getattr(table.c, k) == v for k, v in non_empty_reference_field_keys.items() ] if self.partitioning: @@ -202,9 +202,9 @@ def generate_delete(self, row: Row, client: Engine) -> List[Delete]: for partition_clause in partition_clauses: partitioned_queries.append( - table.delete().where(*(pk_clauses + [text(partition_clause)])) + table.delete().where(*(where_clauses + [text(partition_clause)])) ) return partitioned_queries - return [table.delete().where(*pk_clauses)] + return [table.delete().where(*where_clauses)] diff --git a/src/fides/api/service/connectors/query_configs/mongodb_query_config.py b/src/fides/api/service/connectors/query_configs/mongodb_query_config.py index bd650723f4..1a6aa303f0 100644 --- a/src/fides/api/service/connectors/query_configs/mongodb_query_config.py +++ b/src/fides/api/service/connectors/query_configs/mongodb_query_config.py @@ -69,21 +69,21 @@ def generate_update_stmt( """Generate a SQL update statement in the form of Mongo update statement components""" update_clauses = self.update_value_map(row, policy, request) - pk_clauses: Dict[str, Any] = filter_nonempty_values( + where_clauses: Dict[str, Any] = filter_nonempty_values( { field_path.string_path: field.cast(row[field_path.string_path]) for field_path, field in self.primary_key_field_paths.items() } ) - valid = len(pk_clauses) > 0 and len(update_clauses) > 0 + valid = len(where_clauses) > 0 and len(update_clauses) > 0 if not valid: logger.warning( "There is not enough data to generate a valid update for {}", self.node.address, ) return None - return pk_clauses, {"$set": update_clauses} + return where_clauses, {"$set": update_clauses} def query_to_str(self, t: MongoStatement, input_data: Dict[str, List[Any]]) -> str: """string representation of a query for logging/dry-run""" diff --git a/src/fides/api/service/connectors/query_configs/query_config.py b/src/fides/api/service/connectors/query_configs/query_config.py index 6e868964af..9f5ddb0251 100644 --- a/src/fides/api/service/connectors/query_configs/query_config.py +++ b/src/fides/api/service/connectors/query_configs/query_config.py @@ -100,6 +100,15 @@ def primary_key_field_paths(self) -> Dict[FieldPath, Field]: if field.primary_key } + @property + def reference_field_paths(self) -> Dict[FieldPath, Field]: + """Mapping of FieldPaths to Fields that have incoming identity or dataset references""" + return { + field_path: field + for field_path, field in self.field_map().items() + if field_path in {edge.f2.field_path for edge in self.node.incoming_edges} + } + def query_sources(self) -> Dict[str, List[CollectionAddress]]: """Display the input collection(s) for each query key for display purposes. @@ -412,14 +421,16 @@ def generate_query_without_tuples( # pylint: disable=R0914 def get_update_stmt( self, update_clauses: List[str], - pk_clauses: List[str], + where_clauses: List[str], ) -> str: """Returns a SQL UPDATE statement to fit SQL syntax.""" - return f"UPDATE {self.node.address.collection} SET {', '.join(update_clauses)} WHERE {' AND '.join(pk_clauses)}" + return f"UPDATE {self.node.address.collection} SET {', '.join(update_clauses)} WHERE {' AND '.join(where_clauses)}" @abstractmethod def get_update_clauses( - self, update_value_map: Dict[str, Any], non_empty_primary_keys: Dict[str, Field] + self, + update_value_map: Dict[str, Any], + where_clause_fields: Dict[str, Field], ) -> List[str]: """Returns a list of update clauses for the update statement.""" @@ -428,7 +439,7 @@ def format_query_stmt(self, query_str: str, update_value_map: Dict[str, Any]) -> """Returns a formatted update statement in the appropriate dialect.""" @abstractmethod - def format_key_map_for_update_stmt(self, fields: List[str]) -> List[str]: + def format_key_map_for_update_stmt(self, param_map: Dict[str, Any]) -> List[str]: """Adds the appropriate formatting for update statements in this datastore.""" def generate_update_stmt( @@ -436,7 +447,8 @@ def generate_update_stmt( ) -> Optional[T]: """Returns an update statement in generic SQL-ish dialect.""" update_value_map: Dict[str, Any] = self.update_value_map(row, policy, request) - non_empty_primary_keys: Dict[str, Field] = filter_nonempty_values( + + non_empty_primary_key_fields: Dict[str, Field] = filter_nonempty_values( { fpath.string_path: fld.cast(row[fpath.string_path]) for fpath, fld in self.primary_key_field_paths.items() @@ -444,17 +456,30 @@ def generate_update_stmt( } ) + non_empty_reference_fields: Dict[str, Field] = filter_nonempty_values( + { + fpath.string_path: fld.cast(row[fpath.string_path]) + for fpath, fld in self.reference_field_paths.items() + if fpath.string_path in row + } + ) + + # Create parameter mappings with masked_ prefix for SET values + param_map = { + **{f"masked_{k}": v for k, v in update_value_map.items()}, + **non_empty_primary_key_fields, + **non_empty_reference_fields, + } + update_clauses = self.get_update_clauses( - update_value_map, non_empty_primary_keys + {k: f"masked_{k}" for k in update_value_map}, + non_empty_primary_key_fields or non_empty_reference_fields, ) - pk_clauses = self.format_key_map_for_update_stmt( - list(non_empty_primary_keys.keys()) + where_clauses = self.format_key_map_for_update_stmt( + {k: k for k in non_empty_primary_key_fields or non_empty_reference_fields} ) - for k, v in non_empty_primary_keys.items(): - update_value_map[k] = v - - valid = len(pk_clauses) > 0 and len(update_clauses) > 0 + valid = len(where_clauses) > 0 and len(update_clauses) > 0 if not valid: logger.warning( "There is not enough data to generate a valid update statement for {}", @@ -462,12 +487,9 @@ def generate_update_stmt( ) return None - query_str = self.get_update_stmt( - update_clauses, - pk_clauses, - ) - logger.info("query = {}, params = {}", Pii(query_str), Pii(update_value_map)) - return self.format_query_stmt(query_str, update_value_map) + query_str = self.get_update_stmt(update_clauses, where_clauses) + logger.info("query = {}, params = {}", Pii(query_str), Pii(param_map)) + return self.format_query_stmt(query_str, param_map) class SQLQueryConfig(SQLLikeQueryConfig[Executable]): @@ -538,16 +560,17 @@ def generate_query( ) return None - def format_key_map_for_update_stmt(self, fields: List[str]) -> List[str]: + def format_key_map_for_update_stmt(self, param_map: Dict[str, Any]) -> List[str]: """Adds the appropriate formatting for update statements in this datastore.""" - fields.sort() - return [f"{k} = :{k}" for k in fields] + return [f"{k} = :{v}" for k, v in sorted(param_map.items())] def get_update_clauses( - self, update_value_map: Dict[str, Any], non_empty_primary_keys: Dict[str, Field] + self, + update_value_map: Dict[str, Any], + where_clause_fields: Dict[str, Field], ) -> List[str]: """Returns a list of update clauses for the update statement.""" - return self.format_key_map_for_update_stmt(list(update_value_map.keys())) + return self.format_key_map_for_update_stmt(update_value_map) def format_query_stmt( self, query_str: str, update_value_map: Dict[str, Any] diff --git a/src/fides/api/service/connectors/query_configs/snowflake_query_config.py b/src/fides/api/service/connectors/query_configs/snowflake_query_config.py index 574e1ea1b1..ec640191d8 100644 --- a/src/fides/api/service/connectors/query_configs/snowflake_query_config.py +++ b/src/fides/api/service/connectors/query_configs/snowflake_query_config.py @@ -59,15 +59,14 @@ def get_formatted_query_string( """Returns a query string with double quotation mark formatting as required by Snowflake syntax.""" return f'SELECT {field_list} FROM {self._generate_table_name()} WHERE ({" OR ".join(clauses)})' - def format_key_map_for_update_stmt(self, fields: List[str]) -> List[str]: + def format_key_map_for_update_stmt(self, param_map: Dict[str, Any]) -> List[str]: """Adds the appropriate formatting for update statements in this datastore.""" - fields.sort() - return [f'"{k}" = :{k}' for k in fields] + return [f'"{k}" = :{v}' for k, v in sorted(param_map.items())] def get_update_stmt( self, update_clauses: List[str], - pk_clauses: List[str], + where_clauses: List[str], ) -> str: """Returns a parameterized update statement in Snowflake dialect.""" - return f'UPDATE {self._generate_table_name()} SET {", ".join(update_clauses)} WHERE {" AND ".join(pk_clauses)}' + return f'UPDATE {self._generate_table_name()} SET {", ".join(update_clauses)} WHERE {" AND ".join(where_clauses)}' diff --git a/src/fides/api/service/connectors/scylla_connector.py b/src/fides/api/service/connectors/scylla_connector.py index 43a821930c..ff17674b88 100644 --- a/src/fides/api/service/connectors/scylla_connector.py +++ b/src/fides/api/service/connectors/scylla_connector.py @@ -28,6 +28,11 @@ class ScyllaConnectorMissingKeyspace(Exception): class ScyllaConnector(BaseConnector[Cluster]): """Scylla Connector""" + @property + def requires_primary_keys(self) -> bool: + """ScyllaDB requires primary keys for erasures.""" + return True + def build_uri(self) -> str: """ Builds URI - Not yet implemented diff --git a/src/fides/api/service/connectors/scylla_query_config.py b/src/fides/api/service/connectors/scylla_query_config.py index 2a72270a40..1fa52d573d 100644 --- a/src/fides/api/service/connectors/scylla_query_config.py +++ b/src/fides/api/service/connectors/scylla_query_config.py @@ -70,21 +70,27 @@ def generate_query( ) -> Optional[ScyllaDBStatement]: return self.generate_query_without_tuples(input_data, policy) - def format_key_map_for_update_stmt(self, fields: List[str]) -> List[str]: + def format_key_map_for_update_stmt(self, param_map: Dict[str, Any]) -> List[str]: """Adds the appropriate formatting for update statements in this datastore.""" - fields.sort() - return [f"{k} = %({k})s" for k in fields] + return [f"{k} = %({v})s" for k, v in sorted(param_map.items())] def get_update_clauses( - self, update_value_map: Dict[str, Any], non_empty_primary_keys: Dict[str, Field] + self, + update_value_map: Dict[str, Any], + where_clause_fields: Dict[str, Field], ) -> List[str]: - """Returns a list of update clauses for the update statement.""" + """Returns a list of update clauses for the update statement. + + Omits primary key fields from updates since ScyllaDB prohibits + updating primary key fields. + """ + return self.format_key_map_for_update_stmt( - [ - key - for key in update_value_map.keys() - if key not in non_empty_primary_keys - ] + { + key: value + for key, value in update_value_map.items() + if key not in where_clause_fields + } ) def format_query_data_name(self, query_data_name: str) -> str: diff --git a/src/fides/api/task/graph_task.py b/src/fides/api/task/graph_task.py index 6b78b57297..145094ea25 100644 --- a/src/fides/api/task/graph_task.py +++ b/src/fides/api/task/graph_task.py @@ -603,12 +603,19 @@ def erasure_request( *erasure_prereqs: int, # TODO Remove when we stop support for DSR 2.0. DSR 3.0 enforces with downstream_tasks. ) -> int: """Run erasure request""" + # if there is no primary key specified in the graph node configuration # note this in the execution log and perform no erasures on this node - if not self.execution_node.collection.contains_field(lambda f: f.primary_key): + if ( + self.connector.requires_primary_keys + and not self.execution_node.collection.contains_field( + lambda f: f.primary_key + ) + ): logger.warning( - "No erasures on {} as there is no primary_key defined.", + 'Skipping erasures on "{}" as the "{}" connector requires a primary key to be defined in one of the collection fields, but none was found.', self.execution_node.address, + self.connector.configuration.connection_type, ) if self.request_task.id: # For DSR 3.0, largely for testing. DSR 3.0 uses Request Task status @@ -617,7 +624,7 @@ def erasure_request( # TODO Remove when we stop support for DSR 2.0 self.resources.cache_erasure(self.key.value, 0) self.update_status( - "No values were erased since no primary key was defined for this collection", + "No values were erased since no primary key was defined in any of the fields for this collection", None, ActionType.erasure, ExecutionLogStatus.complete, diff --git a/src/fides/data/sample_project/sample_resources/mongo_example_test_dataset.yml b/src/fides/data/sample_project/sample_resources/mongo_example_test_dataset.yml index 3971b6481d..ccfbe853a9 100644 --- a/src/fides/data/sample_project/sample_resources/mongo_example_test_dataset.yml +++ b/src/fides/data/sample_project/sample_resources/mongo_example_test_dataset.yml @@ -78,7 +78,6 @@ dataset: - name: _id data_categories: [system.operations] fides_meta: - primary_key: True data_type: object_id - name: customer_identifiers fields: @@ -109,7 +108,6 @@ dataset: - name: _id data_categories: [system.operations] fides_meta: - primary_key: True data_type: object_id - name: customer_information fields: @@ -142,7 +140,6 @@ dataset: - name: _id data_categories: [system.operations] fides_meta: - primary_key: True data_type: object_id - name: passenger_information fields: @@ -172,7 +169,6 @@ dataset: - name: _id data_categories: [system.operations] fides_meta: - primary_key: True data_type: object_id - name: thread fides_meta: @@ -197,7 +193,6 @@ dataset: - name: _id data_categories: [system.operations] fides_meta: - primary_key: True data_type: object_id - name: email data_categories: [user.contact.email] @@ -207,7 +202,6 @@ dataset: - name: id data_categories: [user.unique_id] fides_meta: - primary_key: True references: - dataset: mongo_test field: flights.pilots @@ -221,7 +215,6 @@ dataset: - name: _id data_categories: [system.operations] fides_meta: - primary_key: True data_type: object_id - name: planes data_categories: [system.operations] @@ -240,7 +233,6 @@ dataset: - name: _id data_categories: [system.operations] fides_meta: - primary_key: True data_type: object_id - name: billing_address_id data_categories: [system.operations] @@ -257,8 +249,6 @@ dataset: data_categories: [user.unique_id] - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: name data_categories: [user.financial] - name: preferred @@ -267,7 +257,6 @@ dataset: fields: - name: _id fides_meta: - primary_key: True data_type: object_id - name: owner fides_meta: diff --git a/src/fides/data/sample_project/sample_resources/postgres_example_custom_request_field_dataset.yml b/src/fides/data/sample_project/sample_resources/postgres_example_custom_request_field_dataset.yml index 96b58645d4..0a878fad87 100644 --- a/src/fides/data/sample_project/sample_resources/postgres_example_custom_request_field_dataset.yml +++ b/src/fides/data/sample_project/sample_resources/postgres_example_custom_request_field_dataset.yml @@ -10,7 +10,6 @@ dataset: data_categories: [system.operations] fides_meta: data_type: string - primary_key: True - name: email_address data_categories: [system.operations] fides_meta: diff --git a/tests/fixtures/application_fixtures.py b/tests/fixtures/application_fixtures.py index 8356c42111..d030919aed 100644 --- a/tests/fixtures/application_fixtures.py +++ b/tests/fixtures/application_fixtures.py @@ -864,6 +864,59 @@ def erasure_policy( "rule_id": erasure_rule.id, }, ) + + yield erasure_policy + try: + rule_target.delete(db) + except ObjectDeletedError: + pass + try: + erasure_rule.delete(db) + except ObjectDeletedError: + pass + try: + erasure_policy.delete(db) + except ObjectDeletedError: + pass + + +@pytest.fixture(scope="function") +def erasure_policy_address_city( + db: Session, + oauth_client: ClientDetail, +) -> Generator: + erasure_policy = Policy.create( + db=db, + data={ + "name": "example erasure policy", + "key": "example_erasure_policy", + "client_id": oauth_client.id, + }, + ) + + erasure_rule = Rule.create( + db=db, + data={ + "action_type": ActionType.erasure.value, + "client_id": oauth_client.id, + "name": "Erasure Rule", + "policy_id": erasure_policy.id, + "masking_strategy": { + "strategy": "null_rewrite", + "configuration": {}, + }, + }, + ) + + rule_target = RuleTarget.create( + db=db, + data={ + "client_id": oauth_client.id, + "data_category": DataCategory("user.contact.address.city").value, + "rule_id": erasure_rule.id, + }, + ) + yield erasure_policy try: rule_target.delete(db) diff --git a/tests/fixtures/saas/test_data/mailchimp_override_dataset.yml b/tests/fixtures/saas/test_data/mailchimp_override_dataset.yml index b0d39058e3..d9f6e3214b 100644 --- a/tests/fixtures/saas/test_data/mailchimp_override_dataset.yml +++ b/tests/fixtures/saas/test_data/mailchimp_override_dataset.yml @@ -41,8 +41,6 @@ dataset: fields: - name: id data_categories: [user.unique_id] - fidesops_meta: - primary_key: True - name: list_id data_categories: [system.operations] - name: email_address diff --git a/tests/fixtures/saas/test_data/saas_async_dataset.yml b/tests/fixtures/saas/test_data/saas_async_dataset.yml index 42f6557ddb..2398a307a1 100644 --- a/tests/fixtures/saas/test_data/saas_async_dataset.yml +++ b/tests/fixtures/saas/test_data/saas_async_dataset.yml @@ -7,8 +7,6 @@ dataset: fields: - name: id data_categories: [user.unique_id] - fidesops_meta: - primary_key: True - name: system_id data_categories: [system] - name: state diff --git a/tests/fixtures/saas/test_data/saas_custom_privacy_request_fields_dataset.yml b/tests/fixtures/saas/test_data/saas_custom_privacy_request_fields_dataset.yml index 0009dce2e2..c0e9f1f094 100644 --- a/tests/fixtures/saas/test_data/saas_custom_privacy_request_fields_dataset.yml +++ b/tests/fixtures/saas/test_data/saas_custom_privacy_request_fields_dataset.yml @@ -9,4 +9,3 @@ dataset: data_categories: [system.operations] fidesops_meta: data_type: integer - primary_key: True diff --git a/tests/fixtures/saas/test_data/saas_erasure_order_dataset.yml b/tests/fixtures/saas/test_data/saas_erasure_order_dataset.yml index 513088b6e2..2dcd28806d 100644 --- a/tests/fixtures/saas/test_data/saas_erasure_order_dataset.yml +++ b/tests/fixtures/saas/test_data/saas_erasure_order_dataset.yml @@ -9,35 +9,30 @@ dataset: data_categories: [system.operations] fidesops_meta: data_type: integer - primary_key: True - name: refunds fields: - name: id data_categories: [system.operations] fidesops_meta: data_type: integer - primary_key: True - name: labels fields: - name: id data_categories: [system.operations] fidesops_meta: data_type: integer - primary_key: True - name: orders_to_refunds fields: - name: id data_categories: [system.operations] fidesops_meta: data_type: integer - primary_key: True - name: refunds_to_orders fields: - name: id data_categories: [system.operations] fidesops_meta: data_type: integer - primary_key: True - name: products fields: - name: id diff --git a/tests/fixtures/saas/test_data/saas_example_dataset.yml b/tests/fixtures/saas/test_data/saas_example_dataset.yml index c0c430eb80..500c82df50 100644 --- a/tests/fixtures/saas/test_data/saas_example_dataset.yml +++ b/tests/fixtures/saas/test_data/saas_example_dataset.yml @@ -41,8 +41,6 @@ dataset: fields: - name: id data_categories: [user.unique_id] - fidesops_meta: - primary_key: True - name: list_id data_categories: [system.operations] - name: email_address @@ -188,7 +186,6 @@ dataset: - name: id data_categories: [system.operations] fidesops_meta: - primary_key: True read_only: True - name: name fields: @@ -233,37 +230,31 @@ dataset: - name: id fidesops_meta: data_type: integer - primary_key: True - name: skipped_collection fields: - name: id fides_meta: data_type: integer - primary_key: True - name: request_with_output_template fields: - name: id fides_meta: data_type: integer - primary_key: True - name: request_with_invalid_output_template fields: - name: id fides_meta: data_type: integer - primary_key: True - name: standalone_output_template fields: - name: id fides_meta: data_type: integer - primary_key: True - name: complex_template_example fields: - name: id fides_meta: data_type: integer - primary_key: True - fides_key: saas_connector_external_example name: An Example External SaaS Dataset diff --git a/tests/ops/service/connectors/test_bigquery_connector.py b/tests/ops/service/connectors/test_bigquery_connector.py index a9524777fe..2e7bc3b075 100644 --- a/tests/ops/service/connectors/test_bigquery_connector.py +++ b/tests/ops/service/connectors/test_bigquery_connector.py @@ -129,7 +129,7 @@ def test_generate_update_partitioned_table( assert len(updates) == 2 assert ( str(updates[0]) - == "UPDATE `silken-precinct-284918.fidesopstest.customer` SET `name`=%(name:STRING)s WHERE `silken-precinct-284918.fidesopstest.customer`.`id` = %(id_1:INT64)s AND `created` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY) AND `created` <= CURRENT_TIMESTAMP()" + == "UPDATE `silken-precinct-284918.fidesopstest.customer` SET `name`=%(name:STRING)s WHERE `silken-precinct-284918.fidesopstest.customer`.`email` = %(email_1:STRING)s AND `created` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY) AND `created` <= CURRENT_TIMESTAMP()" ) def test_generate_delete_partitioned_table( @@ -158,7 +158,7 @@ def test_generate_delete_partitioned_table( assert len(deletes) == 2 assert ( str(deletes[0]) - == "DELETE FROM `silken-precinct-284918.fidesopstest.customer` WHERE `silken-precinct-284918.fidesopstest.customer`.`id` = %(id_1:INT64)s AND `created` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY) AND `created` <= CURRENT_TIMESTAMP()" + == "DELETE FROM `silken-precinct-284918.fidesopstest.customer` WHERE `silken-precinct-284918.fidesopstest.customer`.`email` = %(email_1:STRING)s AND `created` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY) AND `created` <= CURRENT_TIMESTAMP()" ) def test_retrieve_partitioned_data( diff --git a/tests/ops/service/connectors/test_bigquery_queryconfig.py b/tests/ops/service/connectors/test_bigquery_queryconfig.py index 06c51c5105..24a16517b6 100644 --- a/tests/ops/service/connectors/test_bigquery_queryconfig.py +++ b/tests/ops/service/connectors/test_bigquery_queryconfig.py @@ -196,7 +196,7 @@ def test_generate_delete_stmt( ) stmts = set(str(stmt) for stmt in delete_stmts) expected_stmts = { - "DELETE FROM `employee` WHERE `employee`.`id` = %(id_1:STRING)s" + "DELETE FROM `employee` WHERE `employee`.`address_id` = %(address_id_1:STRING)s AND `employee`.`email` = %(email_1:STRING)s" } assert stmts == expected_stmts @@ -289,6 +289,6 @@ def test_generate_namespaced_delete_stmt( ) stmts = set(str(stmt) for stmt in delete_stmts) expected_stmts = { - "DELETE FROM `silken-precinct-284918.fidesopstest.employee` WHERE `silken-precinct-284918.fidesopstest.employee`.`id` = %(id_1:STRING)s" + "DELETE FROM `silken-precinct-284918.fidesopstest.employee` WHERE `silken-precinct-284918.fidesopstest.employee`.`address_id` = %(address_id_1:STRING)s AND `silken-precinct-284918.fidesopstest.employee`.`email` = %(email_1:STRING)s" } assert stmts == expected_stmts diff --git a/tests/ops/service/connectors/test_query_config.py b/tests/ops/service/connectors/test_query_config.py index 2aa0871255..eac650d587 100644 --- a/tests/ops/service/connectors/test_query_config.py +++ b/tests/ops/service/connectors/test_query_config.py @@ -21,6 +21,7 @@ from fides.api.service.masking.strategy.masking_strategy_hash import HashMaskingStrategy from fides.api.util.data_category import DataCategory from tests.fixtures.application_fixtures import load_dataset +from tests.ops.test_helpers.dataset_utils import remove_primary_keys from ...task.traversal_data import integration_db_graph from ...test_helpers.cache_secrets_helper import cache_secret, clear_cache_secrets @@ -273,7 +274,7 @@ def test_generate_update_stmt_one_field( text_clause = config.generate_update_stmt(row, erasure_policy, privacy_request) assert ( text_clause.text - == """UPDATE customer SET name = :masked_name WHERE email = :email""" + == """UPDATE customer SET name = :masked_name WHERE id = :id""" ) assert text_clause._bindparams["masked_name"].key == "masked_name" assert ( @@ -341,7 +342,7 @@ def test_generate_update_stmt_length_truncation( ) assert ( text_clause.text - == """UPDATE customer SET name = :masked_name WHERE email = :email""" + == """UPDATE customer SET name = :masked_name WHERE id = :id""" ) assert text_clause._bindparams["masked_name"].key == "masked_name" # length truncation on name field @@ -391,7 +392,7 @@ def test_generate_update_stmt_multiple_fields_same_rule( text_clause = config.generate_update_stmt(row, erasure_policy, privacy_request) assert ( text_clause.text - == "UPDATE customer SET email = :masked_email, name = :masked_name WHERE email = :email" + == "UPDATE customer SET email = :masked_email, name = :masked_name WHERE id = :id" ) assert text_clause._bindparams["masked_name"].key == "masked_name" # since length is set to 40 in dataset.yml, we expect only first 40 chars of masked val @@ -407,7 +408,7 @@ def test_generate_update_stmt_multiple_fields_same_rule( ["customer-1@example.com"], request_id=privacy_request.id )[0] ) - assert text_clause._bindparams["email"].value == "customer-1@example.com" + assert text_clause._bindparams["id"].value == 1 clear_cache_secrets(privacy_request.id) def test_generate_update_stmts_from_multiple_rules( @@ -434,6 +435,201 @@ def test_generate_update_stmts_from_multiple_rules( row, erasure_policy_two_rules, privacy_request ) + assert ( + text_clause.text + == "UPDATE customer SET email = :masked_email, name = :masked_name WHERE id = :id" + ) + # Two different masking strategies used for name and email + assert ( + text_clause._bindparams["masked_name"].value is None + ) # Null masking strategy + assert ( + text_clause._bindparams["masked_email"].value == "*****" + ) # String rewrite masking strategy + + def test_generate_update_stmt_one_field_without_primary_keys( + self, erasure_policy, example_datasets, connection_config + ): + dataset = remove_primary_keys(Dataset(**example_datasets[0])) + graph = convert_dataset_to_graph(dataset, connection_config.key) + dataset_graph = DatasetGraph(*[graph]) + traversal = Traversal(dataset_graph, {"email": "customer-1@example.com"}) + + customer_node = traversal.traversal_node_dict[ + CollectionAddress("postgres_example_test_dataset", "customer") + ].to_mock_execution_node() + + config = SQLQueryConfig(customer_node) + row = { + "email": "customer-1@example.com", + "name": "John Customer", + "address_id": 1, + "id": 1, + } + text_clause = config.generate_update_stmt(row, erasure_policy, privacy_request) + assert ( + text_clause.text + == """UPDATE customer SET name = :masked_name WHERE email = :email""" + ) + assert text_clause._bindparams["masked_name"].key == "masked_name" + assert ( + text_clause._bindparams["masked_name"].value is None + ) # Null masking strategy + + def test_generate_update_stmt_one_field_inbound_reference_without_primary_keys( + self, erasure_policy_address_city, example_datasets, connection_config + ): + dataset = remove_primary_keys(Dataset(**example_datasets[0])) + graph = convert_dataset_to_graph(dataset, connection_config.key) + dataset_graph = DatasetGraph(*[graph]) + traversal = Traversal(dataset_graph, {"email": "customer-1@example.com"}) + + address_node = traversal.traversal_node_dict[ + CollectionAddress("postgres_example_test_dataset", "address") + ].to_mock_execution_node() + + config = SQLQueryConfig(address_node) + row = { + "id": 1, + "house": "123", + "street": "Main St", + "city": "San Francisco", + "state": "CA", + "zip": "94105", + } + text_clause = config.generate_update_stmt( + row, erasure_policy_address_city, privacy_request + ) + assert ( + text_clause.text + == """UPDATE address SET city = :masked_city WHERE id = :id""" + ) + assert text_clause._bindparams["masked_city"].key == "masked_city" + assert ( + text_clause._bindparams["masked_city"].value is None + ) # Null masking strategy + + def test_generate_update_stmt_length_truncation_without_primary_keys( + self, + erasure_policy_string_rewrite_long, + example_datasets, + connection_config, + ): + dataset = remove_primary_keys(Dataset(**example_datasets[0])) + graph = convert_dataset_to_graph(dataset, connection_config.key) + dataset_graph = DatasetGraph(*[graph]) + traversal = Traversal(dataset_graph, {"email": "customer-1@example.com"}) + + customer_node = traversal.traversal_node_dict[ + CollectionAddress("postgres_example_test_dataset", "customer") + ].to_mock_execution_node() + + config = SQLQueryConfig(customer_node) + row = { + "email": "customer-1@example.com", + "name": "John Customer", + "address_id": 1, + "id": 1, + } + + text_clause = config.generate_update_stmt( + row, erasure_policy_string_rewrite_long, privacy_request + ) + assert ( + text_clause.text + == """UPDATE customer SET name = :masked_name WHERE email = :email""" + ) + assert text_clause._bindparams["masked_name"].key == "masked_name" + # length truncation on name field + assert ( + text_clause._bindparams["masked_name"].value + == "some rewrite value that is very long and" + ) + + def test_generate_update_stmt_multiple_fields_same_rule_without_primary_keys( + self, erasure_policy, example_datasets, connection_config + ): + dataset = remove_primary_keys(Dataset(**example_datasets[0])) + graph = convert_dataset_to_graph(dataset, connection_config.key) + dataset_graph = DatasetGraph(*[graph]) + traversal = Traversal(dataset_graph, {"email": "customer-1@example.com"}) + + customer_node = traversal.traversal_node_dict[ + CollectionAddress("postgres_example_test_dataset", "customer") + ].to_mock_execution_node() + + config = SQLQueryConfig(customer_node) + row = { + "email": "customer-1@example.com", + "name": "John Customer", + "address_id": 1, + "id": 1, + } + + # Make target more broad + rule = erasure_policy.rules[0] + target = rule.targets[0] + target.data_category = DataCategory("user").value + + # Update rule masking strategy + rule.masking_strategy = { + "strategy": "hash", + "configuration": {"algorithm": "SHA-512"}, + } + # cache secrets for hash strategy + secret = MaskingSecretCache[str]( + secret="adobo", + masking_strategy=HashMaskingStrategy.name, + secret_type=SecretType.salt, + ) + cache_secret(secret, privacy_request.id) + + text_clause = config.generate_update_stmt(row, erasure_policy, privacy_request) + assert ( + text_clause.text + == "UPDATE customer SET email = :masked_email, name = :masked_name WHERE email = :email" + ) + assert text_clause._bindparams["masked_name"].key == "masked_name" + # since length is set to 40 in dataset.yml, we expect only first 40 chars of masked val + assert ( + text_clause._bindparams["masked_name"].value + == HashMaskingStrategy(HashMaskingConfiguration(algorithm="SHA-512")).mask( + ["John Customer"], request_id=privacy_request.id + )[0][0:40] + ) + assert ( + text_clause._bindparams["masked_email"].value + == HashMaskingStrategy(HashMaskingConfiguration(algorithm="SHA-512")).mask( + ["customer-1@example.com"], request_id=privacy_request.id + )[0] + ) + assert text_clause._bindparams["email"].value == "customer-1@example.com" + clear_cache_secrets(privacy_request.id) + + def test_generate_update_stmts_from_multiple_rules_without_primary_keys( + self, erasure_policy_two_rules, example_datasets, connection_config + ): + dataset = remove_primary_keys(Dataset(**example_datasets[0])) + graph = convert_dataset_to_graph(dataset, connection_config.key) + dataset_graph = DatasetGraph(*[graph]) + traversal = Traversal(dataset_graph, {"email": "customer-1@example.com"}) + row = { + "email": "customer-1@example.com", + "name": "John Customer", + "address_id": 1, + "id": 1, + } + + customer_node = traversal.traversal_node_dict[ + CollectionAddress("postgres_example_test_dataset", "customer") + ].to_mock_execution_node() + + config = SQLQueryConfig(customer_node) + + text_clause = config.generate_update_stmt( + row, erasure_policy_two_rules, privacy_request + ) + assert ( text_clause.text == "UPDATE customer SET email = :masked_email, name = :masked_name WHERE email = :email" @@ -446,6 +642,7 @@ def test_generate_update_stmts_from_multiple_rules( text_clause._bindparams["masked_email"].value == "*****" ) # String rewrite masking strategy + class TestSQLLikeQueryConfig: def test_missing_namespace_meta_schema(self): diff --git a/tests/ops/service/connectors/test_snowflake_query_config.py b/tests/ops/service/connectors/test_snowflake_query_config.py index 5521a1a88a..4f4b23b8c4 100644 --- a/tests/ops/service/connectors/test_snowflake_query_config.py +++ b/tests/ops/service/connectors/test_snowflake_query_config.py @@ -150,7 +150,7 @@ def test_generate_update_stmt( ) assert ( str(update_stmt) - == 'UPDATE "address" SET "city" = :city, "house" = :house, "state" = :state, "street" = :street, "zip" = :zip WHERE "id" = :id' + == 'UPDATE "address" SET "city" = :masked_city, "house" = :masked_house, "state" = :masked_state, "street" = :masked_street, "zip" = :masked_zip WHERE "id" = :id' ) def test_generate_namespaced_update_stmt( @@ -191,5 +191,5 @@ def test_generate_namespaced_update_stmt( ) assert ( str(update_stmt) - == 'UPDATE "FIDESOPS_TEST"."TEST"."address" SET "city" = :city, "house" = :house, "state" = :state, "street" = :street, "zip" = :zip WHERE "id" = :id' + == 'UPDATE "FIDESOPS_TEST"."TEST"."address" SET "city" = :masked_city, "house" = :masked_house, "state" = :masked_state, "street" = :masked_street, "zip" = :masked_zip WHERE "id" = :id' ) diff --git a/tests/ops/service/dataset/example_datasets/multiple_identities.yml b/tests/ops/service/dataset/example_datasets/multiple_identities.yml index 053afb3ced..dd76dbfa6d 100644 --- a/tests/ops/service/dataset/example_datasets/multiple_identities.yml +++ b/tests/ops/service/dataset/example_datasets/multiple_identities.yml @@ -16,8 +16,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: diff --git a/tests/ops/service/dataset/example_datasets/multiple_identities_with_external_dependency.yml b/tests/ops/service/dataset/example_datasets/multiple_identities_with_external_dependency.yml index fdfcd32bfc..db9e227a74 100644 --- a/tests/ops/service/dataset/example_datasets/multiple_identities_with_external_dependency.yml +++ b/tests/ops/service/dataset/example_datasets/multiple_identities_with_external_dependency.yml @@ -32,7 +32,5 @@ dataset: direction: from - name: id data_categories: [system.operations] - fides_meta: - primary_key: True - name: shipping_address_id data_categories: [system.operations] diff --git a/tests/ops/service/dataset/example_datasets/no_identities.yml b/tests/ops/service/dataset/example_datasets/no_identities.yml index fac879de99..82b56f9c65 100644 --- a/tests/ops/service/dataset/example_datasets/no_identities.yml +++ b/tests/ops/service/dataset/example_datasets/no_identities.yml @@ -13,8 +13,6 @@ dataset: data_categories: [user.contact.email] - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: diff --git a/tests/ops/service/dataset/example_datasets/single_identity.yml b/tests/ops/service/dataset/example_datasets/single_identity.yml index 19cdc7df3e..ce1506886d 100644 --- a/tests/ops/service/dataset/example_datasets/single_identity.yml +++ b/tests/ops/service/dataset/example_datasets/single_identity.yml @@ -16,8 +16,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: diff --git a/tests/ops/service/dataset/example_datasets/single_identity_with_internal_dependency.yml b/tests/ops/service/dataset/example_datasets/single_identity_with_internal_dependency.yml index 708aefbaf0..af73f8bcb8 100644 --- a/tests/ops/service/dataset/example_datasets/single_identity_with_internal_dependency.yml +++ b/tests/ops/service/dataset/example_datasets/single_identity_with_internal_dependency.yml @@ -16,8 +16,6 @@ dataset: data_type: string - name: id data_categories: [user.unique_id] - fides_meta: - primary_key: True - name: name data_categories: [user.name] fides_meta: diff --git a/tests/ops/service/privacy_request/test_bigquery_enterprise_privacy_request.py b/tests/ops/service/privacy_request/test_bigquery_enterprise_privacy_request.py index 8fb7e29729..5a133c031f 100644 --- a/tests/ops/service/privacy_request/test_bigquery_enterprise_privacy_request.py +++ b/tests/ops/service/privacy_request/test_bigquery_enterprise_privacy_request.py @@ -1,27 +1,9 @@ -import time -from datetime import datetime, timezone -from typing import Any, Dict, List, Set from unittest import mock -from unittest.mock import ANY, Mock, call -from uuid import uuid4 -import pydash import pytest from fides.api.models.audit_log import AuditLog, AuditLogAction -from fides.api.models.privacy_request import ( - ActionType, - CheckpointActionRequired, - ExecutionLog, - ExecutionLogStatus, - PolicyPreWebhook, - PrivacyRequest, - PrivacyRequestStatus, -) -from fides.api.schemas.masking.masking_configuration import MaskingConfiguration -from fides.api.schemas.masking.masking_secrets import MaskingSecretCache -from fides.api.schemas.policy import Rule -from fides.api.service.masking.strategy.masking_strategy import MaskingStrategy +from fides.api.models.privacy_request import ExecutionLog from tests.ops.service.privacy_request.test_request_runner_service import ( get_privacy_request_results, ) @@ -54,7 +36,7 @@ def test_create_and_process_access_request_bigquery_enterprise( customer_email = "customer-1@example.com" user_id = ( - 1754 # this is a real (not generated) user id in the Stackoverflow dataset + 1754 # this is a real (not generated) user id in the Stack Overflow dataset ) data = { "requested_at": "2024-08-30T16:09:37.359Z", diff --git a/tests/ops/task/test_create_request_tasks.py b/tests/ops/task/test_create_request_tasks.py index 290c2dc1be..ad118ee46c 100644 --- a/tests/ops/task/test_create_request_tasks.py +++ b/tests/ops/task/test_create_request_tasks.py @@ -927,7 +927,7 @@ def test_erase_after_saas_upstream_and_downstream_tasks( "is_array": False, "read_only": None, "references": [], - "primary_key": True, + "primary_key": False, "data_categories": ["system.operations"], "data_type_converter": "integer", "return_all_elements": None, diff --git a/tests/ops/test_helpers/dataset_utils.py b/tests/ops/test_helpers/dataset_utils.py index e60efb9892..d51e1f47ff 100644 --- a/tests/ops/test_helpers/dataset_utils.py +++ b/tests/ops/test_helpers/dataset_utils.py @@ -13,7 +13,11 @@ ) from fides.api.graph.data_type import DataType, get_data_type, to_data_type_string from fides.api.models.connectionconfig import ConnectionConfig -from fides.api.models.datasetconfig import DatasetConfig, convert_dataset_to_graph +from fides.api.models.datasetconfig import ( + DatasetConfig, + DatasetField, + convert_dataset_to_graph, +) from fides.api.util.collection_util import Row SAAS_DATASET_DIRECTORY = "data/saas/dataset/" @@ -231,3 +235,27 @@ def get_simple_fields(fields: Iterable[Field]) -> List[Dict[str, Any]]: object["fields"] = get_simple_fields(field.fields.values()) object_list.append(object) return object_list + + +def remove_primary_keys(dataset: Dataset) -> Dataset: + """Returns a copy of the dataset with primary key fields removed from fides_meta.""" + dataset_copy = dataset.model_copy(deep=True) + + for collection in dataset_copy.collections: + for field in collection.fields: + if field.fides_meta: + if field.fides_meta.primary_key: + field.fides_meta.primary_key = None + if field.fields: + _remove_nested_primary_keys(field.fields) + + return dataset_copy + + +def _remove_nested_primary_keys(fields: List[DatasetField]) -> None: + """Helper function to recursively remove primary keys from nested fields.""" + for field in fields: + if field.fides_meta and field.fides_meta.primary_key: + field.fides_meta.primary_key = None + if field.fields: + _remove_nested_primary_keys(field.fields)