diff --git a/dbt/adapters/bigquery/column.py b/dbt/adapters/bigquery/column.py index cfa298b82..02a3e8182 100644 --- a/dbt/adapters/bigquery/column.py +++ b/dbt/adapters/bigquery/column.py @@ -5,6 +5,8 @@ from google.cloud.bigquery import SchemaField +_PARENT_DATA_TYPE_KEY = "__parent_data_type" + Self = TypeVar("Self", bound="BigQueryColumn") @@ -215,15 +217,29 @@ def _update_nested_column_data_types( if len(column_name_parts) == 1: # Base case: column is not nested - store its data_type concatenated with constraint if provided. - nested_column_data_types[root_column_name] = ( + column_data_type_and_constraints = ( column_data_type if column_rendered_constraint is None else f"{column_data_type} {column_rendered_constraint}" ) + if root_column_name not in nested_column_data_types: + nested_column_data_types[root_column_name] = column_data_type_and_constraints + else: + # entry could already exist if this is a parent column -- preserve the parent data type under "_PARENT_DATA_TYPE_KEY" + existing_nested_column_data_type = nested_column_data_types[root_column_name] + assert isinstance(existing_nested_column_data_type, dict) # keeping mypy happy + existing_nested_column_data_type[ + _PARENT_DATA_TYPE_KEY + ] = column_data_type_and_constraints else: # Initialize nested dictionary if root_column_name not in nested_column_data_types: nested_column_data_types[root_column_name] = {} + elif not isinstance(nested_column_data_types[root_column_name], dict): + # a parent specified its base type -- preserve its data_type and potential rendered constraints + # this is used to specify a top-level 'struct' or 'array' field with its own description, constraints, etc + parent_data_type = nested_column_data_types[root_column_name] + nested_column_data_types[root_column_name] = {_PARENT_DATA_TYPE_KEY: parent_data_type} # Recursively process rest of remaining column name remaining_column_name = ".".join(column_name_parts[1:]) @@ -252,8 +268,23 @@ def _format_nested_data_type(unformatted_nested_data_type: Union[str, Dict[str, if isinstance(unformatted_nested_data_type, str): return unformatted_nested_data_type else: + parent_data_type = unformatted_nested_data_type.pop(_PARENT_DATA_TYPE_KEY, None) + parent_constraints = None + if parent_data_type: + parent_data_type_flat = parent_data_type.split() + if len(parent_data_type_flat) > 1: + parent_data_type = parent_data_type_flat[0] + parent_constraints = " ".join(parent_data_type_flat[1:]) + formatted_nested_types = [ f"{column_name} {_format_nested_data_type(column_type)}" for column_name, column_type in unformatted_nested_data_type.items() ] - return f"""struct<{", ".join(formatted_nested_types)}>""" + + formatted_nested_type = f"""struct<{", ".join(formatted_nested_types)}>""" + if parent_data_type and parent_data_type.lower() == "array": + formatted_nested_type = f"""array<{formatted_nested_type}>""" + if parent_constraints: + formatted_nested_type = f"""{formatted_nested_type} {parent_constraints}""" + + return formatted_nested_type