Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions docs/docs/configuration/databases.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,82 @@ For a connection to a SQL endpoint you need to use the HTTP path from the endpoi
{"connect_args": {"http_path": "/sql/1.0/endpoints/****", "driver_path": "/path/to/odbc/driver"}}
```

##### OAuth2 Authentication

Superset supports OAuth2 authentication for Databricks, allowing users to authenticate with their personal Databricks accounts instead of using shared access tokens. This provides better security and audit capabilities.

###### Prerequisites

1. Create an OAuth2 application in your Databricks account:
- Go to your Databricks account console
- Navigate to **Settings** → **Developer** → **OAuth apps**
- Create a new OAuth app with the redirect URI: `http://your-superset-host:port/api/v1/database/oauth2/`

2. Configure OAuth2 in your `superset_config.py`:

```python
from datetime import timedelta

# OAuth2 configuration for Databricks
# OAuth2 endpoints are automatically detected based on your Databricks cloud provider
DATABASE_OAUTH2_CLIENTS = {
"Databricks (legacy)": {
"id": "your-databricks-client-id",
"secret": "your-databricks-client-secret",
"scope": "sql",
# OAuth2 endpoints are auto-detected based on hostname, but can be overridden:
# AWS: "authorization_request_uri": "https://accounts.cloud.databricks.com/oidc/accounts/{account_id}/v1/authorize",
# Azure: "authorization_request_uri": "https://login.microsoftonline.com/{tenant_id}/oauth2/v2.0/authorize",
# GCP: "authorization_request_uri": "https://accounts.gcp.databricks.com/oidc/accounts/{account_id}/v1/authorize",
},
"Databricks": {
"id": "your-databricks-client-id",
"secret": "your-databricks-client-secret",
"scope": "sql",
# OAuth2 endpoints are auto-detected based on hostname
},
}

# OAuth2 redirect URI (adjust hostname/port for your setup)
DATABASE_OAUTH2_REDIRECT_URI = "http://your-superset-host:port/api/v1/database/oauth2/"

# Optional: OAuth2 timeout
DATABASE_OAUTH2_TIMEOUT = timedelta(seconds=30)
```

Replace the following placeholders:
- `your-databricks-client-id`: Your Databricks OAuth2 application client ID
- `your-databricks-client-secret`: Your Databricks OAuth2 application client secret
- `your-superset-host:port`: Your Superset instance hostname and port

**Multi-Cloud Provider Support**

Superset automatically detects your Databricks cloud provider and uses the appropriate OAuth2 endpoints:

- **AWS**: Detected from hostnames containing `cloud.databricks.com`
- **Azure**: Detected from hostnames containing `azure` or `azuredatabricks`
- **GCP**: Detected from hostnames containing `gcp` or `googleusercontent`

You can also explicitly specify the cloud provider in your database configuration under **Advanced** → **Other** → **ENGINE PARAMETERS**:

```json
{
"cloud_provider": "azure"
}
```

Valid cloud provider values are: `aws`, `azure`, `gcp`.

###### Usage

Once configured, users can:

1. Connect to Databricks databases normally using access tokens
2. When querying data, Superset will automatically redirect users to authenticate with Databricks if needed
3. User-specific OAuth2 tokens will be used for database connections, providing better security and audit trails

This feature works with both "Databricks (legacy)" and "Databricks" engine types and automatically supports all major cloud providers (AWS, Azure, GCP).

#### Denodo

The recommended connector library for Denodo is
Expand Down
197 changes: 197 additions & 0 deletions superset/db_engine_specs/databricks.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,18 @@
from superset.db_engine_specs.base import BaseEngineSpec, BasicParametersMixin
from superset.db_engine_specs.hive import HiveEngineSpec
from superset.errors import ErrorLevel, SupersetError, SupersetErrorType
from superset.exceptions import OAuth2RedirectError
from superset.utils import json
from superset.utils.core import get_user_agent, QuerySource
from superset.utils.network import is_hostname_valid, is_port_open

if TYPE_CHECKING:
from superset.models.core import Database
from superset.superset_typing import (
OAuth2ClientConfig,
OAuth2State,
OAuth2TokenResponse,
)


try:
Expand Down Expand Up @@ -244,6 +250,71 @@ class DatabricksDynamicBaseEngineSpec(BasicParametersMixin, DatabricksBaseEngine
"port": "port",
}

# OAuth2 endpoints for different cloud providers
_oauth2_endpoints = {
"aws": {
"authorization_request_uri": "https://accounts.cloud.databricks.com/oidc/accounts/{}/v1/authorize",
"token_request_uri": "https://accounts.cloud.databricks.com/oidc/accounts/{}/v1/token",
},
"azure": {
"authorization_request_uri": "https://login.microsoftonline.com/{}/oauth2/v2.0/authorize",
"token_request_uri": "https://login.microsoftonline.com/{}/oauth2/v2.0/token",
},
"gcp": {
"authorization_request_uri": "https://accounts.gcp.databricks.com/oidc/accounts/{}/v1/authorize",
"token_request_uri": "https://accounts.gcp.databricks.com/oidc/accounts/{}/v1/token",
},
}

@classmethod
def _detect_cloud_provider(cls, database: Database) -> str:
"""
Detect the cloud provider based on the database configuration.

Returns:
str: The cloud provider ('aws', 'azure', or 'gcp')
"""
# Check if cloud provider is explicitly configured in extra
if "cloud_provider" in (extra := cls.get_extra_params(database)):
provider = extra["cloud_provider"].lower()
if provider in cls._oauth2_endpoints:
return provider

# Try to detect from hostname
hostname = database.url_object.host or ""
hostname = hostname.lower()

if "azure" in hostname or "azuredatabricks" in hostname:
return "azure"
elif "gcp" in hostname or "googleusercontent" in hostname:
return "gcp"
else:
# Default to AWS for compatibility
return "aws"

@classmethod
def impersonate_user(
cls,
database: Database,
username: str | None,
user_token: str | None,
url: URL,
engine_kwargs: dict[str, Any],
) -> tuple[URL, dict[str, Any]]:
"""
Update connection with OAuth2 access token for user impersonation.
"""
if user_token:
# Replace the access token in the URL with the user's OAuth2 token
url = url.set(password=user_token)

# Also update connect_args if they contain access token
connect_args = engine_kwargs.setdefault("connect_args", {})
if "access_token" in connect_args:
connect_args["access_token"] = user_token

return url, engine_kwargs

@staticmethod
def get_extra_params(
database: Database, source: QuerySource | None = None
Expand Down Expand Up @@ -424,6 +495,69 @@ class DatabricksNativeEngineSpec(DatabricksDynamicBaseEngineSpec):
supports_dynamic_catalog = True
supports_cross_catalog_queries = True

# OAuth 2.0 support
supports_oauth2 = True
oauth2_exception = OAuth2RedirectError
oauth2_scope = "sql"

# OAuth2 endpoints are determined dynamically based on cloud provider
oauth2_authorization_request_uri = "" # Set dynamically
oauth2_token_request_uri = "" # Set dynamically

@classmethod
def get_oauth2_authorization_uri(
cls,
config: "OAuth2ClientConfig",
state: "OAuth2State",
) -> str:
"""
Return URI for initial OAuth2 request with dynamic endpoint detection.
"""
from superset.models.core import Database

# Get the database to detect cloud provider
database_id = state["database_id"]
if database := Database.query.get(database_id):
provider = cls._detect_cloud_provider(database)
# Update config with the correct authorization URI for the cloud provider
from typing import cast

config = cast(
"OAuth2ClientConfig",
dict(config)
| {
"authorization_request_uri": cls._oauth2_endpoints[provider][
"authorization_request_uri"
]
},
)

return super().get_oauth2_authorization_uri(config, state)

@classmethod
def get_oauth2_token(
cls,
config: "OAuth2ClientConfig",
code: str,
) -> "OAuth2TokenResponse":
"""
Exchange authorization code for refresh/access tokens with dynamic endpoint.

Note: For token exchange, we need the database context from the state.
This is a limitation of the current OAuth2 flow design.
"""
# For now, fall back to AWS endpoints for token exchange
# TODO: Improve OAuth2 flow to pass database context to token exchange
from typing import cast

config = cast(
"OAuth2ClientConfig",
dict(config)
| {"token_request_uri": cls._oauth2_endpoints["aws"]["token_request_uri"]},
)

return super().get_oauth2_token(config, code)

@classmethod
def build_sqlalchemy_uri( # type: ignore
cls, parameters: DatabricksNativeParametersType, *_
Expand Down Expand Up @@ -563,6 +697,69 @@ class DatabricksPythonConnectorEngineSpec(DatabricksDynamicBaseEngineSpec):

supports_dynamic_schema = supports_catalog = supports_dynamic_catalog = True

# OAuth 2.0 support
supports_oauth2 = True
oauth2_exception = OAuth2RedirectError
oauth2_scope = "sql"

# OAuth2 endpoints are determined dynamically based on cloud provider
oauth2_authorization_request_uri = "" # Set dynamically
oauth2_token_request_uri = "" # Set dynamically

@classmethod
def get_oauth2_authorization_uri(
cls,
config: "OAuth2ClientConfig",
state: "OAuth2State",
) -> str:
"""
Return URI for initial OAuth2 request with dynamic endpoint detection.
"""
from superset.models.core import Database

# Get the database to detect cloud provider
database_id = state["database_id"]
if database := Database.query.get(database_id):
provider = cls._detect_cloud_provider(database)
# Update config with the correct authorization URI for the cloud provider
from typing import cast

config = cast(
"OAuth2ClientConfig",
dict(config)
| {
"authorization_request_uri": cls._oauth2_endpoints[provider][
"authorization_request_uri"
]
},
)

return super().get_oauth2_authorization_uri(config, state)

@classmethod
def get_oauth2_token(
cls,
config: "OAuth2ClientConfig",
code: str,
) -> "OAuth2TokenResponse":
"""
Exchange authorization code for refresh/access tokens with dynamic endpoint.

Note: For token exchange, we need the database context from the state.
This is a limitation of the current OAuth2 flow design.
"""
# For now, fall back to AWS endpoints for token exchange
# TODO: Improve OAuth2 flow to pass database context to token exchange
from typing import cast

config = cast(
"OAuth2ClientConfig",
dict(config)
| {"token_request_uri": cls._oauth2_endpoints["aws"]["token_request_uri"]},
)

return super().get_oauth2_token(config, code)

@classmethod
def build_sqlalchemy_uri( # type: ignore
cls, parameters: DatabricksPythonConnectorParametersType, *_
Expand Down
Loading
Loading