Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add oauth flow for querybook github integration #1497

Merged
merged 9 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "querybook",
"version": "3.34.2",
"version": "3.35.0",
"description": "A Big Data Webapp",
"private": true,
"scripts": {
Expand Down
4 changes: 4 additions & 0 deletions querybook/config/querybook_default_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ OAUTH_AUTHORIZATION_URL: ~
OAUTH_TOKEN_URL: ~
OAUTH_USER_PROFILE: ~

# --------------- GitHub Integration ---------------
GITHUB_CLIENT_ID: ~
GITHUB_CLIENT_SECRET: ~

# LDAP
LDAP_CONN: ~
LDAP_USER_DN: uid={},dc=example,dc=com
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""Add GitHub Datadoc Link

Revision ID: aa328ae9dced
Revises: f7b11b3e3a95
Create Date: 2024-10-23 21:04:55.052696

"""

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = "aa328ae9dced"
down_revision = "f7b11b3e3a95"
branch_labels = None
depends_on = None


def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.create_table(
"github_link",
sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
sa.Column("datadoc_id", sa.Integer(), nullable=False),
sa.Column("user_id", sa.Integer(), nullable=False),
sa.Column(
"directory",
sa.String(length=255),
nullable=False,
server_default="datadocs",
),
sa.Column(
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.Column(
"updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.ForeignKeyConstraint(
["datadoc_id"],
["data_doc.id"],
),
sa.ForeignKeyConstraint(
["user_id"],
["user.id"],
),
sa.PrimaryKeyConstraint("id"),
sa.UniqueConstraint("datadoc_id"),
)
# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table("github_link")
# ### end Alembic commands ###
3 changes: 3 additions & 0 deletions querybook/server/datasources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
from . import comment
from . import survey
from . import query_transform
from . import github


# Keep this at the end of imports to make sure the plugin APIs override the default ones
try:
Expand Down Expand Up @@ -47,3 +49,4 @@
survey
query_transform
api_plugin
github
26 changes: 26 additions & 0 deletions querybook/server/datasources/github.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from app.datasource import register
from lib.github.github import github_manager
from typing import Dict
from logic import github as logic
from flask_login import current_user


@register("/github/auth/", methods=["GET"])
def connect_github() -> Dict[str, str]:
return github_manager.initiate_github_integration()


@register("/github/is_authenticated/", methods=["GET"])
def is_github_authenticated() -> str:
is_authenticated = github_manager.get_github_token() is not None
return {"is_authenticated": is_authenticated}


@register("/github/datadocs/<int:datadoc_id>/link/", methods=["POST"])
def link_datadoc_to_github(
datadoc_id: int,
directory: str,
) -> Dict:
return logic.create_repo_link(
datadoc_id=datadoc_id, user_id=current_user.id, directory=directory
)
4 changes: 4 additions & 0 deletions querybook/server/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ class QuerybookSettings(object):
OAUTH_USER_PROFILE = get_env_config("OAUTH_USER_PROFILE")
AZURE_TENANT_ID = get_env_config("AZURE_TENANT_ID")

# GitHub App settings for feature integration
GITHUB_CLIENT_ID = os.getenv("GITHUB_CLIENT_ID")
GITHUB_CLIENT_SECRET = os.getenv("GITHUB_CLIENT_SECRET")

LDAP_CONN = get_env_config("LDAP_CONN")
LDAP_USE_TLS = str(get_env_config("LDAP_USE_TLS")).lower() == "true"
LDAP_USE_BIND_USER = str(get_env_config("LDAP_USE_BIND_USER")).lower() == "true"
Expand Down
Empty file.
93 changes: 93 additions & 0 deletions querybook/server/lib/github/github.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import certifi
from flask import session as flask_session, request
from app.auth.github_auth import GitHubLoginManager
from env import QuerybookSettings
from lib.logger import get_logger
from app.flask_app import flask_app
from typing import Optional, Dict, Any

LOG = get_logger(__file__)

GITHUB_OAUTH_CALLBACK = "/github/oauth2callback"
GITHUB_ACCESS_TOKEN = "github_access_token"


class GitHubManager(GitHubLoginManager):
def __init__(
self,
additional_scopes: Optional[list] = None,
client_id: Optional[str] = None,
client_secret: Optional[str] = None,
):
self.additional_scopes = additional_scopes or []
self._client_id = client_id
self._client_secret = client_secret
super().__init__()

@property
def oauth_config(self) -> Dict[str, Any]:
config = super().oauth_config
config["scope"] = "user email " + " ".join(self.additional_scopes)
config[
"callback_url"
] = f"{QuerybookSettings.PUBLIC_URL}{GITHUB_OAUTH_CALLBACK}"
if self._client_id:
config["client_id"] = self._client_id
if self._client_secret:
config["client_secret"] = self._client_secret
return config

def save_github_token(self, token: str) -> None:
flask_session[GITHUB_ACCESS_TOKEN] = token
LOG.debug("Saved GitHub token to session")

def get_github_token(self) -> Optional[str]:
return flask_session.get(GITHUB_ACCESS_TOKEN)

def initiate_github_integration(self) -> Dict[str, str]:
github = self.oauth_session
authorization_url, state = github.authorization_url(
self.oauth_config["authorization_url"]
)
flask_session["oauth_state"] = state
return {"url": authorization_url}

def github_integration_callback(self) -> str:
try:
github = self.oauth_session
access_token = github.fetch_token(
self.oauth_config["token_url"],
client_secret=self.oauth_config["client_secret"],
authorization_response=request.url,
cert=certifi.where(),
)
self.save_github_token(access_token["access_token"])
return self.success_response()
except Exception as e:
LOG.error(f"Failed to obtain credentials: {e}")
return self.error_response(str(e))

def success_response(self) -> str:
return """
<p>Success! Please close the tab.</p>
<script>
window.opener.receiveChildMessage()
</script>
"""

def error_response(self, error_message: str) -> str:
return f"""
<p>Failed to obtain credentials, reason: {error_message}</p>
"""


github_manager = GitHubManager(
additional_scopes=["repo"],
client_id=QuerybookSettings.GITHUB_CLIENT_ID,
client_secret=QuerybookSettings.GITHUB_CLIENT_SECRET,
)


@flask_app.route(GITHUB_OAUTH_CALLBACK)
def github_callback() -> str:
return github_manager.github_integration_callback()
145 changes: 145 additions & 0 deletions querybook/server/lib/github/serializers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import yaml
import re
from typing import List
from models.datadoc import DataDoc, DataCell
from const.data_doc import DataCellType
from datetime import datetime, timezone


def parse_datetime_as_utc(date_str: str) -> datetime:
"""
Parse the given date string to a datetime object in UTC.
"""
if isinstance(date_str, datetime):
return date_str.astimezone(timezone.utc)
if date_str:
return datetime.fromisoformat(date_str).astimezone(timezone.utc)
return datetime.now(timezone.utc).replace(tzinfo=timezone.utc)


def serialize_datadoc_to_markdown(datadoc: DataDoc) -> str:
# Serialize DataDoc metadata to YAML front matter for readability
datadoc_metadata = {
"id": datadoc.id,
"environment_id": datadoc.environment_id,
"public": datadoc.public,
"archived": datadoc.archived,
"owner_uid": datadoc.owner_uid,
"created_at": datadoc.created_at.isoformat() if datadoc.created_at else None,
"updated_at": datadoc.updated_at.isoformat() if datadoc.updated_at else None,
"meta": datadoc.meta,
"title": datadoc.title,
}
try:
front_matter = (
f"---\n{yaml.dump(datadoc_metadata, default_flow_style=False)}---\n\n"
)
except yaml.YAMLError as e:
raise ValueError(f"Error serializing DataDoc metadata to YAML: {e}")

title = f"# {datadoc.title}\n\n"
content = serialize_datacells(datadoc.cells)
markdown_content = front_matter + title + content
return markdown_content


def serialize_datacells(cells: List[DataCell]) -> str:
lines = []
for cell in cells:
# Since GitHub's Markdown renderer does not recognize multiple --- blocks as separate YAML sections,
# we serialize cell metadata in HTML comment to hide it from rendered view
cell_metadata = {
"id": cell.id,
"cell_type": cell.cell_type.name.lower(),
"created_at": cell.created_at.isoformat() if cell.created_at else None,
"updated_at": cell.updated_at.isoformat() if cell.updated_at else None,
"meta": cell.meta,
}
try:
cell_metadata_yaml = yaml.dump(cell_metadata, default_flow_style=False)
except yaml.YAMLError as e:
raise ValueError(f"Error serializing cell metadata to YAML: {e}")

cell_metadata_comment = f"<!--\n{cell_metadata_yaml.strip()}\n-->\n"

cell_content = serialize_cell_content(cell)
lines.append(cell_metadata_comment + cell_content)

return "\n\n".join(lines)


def serialize_cell_content(cell: DataCell) -> str:
cell_meta = cell.meta or {}
if cell.cell_type == DataCellType.query:
query_title = cell_meta.get("title", "Query")
return f"## Query: {query_title}\n\n```sql\n{cell.context.strip()}\n```\n"
elif cell.cell_type == DataCellType.text:
return f"## Text\n\n```text\n{cell.context.strip()}\n```\n"
elif cell.cell_type == DataCellType.chart:
return "## Chart\n\n```text\n*Chart generated from the metadata.*\n```\n"


def deserialize_datadoc_from_markdown(markdown_str: str) -> DataDoc:
front_matter, content = extract_front_matter(markdown_str)
datadoc = create_datadoc_from_metadata(front_matter)
datadoc.cells = deserialize_datadoc_content(content)
return datadoc


def extract_front_matter(markdown_str: str):
front_matter_pattern = re.compile(r"^---\n(.*?)\n---\n\n", re.DOTALL)
match = front_matter_pattern.match(markdown_str)
if match:
front_matter_str = match.group(1)
content = markdown_str[match.end() :]
try:
front_matter = yaml.safe_load(front_matter_str)
except yaml.YAMLError as e:
raise ValueError(f"Error parsing front matter YAML: {e}")
else:
raise ValueError("Invalid Markdown format: Missing front matter.")
return front_matter, content


def create_datadoc_from_metadata(metadata: dict) -> DataDoc:
datadoc = DataDoc(
id=metadata.get("id"),
environment_id=metadata.get("environment_id"),
public=metadata.get("public", True),
archived=metadata.get("archived", False),
owner_uid=metadata.get("owner_uid"),
created_at=parse_datetime_as_utc(metadata.get("created_at")),
updated_at=parse_datetime_as_utc(metadata.get("updated_at")),
title=metadata.get("title", ""),
)
datadoc.meta = metadata.get("meta", {})
return datadoc


def deserialize_datadoc_content(content_str: str) -> List[DataCell]:
cells = []
# Pattern to match cell metadata in HTML comments and the following content
pattern = re.compile(r"<!--\n(.*?)\n-->\n## .*?\n\n```.*?\n(.*?)\n```", re.DOTALL)
matches = pattern.finditer(content_str)
for match in matches:
metadata_str = match.group(1)
cell_content = match.group(2)
try:
metadata = yaml.safe_load(metadata_str)
except yaml.YAMLError as e:
raise ValueError(f"Error parsing cell metadata YAML: {e}")

cell_type_str = metadata.get("cell_type", "query").lower()
cell_type = DataCellType[cell_type_str]
cell = DataCell(
id=metadata.get("id"),
cell_type=cell_type,
context=(
cell_content.strip() if cell_type != DataCellType.chart else None
), # Charts are generated from the metadata, and not from content
created_at=parse_datetime_as_utc(metadata.get("created_at")),
updated_at=parse_datetime_as_utc(metadata.get("updated_at")),
meta=metadata.get("meta", {}),
)
cells.append(cell)
return cells
Loading
Loading