Basic integration with ClickHouse (#1844)

* db_engine_specs: added engine spec for clickhouse * models: group by time without join on subquery if this is enabled, it will remove the subquery when grouping by time, and group by time will be rewritten as group by timeslot, ... this particularly fixes clickhouse backend * docs: added clickhouse * models: allow using secondary dttm columns as constraints * superset/forms: configurable default slice since time * superset/models: respect dialect quoting * dttm handling, codeclimate tweaks
apache · May 3, 2017 · 9ff3515 · 9ff3515
1 parent 59a6f44
commit 9ff3515
Show file tree

Hide file tree

Showing 4 changed files with 62 additions and 7 deletions.
diff --git a/docs/installation.rst b/docs/installation.rst
@@ -211,6 +211,9 @@ Here's a list of some of the recommended packages.
 |  Vertica      | ``pip install                       |  ``vertica+vertica_python://``                  |
 |               | sqlalchemy-vertica-python``         |                                                 |
 +---------------+-------------------------------------+-------------------------------------------------+
+|  ClickHouse   | ``pip install                       | ``clickhouse://``                               |
+|               | sqlalchemy-clickhouse``             |                                                 |
++---------------+-------------------------------------+-------------------------------------------------+
 
 Note that many other database are supported, the main criteria being the
 existence of a functional SqlAlchemy dialect and Python driver. Googling

diff --git a/superset/connectors/sqla/models.py b/superset/connectors/sqla/models.py
@@ -370,6 +370,9 @@ def get_sqla_query(  # sqla
         if granularity not in self.dttm_cols:
             granularity = self.main_dttm_col
 
+        # Database spec supports join-free timeslot grouping
+        time_groupby_inline = self.database.db_engine_spec.time_groupby_inline
+
         cols = {col.column_name: col for col in self.columns}
         metrics_dict = {m.metric_name: m for m in self.metrics}
 
@@ -386,7 +389,7 @@ def get_sqla_query(  # sqla
         if timeseries_limit_metric:
             timeseries_limit_metric_expr = \
                 timeseries_limit_metric.sqla_col
-        if metrics:
+        if metrics_exprs:
             main_metric_expr = metrics_exprs[0]
         else:
             main_metric_expr = literal_column("COUNT(*)").label("ccount")
@@ -436,13 +439,20 @@ def visit_column(element, compiler, **kw):
 
             dttm_col = cols[granularity]
             time_grain = extras.get('time_grain_sqla')
+            time_filters = []
 
             if is_timeseries:
                 timestamp = dttm_col.get_timestamp_expression(time_grain)
                 select_exprs += [timestamp]
                 groupby_exprs += [timestamp]
 
-            time_filter = dttm_col.get_time_filter(from_dttm, to_dttm)
+            # Use main dttm column to support index with secondary dttm columns
+            if self.database.db_engine_spec.time_secondary_columns and \
+                    self.main_dttm_col in self.dttm_cols and \
+                    self.main_dttm_col != dttm_col.column_name:
+                time_filters.append(cols[self.main_dttm_col].
+                                    get_time_filter(from_dttm, to_dttm))
+            time_filters.append(dttm_col.get_time_filter(from_dttm, to_dttm))
 
         select_exprs += metrics_exprs
         qry = sa.select(select_exprs)
@@ -509,7 +519,7 @@ def visit_column(element, compiler, **kw):
                 having = template_processor.process_template(having)
                 having_clause_and += [sa.text('({})'.format(having))]
         if granularity:
-            qry = qry.where(and_(*([time_filter] + where_clause_and)))
+            qry = qry.where(and_(*(time_filters + where_clause_and)))
         else:
             qry = qry.where(and_(*where_clause_and))
         qry = qry.having(and_(*having_clause_and))
@@ -522,7 +532,8 @@ def visit_column(element, compiler, **kw):
 
         qry = qry.limit(row_limit)
 
-        if is_timeseries and timeseries_limit and groupby:
+        if is_timeseries and \
+                timeseries_limit and groupby and not time_groupby_inline:
             # some sql dialects require for order by expressions
             # to also be in the select clause -- others, e.g. vertica,
             # require a unique inner alias
@@ -620,8 +631,7 @@ def fetch_metadata(self):
             if not any_date_col and dbcol.is_time:
                 any_date_col = col.name
 
-            quoted = "{}".format(
-                column(dbcol.column_name).compile(dialect=db_dialect))
+            quoted = "{}".format(col.compile(dialect=db_dialect))
             if dbcol.sum:
                 metrics.append(M(
                     metric_name='sum__' + dbcol.column_name,

diff --git a/superset/db_engine_specs.py b/superset/db_engine_specs.py
@@ -50,7 +50,9 @@ class BaseEngineSpec(object):
     engine = 'base'  # str as defined in sqlalchemy.engine.engine
     cursor_execute_kwargs = {}
     time_grains = tuple()
+    time_groupby_inline = False
     limit_method = LimitMethod.FETCH_MANY
+    time_secondary_columns = False
 
     @classmethod
     def fetch_data(cls, cursor, limit):
@@ -865,6 +867,44 @@ def convert_dttm(cls, target_type, dttm):
     def epoch_to_dttm(cls):
         return "from_unixtime({col})"
 
+
+class ClickHouseEngineSpec(BaseEngineSpec):
+    """Dialect for ClickHouse analytical DB."""
+
+    engine = 'clickhouse'
+
+    time_secondary_columns = True
+    time_groupby_inline = True
+    time_grains = (
+        Grain('Time Column', _('Time Column'), '{col}'),
+        Grain('minute', _('minute'),
+              "toStartOfMinute(toDateTime({col}))"),
+        Grain('5 minute', _('5 minute'),
+              "toDateTime(intDiv(toUInt32(toDateTime({col})), 300)*300)"),
+        Grain('10 minute', _('10 minute'),
+              "toDateTime(intDiv(toUInt32(toDateTime({col})), 600)*600)"),
+        Grain('hour', _('hour'),
+              "toStartOfHour(toDateTime({col}))"),
+        Grain('day', _('day'),
+              "toStartOfDay(toDateTime({col}))"),
+        Grain('month', _('month'),
+              "toStartOfMonth(toDateTime({col}))"),
+        Grain('quarter', _('quarter'),
+              "toStartOfQuarter(toDateTime({col}))"),
+        Grain('year', _('year'),
+              "toStartOfYear(toDateTime({col}))"),
+    )
+
+    @classmethod
+    def convert_dttm(cls, target_type, dttm):
+        tt = target_type.upper()
+        if tt == 'DATE':
+            return "toDate('{}')".format(dttm.strftime('%Y-%m-%d'))
+        if tt == 'DATETIME':
+            return "toDateTime('{}')".format(
+                dttm.strftime('%Y-%m-%d %H:%M:%S'))
+        return "'{}'".format(dttm.strftime('%Y-%m-%d %H:%M:%S'))
+
 engines = {
     o.engine: o for o in globals().values()
     if inspect.isclass(o) and issubclass(o, BaseEngineSpec)}
diff --git a/superset/viz.py b/superset/viz.py
@@ -130,7 +130,9 @@ def query_obj(self):
         # [column_name in list_of_values]. `__` prefix is there to avoid
         # potential conflicts with column that would be named `from` or `to`
         since = (
-            extra_filters.get('__from') or form_data.get("since", "1 year ago")
+            extra_filters.get('__from') or
+            form_data.get("since") or
+            config.get("SUPERSET_DEFAULT_SINCE", "1 year ago")
         )
 
         from_dttm = utils.parse_human_datetime(since)