From 26dcf894bb040c9133fbab2c4f3df86092a3a997 Mon Sep 17 00:00:00 2001 From: Maxime Beauchemin Date: Mon, 12 Sep 2016 22:40:23 -0700 Subject: [PATCH 01/16] Handling timeouts --- .../javascripts/SqlLab/components/Timer.jsx | 4 +-- caravel/config.py | 3 ++ caravel/sql_lab.py | 1 + caravel/utils.py | 35 ++++++++++++++++++- caravel/views.py | 9 ++++- docs/installation.rst | 19 ++++++++++ 6 files changed, 67 insertions(+), 4 deletions(-) diff --git a/caravel/assets/javascripts/SqlLab/components/Timer.jsx b/caravel/assets/javascripts/SqlLab/components/Timer.jsx index 0773e7696dcc..6339348fc9ed 100644 --- a/caravel/assets/javascripts/SqlLab/components/Timer.jsx +++ b/caravel/assets/javascripts/SqlLab/components/Timer.jsx @@ -27,8 +27,8 @@ class Timer extends React.Component { } stopwatch() { if (this.props && this.props.query) { - const since = this.props.query.endDttm || now(); - const clockStr = fDuration(this.props.query.startDttm, since); + let endDttm = this.props.query.endDttm || now(); + const clockStr = fDuration(this.props.query.startDttm, endDttm); this.setState({ clockStr }); if (this.props.query.state !== 'running') { this.stopTimer(); diff --git a/caravel/config.py b/caravel/config.py index 233f99a6c782..73bd8e5bcb00 100644 --- a/caravel/config.py +++ b/caravel/config.py @@ -206,6 +206,9 @@ class CeleryConfig(object): # The db id here results in selecting this one as a default in SQL Lab DEFAULT_DB_ID = None +# Timeout duration for SQL Lab synchronous queries +SQLLAB_TIMEOUT = 30 + try: from caravel_config import * # noqa except ImportError: diff --git a/caravel/sql_lab.py b/caravel/sql_lab.py index b1d64a9c0907..4b61c8e72a58 100644 --- a/caravel/sql_lab.py +++ b/caravel/sql_lab.py @@ -49,6 +49,7 @@ def get_sql_results(query_id, return_results=True): database = query.database executed_sql = query.sql.strip().strip(';') + time.sleep(5) # Limit enforced only for retrieving the data, not for the CTA queries. if is_query_select(executed_sql): if query.select_as_cta: diff --git a/caravel/utils.py b/caravel/utils.py index 2fb43ada9b25..678aafba32bd 100644 --- a/caravel/utils.py +++ b/caravel/utils.py @@ -4,13 +4,14 @@ from __future__ import print_function from __future__ import unicode_literals +from builtins import object from datetime import date, datetime import decimal import functools import json import logging import numpy -import time +import signal import uuid import parsedatetime @@ -30,6 +31,10 @@ class CaravelException(Exception): pass +class CaravelTimeoutException(Exception): + pass + + class CaravelSecurityException(CaravelException): pass @@ -414,3 +419,31 @@ def generic_find_constraint_name(table, columns, referenced, db): fk.referred_table.name == referenced and set(fk.column_keys) == columns): return fk.name + + +class timeout(object): + """ + To be used in a ``with`` block and timeout its content. + """ + def __init__(self, seconds=1, error_message='Timeout'): + self.seconds = seconds + self.error_message = error_message + + def handle_timeout(self, signum, frame): + logging.error("Process timed out") + raise CaravelTimeoutException(self.error_message) + + def __enter__(self): + try: + signal.signal(signal.SIGALRM, self.handle_timeout) + signal.alarm(self.seconds) + except ValueError as e: + logging.warning("timeout can't be used in the current context") + logging.exception(e) + + def __exit__(self, type, value, traceback): + try: + signal.alarm(0) + except ValueError as e: + logging.warning("timeout can't be used in the current context") + logging.exception(e) diff --git a/caravel/views.py b/caravel/views.py index cdcec757fc78..513564fcdaaf 100755 --- a/caravel/views.py +++ b/caravel/views.py @@ -1587,7 +1587,14 @@ def sql_json(self): # Sync request. try: - data = sql_lab.get_sql_results(query_id) + SQLLAB_TIMEOUT = config.get("SQLLAB_TIMEOUT") + with utils.timeout( + seconds=SQLLAB_TIMEOUT, + error_message=( + "The query exceeded the {SQLLAB_TIMEOUT} seconds " + "timeout." + ).format(**locals())): + data = sql_lab.get_sql_results(query_id) except Exception as e: logging.exception(e) return Response( diff --git a/docs/installation.rst b/docs/installation.rst index c70cc0082ead..0d94baa7f5e3 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -294,6 +294,25 @@ Upgrading should be as straightforward as running:: pip install caravel --upgrade caravel db upgrade +SQL Lab +------- +SQL Lab is a powerful SQL IDE that works with all SQLAlchemy compatible +databases out there. By default, queries are run in a web request, and +may eventually timeout as queries exceed the maximum duration of a web +request in your environment, whether it'd be a reverse proxy or the Caravel +server itself. + +In the modern analytics world, it's not uncommon to run large queries that +run for minutes or hours. +To enable support for long running queries that +execute beyond the typical web request's timeout (30-60 seconds), it is +necessary to deploy an asynchronous backend, which consist of one or many +Caravel worker, which is implemented as a Celery worker, and a Celery +broker for which we recommend using Redis or RabbitMQ. + +It's also preferable to setup an async result backend as a key value store +that can hold the long-running query results for a period of time. More +details to come as to how to set this up here soon. Making your own build --------------------- From 5b88c1db607330ee087c99e6cfe4aec0f71b85f1 Mon Sep 17 00:00:00 2001 From: Maxime Beauchemin Date: Mon, 12 Sep 2016 23:16:02 -0700 Subject: [PATCH 02/16] Fixing timer on non-utc server --- caravel/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/caravel/utils.py b/caravel/utils.py index 678aafba32bd..336bd18373e8 100644 --- a/caravel/utils.py +++ b/caravel/utils.py @@ -350,7 +350,7 @@ def datetime_to_epoch(dttm): def now_as_float(): - return datetime_to_epoch(datetime.now()) + return datetime_to_epoch(datetime.utcnow()) def json_int_dttm_ser(obj): From 733a63d6e921828e97fbac6a8bcdd3a177fcfa9f Mon Sep 17 00:00:00 2001 From: Maxime Beauchemin Date: Tue, 13 Sep 2016 08:55:23 -0700 Subject: [PATCH 03/16] Allowing async with results --- .../SqlLab/components/SqlEditor.jsx | 32 +++++++++++++--- caravel/assets/javascripts/SqlLab/main.css | 3 -- .../4500485bde7d_allow_run_sync_async.py | 28 ++++++++++++++ caravel/models.py | 2 + caravel/sql_lab.py | 23 ++++++++---- caravel/views.py | 37 +++++++++++++++---- 6 files changed, 100 insertions(+), 25 deletions(-) create mode 100644 caravel/migrations/versions/4500485bde7d_allow_run_sync_async.py diff --git a/caravel/assets/javascripts/SqlLab/components/SqlEditor.jsx b/caravel/assets/javascripts/SqlLab/components/SqlEditor.jsx index e2c094666d09..1d50e1c30f6e 100644 --- a/caravel/assets/javascripts/SqlLab/components/SqlEditor.jsx +++ b/caravel/assets/javascripts/SqlLab/components/SqlEditor.jsx @@ -52,10 +52,11 @@ class SqlEditor extends React.Component { this.startQuery(); } } - runQuery() { - this.startQuery(); + runQuery(runAsync = false) { + this.startQuery(runAsync); } startQuery(runAsync = false, ctas = false) { + console.log(runAsync); const that = this; const query = { dbId: this.props.queryEditor.dbId, @@ -76,7 +77,7 @@ class SqlEditor extends React.Component { const sqlJsonUrl = '/caravel/sql_json/'; const sqlJsonRequest = { - async: runAsync, + runAsync: runAsync, client_id: query.id, database_id: this.props.queryEditor.dbId, json: true, @@ -149,17 +150,36 @@ class SqlEditor extends React.Component { } render() { - let runButtons = ( - + let runButtons = []; + if (this.props.database.allow_run_sync) { + runButtons.push( + ); + } + if (this.props.database.allow_run_async) { + runButtons.push( + + ); + } + runButtons = ( + + {runButtons} ); if (this.props.latestQuery && this.props.latestQuery.state === 'running') { diff --git a/caravel/assets/javascripts/SqlLab/main.css b/caravel/assets/javascripts/SqlLab/main.css index 7c46f18811d1..f130a162b43b 100644 --- a/caravel/assets/javascripts/SqlLab/main.css +++ b/caravel/assets/javascripts/SqlLab/main.css @@ -239,6 +239,3 @@ div.tablePopover:hover { .SouthPane .tab-content { padding-top: 10px; } -.SqlEditor textarea { - display: none; -} diff --git a/caravel/migrations/versions/4500485bde7d_allow_run_sync_async.py b/caravel/migrations/versions/4500485bde7d_allow_run_sync_async.py new file mode 100644 index 000000000000..0695e2cda59b --- /dev/null +++ b/caravel/migrations/versions/4500485bde7d_allow_run_sync_async.py @@ -0,0 +1,28 @@ +"""allow_run_sync_async + +Revision ID: 4500485bde7d +Revises: 41f6a59a61f2 +Create Date: 2016-09-12 23:33:14.789632 + +""" + +# revision identifiers, used by Alembic. +revision = '4500485bde7d' +down_revision = '41f6a59a61f2' + +from alembic import op +import sqlalchemy as sa + + +def upgrade(): + op.add_column('dbs', sa.Column('allow_run_async', sa.Boolean(), nullable=True)) + op.add_column('dbs', sa.Column('allow_run_sync', sa.Boolean(), nullable=True)) + + +def downgrade(): + try: + op.drop_column('dbs', 'allow_run_sync') + op.drop_column('dbs', 'allow_run_async') + except Exception: + pass + diff --git a/caravel/models.py b/caravel/models.py index 105de89495b6..c112b38eda86 100644 --- a/caravel/models.py +++ b/caravel/models.py @@ -412,6 +412,8 @@ class Database(Model, AuditMixinNullable): cache_timeout = Column(Integer) select_as_create_table_as = Column(Boolean, default=False) expose_in_sqllab = Column(Boolean, default=False) + allow_run_sync = Column(Boolean, default=True) + allow_run_async = Column(Boolean, default=False) allow_ctas = Column(Boolean, default=False) force_ctas_schema = Column(String(250)) extra = Column(Text, default=textwrap.dedent("""\ diff --git a/caravel/sql_lab.py b/caravel/sql_lab.py index 4b61c8e72a58..36f12f99b682 100644 --- a/caravel/sql_lab.py +++ b/caravel/sql_lab.py @@ -2,8 +2,9 @@ from datetime import datetime import pandas as pd import logging -from caravel import app, db, models, utils +from caravel import app, cache, db, models, utils import time +import json QueryStatus = models.QueryStatus @@ -44,12 +45,12 @@ def create_table_as(sql, table_name, schema=None, override=False): @celery_app.task def get_sql_results(query_id, return_results=True): """Executes the sql query returns the results.""" - db.session.commit() # HACK - query = db.session.query(models.Query).filter_by(id=query_id).one() + session = db.session() + session.commit() # HACK + query = session.query(models.Query).filter_by(id=query_id).one() database = query.database executed_sql = query.sql.strip().strip(';') - time.sleep(5) # Limit enforced only for retrieving the data, not for the CTA queries. if is_query_select(executed_sql): if query.select_as_cta: @@ -74,12 +75,12 @@ def get_sql_results(query_id, return_results=True): query.error_message = utils.error_msg_from_exception(e) query.status = QueryStatus.FAILED query.tmp_table_name = None - db.session.commit() + session.commit() raise Exception(query.error_message) cursor = result_proxy.cursor query.status = QueryStatus.RUNNING - db.session.flush() + session.flush() if database.backend == 'presto': polled = cursor.poll() # poll returns dict -- JSON status information or ``None`` @@ -96,7 +97,7 @@ def get_sql_results(query_id, return_results=True): progress = 100 * (completed_splits / total_splits) if progress > query.progress: query.progress = progress - db.session.commit() + session.commit() time.sleep(1) polled = cursor.poll() @@ -126,7 +127,7 @@ def get_sql_results(query_id, return_results=True): query.tmp_table_name, limit=query.limit)) query.end_time = utils.now_as_float() - db.session.commit() + session.commit() payload = { 'query_id': query.id, @@ -139,3 +140,9 @@ def get_sql_results(query_id, return_results=True): payload['error'] = query.error_message if return_results: return payload + ''' + # Hack testing using a kv store for results + key = "query_id={}".format(query.id) + logging.info("Storing results in key=[{}]".format(key)) + cache.set(key, json.dumps(payload, default=utils.json_iso_dttm_ser)) + ''' diff --git a/caravel/views.py b/caravel/views.py index 513564fcdaaf..3b187b0194f4 100755 --- a/caravel/views.py +++ b/caravel/views.py @@ -32,7 +32,7 @@ import caravel from caravel import ( - appbuilder, db, models, viz, utils, app, sm, ascii_art, sql_lab + appbuilder, cache, db, models, viz, utils, app, sm, ascii_art, sql_lab ) config = app.config @@ -447,7 +447,8 @@ class DatabaseView(CaravelModelView, DeleteMixin): # noqa list_columns = ['database_name', 'creator', 'changed_on_'] add_columns = [ 'database_name', 'sqlalchemy_uri', 'cache_timeout', 'extra', - 'expose_in_sqllab', 'allow_ctas', 'force_ctas_schema'] + 'expose_in_sqllab', 'allow_run_sync', 'allow_run_async', + 'allow_ctas', 'force_ctas_schema'] search_exclude_columns = ('password',) edit_columns = add_columns show_columns = [ @@ -470,6 +471,14 @@ class DatabaseView(CaravelModelView, DeleteMixin): # noqa "to structure your URI here: " "http://docs.sqlalchemy.org/en/rel_1_0/core/engines.html"), 'expose_in_sqllab': _("Expose this DB in SQL Lab"), + 'allow_run_sync': _( + "Allow users to run synchronous queries, this is the default " + "and should work well for queries that can be executed " + "within a web request scope (<~1 minute)"), + 'allow_run_async': _( + "Allow users to run queries, against an async backend. " + "This assumes that you have a Celery worker setup as well " + "as a results backend."), 'allow_ctas': _("Allow CREATE TABLE AS option in SQL Lab"), 'force_ctas_schema': _( "When allowing CREATE TABLE AS option in SQL Lab, " @@ -520,7 +529,8 @@ def pre_update(self, db): class DatabaseAsync(DatabaseView): list_columns = [ 'id', 'database_name', - 'expose_in_sqllab', 'allow_ctas', 'force_ctas_schema' + 'expose_in_sqllab', 'allow_ctas', 'force_ctas_schema', + 'allow_run_async', 'allow_run_sync', ] appbuilder.add_view_no_menu(DatabaseAsync) @@ -1535,12 +1545,22 @@ def select_star(self, database_id, table_name): def theme(self): return self.render_template('caravel/theme.html') - @has_access + @has_access_api + @expose("/cached_key//") + @log_this + def cached_key(self, key): + """Returns a key from the cache""" + resp = cache.get(key) + if resp: + return resp + return "nope" + + @has_access_api @expose("/sql_json/", methods=['POST', 'GET']) @log_this def sql_json(self): """Runs arbitrary sql and returns and json""" - async = request.form.get('async') == 'true' + async = request.form.get('runAsync') == 'true' sql = request.form.get('sql') database_id = request.form.get('database_id') @@ -1577,7 +1597,7 @@ def sql_json(self): # Async request. if async: # Ignore the celery future object and the request may time out. - sql_lab.get_sql_results.delay(query_id) + sql_lab.get_sql_results.delay(query_id, return_results=False) return Response( json.dumps({'query': query.to_dict()}, default=utils.json_int_dttm_ser, @@ -1592,9 +1612,10 @@ def sql_json(self): seconds=SQLLAB_TIMEOUT, error_message=( "The query exceeded the {SQLLAB_TIMEOUT} seconds " - "timeout." + "timeout. You may want to run your query as a " + "`CREATE TABLE AS` to prevent timeouts." ).format(**locals())): - data = sql_lab.get_sql_results(query_id) + data = sql_lab.get_sql_results(query_id, return_results=True) except Exception as e: logging.exception(e) return Response( From 66ad2da51c196b6eb46086c78087e0666c1b6b9b Mon Sep 17 00:00:00 2001 From: Maxime Beauchemin Date: Tue, 13 Sep 2016 13:17:01 -0700 Subject: [PATCH 04/16] [bugfix] database is not selected --- caravel/assets/javascripts/SqlLab/components/SqlEditor.jsx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/caravel/assets/javascripts/SqlLab/components/SqlEditor.jsx b/caravel/assets/javascripts/SqlLab/components/SqlEditor.jsx index 1d50e1c30f6e..e52c9336cd2a 100644 --- a/caravel/assets/javascripts/SqlLab/components/SqlEditor.jsx +++ b/caravel/assets/javascripts/SqlLab/components/SqlEditor.jsx @@ -151,7 +151,7 @@ class SqlEditor extends React.Component { render() { let runButtons = []; - if (this.props.database.allow_run_sync) { + if (this.props.database && this.props.database.allow_run_sync) { runButtons.push(