From b1661ae4785ddfd7db5ebd8cf10f334cf4d9579b Mon Sep 17 00:00:00 2001 From: Raghwendra Singh Date: Sun, 31 Jul 2022 02:15:29 +0530 Subject: [PATCH 01/28] DI-679 added CSP HTTP response header --- conf/zeppelin-site.xml.template | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/conf/zeppelin-site.xml.template b/conf/zeppelin-site.xml.template index 4ee200b9956..5ab01b5ceb1 100755 --- a/conf/zeppelin-site.xml.template +++ b/conf/zeppelin-site.xml.template @@ -644,6 +644,14 @@ The X-Frame-Options HTTP response header can be used to indicate whether or not a browser should be allowed to render a page in a frame/iframe/object. + + + diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java index 350e72b35be..b67e92c68b3 100644 --- a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java @@ -908,7 +908,6 @@ private InterpreterResult executeSql(String dbPrefix, String sql, } } finalOutput.append(userName); - context.out.write(finalOutput.toString()); context.getLocalProperties().put(CANCEL_REASON, finalOutput.toString()); cancel(context); return new InterpreterResult(Code.ERROR, finalOutput.toString()); @@ -916,16 +915,8 @@ private InterpreterResult executeSql(String dbPrefix, String sql, } catch (Exception e) { context.out.write("Error occurred while sending request"); - System.err.println("Error occurred while sending request: " + e.getMessage()); - e.printStackTrace(); } -// if (sqlToExecute.contains("fail_fast_kill")) { -// context.getLocalProperties().put(CANCEL_REASON, "Fail Fast custom error"); -// cancel(context); -// } - - boolean isResultSetAvailable = statement.execute(sqlToExecute); getJDBCConfiguration(user).setConnectionInDBDriverPoolSuccessful(dbPrefix); if (isResultSetAvailable) { diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationRequest.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationRequest.java index 7dea7f564b0..94de605e3d3 100644 --- a/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationRequest.java +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationRequest.java @@ -3,12 +3,10 @@ public class ValidationRequest { private String queryText; - // Constructor public ValidationRequest(String queryText) { this.queryText = queryText; } - // Getter and Setter public String getQueryText() { return queryText; } diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationResponse.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationResponse.java index 801a5aa82c8..027f91b7300 100644 --- a/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationResponse.java +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationResponse.java @@ -1,5 +1,8 @@ package org.apache.zeppelin.jdbc; +import com.google.gson.Gson; +import com.google.gson.JsonObject; + public class ValidationResponse { private boolean preSubmitFail; private boolean failFast; @@ -40,18 +43,20 @@ public void setMessage(String message) { } public static ValidationResponse fromJson(String jsonResponse) { + Gson gson = new Gson(); ValidationResponse response = new ValidationResponse(); - // Use simple JSON parsing (can replace with a library like Jackson or Gson) - response.setPreSubmitFail(jsonResponse.contains("\"pre_submit_fail\":true")); - response.setFailFast(jsonResponse.contains("\"fail_fast\":true")); - response.setFailedByDeprecatedTable(jsonResponse.contains("\"failed_by_deprecated_table\":true")); - - int messageIndex = jsonResponse.indexOf("\"message\":\""); - if (messageIndex != -1) { - int messageEnd = jsonResponse.indexOf("\"", messageIndex + 10); - String message = jsonResponse.substring(messageIndex + 10, messageEnd); - response.setMessage(message); + + JsonObject jsonObject = gson.fromJson(jsonResponse, JsonObject.class); + + response.setPreSubmitFail(jsonObject.get("pre_submit_fail").getAsBoolean()); + response.setFailFast(jsonObject.get("fail_fast").getAsBoolean()); + response.setFailedByDeprecatedTable(jsonObject.get("failed_by_deprecated_table").getAsBoolean()); + + // Extract the "message" field + if (jsonObject.has("message")) { + response.setMessage(jsonObject.get("message").getAsString()); } + return response; } } From 39f0a7bab0dd5732c0f1fcd6e3b017ecbb2f837e Mon Sep 17 00:00:00 2001 From: Aman Singh Chauhan Date: Tue, 17 Sep 2024 16:58:58 +0530 Subject: [PATCH 11/28] added loggers --- .../java/org/apache/zeppelin/jdbc/JDBCInterpreter.java | 9 ++++++--- .../org/apache/zeppelin/jdbc/ValidationResponse.java | 2 -- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java index b67e92c68b3..a3eaa99ddc2 100644 --- a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java @@ -867,11 +867,14 @@ private InterpreterResult executeSql(String dbPrefix, String sql, ValidationRequest request = new ValidationRequest(sqlToExecute); try { + context.out.write("Sending request for validation"); ValidationResponse response = sendValidationRequest(request); + context.out.write("Response received for validation"); if (response.isPreSubmitFail()) { + context.out.write("Pre Submit custom error check"); String outputMessage = response.getMessage(); - String userName = getUser(context); - context.out.write(userName); +// String userName = getUser(context); +// context.out.write(userName); StringBuilder finalOutput = new StringBuilder(); if (response.isFailFast()) { @@ -907,7 +910,7 @@ private InterpreterResult executeSql(String dbPrefix, String sql, } } } - finalOutput.append(userName); +// finalOutput.append(userName); context.getLocalProperties().put(CANCEL_REASON, finalOutput.toString()); cancel(context); return new InterpreterResult(Code.ERROR, finalOutput.toString()); diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationResponse.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationResponse.java index 027f91b7300..0f0a30cedf4 100644 --- a/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationResponse.java +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationResponse.java @@ -51,8 +51,6 @@ public static ValidationResponse fromJson(String jsonResponse) { response.setPreSubmitFail(jsonObject.get("pre_submit_fail").getAsBoolean()); response.setFailFast(jsonObject.get("fail_fast").getAsBoolean()); response.setFailedByDeprecatedTable(jsonObject.get("failed_by_deprecated_table").getAsBoolean()); - - // Extract the "message" field if (jsonObject.has("message")) { response.setMessage(jsonObject.get("message").getAsString()); } From b6225df40b710667e997fd4afde71fe07d1bddbf Mon Sep 17 00:00:00 2001 From: Aman Singh Chauhan Date: Tue, 17 Sep 2024 18:00:15 +0530 Subject: [PATCH 12/28] added stract trace --- .../org/apache/zeppelin/jdbc/JDBCInterpreter.java | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java index a3eaa99ddc2..6e7c90afdc1 100644 --- a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java @@ -867,14 +867,14 @@ private InterpreterResult executeSql(String dbPrefix, String sql, ValidationRequest request = new ValidationRequest(sqlToExecute); try { - context.out.write("Sending request for validation"); + context.out.write("Sending request for validation\n"); ValidationResponse response = sendValidationRequest(request); context.out.write("Response received for validation"); if (response.isPreSubmitFail()) { context.out.write("Pre Submit custom error check"); String outputMessage = response.getMessage(); -// String userName = getUser(context); -// context.out.write(userName); + String userName = getUser(context); + context.out.write(userName); StringBuilder finalOutput = new StringBuilder(); if (response.isFailFast()) { @@ -910,14 +910,19 @@ private InterpreterResult executeSql(String dbPrefix, String sql, } } } -// finalOutput.append(userName); + finalOutput.append(userName); context.getLocalProperties().put(CANCEL_REASON, finalOutput.toString()); cancel(context); return new InterpreterResult(Code.ERROR, finalOutput.toString()); } } catch (Exception e) { - context.out.write("Error occurred while sending request"); + String error = "Error occurred while sending request" + e.getMessage(); + String stackTrace = e.getStackTrace().toString(); + String mess = e.getLocalizedMessage(); + context.out.write(error); + context.out.write(stackTrace); + context.out.write(mess); } boolean isResultSetAvailable = statement.execute(sqlToExecute); From b06c1619f786226ee67d45a43e570a05fd8c694a Mon Sep 17 00:00:00 2001 From: Aman Singh Chauhan Date: Tue, 17 Sep 2024 20:13:45 +0530 Subject: [PATCH 13/28] removed test loggers --- jdbc/pom.xml | 2 +- .../org/apache/zeppelin/jdbc/JDBCInterpreter.java | 14 +++----------- .../apache/zeppelin/jdbc/ValidationRequest.java | 14 ++++---------- 3 files changed, 8 insertions(+), 22 deletions(-) diff --git a/jdbc/pom.xml b/jdbc/pom.xml index bfa0a600cc9..56467b4d074 100644 --- a/jdbc/pom.xml +++ b/jdbc/pom.xml @@ -59,7 +59,7 @@ com.google.code.gson gson - 2.8.9 + 2.8.9 diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java index 6e7c90afdc1..21c29b21121 100644 --- a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java @@ -865,20 +865,15 @@ private InterpreterResult executeSql(String dbPrefix, String sql, Boolean.parseBoolean(getProperty("hive.log.display", "true")), this); } - ValidationRequest request = new ValidationRequest(sqlToExecute); + String userName = getUser(context); + ValidationRequest request = new ValidationRequest(sqlToExecute, userName); try { - context.out.write("Sending request for validation\n"); ValidationResponse response = sendValidationRequest(request); - context.out.write("Response received for validation"); if (response.isPreSubmitFail()) { - context.out.write("Pre Submit custom error check"); String outputMessage = response.getMessage(); - String userName = getUser(context); - context.out.write(userName); StringBuilder finalOutput = new StringBuilder(); if (response.isFailFast()) { - context.out.write("Fail Fast custom error"); JSONObject jsonObject = new JSONObject(outputMessage); finalOutput.append("The following TABLE(s) used in the query are not using partition filter:\n"); @@ -910,18 +905,15 @@ private InterpreterResult executeSql(String dbPrefix, String sql, } } } - finalOutput.append(userName); context.getLocalProperties().put(CANCEL_REASON, finalOutput.toString()); cancel(context); - return new InterpreterResult(Code.ERROR, finalOutput.toString()); + return new InterpreterResult(Code.ERROR, "Failed by Fail Fast"); } } catch (Exception e) { String error = "Error occurred while sending request" + e.getMessage(); - String stackTrace = e.getStackTrace().toString(); String mess = e.getLocalizedMessage(); context.out.write(error); - context.out.write(stackTrace); context.out.write(mess); } diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationRequest.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationRequest.java index 94de605e3d3..94e0a6edf40 100644 --- a/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationRequest.java +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationRequest.java @@ -2,21 +2,15 @@ public class ValidationRequest { private String queryText; + private String user; - public ValidationRequest(String queryText) { - this.queryText = queryText; - } - - public String getQueryText() { - return queryText; - } - - public void setQueryText(String queryText) { + public ValidationRequest(String queryText, String user) { this.queryText = queryText; + this.user = user; } public String toJson() { - return "{\"query_text\":\"" + queryText + "\"}"; + return "{\"queryText\":\"" + queryText + "\",\"user\":\"" + user + "\"}"; } } From 08cf063f0223123fe1188aae4cb7438a28599289 Mon Sep 17 00:00:00 2001 From: Aman Singh Chauhan Date: Tue, 17 Sep 2024 22:26:12 +0530 Subject: [PATCH 14/28] updated request payload --- .../src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java | 2 -- .../main/java/org/apache/zeppelin/jdbc/ValidationRequest.java | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java index 21c29b21121..0058f9f45dd 100644 --- a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java @@ -376,7 +376,6 @@ private static HttpURLConnection createConnection() throws Exception { private static void sendRequest(HttpURLConnection connection, ValidationRequest request) throws Exception { try (OutputStream os = connection.getOutputStream()) { - // Manually convert the request object to a JSON string String jsonRequest = request.toJson(); byte[] input = jsonRequest.getBytes("utf-8"); os.write(input, 0, input.length); @@ -909,7 +908,6 @@ private InterpreterResult executeSql(String dbPrefix, String sql, cancel(context); return new InterpreterResult(Code.ERROR, "Failed by Fail Fast"); } - } catch (Exception e) { String error = "Error occurred while sending request" + e.getMessage(); String mess = e.getLocalizedMessage(); diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationRequest.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationRequest.java index 94e0a6edf40..71d8ad17418 100644 --- a/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationRequest.java +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationRequest.java @@ -10,7 +10,7 @@ public ValidationRequest(String queryText, String user) { } public String toJson() { - return "{\"queryText\":\"" + queryText + "\",\"user\":\"" + user + "\"}"; + return "{\"query_text\":\"" + queryText + "\",\"user\":\"" + user + "\"}"; } } From 682c3dcb11bf62b6aaea7cfe4c412a9a1e0547cd Mon Sep 17 00:00:00 2001 From: Aman Singh Chauhan Date: Wed, 18 Sep 2024 13:06:42 +0530 Subject: [PATCH 15/28] updated sql --- .../org/apache/zeppelin/jdbc/JDBCInterpreter.java | 10 ++++++++-- .../apache/zeppelin/jdbc/ValidationResponse.java | 15 ++++++++++----- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java index 0058f9f45dd..099083b1c8d 100644 --- a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java @@ -865,7 +865,13 @@ private InterpreterResult executeSql(String dbPrefix, String sql, } String userName = getUser(context); - ValidationRequest request = new ValidationRequest(sqlToExecute, userName); + String sqlToValidate = sqlToExecute + .replace("\n", "\\n") // Newlines + .replace("\r", "\\r") // Carriage return + .replace("\t", "\\t") // Tabs + .replace("\"", "\\\"") // Double quotes + .replace("\\", "\\\\"); // Backslashes + ValidationRequest request = new ValidationRequest(sqlToValidate, userName); try { ValidationResponse response = sendValidationRequest(request); if (response.isPreSubmitFail()) { @@ -909,7 +915,7 @@ private InterpreterResult executeSql(String dbPrefix, String sql, return new InterpreterResult(Code.ERROR, "Failed by Fail Fast"); } } catch (Exception e) { - String error = "Error occurred while sending request" + e.getMessage(); + String error = "Error occurred while sending request " + e.getMessage(); String mess = e.getLocalizedMessage(); context.out.write(error); context.out.write(mess); diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationResponse.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationResponse.java index 0f0a30cedf4..2128dfb86a1 100644 --- a/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationResponse.java +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationResponse.java @@ -48,13 +48,18 @@ public static ValidationResponse fromJson(String jsonResponse) { JsonObject jsonObject = gson.fromJson(jsonResponse, JsonObject.class); - response.setPreSubmitFail(jsonObject.get("pre_submit_fail").getAsBoolean()); - response.setFailFast(jsonObject.get("fail_fast").getAsBoolean()); - response.setFailedByDeprecatedTable(jsonObject.get("failed_by_deprecated_table").getAsBoolean()); - if (jsonObject.has("message")) { + if (jsonObject.has("pre_submit_fail") && !jsonObject.get("pre_submit_fail").isJsonNull()) { + response.setPreSubmitFail(jsonObject.get("pre_submit_fail").getAsBoolean()); + } + if (jsonObject.has("fail_fast") && !jsonObject.get("fail_fast").isJsonNull()) { + response.setFailFast(jsonObject.get("fail_fast").getAsBoolean()); + } + if (jsonObject.has("failed_by_deprecated_table") && !jsonObject.get("failed_by_deprecated_table").isJsonNull()) { + response.setFailedByDeprecatedTable(jsonObject.get("failed_by_deprecated_table").getAsBoolean()); + } + if (jsonObject.has("message") && !jsonObject.get("message").isJsonNull()) { response.setMessage(jsonObject.get("message").getAsString()); } - return response; } } From 0e70d81bf62f2aabca3b141abd89683aabfc4936 Mon Sep 17 00:00:00 2001 From: Aman Singh Chauhan Date: Wed, 18 Sep 2024 15:48:19 +0530 Subject: [PATCH 16/28] updated escapae character --- .../java/org/apache/zeppelin/jdbc/JDBCInterpreter.java | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java index 099083b1c8d..8c1f6057012 100644 --- a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java @@ -866,11 +866,9 @@ private InterpreterResult executeSql(String dbPrefix, String sql, String userName = getUser(context); String sqlToValidate = sqlToExecute - .replace("\n", "\\n") // Newlines - .replace("\r", "\\r") // Carriage return - .replace("\t", "\\t") // Tabs - .replace("\"", "\\\"") // Double quotes - .replace("\\", "\\\\"); // Backslashes + .replace("\n", " ") + .replace("\r", " ") + .replace("\t", " "); ValidationRequest request = new ValidationRequest(sqlToValidate, userName); try { ValidationResponse response = sendValidationRequest(request); From e0a936dab8417559ba227ad7242cd07ba03e2dd1 Mon Sep 17 00:00:00 2001 From: Aman Singh Chauhan Date: Wed, 18 Sep 2024 17:28:16 +0530 Subject: [PATCH 17/28] updated url --- .../main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java index 8c1f6057012..37414f95dee 100644 --- a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java @@ -151,7 +151,7 @@ public class JDBCInterpreter extends KerberosInterpreter { "zeppelin.jdbc.concurrent.max_connection"; private static final String DBCP_STRING = "jdbc:apache:commons:dbcp:"; private static final String MAX_ROWS_KEY = "zeppelin.jdbc.maxRows"; - private static final String FAIL_FAST_VALIDATE_URL = "http://localhost:8080/api/validate"; + private static final String FAIL_FAST_VALIDATE_URL = "http://spark-event-listener.prd.meesho.int/api/validate"; private static final Set PRESTO_PROPERTIES = new HashSet<>(Arrays.asList( "user", "password", @@ -877,6 +877,7 @@ private InterpreterResult executeSql(String dbPrefix, String sql, StringBuilder finalOutput = new StringBuilder(); if (response.isFailFast()) { + context.out.write("Query failed because partitions were not used in the query. Please ensure that partition filters are applied.\n"); JSONObject jsonObject = new JSONObject(outputMessage); finalOutput.append("The following TABLE(s) used in the query are not using partition filter:\n"); @@ -897,6 +898,7 @@ private InterpreterResult executeSql(String dbPrefix, String sql, } } } else if (response.isFailedByDeprecatedTable()) { + context.out.write("Query failed as Restricted table(s) are used\n"); JSONObject jsonObject = new JSONObject(outputMessage); finalOutput.append("The following TABLE(s) used in the query are restricted:\n"); @@ -910,7 +912,7 @@ private InterpreterResult executeSql(String dbPrefix, String sql, } context.getLocalProperties().put(CANCEL_REASON, finalOutput.toString()); cancel(context); - return new InterpreterResult(Code.ERROR, "Failed by Fail Fast"); + return new InterpreterResult(Code.ERROR); } } catch (Exception e) { String error = "Error occurred while sending request " + e.getMessage(); From 76db3df10248abcf6914e856ec9b28188159ac12 Mon Sep 17 00:00:00 2001 From: Aman Singh Chauhan Date: Tue, 1 Oct 2024 16:48:46 +0530 Subject: [PATCH 18/28] added timeout for rca cluster --- .../main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java index 37414f95dee..f9fe9dd4f87 100644 --- a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java @@ -839,6 +839,12 @@ private InterpreterResult executeSql(String dbPrefix, String sql, LOGGER.info("Execute sql: " + sqlToExecute); statement = connection.createStatement(); + String interpreterName = getProperty("zeppelin.jdbc.interpreter.name"); + + if (interpreterName != null && interpreterName.startsWith("spark_")) { + statement.setQueryTimeout(60); // 10800 seconds = 3 hours + } + // fetch n+1 rows in order to indicate there's more rows available (for large selects) statement.setFetchSize(context.getIntLocalProperty("limit", getMaxResult())); statement.setMaxRows(context.getIntLocalProperty("limit", maxRows)); From f0d6248c44250271bb5ed3060f06470fc2ccf05a Mon Sep 17 00:00:00 2001 From: Aman Singh Chauhan Date: Wed, 2 Oct 2024 20:30:30 +0530 Subject: [PATCH 19/28] added logging --- .../main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java index f9fe9dd4f87..3b8dd8b6c05 100644 --- a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java @@ -840,9 +840,11 @@ private InterpreterResult executeSql(String dbPrefix, String sql, statement = connection.createStatement(); String interpreterName = getProperty("zeppelin.jdbc.interpreter.name"); + context.out.write("Interpreter Name: " + interpreterName); if (interpreterName != null && interpreterName.startsWith("spark_")) { - statement.setQueryTimeout(60); // 10800 seconds = 3 hours + statement.setQueryTimeout(5); // 10800 seconds = 3 hours + context.out.write("Query Timeout: 5 seconds"); } // fetch n+1 rows in order to indicate there's more rows available (for large selects) From 761455984b258b1e629ac51d8b9bcc23e096d348 Mon Sep 17 00:00:00 2001 From: Aman Singh Chauhan Date: Tue, 8 Oct 2024 14:41:40 +0530 Subject: [PATCH 20/28] updated get interpreterName --- .../apache/zeppelin/jdbc/JDBCInterpreter.java | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java index 3b8dd8b6c05..b34862723be 100644 --- a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java @@ -839,9 +839,13 @@ private InterpreterResult executeSql(String dbPrefix, String sql, LOGGER.info("Execute sql: " + sqlToExecute); statement = connection.createStatement(); - String interpreterName = getProperty("zeppelin.jdbc.interpreter.name"); + String interpreterName = getInterpreterGroup().getId(); context.out.write("Interpreter Name: " + interpreterName); + String className = getClassName(); + context.out.write("Class Name: " + className); + + if (interpreterName != null && interpreterName.startsWith("spark_")) { statement.setQueryTimeout(5); // 10800 seconds = 3 hours context.out.write("Query Timeout: 5 seconds"); @@ -885,16 +889,17 @@ private InterpreterResult executeSql(String dbPrefix, String sql, StringBuilder finalOutput = new StringBuilder(); if (response.isFailFast()) { - context.out.write("Query failed because partitions were not used in the query. Please ensure that partition filters are applied.\n"); + context.out.write("Query Error: Partition Filters Missing\n" + + "Your query failed because some tables are missing partition filters. To avoid this, please ensure partition filters are applied to improve performance."); JSONObject jsonObject = new JSONObject(outputMessage); - finalOutput.append("The following TABLE(s) used in the query are not using partition filter:\n"); + finalOutput.append("The following table(s) are missing partition filters:\n"); JSONArray tableNames = jsonObject.names(); if (tableNames != null) { for (int i = 0; i < tableNames.length(); i++) { String table = tableNames.getString(i); JSONArray partitions = jsonObject.getJSONArray(table); - finalOutput.append(table).append(" -> "); + finalOutput.append("Table: ").append(table).append(" Partition filters: "); for (int j = 0; j < partitions.length(); j++) { finalOutput.append(partitions.getString(j)); @@ -906,15 +911,15 @@ private InterpreterResult executeSql(String dbPrefix, String sql, } } } else if (response.isFailedByDeprecatedTable()) { - context.out.write("Query failed as Restricted table(s) are used\n"); + context.out.write("Query Error: Restricted Table Used\n"); JSONObject jsonObject = new JSONObject(outputMessage); - finalOutput.append("The following TABLE(s) used in the query are restricted:\n"); + finalOutput.append("It seems you're trying to use a restricted table:\n"); JSONArray tableNames = jsonObject.names(); if (tableNames != null) { for (int i = 0; i < tableNames.length(); i++) { String table = tableNames.getString(i); - finalOutput.append(table).append(" -> ").append(jsonObject.getString(table)).append("\n"); + finalOutput.append("Use: ").append(jsonObject.getString(table)).append(" in place of ").append(table).append("\n"); } } } From b4e1d0d12a27a787465dce904d1f74757056c6e1 Mon Sep 17 00:00:00 2001 From: Aman Singh Chauhan Date: Tue, 8 Oct 2024 17:16:52 +0530 Subject: [PATCH 21/28] removed logger --- .../org/apache/zeppelin/jdbc/JDBCInterpreter.java | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java index b34862723be..48592b21914 100644 --- a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java @@ -840,15 +840,9 @@ private InterpreterResult executeSql(String dbPrefix, String sql, statement = connection.createStatement(); String interpreterName = getInterpreterGroup().getId(); - context.out.write("Interpreter Name: " + interpreterName); - String className = getClassName(); - context.out.write("Class Name: " + className); - - - if (interpreterName != null && interpreterName.startsWith("spark_")) { - statement.setQueryTimeout(5); // 10800 seconds = 3 hours - context.out.write("Query Timeout: 5 seconds"); + if (interpreterName != null && interpreterName.startsWith("spark_rca_")) { + statement.setQueryTimeout(10800); // 10800 seconds = 3 hours } // fetch n+1 rows in order to indicate there's more rows available (for large selects) @@ -890,7 +884,7 @@ private InterpreterResult executeSql(String dbPrefix, String sql, if (response.isFailFast()) { context.out.write("Query Error: Partition Filters Missing\n" + - "Your query failed because some tables are missing partition filters. To avoid this, please ensure partition filters are applied to improve performance."); + "Your query failed because some tables are missing partition filters. To avoid this, please ensure partition filters are applied to improve performance.\n"); JSONObject jsonObject = new JSONObject(outputMessage); finalOutput.append("The following table(s) are missing partition filters:\n"); @@ -899,7 +893,7 @@ private InterpreterResult executeSql(String dbPrefix, String sql, for (int i = 0; i < tableNames.length(); i++) { String table = tableNames.getString(i); JSONArray partitions = jsonObject.getJSONArray(table); - finalOutput.append("Table: ").append(table).append(" Partition filters: "); + finalOutput.append("Table: ").append(table).append(", Partition filter's: "); for (int j = 0; j < partitions.length(); j++) { finalOutput.append(partitions.getString(j)); From 12dfddcfc9c1f2d2088405535828fc0df0537c43 Mon Sep 17 00:00:00 2001 From: Aman Singh Chauhan Date: Thu, 10 Oct 2024 13:53:11 +0530 Subject: [PATCH 22/28] updated Gson object --- .../zeppelin/jdbc/ValidationResponse.java | 34 ++++++++++++------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationResponse.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationResponse.java index 2128dfb86a1..05716cc2edb 100644 --- a/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationResponse.java +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationResponse.java @@ -1,6 +1,7 @@ package org.apache.zeppelin.jdbc; import com.google.gson.Gson; +import com.google.gson.JsonElement; import com.google.gson.JsonObject; public class ValidationResponse { @@ -46,19 +47,28 @@ public static ValidationResponse fromJson(String jsonResponse) { Gson gson = new Gson(); ValidationResponse response = new ValidationResponse(); - JsonObject jsonObject = gson.fromJson(jsonResponse, JsonObject.class); + JsonElement jsonElement = gson.fromJson(jsonResponse, JsonElement.class); - if (jsonObject.has("pre_submit_fail") && !jsonObject.get("pre_submit_fail").isJsonNull()) { - response.setPreSubmitFail(jsonObject.get("pre_submit_fail").getAsBoolean()); - } - if (jsonObject.has("fail_fast") && !jsonObject.get("fail_fast").isJsonNull()) { - response.setFailFast(jsonObject.get("fail_fast").getAsBoolean()); - } - if (jsonObject.has("failed_by_deprecated_table") && !jsonObject.get("failed_by_deprecated_table").isJsonNull()) { - response.setFailedByDeprecatedTable(jsonObject.get("failed_by_deprecated_table").getAsBoolean()); - } - if (jsonObject.has("message") && !jsonObject.get("message").isJsonNull()) { - response.setMessage(jsonObject.get("message").getAsString()); + if (jsonElement.isJsonObject()) { + JsonObject jsonObject = jsonElement.getAsJsonObject(); + + if (jsonObject.has("pre_submit_fail") && !jsonObject.get("pre_submit_fail").isJsonNull()) { + response.setPreSubmitFail(jsonObject.get("pre_submit_fail").getAsBoolean()); + } + if (jsonObject.has("fail_fast") && !jsonObject.get("fail_fast").isJsonNull()) { + response.setFailFast(jsonObject.get("fail_fast").getAsBoolean()); + } + if (jsonObject.has("failed_by_deprecated_table") && !jsonObject.get("failed_by_deprecated_table").isJsonNull()) { + response.setFailedByDeprecatedTable(jsonObject.get("failed_by_deprecated_table").getAsBoolean()); + } + if (jsonObject.has("message") && !jsonObject.get("message").isJsonNull()) { + response.setMessage(jsonObject.get("message").getAsString()); + } + } else { + response.setPreSubmitFail(false); + response.setFailFast(false); + response.setFailedByDeprecatedTable(false); + response.setMessage(""); // Default message } return response; } From e54907b5e3ea0520048b47f59b7eea1bae883e9f Mon Sep 17 00:00:00 2001 From: shagil-meesho Date: Thu, 17 Oct 2024 23:53:53 +0530 Subject: [PATCH 23/28] feat: suffixing the STATEMENT_TIMEOUT = 10800 before rca interpreter fired queries --- .../java/org/apache/zeppelin/jdbc/JDBCInterpreter.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java index 48592b21914..930e42e0a27 100644 --- a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java @@ -1058,8 +1058,14 @@ public InterpreterResult internalInterpret(String cmd, InterpreterContext contex LOGGER.debug("Run SQL command '{}'", cmd); String dbPrefix = getDBPrefix(context); LOGGER.debug("DBPrefix: {}, SQL command: '{}'", dbPrefix, cmd); + String interpreterName = getInterpreterGroup().getId(); + if (interpreterName!=null && interpreterName.startsWith("spark_rca_")) { + cmd = "set STATEMENT_TIMEOUT=10800;\n"+cmd; + } + LOGGER.debug("InterpreterName: {}, SQL command: '{}'", interpreterName, cmd); + String finalCmd = cmd; if (!isRefreshMode(context)) { - return executeSql(dbPrefix, cmd, context); + return executeSql(dbPrefix, finalCmd, context); } else { int refreshInterval = Integer.parseInt(context.getLocalProperties().get("refreshInterval")); paragraphCancelMap.put(context.getParagraphId(), false); @@ -1070,7 +1076,7 @@ public InterpreterResult internalInterpret(String cmd, InterpreterContext contex refreshExecutor.scheduleAtFixedRate(() -> { context.out.clear(false); try { - InterpreterResult result = executeSql(dbPrefix, cmd, context); + InterpreterResult result = executeSql(dbPrefix, finalCmd, context); context.out.flush(); interpreterResultRef.set(result); if (result.code() != Code.SUCCESS) { From 9c94471c1884b1e86278a97fcabf0848b761a9d3 Mon Sep 17 00:00:00 2001 From: Vishvas-meesho Date: Thu, 17 Apr 2025 15:39:09 +0530 Subject: [PATCH 24/28] F/jdbc custom interpreter (#22) * added interceptor for Fail Fast in jdbc interpreter DI-2332 * feat: suffixing the STATEMENT_TIMEOUT = 10800 before rca interpreter fired queries --------- Co-authored-by: RaghwendraSingh Co-authored-by: Aman Singh Chauhan Co-authored-by: shagil-meesho --- jdbc/pom.xml | 19 ++- .../apache/zeppelin/jdbc/JDBCInterpreter.java | 129 +++++++++++++++++- .../zeppelin/jdbc/ValidationRequest.java | 16 +++ .../zeppelin/jdbc/ValidationResponse.java | 75 ++++++++++ 4 files changed, 236 insertions(+), 3 deletions(-) create mode 100644 jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationRequest.java create mode 100644 jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationResponse.java diff --git a/jdbc/pom.xml b/jdbc/pom.xml index c4c35388ca0..56467b4d074 100644 --- a/jdbc/pom.xml +++ b/jdbc/pom.xml @@ -46,7 +46,24 @@ - + + org.json + json + 20210307 + + + org.codehaus.jettison + jettison + 1.4.1 + + + com.google.code.gson + gson + 2.8.9 + + + + org.postgresql postgresql ${postgresql.version} diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java index f5302dc48ae..930e42e0a27 100644 --- a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java @@ -37,6 +37,8 @@ import org.apache.zeppelin.jdbc.hive.HiveUtils; import org.apache.zeppelin.tabledata.TableDataUtils; import org.apache.zeppelin.util.PropertiesUtil; +import org.codehaus.jettison.json.JSONArray; +import org.codehaus.jettison.json.JSONObject; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -75,6 +77,12 @@ import org.apache.zeppelin.user.UserCredentials; import org.apache.zeppelin.user.UsernamePassword; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.net.HttpURLConnection; +import java.net.URL; + /** * JDBC interpreter for Zeppelin. This interpreter can also be used for accessing HAWQ, * GreenplumDB, MariaDB, MySQL, Postgres and Redshift. @@ -143,6 +151,7 @@ public class JDBCInterpreter extends KerberosInterpreter { "zeppelin.jdbc.concurrent.max_connection"; private static final String DBCP_STRING = "jdbc:apache:commons:dbcp:"; private static final String MAX_ROWS_KEY = "zeppelin.jdbc.maxRows"; + private static final String FAIL_FAST_VALIDATE_URL = "http://spark-event-listener.prd.meesho.int/api/validate"; private static final Set PRESTO_PROPERTIES = new HashSet<>(Arrays.asList( "user", "password", @@ -350,6 +359,51 @@ public void close() { } } + public static ValidationResponse sendValidationRequest(ValidationRequest request) throws Exception { + HttpURLConnection connection = createConnection(); + sendRequest(connection, request); + return readResponse(connection); + } + + private static HttpURLConnection createConnection() throws Exception { + URL url = new URL(FAIL_FAST_VALIDATE_URL); + HttpURLConnection connection = (HttpURLConnection) url.openConnection(); + connection.setRequestMethod("POST"); + connection.setRequestProperty("Content-Type", "application/json"); + connection.setDoOutput(true); // Enable sending request body + return connection; + } + + private static void sendRequest(HttpURLConnection connection, ValidationRequest request) throws Exception { + try (OutputStream os = connection.getOutputStream()) { + String jsonRequest = request.toJson(); + byte[] input = jsonRequest.getBytes("utf-8"); + os.write(input, 0, input.length); + } + } + + private static ValidationResponse readResponse(HttpURLConnection connection) throws Exception { + int statusCode = connection.getResponseCode(); + BufferedReader reader; + + if (statusCode == HttpURLConnection.HTTP_OK) { + reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "utf-8")); + } else { + reader = new BufferedReader(new InputStreamReader(connection.getErrorStream(), "utf-8")); + } + + StringBuilder responseBuilder = new StringBuilder(); + String line; + while ((line = reader.readLine()) != null) { + responseBuilder.append(line.trim()); + } + + reader.close(); + connection.disconnect(); + + return ValidationResponse.fromJson(responseBuilder.toString()); + } + /* Get user of this sql. * 1. If shiro is enabled, use the login user * 2. Otherwise try to get it from interpreter setting, e.g. default.user @@ -785,6 +839,12 @@ private InterpreterResult executeSql(String dbPrefix, String sql, LOGGER.info("Execute sql: " + sqlToExecute); statement = connection.createStatement(); + String interpreterName = getInterpreterGroup().getId(); + + if (interpreterName != null && interpreterName.startsWith("spark_rca_")) { + statement.setQueryTimeout(10800); // 10800 seconds = 3 hours + } + // fetch n+1 rows in order to indicate there's more rows available (for large selects) statement.setFetchSize(context.getIntLocalProperty("limit", getMaxResult())); statement.setMaxRows(context.getIntLocalProperty("limit", maxRows)); @@ -809,6 +869,65 @@ private InterpreterResult executeSql(String dbPrefix, String sql, HiveUtils.startHiveMonitorThread(statement, context, Boolean.parseBoolean(getProperty("hive.log.display", "true")), this); } + + String userName = getUser(context); + String sqlToValidate = sqlToExecute + .replace("\n", " ") + .replace("\r", " ") + .replace("\t", " "); + ValidationRequest request = new ValidationRequest(sqlToValidate, userName); + try { + ValidationResponse response = sendValidationRequest(request); + if (response.isPreSubmitFail()) { + String outputMessage = response.getMessage(); + StringBuilder finalOutput = new StringBuilder(); + + if (response.isFailFast()) { + context.out.write("Query Error: Partition Filters Missing\n" + + "Your query failed because some tables are missing partition filters. To avoid this, please ensure partition filters are applied to improve performance.\n"); + JSONObject jsonObject = new JSONObject(outputMessage); + finalOutput.append("The following table(s) are missing partition filters:\n"); + + JSONArray tableNames = jsonObject.names(); + if (tableNames != null) { + for (int i = 0; i < tableNames.length(); i++) { + String table = tableNames.getString(i); + JSONArray partitions = jsonObject.getJSONArray(table); + finalOutput.append("Table: ").append(table).append(", Partition filter's: "); + + for (int j = 0; j < partitions.length(); j++) { + finalOutput.append(partitions.getString(j)); + if (j < partitions.length() - 1) { + finalOutput.append(", "); + } + } + finalOutput.append("\n"); + } + } + } else if (response.isFailedByDeprecatedTable()) { + context.out.write("Query Error: Restricted Table Used\n"); + JSONObject jsonObject = new JSONObject(outputMessage); + finalOutput.append("It seems you're trying to use a restricted table:\n"); + + JSONArray tableNames = jsonObject.names(); + if (tableNames != null) { + for (int i = 0; i < tableNames.length(); i++) { + String table = tableNames.getString(i); + finalOutput.append("Use: ").append(jsonObject.getString(table)).append(" in place of ").append(table).append("\n"); + } + } + } + context.getLocalProperties().put(CANCEL_REASON, finalOutput.toString()); + cancel(context); + return new InterpreterResult(Code.ERROR); + } + } catch (Exception e) { + String error = "Error occurred while sending request " + e.getMessage(); + String mess = e.getLocalizedMessage(); + context.out.write(error); + context.out.write(mess); + } + boolean isResultSetAvailable = statement.execute(sqlToExecute); getJDBCConfiguration(user).setConnectionInDBDriverPoolSuccessful(dbPrefix); if (isResultSetAvailable) { @@ -939,8 +1058,14 @@ public InterpreterResult internalInterpret(String cmd, InterpreterContext contex LOGGER.debug("Run SQL command '{}'", cmd); String dbPrefix = getDBPrefix(context); LOGGER.debug("DBPrefix: {}, SQL command: '{}'", dbPrefix, cmd); + String interpreterName = getInterpreterGroup().getId(); + if (interpreterName!=null && interpreterName.startsWith("spark_rca_")) { + cmd = "set STATEMENT_TIMEOUT=10800;\n"+cmd; + } + LOGGER.debug("InterpreterName: {}, SQL command: '{}'", interpreterName, cmd); + String finalCmd = cmd; if (!isRefreshMode(context)) { - return executeSql(dbPrefix, cmd, context); + return executeSql(dbPrefix, finalCmd, context); } else { int refreshInterval = Integer.parseInt(context.getLocalProperties().get("refreshInterval")); paragraphCancelMap.put(context.getParagraphId(), false); @@ -951,7 +1076,7 @@ public InterpreterResult internalInterpret(String cmd, InterpreterContext contex refreshExecutor.scheduleAtFixedRate(() -> { context.out.clear(false); try { - InterpreterResult result = executeSql(dbPrefix, cmd, context); + InterpreterResult result = executeSql(dbPrefix, finalCmd, context); context.out.flush(); interpreterResultRef.set(result); if (result.code() != Code.SUCCESS) { diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationRequest.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationRequest.java new file mode 100644 index 00000000000..71d8ad17418 --- /dev/null +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationRequest.java @@ -0,0 +1,16 @@ +package org.apache.zeppelin.jdbc; + +public class ValidationRequest { + private String queryText; + private String user; + + public ValidationRequest(String queryText, String user) { + this.queryText = queryText; + this.user = user; + } + + public String toJson() { + return "{\"query_text\":\"" + queryText + "\",\"user\":\"" + user + "\"}"; + } +} + diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationResponse.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationResponse.java new file mode 100644 index 00000000000..05716cc2edb --- /dev/null +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationResponse.java @@ -0,0 +1,75 @@ +package org.apache.zeppelin.jdbc; + +import com.google.gson.Gson; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; + +public class ValidationResponse { + private boolean preSubmitFail; + private boolean failFast; + private boolean failedByDeprecatedTable; + private String message; + + // Getters and Setters + public boolean isPreSubmitFail() { + return preSubmitFail; + } + + public void setPreSubmitFail(boolean preSubmitFail) { + this.preSubmitFail = preSubmitFail; + } + + public boolean isFailFast() { + return failFast; + } + + public void setFailFast(boolean failFast) { + this.failFast = failFast; + } + + public boolean isFailedByDeprecatedTable() { + return failedByDeprecatedTable; + } + + public void setFailedByDeprecatedTable(boolean failedByDeprecatedTable) { + this.failedByDeprecatedTable = failedByDeprecatedTable; + } + + public String getMessage() { + return message; + } + + public void setMessage(String message) { + this.message = message; + } + + public static ValidationResponse fromJson(String jsonResponse) { + Gson gson = new Gson(); + ValidationResponse response = new ValidationResponse(); + + JsonElement jsonElement = gson.fromJson(jsonResponse, JsonElement.class); + + if (jsonElement.isJsonObject()) { + JsonObject jsonObject = jsonElement.getAsJsonObject(); + + if (jsonObject.has("pre_submit_fail") && !jsonObject.get("pre_submit_fail").isJsonNull()) { + response.setPreSubmitFail(jsonObject.get("pre_submit_fail").getAsBoolean()); + } + if (jsonObject.has("fail_fast") && !jsonObject.get("fail_fast").isJsonNull()) { + response.setFailFast(jsonObject.get("fail_fast").getAsBoolean()); + } + if (jsonObject.has("failed_by_deprecated_table") && !jsonObject.get("failed_by_deprecated_table").isJsonNull()) { + response.setFailedByDeprecatedTable(jsonObject.get("failed_by_deprecated_table").getAsBoolean()); + } + if (jsonObject.has("message") && !jsonObject.get("message").isJsonNull()) { + response.setMessage(jsonObject.get("message").getAsString()); + } + } else { + response.setPreSubmitFail(false); + response.setFailFast(false); + response.setFailedByDeprecatedTable(false); + response.setMessage(""); // Default message + } + return response; + } +} From d8ac2f1013f6713fae78016d839954b7c2f3aba4 Mon Sep 17 00:00:00 2001 From: anuraags2719 Date: Mon, 30 Jun 2025 23:32:32 +0530 Subject: [PATCH 25/28] added 1 hour restriction --- .gitignore | 2 ++ .../src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java | 2 ++ 2 files changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index e52d5a63a2e..ec6747baad1 100644 --- a/.gitignore +++ b/.gitignore @@ -131,3 +131,5 @@ tramp # jEnv file .java-version +.pre-commit-config.yaml +trufflehog/ diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java index 930e42e0a27..33b5391a919 100644 --- a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java @@ -843,6 +843,8 @@ private InterpreterResult executeSql(String dbPrefix, String sql, if (interpreterName != null && interpreterName.startsWith("spark_rca_")) { statement.setQueryTimeout(10800); // 10800 seconds = 3 hours + } else if (interpreterName != null && interpreterName.startsWith("spark_")) { + statement.setQueryTimeout(3600); // 3600 seconds = 1 hour } // fetch n+1 rows in order to indicate there's more rows available (for large selects) From 03a629418c821de742f7b18c0efc993b5039a998 Mon Sep 17 00:00:00 2001 From: anuraags2719 <95436809+anuraags2719@users.noreply.github.com> Date: Tue, 15 Jul 2025 17:30:50 +0530 Subject: [PATCH 26/28] Revert "added 1 hour restriction" --- .gitignore | 2 -- .../src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java | 2 -- 2 files changed, 4 deletions(-) diff --git a/.gitignore b/.gitignore index ec6747baad1..e52d5a63a2e 100644 --- a/.gitignore +++ b/.gitignore @@ -131,5 +131,3 @@ tramp # jEnv file .java-version -.pre-commit-config.yaml -trufflehog/ diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java index 33b5391a919..930e42e0a27 100644 --- a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java @@ -843,8 +843,6 @@ private InterpreterResult executeSql(String dbPrefix, String sql, if (interpreterName != null && interpreterName.startsWith("spark_rca_")) { statement.setQueryTimeout(10800); // 10800 seconds = 3 hours - } else if (interpreterName != null && interpreterName.startsWith("spark_")) { - statement.setQueryTimeout(3600); // 3600 seconds = 1 hour } // fetch n+1 rows in order to indicate there's more rows available (for large selects) From f08494e9d97ef34eda80fa37db4b7f1a9a4de5be Mon Sep 17 00:00:00 2001 From: Vishvas-meesho Date: Fri, 8 Aug 2025 00:48:15 +0530 Subject: [PATCH 27/28] passing interpreter_name to fail_fast (#27) --- .../main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java | 5 ++++- .../java/org/apache/zeppelin/jdbc/ValidationRequest.java | 6 ++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java index 930e42e0a27..6893033816f 100644 --- a/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java @@ -875,7 +875,7 @@ private InterpreterResult executeSql(String dbPrefix, String sql, .replace("\n", " ") .replace("\r", " ") .replace("\t", " "); - ValidationRequest request = new ValidationRequest(sqlToValidate, userName); + ValidationRequest request = new ValidationRequest(sqlToValidate, userName, interpreterName); try { ValidationResponse response = sendValidationRequest(request); if (response.isPreSubmitFail()) { @@ -916,6 +916,9 @@ private InterpreterResult executeSql(String dbPrefix, String sql, finalOutput.append("Use: ").append(jsonObject.getString(table)).append(" in place of ").append(table).append("\n"); } } + }else if (outputMessage.contains("UnAuthorized Query")) { + context.out.write("Query Error: UnAuthorized Query\n"); + finalOutput.append("You are not authorized to execute this query.\n"); } context.getLocalProperties().put(CANCEL_REASON, finalOutput.toString()); cancel(context); diff --git a/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationRequest.java b/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationRequest.java index 71d8ad17418..23cf6f67a60 100644 --- a/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationRequest.java +++ b/jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationRequest.java @@ -3,14 +3,16 @@ public class ValidationRequest { private String queryText; private String user; + private String interpreterName; - public ValidationRequest(String queryText, String user) { + public ValidationRequest(String queryText, String user, String interpreterName) { this.queryText = queryText; this.user = user; + this.interpreterName = interpreterName; } public String toJson() { - return "{\"query_text\":\"" + queryText + "\",\"user\":\"" + user + "\"}"; + return "{\"query_text\":\"" + queryText + "\",\"user\":\"" + user + "\",\"interpreter_name\":\"" + interpreterName + "\"}"; } } From 11ff9545241d7a6faf1d1df70c6495e8d66e651e Mon Sep 17 00:00:00 2001 From: keshavpandya06 Date: Tue, 23 Sep 2025 15:23:11 +0530 Subject: [PATCH 28/28] Add .cursor directory and update .gitignore for AI documentation --- .../architecture/architecture.md | 335 ++++++++++++ .../architecture/core_server_components.md | 357 ++++++++++++ .../architecture/interpreter_framework.md | 376 +++++++++++++ .../architecture/module_structure.md | 222 ++++++++ .../architecture/storage_and_persistence.md | 305 +++++++++++ .../build_ci_cd_and_documentation.md | 341 ++++++++++++ .../deployment_and_operations.md | 374 +++++++++++++ .../kubernetes_support.md | 367 +++++++++++++ .../interpreters/flink_interpreter.md | 316 +++++++++++ .../interpreters/interpreters.md | 475 ++++++++++++++++ .../interpreters/jdbc_interpreter.md | 366 +++++++++++++ .../interpreters/livy_interpreter.md | 314 +++++++++++ .../interpreters/python_interpreter.md | 258 +++++++++ .../remote_interpreter_infrastructure.md | 345 ++++++++++++ .../interpreters/spark_interpreters.md | 355 ++++++++++++ .cursor/documentation/overview/overview.md | 243 +++++++++ .../configuration_management.md | 274 ++++++++++ .../notebook_server_and_apis.md | 506 ++++++++++++++++++ .../process_lifecycle_management.md | 361 +++++++++++++ .../server_components/server_components.md | 416 ++++++++++++++ .../web_interface/frontend_build_system.md | 356 ++++++++++++ .../interpreter_management_ui.md | 248 +++++++++ .../web_interface/navigation_and_core_ui.md | 314 +++++++++++ .../notebook_and_paragraph_ui.md | 322 +++++++++++ .../web_interface/web_interface.md | 363 +++++++++++++ .gitignore | 2 + 26 files changed, 8511 insertions(+) create mode 100644 .cursor/documentation/architecture/architecture.md create mode 100644 .cursor/documentation/architecture/core_server_components.md create mode 100644 .cursor/documentation/architecture/interpreter_framework.md create mode 100644 .cursor/documentation/architecture/module_structure.md create mode 100644 .cursor/documentation/architecture/storage_and_persistence.md create mode 100644 .cursor/documentation/deployment_and_operations/build_ci_cd_and_documentation.md create mode 100644 .cursor/documentation/deployment_and_operations/deployment_and_operations.md create mode 100644 .cursor/documentation/deployment_and_operations/kubernetes_support.md create mode 100644 .cursor/documentation/interpreters/flink_interpreter.md create mode 100644 .cursor/documentation/interpreters/interpreters.md create mode 100644 .cursor/documentation/interpreters/jdbc_interpreter.md create mode 100644 .cursor/documentation/interpreters/livy_interpreter.md create mode 100644 .cursor/documentation/interpreters/python_interpreter.md create mode 100644 .cursor/documentation/interpreters/remote_interpreter_infrastructure.md create mode 100644 .cursor/documentation/interpreters/spark_interpreters.md create mode 100644 .cursor/documentation/overview/overview.md create mode 100644 .cursor/documentation/server_components/configuration_management.md create mode 100644 .cursor/documentation/server_components/notebook_server_and_apis.md create mode 100644 .cursor/documentation/server_components/process_lifecycle_management.md create mode 100644 .cursor/documentation/server_components/server_components.md create mode 100644 .cursor/documentation/web_interface/frontend_build_system.md create mode 100644 .cursor/documentation/web_interface/interpreter_management_ui.md create mode 100644 .cursor/documentation/web_interface/navigation_and_core_ui.md create mode 100644 .cursor/documentation/web_interface/notebook_and_paragraph_ui.md create mode 100644 .cursor/documentation/web_interface/web_interface.md diff --git a/.cursor/documentation/architecture/architecture.md b/.cursor/documentation/architecture/architecture.md new file mode 100644 index 00000000000..185bb33dd99 --- /dev/null +++ b/.cursor/documentation/architecture/architecture.md @@ -0,0 +1,335 @@ +# Architecture + +
+Relevant source files + +The following files were used as context for generating this wiki page: + +- [.gitignore](.gitignore) +- [README.md](README.md) +- [conf/zeppelin-site.xml.template](conf/zeppelin-site.xml.template) +- [docs/setup/operation/configuration.md](docs/setup/operation/configuration.md) +- [pom.xml](pom.xml) +- [spark/pom.xml](spark/pom.xml) +- [zeppelin-display/pom.xml](zeppelin-display/pom.xml) +- [zeppelin-distribution/pom.xml](zeppelin-distribution/pom.xml) +- [zeppelin-interpreter/pom.xml](zeppelin-interpreter/pom.xml) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/launcher/InterpreterLauncher.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/launcher/InterpreterLauncher.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/lifecycle/TimeoutLifecycleManager.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/lifecycle/TimeoutLifecycleManager.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/scheduler/Job.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/scheduler/Job.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/scheduler/JobWithProgressPoller.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/scheduler/JobWithProgressPoller.java) +- [zeppelin-interpreter/src/test/java/org/apache/zeppelin/conf/ZeppelinConfigurationTest.java](zeppelin-interpreter/src/test/java/org/apache/zeppelin/conf/ZeppelinConfigurationTest.java) +- [zeppelin-interpreter/src/test/java/org/apache/zeppelin/scheduler/JobTest.java](zeppelin-interpreter/src/test/java/org/apache/zeppelin/scheduler/JobTest.java) +- [zeppelin-server/pom.xml](zeppelin-server/pom.xml) +- [zeppelin-server/src/main/java/org/apache/zeppelin/rest/NotebookRestApi.java](zeppelin-server/src/main/java/org/apache/zeppelin/rest/NotebookRestApi.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/server/ZeppelinServer.java](zeppelin-server/src/main/java/org/apache/zeppelin/server/ZeppelinServer.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/service/NotebookService.java](zeppelin-server/src/main/java/org/apache/zeppelin/service/NotebookService.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java](zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/rest/AbstractTestRestApi.java](zeppelin-server/src/test/java/org/apache/zeppelin/rest/AbstractTestRestApi.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/rest/NotebookRestApiTest.java](zeppelin-server/src/test/java/org/apache/zeppelin/rest/NotebookRestApiTest.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/rest/ZeppelinRestApiTest.java](zeppelin-server/src/test/java/org/apache/zeppelin/rest/ZeppelinRestApiTest.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/service/NotebookServiceTest.java](zeppelin-server/src/test/java/org/apache/zeppelin/service/NotebookServiceTest.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/socket/NotebookServerTest.java](zeppelin-server/src/test/java/org/apache/zeppelin/socket/NotebookServerTest.java) +- [zeppelin-web/pom.xml](zeppelin-web/pom.xml) +- [zeppelin-zengine/pom.xml](zeppelin-zengine/pom.xml) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/recovery/RecoveryUtils.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/recovery/RecoveryUtils.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreter.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreter.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Note.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Note.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Notebook.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Notebook.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Paragraph.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Paragraph.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/conf/ZeppelinConfigurationTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/conf/ZeppelinConfigurationTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/helium/HeliumApplicationFactoryTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/helium/HeliumApplicationFactoryTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/launcher/StandardInterpreterLauncherTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/launcher/StandardInterpreterLauncherTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/lifecycle/TimeoutLifecycleManagerTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/lifecycle/TimeoutLifecycleManagerTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/NoteTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/NoteTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/NotebookTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/NotebookTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/ParagraphTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/ParagraphTest.java) + +
+ + + +This document describes the high-level architecture of Apache Zeppelin, including the relationship between major components, module structure, and system design patterns. For detailed information about specific interpreter implementations, see [Interpreters](#5). For deployment and operational concerns, see [Deployment and Operations](#6). + +## Overview + +Apache Zeppelin is a web-based notebook platform that enables interactive data analytics. The system follows a multi-tier architecture with a clear separation between the frontend web interface, backend services, and pluggable interpreter framework for executing code in various languages. + +```mermaid +graph TB + subgraph "Frontend Layer" + WebUI["zeppelin-web
Angular.js Application"] + end + + subgraph "Communication Layer" + WebSocket["NotebookServer
WebSocket Endpoint"] + RestAPI["REST APIs
Jersey Framework"] + end + + subgraph "Core Engine" + Notebook["Notebook
High-level API"] + NoteManager["NoteManager
Note Operations"] + ZEngine["zeppelin-zengine
Execution Engine"] + end + + subgraph "Interpreter System" + InterpFactory["InterpreterFactory"] + InterpManager["InterpreterSettingManager"] + RemoteInterp["Remote Interpreters
Thrift RPC"] + end + + subgraph "Storage Layer" + NotebookRepo["NotebookRepo
Pluggable Storage"] + Storage["Git, S3, Azure, Local FS"] + end + + WebUI --> WebSocket + WebUI --> RestAPI + WebSocket --> Notebook + RestAPI --> Notebook + Notebook --> NoteManager + Notebook --> InterpFactory + InterpFactory --> InterpManager + InterpManager --> RemoteInterp + Notebook --> NotebookRepo + NotebookRepo --> Storage +``` + +Sources: [pom.xml:54-103](), [zeppelin-server/src/main/java/org/apache/zeppelin/server/ZeppelinServer.java:166-224](), [zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java:112-123]() + +## Maven Module Structure + +Zeppelin is organized as a multi-module Maven project with clear separation of concerns: + +| Module | Purpose | Key Dependencies | +|--------|---------|------------------| +| `zeppelin-server` | Web server and REST APIs | zeppelin-zengine, Jersey, Jetty | +| `zeppelin-web` | Frontend Angular.js application | Node.js build tools | +| `zeppelin-zengine` | Core notebook execution engine | zeppelin-interpreter, zeppelin-common | +| `zeppelin-interpreter` | Interpreter framework and base classes | Apache Thrift | +| `zeppelin-common` | Shared utilities and configurations | Commons libraries | +| `spark/*` | Spark interpreter family | Multiple Scala versions | +| `python` | Python interpreter | IPython, Conda support | +| `jdbc` | JDBC interpreter | Database drivers | + +```mermaid +graph TD + Root["zeppelin
(root pom)"] + + subgraph "Core Modules" + Common["zeppelin-common"] + Interpreter["zeppelin-interpreter"] + ZEngine["zeppelin-zengine"] + Server["zeppelin-server"] + Web["zeppelin-web"] + end + + subgraph "Language Interpreters" + SparkParent["spark-parent"] + Python["python"] + JDBC["jdbc"] + Flink["flink"] + end + + subgraph "Distribution" + Distribution["zeppelin-distribution"] + end + + Root --> Common + Root --> Interpreter + Root --> ZEngine + Root --> Server + Root --> Web + Root --> SparkParent + Root --> Python + Root --> JDBC + Root --> Flink + Root --> Distribution + + ZEngine --> Interpreter + ZEngine --> Common + Server --> ZEngine + SparkParent --> Interpreter + Python --> Interpreter + JDBC --> Interpreter + Distribution --> Server +``` + +Sources: [pom.xml:54-103](), [zeppelin-server/pom.xml:50-95](), [zeppelin-zengine/pom.xml:45-72](), [spark/pom.xml:58-67]() + +## Core Server Components + +The `zeppelin-server` module provides the main web server functionality using embedded Jetty with dependency injection via HK2. + +### Web Server Architecture + +```mermaid +graph TB + subgraph "Jetty Server" + JettyServer["Server
Jetty 9.4"] + WebAppContext["WebAppContext
Servlet Container"] + SessionHandler["SessionHandler"] + end + + subgraph "REST Layer" + NotebookRestApi["NotebookRestApi
/notebook/*"] + InterpreterRestApi["InterpreterRestApi
/interpreter/*"] + ConfigRestApi["ConfigurationRestApi
/config/*"] + Jersey["Jersey JAX-RS"] + end + + subgraph "WebSocket Layer" + NotebookServer["NotebookServer
/ws"] + ConnectionManager["ConnectionManager"] + end + + subgraph "Service Layer" + NotebookService["NotebookService"] + ConfigurationService["ConfigurationService"] + AuthenticationService["AuthenticationService"] + end + + JettyServer --> WebAppContext + WebAppContext --> Jersey + WebAppContext --> NotebookServer + Jersey --> NotebookRestApi + Jersey --> InterpreterRestApi + Jersey --> ConfigRestApi + NotebookRestApi --> NotebookService + NotebookServer --> ConnectionManager + NotebookServer --> NotebookService +``` + +Sources: [zeppelin-server/src/main/java/org/apache/zeppelin/server/ZeppelinServer.java:148-280](), [zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java:117-118](), [zeppelin-server/src/main/java/org/apache/zeppelin/rest/NotebookRestApi.java:75-113]() + +### Dependency Injection Configuration + +The server uses HK2 for dependency injection with a shared `ServiceLocator`: + +```mermaid +graph LR + ServiceLocator["ServiceLocatorFactory
shared-locator"] + + subgraph "Core Services" + NotebookRepo["NotebookRepoSync"] + InterpreterFactory["InterpreterFactory"] + Notebook["Notebook"] + NotebookService["NotebookService"] + end + + subgraph "Supporting Services" + AuthzService["AuthorizationService"] + AuthnService["AuthenticationService"] + ConfigService["ConfigurationService"] + end + + ServiceLocator --> NotebookRepo + ServiceLocator --> InterpreterFactory + ServiceLocator --> Notebook + ServiceLocator --> NotebookService + ServiceLocator --> AuthzService + ServiceLocator --> AuthnService + ServiceLocator --> ConfigService +``` + +Sources: [zeppelin-server/src/main/java/org/apache/zeppelin/server/ZeppelinServer.java:166-224]() + +## Data Model and Core Entities + +The core data model revolves around `Notebook`, `Note`, and `Paragraph` entities: + +### Core Entity Relationships + +```mermaid +graph TB + subgraph "Notebook Management" + Notebook["Notebook
High-level operations"] + NoteManager["NoteManager
CRUD operations"] + AuthorizationService["AuthorizationService
Access control"] + end + + subgraph "Note Structure" + Note["Note
Container for paragraphs"] + Paragraph["Paragraph
Execution unit"] + ParagraphJobListener["ParagraphJobListener
Execution callbacks"] + end + + subgraph "Execution Context" + InterpreterFactory["InterpreterFactory
Interpreter selection"] + InterpreterSetting["InterpreterSetting
Configuration"] + RemoteInterpreter["RemoteInterpreter
Process communication"] + end + + Notebook --> NoteManager + Notebook --> AuthorizationService + NoteManager --> Note + Note --> Paragraph + Paragraph --> ParagraphJobListener + Paragraph --> InterpreterFactory + InterpreterFactory --> InterpreterSetting + InterpreterSetting --> RemoteInterpreter +``` + +Sources: [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Notebook.java:73-114](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Note.java:70-185](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Paragraph.java:67-116]() + +## Communication Architecture + +Zeppelin supports both synchronous REST API calls and real-time WebSocket communication: + +### WebSocket Message Flow + +```mermaid +sequenceDiagram + participant Frontend as "Angular Frontend" + participant NotebookServer as "NotebookServer" + participant NotebookService as "NotebookService" + participant Paragraph as "Paragraph" + participant RemoteInterpreter as "RemoteInterpreter" + + Frontend->>NotebookServer: RUN_PARAGRAPH (WebSocket) + NotebookServer->>NotebookService: runParagraph() + NotebookService->>Paragraph: execute() + Paragraph->>RemoteInterpreter: interpret() + RemoteInterpreter-->>Paragraph: InterpreterResult + Paragraph-->>NotebookService: Job completion + NotebookService-->>NotebookServer: Broadcast result + NotebookServer-->>Frontend: PARAGRAPH update (WebSocket) +``` + +Sources: [zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java:274-500](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Paragraph.java:322-373]() + +### Configuration System + +Zeppelin uses a hierarchical configuration system with multiple sources: + +```mermaid +graph TB + subgraph "Configuration Sources (Priority Order)" + EnvVars["Environment Variables
Highest Priority"] + SysProps["System Properties"] + ZeppelinSite["zeppelin-site.xml
Lowest Priority"] + end + + subgraph "Configuration Classes" + ZeppelinConfiguration["ZeppelinConfiguration
Central config access"] + ConfVars["ConfVars enum
Property definitions"] + end + + subgraph "Configuration Categories" + ServerConfig["Server Configuration
Ports, SSL, Context"] + NotebookConfig["Notebook Configuration
Storage, Permissions"] + InterpreterConfig["Interpreter Configuration
Timeouts, Limits"] + end + + EnvVars --> ZeppelinConfiguration + SysProps --> ZeppelinConfiguration + ZeppelinSite --> ZeppelinConfiguration + ZeppelinConfiguration --> ConfVars + ZeppelinConfiguration --> ServerConfig + ZeppelinConfiguration --> NotebookConfig + ZeppelinConfiguration --> InterpreterConfig +``` + +Sources: [zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java:62-148](), [conf/zeppelin-site.xml.template:20-700]() diff --git a/.cursor/documentation/architecture/core_server_components.md b/.cursor/documentation/architecture/core_server_components.md new file mode 100644 index 00000000000..a183af3ab91 --- /dev/null +++ b/.cursor/documentation/architecture/core_server_components.md @@ -0,0 +1,357 @@ +# Core Server Components + +
+Relevant source files + +The following files were used as context for generating this wiki page: + +- [bin/common.cmd](bin/common.cmd) +- [bin/common.sh](bin/common.sh) +- [bin/functions.sh](bin/functions.sh) +- [bin/interpreter.sh](bin/interpreter.sh) +- [bin/zeppelin-daemon.sh](bin/zeppelin-daemon.sh) +- [bin/zeppelin.sh](bin/zeppelin.sh) +- [conf/zeppelin-env.cmd.template](conf/zeppelin-env.cmd.template) +- [conf/zeppelin-env.sh.template](conf/zeppelin-env.sh.template) +- [conf/zeppelin-site.xml.template](conf/zeppelin-site.xml.template) +- [docs/setup/operation/configuration.md](docs/setup/operation/configuration.md) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/launcher/InterpreterLauncher.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/launcher/InterpreterLauncher.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/lifecycle/TimeoutLifecycleManager.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/lifecycle/TimeoutLifecycleManager.java) +- [zeppelin-interpreter/src/test/java/org/apache/zeppelin/conf/ZeppelinConfigurationTest.java](zeppelin-interpreter/src/test/java/org/apache/zeppelin/conf/ZeppelinConfigurationTest.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/server/ImmediateErrorHandlerImpl.java](zeppelin-server/src/main/java/org/apache/zeppelin/server/ImmediateErrorHandlerImpl.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/server/ZeppelinServer.java](zeppelin-server/src/main/java/org/apache/zeppelin/server/ZeppelinServer.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/socket/SessionConfigurator.java](zeppelin-server/src/main/java/org/apache/zeppelin/socket/SessionConfigurator.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/utils/TestUtils.java](zeppelin-server/src/main/java/org/apache/zeppelin/utils/TestUtils.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/recovery/RecoveryTest.java](zeppelin-server/src/test/java/org/apache/zeppelin/recovery/RecoveryTest.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/recovery/RecoveryUtils.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/recovery/RecoveryUtils.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreter.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreter.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/conf/ZeppelinConfigurationTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/conf/ZeppelinConfigurationTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/launcher/StandardInterpreterLauncherTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/launcher/StandardInterpreterLauncherTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/lifecycle/TimeoutLifecycleManagerTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/lifecycle/TimeoutLifecycleManagerTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterTest.java) + +
+ + + +This document explains the core server infrastructure of Apache Zeppelin, including the Jetty web server configuration, HK2 dependency injection framework, configuration management, and core service bindings that form the foundation of the Zeppelin server architecture. + +For information about the interpreter framework and execution, see [Interpreter Framework](#2.3). For details about storage and persistence mechanisms, see [Storage and Persistence](#2.4). + +## Server Bootstrap Process + +The Zeppelin server bootstrap begins with the `ZeppelinServer.main()` method, which orchestrates the initialization of all core components in a specific sequence. + +```mermaid +flowchart TD + Start["main()"] --> LoadConfig["ZeppelinConfiguration.create()"] + LoadConfig --> SetupJetty["setupJettyServer()"] + SetupJetty --> InitMetrics["initMetrics()"] + InitMetrics --> CreateServiceLocator["ServiceLocatorFactory.getInstance().create()"] + CreateServiceLocator --> BindServices["ServiceLocatorUtilities.bind()"] + BindServices --> SetupWebApps["setupWebAppContext()"] + SetupWebApps --> InitWebApps["initWebApp()"] + InitWebApps --> SetupCluster["setupClusterManagerServer()"] + SetupCluster --> StartServer["jettyWebServer.start()"] + StartServer --> RecoveryInit["notebook.recoveryIfNecessary()"] + RecoveryInit --> Running["Server Running"] +``` + +**Server Bootstrap Flow** + +The bootstrap process loads configuration from multiple sources with the following priority order: environment variables, system properties, and finally the `zeppelin-site.xml` configuration file. The server then sets up the Jetty web server with appropriate connectors, initializes the HK2 dependency injection framework, and binds all core services before starting the web server. + +Sources: [zeppelin-server/src/main/java/org/apache/zeppelin/server/ZeppelinServer.java:148-310]() + +## Jetty Web Server Architecture + +Zeppelin uses an embedded Jetty server as its web container, configured with customizable thread pools, SSL support, and multiple web application contexts. + +```mermaid +graph TB + subgraph "Jetty Server" + ThreadPool["InstrumentedQueuedThreadPool"] + ServerConnector["ServerConnector"] + TimedHandler["TimedHandler"] + ContextCollection["ContextHandlerCollection"] + end + + subgraph "Web Application Contexts" + DefaultWebApp["Default WebApp Context
zeppelin-web"] + NextWebApp["Next WebApp Context
zeppelin-web-angular"] + RestAPI["REST API Servlet
'/api/*'"] + WebSocket["WebSocket Endpoint
NotebookServer"] + Prometheus["Prometheus Metrics
'/metrics'"] + HealthCheck["Health Check
'/health/*'"] + end + + subgraph "SSL Configuration" + SslContextFactory["SslContextFactory.Server"] + KeyStore["KeyStore/TrustStore"] + PEMSupport["PEM Certificate Support"] + end + + ThreadPool --> ServerConnector + ServerConnector --> TimedHandler + TimedHandler --> ContextCollection + ContextCollection --> DefaultWebApp + ContextCollection --> NextWebApp + DefaultWebApp --> RestAPI + DefaultWebApp --> WebSocket + DefaultWebApp --> Prometheus + DefaultWebApp --> HealthCheck + ServerConnector --> SslContextFactory + SslContextFactory --> KeyStore + SslContextFactory --> PEMSupport +``` + +**Jetty Server Component Architecture** + +The server supports both HTTP and HTTPS configurations, with SSL certificates configurable via either Java KeyStore/TrustStore files or PEM certificate files. Thread pool sizing is configurable through properties like `zeppelin.server.jetty.thread.pool.max`. + +Sources: [zeppelin-server/src/main/java/org/apache/zeppelin/server/ZeppelinServer.java:343-391](), [zeppelin-server/src/main/java/org/apache/zeppelin/server/ZeppelinServer.java:476-546]() + +## HK2 Dependency Injection Framework + +Zeppelin uses the HK2 dependency injection framework to manage service lifecycles and dependencies. A shared `ServiceLocator` instance manages all core services as singletons. + +```mermaid +graph TB + ServiceLocatorFactory["ServiceLocatorFactory.getInstance()"] + SharedServiceLocator["ServiceLocator
SERVICE_LOCATOR_NAME"] + + subgraph "Core Service Bindings" + ZeppelinConfig["ZeppelinConfiguration"] + InterpreterFactory["InterpreterFactory"] + InterpreterSettingManager["InterpreterSettingManager"] + NotebookRepoSync["NotebookRepoSync"] + Notebook["Notebook"] + NoteManager["NoteManager"] + NotebookService["NotebookService"] + NotebookServer["NotebookServer"] + AuthService["AuthenticationService"] + AuthzService["AuthorizationService"] + ConnectionManager["ConnectionManager"] + SearchService["SearchService"] + SchedulerService["SchedulerService"] + Helium["Helium"] + HeliumBundleFactory["HeliumBundleFactory"] + Credentials["Credentials"] + end + + subgraph "Service Interfaces" + AuthInterface["AuthenticationService Interface"] + SearchInterface["SearchService Interface"] + SchedulerInterface["SchedulerService Interface"] + NotebookRepoInterface["NotebookRepo Interface"] + end + + ServiceLocatorFactory --> SharedServiceLocator + SharedServiceLocator --> ZeppelinConfig + SharedServiceLocator --> InterpreterFactory + SharedServiceLocator --> InterpreterSettingManager + SharedServiceLocator --> NotebookRepoSync + SharedServiceLocator --> Notebook + SharedServiceLocator --> NoteManager + SharedServiceLocator --> NotebookService + SharedServiceLocator --> NotebookServer + SharedServiceLocator --> AuthService + SharedServiceLocator --> AuthzService + SharedServiceLocator --> ConnectionManager + SharedServiceLocator --> SearchService + SharedServiceLocator --> SchedulerService + SharedServiceLocator --> Helium + SharedServiceLocator --> HeliumBundleFactory + SharedServiceLocator --> Credentials + + AuthService --> AuthInterface + SearchService --> SearchInterface + SchedulerService --> SchedulerInterface + NotebookRepoSync --> NotebookRepoInterface +``` + +**HK2 Service Dependency Graph** + +The service bindings are configured in the `main()` method using an `AbstractBinder` implementation. Services are bound both to their concrete implementation classes and to their interface types, allowing for polymorphic dependency injection. + +| Service | Implementation | Lifecycle | Interface | +|---------|---------------|-----------|-----------| +| `AuthenticationService` | `ShiroAuthenticationService` or `NoAuthenticationService` | Singleton | Based on Shiro configuration | +| `SearchService` | `LuceneSearch` or `NoSearchService` | Singleton | Based on search enable flag | +| `SchedulerService` | `QuartzSchedulerService` or `NoSchedulerService` | Singleton | Based on cron enable flag | +| `NotebookRepo` | `NotebookRepoSync` | Immediate | Multi-backend support | + +Sources: [zeppelin-server/src/main/java/org/apache/zeppelin/server/ZeppelinServer.java:166-223](), [zeppelin-server/src/main/java/org/apache/zeppelin/socket/SessionConfigurator.java:47-49]() + +## Configuration Management + +The `ZeppelinConfiguration` class implements a hierarchical configuration system that loads settings from multiple sources in priority order. + +```mermaid +graph LR + subgraph "Configuration Sources (Priority Order)" + EnvVars["Environment Variables
ZEPPELIN_*"] + SysProps["System Properties
-D flags"] + ConfigFile["Configuration File
zeppelin-site.xml"] + end + + subgraph "Configuration Loading" + ZeppelinLocationStrategy["ZeppelinLocationStrategy"] + ClasspathLocationStrategy["ClasspathLocationStrategy"] + XMLConfiguration["XMLConfiguration"] + ConfigBuilder["FileBasedConfigurationBuilder"] + end + + subgraph "Configuration Categories" + ServerConfig["Server Configuration
Port, Address, SSL"] + StorageConfig["Storage Configuration
Notebook, Recovery"] + InterpreterConfig["Interpreter Configuration
Directories, Timeouts"] + SecurityConfig["Security Configuration
Authentication, Authorization"] + ClusterConfig["Cluster Configuration
Addresses, Heartbeat"] + end + + EnvVars --> XMLConfiguration + SysProps --> XMLConfiguration + ConfigFile --> ZeppelinLocationStrategy + ZeppelinLocationStrategy --> ConfigBuilder + ClasspathLocationStrategy --> ConfigBuilder + ConfigBuilder --> XMLConfiguration + + XMLConfiguration --> ServerConfig + XMLConfiguration --> StorageConfig + XMLConfiguration --> InterpreterConfig + XMLConfiguration --> SecurityConfig + XMLConfiguration --> ClusterConfig +``` + +**Configuration Hierarchy and Loading Process** + +The configuration system uses Apache Commons Configuration2 to load settings from XML files, with support for both filesystem and classpath-based configuration file discovery. Configuration values are strongly typed with automatic conversion and validation. + +| Configuration Category | Key Properties | Default Values | +|------------------------|---------------|----------------| +| Server | `zeppelin.server.addr`, `zeppelin.server.port` | `127.0.0.1:8080` | +| SSL | `zeppelin.ssl`, `zeppelin.ssl.keystore.path` | Disabled | +| Interpreters | `zeppelin.interpreter.dir`, `zeppelin.interpreter.connect.timeout` | `interpreter`, `600s` | +| Notebook Storage | `zeppelin.notebook.storage`, `zeppelin.notebook.dir` | `GitNotebookRepo`, `notebook` | + +Sources: [zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java:83-118](), [conf/zeppelin-site.xml.template:22-830]() + +## Core Service Implementations + +The server binds multiple service implementations based on configuration flags, providing pluggable architectures for authentication, search, scheduling, and storage. + +```mermaid +classDiagram + class NotebookServer { + +WebSocketServlet + +AngularObjectRegistryListener + +RemoteInterpreterProcessListener + +ApplicationEventListener + +NoteEventListener + } + + class InterpreterSettingManager { + +createInterpreterSetting() + +getInterpreterSettingByName() + +restart() + +close() + } + + class NotebookService { + +createNote() + +runParagraph() + +runAllParagraphs() + +deleteNote() + } + + class AuthenticationService { + <> + } + + class ShiroAuthenticationService { + +getPrincipal() + +getAssociatedUser() + } + + class NoAuthenticationService { + +getPrincipal() + +getAssociatedUser() + } + + class SearchService { + <> + } + + class LuceneSearch { + +indexNote() + +deleteNote() + +query() + } + + class NoSearchService { + +indexNote() + +deleteNote() + +query() + } + + AuthenticationService <|-- ShiroAuthenticationService + AuthenticationService <|-- NoAuthenticationService + SearchService <|-- LuceneSearch + SearchService <|-- NoSearchService + + NotebookServer --> InterpreterSettingManager + NotebookServer --> NotebookService + NotebookService --> AuthenticationService +``` + +**Core Service Class Relationships** + +The `NotebookServer` serves as the primary WebSocket endpoint and implements multiple listener interfaces for different types of events. Service selection is determined by configuration flags - for example, authentication services are selected based on whether a Shiro configuration file exists. + +Sources: [zeppelin-server/src/main/java/org/apache/zeppelin/server/ZeppelinServer.java:193-221]() + +## Server Lifecycle Management + +Zeppelin provides shell scripts for managing the server lifecycle, with support for daemon mode operation and graceful shutdown handling. + +```mermaid +stateDiagram-v2 + [*] --> Stopped + + Stopped --> Starting : zeppelin-daemon.sh start + Stopped --> Running : zeppelin.sh + + Starting --> Running : Successful startup + Starting --> Failed : Startup error + + Running --> Stopping : zeppelin-daemon.sh stop + Running --> Stopping : SIGTERM/SIGINT + + Stopping --> Stopped : Graceful shutdown + + Failed --> Stopped : Error cleanup + + Running --> Running : zeppelin-daemon.sh reload + + state Running { + [*] --> ServerStarted + ServerStarted --> NotebookInitialized : notebook.initNotebook() + NotebookInitialized --> RecoveryComplete : notebook.recoveryIfNecessary() + RecoveryComplete --> FullyOperational + } +``` + +**Server Lifecycle State Machine** + +The server supports both foreground execution via `zeppelin.sh` and daemon mode via `zeppelin-daemon.sh`. The daemon script manages PID files, handles process monitoring, and provides graceful shutdown with configurable timeouts. + +| Script | Purpose | Key Features | +|--------|---------|--------------| +| `zeppelin.sh` | Direct execution | Foreground mode, direct console output | +| `zeppelin-daemon.sh` | Daemon management | Background mode, PID management, log redirection | +| `common.sh` | Shared utilities | Environment setup, classpath construction | +| `interpreter.sh` | Interpreter processes | Remote interpreter launching | + +Sources: [bin/zeppelin-daemon.sh:188-235](), [bin/zeppelin.sh:88-143](), [bin/common.sh:25-177]() diff --git a/.cursor/documentation/architecture/interpreter_framework.md b/.cursor/documentation/architecture/interpreter_framework.md new file mode 100644 index 00000000000..23032e18d26 --- /dev/null +++ b/.cursor/documentation/architecture/interpreter_framework.md @@ -0,0 +1,376 @@ +# Interpreter Framework + +
+Relevant source files + +The following files were used as context for generating this wiki page: + +- [docs/assets/themes/zeppelin/img/screenshots/interpreter_setting_with_context_parameters.png](docs/assets/themes/zeppelin/img/screenshots/interpreter_setting_with_context_parameters.png) +- [helium-dev/src/main/java/org/apache/zeppelin/helium/ZeppelinDevServer.java](helium-dev/src/main/java/org/apache/zeppelin/helium/ZeppelinDevServer.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/AbstractInterpreter.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/AbstractInterpreter.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/Interpreter.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/Interpreter.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/InterpreterGroup.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/InterpreterGroup.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/InterpreterOption.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/InterpreterOption.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterServer.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterServer.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterUtils.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterUtils.java) +- [zeppelin-interpreter/src/test/java/org/apache/zeppelin/interpreter/InterpreterTest.java](zeppelin-interpreter/src/test/java/org/apache/zeppelin/interpreter/InterpreterTest.java) +- [zeppelin-interpreter/src/test/java/org/apache/zeppelin/interpreter/ZeppCtxtVariableTest.java](zeppelin-interpreter/src/test/java/org/apache/zeppelin/interpreter/ZeppCtxtVariableTest.java) +- [zeppelin-interpreter/src/test/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterServerTest.java](zeppelin-interpreter/src/test/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterServerTest.java) +- [zeppelin-interpreter/src/test/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterUtilsTest.java](zeppelin-interpreter/src/test/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterUtilsTest.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/rest/InterpreterRestApi.java](zeppelin-server/src/main/java/org/apache/zeppelin/rest/InterpreterRestApi.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/rest/InterpreterRestApiTest.java](zeppelin-server/src/test/java/org/apache/zeppelin/rest/InterpreterRestApiTest.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterFactory.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterFactory.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSetting.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSetting.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSettingManager.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSettingManager.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/ManagedInterpreterGroup.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/ManagedInterpreterGroup.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/RemoteInterpreterEventServer.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/RemoteInterpreterEventServer.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/InterpreterFactoryTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/InterpreterFactoryTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/InterpreterSettingManagerTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/InterpreterSettingManagerTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/InterpreterSettingTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/InterpreterSettingTest.java) + +
+ + + +The Interpreter Framework is the core system in Apache Zeppelin that manages interpreter settings, creates interpreter instances, and handles their lifecycle. This framework enables Zeppelin to support multiple programming languages and execution backends through a pluggable architecture where interpreters can run locally or in remote processes. + +For information about specific interpreter implementations like Spark or Python, see [Interpreters](#5). For details about the web interface for managing interpreters, see [Interpreter Management UI](#3.2). + +## Architecture Overview + +The Interpreter Framework consists of several key components that work together to provide interpreter functionality: + +```mermaid +graph TB + subgraph "Zeppelin Server" + ISM["InterpreterSettingManager
Central Registry"] + IF["InterpreterFactory
Instance Provider"] + IS["InterpreterSetting
Configuration & Lifecycle"] + MIG["ManagedInterpreterGroup
Session Management"] + end + + subgraph "Remote Process" + RIS["RemoteInterpreterServer
Thrift RPC Server"] + IG["InterpreterGroup
Session Container"] + INTP["Interpreter
Language Runtime"] + end + + subgraph "Communication" + RIES["RemoteInterpreterEventServer
Event Handling"] + Thrift["Thrift RPC
Protocol"] + end + + ISM --> IS + ISM --> IF + IS --> MIG + MIG --> RIS + RIS --> IG + IG --> INTP + ISM --> RIES + RIES --> Thrift + RIS --> Thrift +``` + +**Sources:** [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSettingManager.java:1-760](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterFactory.java:1-84](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSetting.java:1-1200]() + +## Interpreter Settings Management + +The `InterpreterSettingManager` serves as the central registry for all interpreter configurations and is responsible for loading, creating, updating, and removing interpreter settings. + +### Setting Templates and Instances + +```mermaid +graph LR + subgraph "Template Layer" + Templates["interpreterSettingTemplates
Map"] + JSON["interpreter-setting.json
Configuration Files"] + end + + subgraph "Instance Layer" + Settings["interpreterSettings
Map"] + Storage["interpreter.json
Persistent Storage"] + end + + JSON --> Templates + Templates --> Settings + Settings --> Storage + + subgraph "Operations" + Load["loadFromFile()"] + Create["createNewSetting()"] + Update["setPropertyAndRestart()"] + Remove["remove()"] + end + + Load --> Settings + Create --> Settings + Update --> Settings + Remove --> Settings +``` + +The system maintains two levels of interpreter configurations: + +- **Templates** (`interpreterSettingTemplates`): Default configurations loaded from `interpreter-setting.json` files in interpreter directories +- **Instances** (`interpreterSettings`): User-customized configurations persisted in `interpreter.json` + +**Sources:** [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSettingManager.java:122-133](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSettingManager.java:239-321]() + +### Configuration Loading Process + +The `InterpreterSettingManager` follows this initialization sequence: + +1. **Load Templates**: Scan interpreter directories for `interpreter-setting.json` files via `loadInterpreterSettingFromDefaultDir()` +2. **Load Instances**: Read user configurations from `interpreter.json` via `loadFromFile()` +3. **Merge Configurations**: Combine template properties with user customizations +4. **Initialize Dependencies**: Set up dependency resolvers and repositories + +**Sources:** [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSettingManager.java:414-443](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSettingManager.java:499-542]() + +## Interpreter Lifecycle Management + +Each `InterpreterSetting` manages the complete lifecycle of interpreter instances, from creation to destruction. + +### Interpreter Creation Flow + +```mermaid +sequenceDiagram + participant Client + participant IS as "InterpreterSetting" + participant MIG as "ManagedInterpreterGroup" + participant RIP as "RemoteInterpreterProcess" + participant RIS as "RemoteInterpreterServer" + + Client->>IS: getDefaultInterpreter(user, noteId) + IS->>IS: getOrCreateInterpreterGroup(executionContext) + IS->>MIG: getOrCreateSession(user, sessionId) + MIG->>MIG: getOrCreateInterpreterProcess(userName, properties) + MIG->>RIP: createInterpreterProcess() + RIP->>RIP: start() + RIP->>RIS: Remote Process Started + MIG->>IS: createInterpreters(user, groupId, sessionId) + IS->>Client: Return Interpreter List +``` + +The creation process involves: + +1. **Execution Context**: Determine interpreter group ID based on isolation settings +2. **Session Management**: Create or reuse sessions based on `InterpreterOption` configuration +3. **Process Management**: Launch remote interpreter processes when needed +4. **Interpreter Instantiation**: Create actual interpreter instances in the remote process + +**Sources:** [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSetting.java:448-462](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSetting.java:820-846](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/ManagedInterpreterGroup.java:60-75]() + +### Isolation Modes + +The framework supports multiple isolation modes controlled by `InterpreterOption`: + +| Mode | Scope | Behavior | +|------|-------|----------| +| `SHARED` | Global | Single process/session shared across all users and notes | +| `SCOPED` | Per User/Note | Separate sessions within shared process | +| `ISOLATED` | Per User/Note | Separate processes for each user/note | + +**Sources:** [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/InterpreterOption.java:30-32](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSetting.java:400-442]() + +## Remote Interpreter Infrastructure + +Zeppelin runs interpreters in separate processes to provide isolation and fault tolerance. The remote interpreter infrastructure handles process management and communication. + +### Process Architecture + +```mermaid +graph TB + subgraph "Zeppelin Server Process" + ISM["InterpreterSettingManager"] + RIES["RemoteInterpreterEventServer
Port: configurable"] + MIG["ManagedInterpreterGroup"] + RIP["RemoteInterpreterProcess
Process Wrapper"] + end + + subgraph "Remote Interpreter Process" + RIS["RemoteInterpreterServer
Port: dynamic"] + IG["InterpreterGroup"] + Interpreters["Interpreter Instances
(Spark, Python, etc)"] + end + + subgraph "Communication" + Thrift["Thrift RPC"] + Registration["Process Registration"] + Events["Event Streaming"] + end + + ISM --> RIES + MIG --> RIP + RIP --> RIS + RIS --> IG + IG --> Interpreters + + RIS -.->|registerInterpreterProcess| RIES + RIES -.->|onInterpreterOutputAppend| RIS + RIP -.->|Thrift Calls| RIS +``` + +### Process Registration + +When a remote interpreter process starts, it follows this registration sequence: + +1. **Server Startup**: `RemoteInterpreterServer` starts and binds to available port +2. **Registration**: Calls `registerInterpreterProcess()` on `RemoteInterpreterEventServer` +3. **Process Tracking**: Server updates `RemoteInterpreterProcess` with connection details +4. **Event Setup**: Establishes bidirectional communication for events and output + +**Sources:** [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterServer.java:616-658](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/RemoteInterpreterEventServer.java:167-186]() + +### Launcher Plugins + +The framework supports multiple launcher implementations for different deployment scenarios: + +| Launcher | Use Case | Configuration | +|----------|----------|---------------| +| `StandardInterpreterLauncher` | Local processes | Default | +| `SparkInterpreterLauncher` | Spark-specific setup | Spark interpreter | +| `K8sStandardInterpreterLauncher` | Kubernetes pods | `RUN_MODE.K8S` | +| `YarnInterpreterLauncher` | YARN containers | `zeppelin.interpreter.launcher=yarn` | +| `DockerInterpreterLauncher` | Docker containers | `RUN_MODE.DOCKER` | + +**Sources:** [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSetting.java:763-791]() + +## Session and Group Management + +The framework organizes interpreters into groups and sessions to manage sharing and isolation. + +### Hierarchy Structure + +```mermaid +graph TB + subgraph "InterpreterSetting Level" + IS["InterpreterSetting
spark, python, etc"] + end + + subgraph "Group Level" + IG1["ManagedInterpreterGroup
spark-user1-note1"] + IG2["ManagedInterpreterGroup
spark-user2-note2"] + end + + subgraph "Session Level" + S1["Session: user1"] + S2["Session: user2"] + S3["Session: shared_session"] + end + + subgraph "Interpreter Level" + I1["RemoteInterpreter
SparkInterpreter"] + I2["RemoteInterpreter
PySparkInterpreter"] + I3["RemoteInterpreter
SparkSQLInterpreter"] + end + + IS --> IG1 + IS --> IG2 + IG1 --> S1 + IG2 --> S2 + IG2 --> S3 + S1 --> I1 + S1 --> I2 + S2 --> I3 +``` + +### Session ID Generation + +Session IDs are generated based on the `InterpreterOption` configuration: + +- **Shared**: `"shared_session"` +- **Per User**: `user` +- **Per Note**: `noteId` +- **Per User & Note**: `user + ":" + noteId` +- **Existing Process**: `"existing_process"` + +**Sources:** [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSetting.java:428-442]() + +### Group ID Generation + +Interpreter group IDs encode the isolation scope: + +``` +{settingId}-{isolationKeys} +``` + +Where isolation keys depend on the interpreter option: +- **Shared**: `"shared_process"` +- **Isolated**: `{user}` and/or `{noteId}` +- **Existing**: `"existing_process"` + +**Sources:** [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSetting.java:400-425]() + +## Factory Pattern Implementation + +The `InterpreterFactory` provides a clean interface for creating interpreter instances without exposing the complexity of the underlying management system. + +### Resolution Logic + +```mermaid +flowchart TD + Start["getInterpreter(replName, executionContext)"] + Check{{"replName blank?"}} + Default["Get default interpreter
from defaultInterpreterGroup"] + Split["Split replName by '.'"] + + TwoParts{{"2 parts?"}} + Group["Extract group"] + Name["Extract name"] + Setting1["getByName(group)"] + GetInterp1["setting.getInterpreter(context, name)"] + + OnePart{{"1 part?"}} + AsName["Try as interpreter name
in default group"] + Found1{{"Found?"}} + AsGroup["Try as group name"] + GetDefault["getDefaultInterpreter()"] + + NotFound["Throw InterpreterNotFoundException"] + + Start --> Check + Check -->|Yes| Default + Check -->|No| Split + Split --> TwoParts + TwoParts -->|Yes| Group + Group --> Name + Name --> Setting1 + Setting1 --> GetInterp1 + + TwoParts -->|No| OnePart + OnePart -->|Yes| AsName + AsName --> Found1 + Found1 -->|Yes| GetInterp1 + Found1 -->|No| AsGroup + AsGroup --> GetDefault + + OnePart -->|No| NotFound +``` + +**Sources:** [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterFactory.java:38-83]() + +## Error Handling and Recovery + +The framework includes comprehensive error handling and recovery mechanisms: + +### Status Management + +Each `InterpreterSetting` maintains a status: + +- `READY`: Normal operational state +- `DOWNLOADING_DEPENDENCIES`: Resolving artifact dependencies +- `ERROR`: Failed state with error reason + +**Sources:** [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSetting.java:728-748]() + +### Recovery Storage + +The framework integrates with recovery storage systems to persist interpreter state and enable recovery after failures: + +```java +// Recovery operations +recoveryStorage.onInterpreterClientStart(process); +recoveryStorage.onInterpreterClientStop(process); +``` + +**Sources:** [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSetting.java:857-858](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/ManagedInterpreterGroup.java:112-115]() diff --git a/.cursor/documentation/architecture/module_structure.md b/.cursor/documentation/architecture/module_structure.md new file mode 100644 index 00000000000..fb488f66c12 --- /dev/null +++ b/.cursor/documentation/architecture/module_structure.md @@ -0,0 +1,222 @@ +# Module Structure + +
+Relevant source files + +The following files were used as context for generating this wiki page: + +- [.gitignore](.gitignore) +- [README.md](README.md) +- [pom.xml](pom.xml) +- [spark/pom.xml](spark/pom.xml) +- [zeppelin-display/pom.xml](zeppelin-display/pom.xml) +- [zeppelin-distribution/pom.xml](zeppelin-distribution/pom.xml) +- [zeppelin-interpreter/pom.xml](zeppelin-interpreter/pom.xml) +- [zeppelin-server/pom.xml](zeppelin-server/pom.xml) +- [zeppelin-web/pom.xml](zeppelin-web/pom.xml) +- [zeppelin-zengine/pom.xml](zeppelin-zengine/pom.xml) + +
+ + + +This document describes the Maven module structure of Apache Zeppelin, including the organization of core components, interpreter modules, and their dependencies. This covers the build system architecture and module relationships that define how Zeppelin is assembled and packaged. + +For information about the runtime architecture and component interactions, see [Core Server Components](#2.2). For details about individual interpreter implementations, see [Interpreters](#5). + +## Module Organization + +Zeppelin is organized as a multi-module Maven project with over 50 modules grouped into several logical categories. The root project defines the overall structure and shared configuration. + +```mermaid +graph TB + subgraph "Core Infrastructure" + Common["zeppelin-common
Shared utilities"] + Interpreter["zeppelin-interpreter
Base interpreter framework"] + Zengine["zeppelin-zengine
Core engine"] + Display["zeppelin-display
Display APIs"] + Server["zeppelin-server
REST APIs & WebSocket"] + Web["zeppelin-web
Angular frontend"] + Distribution["zeppelin-distribution
Packaging"] + end + + subgraph "Language Interpreters" + Spark["spark/*
Scala interpreters"] + Python["python
Python interpreter"] + Java["java
Java interpreter"] + Kotlin["kotlin
Kotlin interpreter"] + RLang["rlang
R interpreter"] + Groovy["groovy
Groovy interpreter"] + end + + subgraph "Data Platform Interpreters" + JDBC["jdbc
Database connectivity"] + MongoDB["mongodb
MongoDB connector"] + Cassandra["cassandra
Cassandra connector"] + Elasticsearch["elasticsearch
Search engine"] + BigQuery["bigquery
Google BigQuery"] + HBase["hbase
HBase connector"] + end + + subgraph "Stream Processing" + Flink["flink
Stream processing"] + Livy["livy
Remote Spark"] + Beam["beam
Apache Beam"] + FlinkCmd["flink-cmd
Flink commands"] + end + + subgraph "Utility Modules" + Markdown["markdown
Markdown interpreter"] + Shell["shell
Shell interpreter"] + File["file
File system"] + Angular["angular
AngularJS support"] + Jupyter["zeppelin-jupyter
Jupyter integration"] + Plugins["zeppelin-plugins
Plugin system"] + end +``` + +Sources: [pom.xml:54-103]() + +## Core Module Dependencies + +The core infrastructure modules have a layered dependency structure where higher-level components build upon lower-level foundations. + +```mermaid +graph TD + Common["zeppelin-common"] + Interpreter["zeppelin-interpreter"] + Display["zeppelin-display"] + Jupyter["zeppelin-jupyter"] + Zengine["zeppelin-zengine"] + Server["zeppelin-server"] + Web["zeppelin-web"] + Distribution["zeppelin-distribution"] + + Interpreter --> Common + Display --> Interpreter + Jupyter --> Common + Zengine --> Common + Zengine --> Interpreter + Zengine --> Jupyter + Server --> Zengine + Distribution --> Server + Distribution --> Web +``` + +Sources: [zeppelin-server/pom.xml:54](), [zeppelin-zengine/pom.xml:48-72](), [zeppelin-distribution/pom.xml:77-87](), [zeppelin-display/pom.xml:67-71]() + +## Core Infrastructure Modules + +| Module | Artifact ID | Purpose | Key Dependencies | +|--------|-------------|---------|------------------| +| Common | `zeppelin-common` | Shared utilities and base classes | None (foundation) | +| Interpreter | `zeppelin-interpreter` | Base interpreter framework and API | `zeppelin-common` | +| Display | `zeppelin-display` | Display system APIs for visualizations | `zeppelin-interpreter` | +| Zengine | `zeppelin-zengine` | Core notebook engine and business logic | `zeppelin-common`, `zeppelin-interpreter`, `zeppelin-jupyter` | +| Server | `zeppelin-server` | REST APIs, WebSocket server, web container | `zeppelin-zengine` | +| Web | `zeppelin-web` | Angular.js frontend application | None (frontend only) | +| Distribution | `zeppelin-distribution` | Final packaging and assembly | `zeppelin-server`, `zeppelin-web` | + +The `zeppelin-common` module serves as the foundation, providing shared utilities used across all other components. The `zeppelin-interpreter` module defines the core interpreter framework that all language-specific interpreters extend. + +Sources: [zeppelin-server/pom.xml:50-95](), [zeppelin-zengine/pom.xml:45-72](), [zeppelin-distribution/pom.xml:76-88]() + +## Interpreter Module Hierarchy + +Interpreters are organized under a parent module structure that provides shared configuration and dependencies. + +```mermaid +graph TD + Root["zeppelin
Root POM"] + InterpParent["zeppelin-interpreter-parent
Shared interpreter config"] + + subgraph "Spark Ecosystem" + SparkParent["spark-parent
Spark configuration"] + SparkInterp["spark/interpreter
Core Spark interpreter"] + SparkScala["spark-scala-parent
Scala version support"] + Scala211["spark/scala-2.11"] + Scala212["spark/scala-2.12"] + Scala213["spark/scala-2.13"] + SparkShims["spark-shims
Version compatibility"] + Spark2Shims["spark2-shims"] + Spark3Shims["spark3-shims"] + end + + subgraph "Other Interpreters" + Python["python"] + JDBC["jdbc"] + Flink["flink"] + Shell["shell"] + end + + Root --> InterpParent + InterpParent --> SparkParent + InterpParent --> Python + InterpParent --> JDBC + InterpParent --> Flink + InterpParent --> Shell + + SparkParent --> SparkInterp + SparkParent --> SparkScala + SparkParent --> SparkShims + SparkParent --> Spark2Shims + SparkParent --> Spark3Shims + SparkScala --> Scala211 + SparkScala --> Scala212 + SparkScala --> Scala213 +``` + +Sources: [pom.xml:55](), [spark/pom.xml:24-67]() + +## Build Configuration and Profiles + +The build system uses Maven profiles to support different Hadoop versions and deployment configurations. Key properties are defined at the root level and inherited by child modules. + +| Property Category | Key Properties | Purpose | +|-------------------|----------------|---------| +| Language Versions | `java.version`, `scala.version`, `scala.binary.version` | Define language runtime versions | +| Hadoop Support | `hadoop.version`, `hadoop.deps.scope` | Configure Hadoop integration | +| Frontend Build | `node.version`, `npm.version`, `plugin.frontend.version` | Control web build process | +| Library Versions | `jetty.version`, `gson.version`, `libthrift.version` | Manage dependency versions | + +The root POM defines two main Hadoop profiles: +- `hadoop2`: Uses Hadoop 2.7.x (default) +- `hadoop3`: Uses Hadoop 3.2.x + +Sources: [pom.xml:105-216](), [zeppelin-server/pom.xml:482-519]() + +## Shaded and Integration Modules + +Several modules use Maven shading to resolve dependency conflicts and provide integration capabilities: + +```mermaid +graph LR + InterpShaded["zeppelin-interpreter-shaded
Shaded interpreter deps"] + JupyterShaded["zeppelin-jupyter-interpreter-shaded
Shaded Jupyter deps"] + Client["zeppelin-client
Java client API"] + ClientExamples["zeppelin-client-examples
Usage examples"] + + Interpreter --> InterpShaded + Jupyter --> JupyterShaded + Client --> ClientExamples +``` + +The shaded modules package dependencies with relocated class names to avoid conflicts when Zeppelin is embedded in other applications or when interpreters have conflicting dependency versions. + +Sources: [pom.xml:57,62,96-97]() + +## Web Application Build + +The `zeppelin-web` module uses the frontend-maven-plugin to build the Angular.js application: + +| Build Phase | Goal | Purpose | +|-------------|------|---------| +| Install | `install-node-and-npm` | Download Node.js and npm | +| Dependencies | `npm install` | Install JavaScript dependencies | +| Build | `npm run build:dist` | Build production assets | +| Test | `npm run karma-test` | Run unit tests | +| Integration Test | `npm run e2e` | Run end-to-end tests | + +The web module produces a WAR file that contains the compiled frontend assets and is included in the final distribution. + +Sources: [zeppelin-web/pom.xml:56-123](), [zeppelin-web/pom.xml:211-235]() diff --git a/.cursor/documentation/architecture/storage_and_persistence.md b/.cursor/documentation/architecture/storage_and_persistence.md new file mode 100644 index 00000000000..218240369b6 --- /dev/null +++ b/.cursor/documentation/architecture/storage_and_persistence.md @@ -0,0 +1,305 @@ +# Storage and Persistence + +
+Relevant source files + +The following files were used as context for generating this wiki page: + +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/repo/GitNotebookRepo.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/repo/GitNotebookRepo.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/repo/NotebookRepo.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/repo/NotebookRepo.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/repo/NotebookRepoSync.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/repo/NotebookRepoSync.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/repo/VFSNotebookRepo.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/repo/VFSNotebookRepo.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/repo/GitNotebookRepoTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/repo/GitNotebookRepoTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/repo/NotebookRepoSyncTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/repo/NotebookRepoSyncTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/repo/VFSNotebookRepoTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/repo/VFSNotebookRepoTest.java) + +
+ + + +This document covers Zeppelin's storage and persistence layer, which manages how notebooks are stored, synchronized, and versioned across different storage backends. The system supports multiple storage implementations including local file systems, Git repositories, and cloud storage providers, with built-in synchronization capabilities between multiple backends. + +For information about notebook structure and lifecycle management, see [Core Server Components](#2.2). For interpreter-specific storage and recovery mechanisms, see [Interpreter Framework](#2.3). + +## Storage Architecture Overview + +Zeppelin's storage system is built around a pluggable architecture that supports multiple storage backends through the `NotebookRepo` interface. The system can operate with up to two storage backends simultaneously, with automatic synchronization between them. + +```mermaid +graph TB + subgraph "Storage Layer Architecture" + NotebookService["NotebookService"] + NotebookRepoSync["NotebookRepoSync
Synchronization Layer"] + + subgraph "Storage Implementations" + VFSRepo["VFSNotebookRepo
File System Storage"] + GitRepo["GitNotebookRepo
Version Control Storage"] + S3Repo["S3NotebookRepo
Cloud Storage"] + AzureRepo["AzureNotebookRepo
Azure Storage"] + MongoRepo["MongoNotebookRepo
Database Storage"] + end + + subgraph "Configuration" + ZeppelinConf["ZeppelinConfiguration
ZEPPELIN_NOTEBOOK_STORAGE"] + PluginManager["PluginManager
Storage Loading"] + end + end + + NotebookService --> NotebookRepoSync + NotebookRepoSync --> VFSRepo + NotebookRepoSync --> GitRepo + NotebookRepoSync --> S3Repo + NotebookRepoSync --> AzureRepo + NotebookRepoSync --> MongoRepo + + ZeppelinConf --> NotebookRepoSync + PluginManager --> NotebookRepoSync +``` + +Sources: [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/repo/NotebookRepoSync.java:44-103](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/repo/NotebookRepo.java:30-42]() + +## NotebookRepo Interface + +The `NotebookRepo` interface defines the core contract for all storage implementations. It provides standardized methods for notebook lifecycle operations and includes utility methods for file naming conventions. + +| Method | Purpose | Parameters | +|--------|---------|------------| +| `list()` | List all notebooks | `AuthenticationInfo subject` | +| `get()` | Retrieve specific notebook | `noteId, notePath, subject` | +| `save()` | Persist notebook changes | `Note note, subject` | +| `move()` | Relocate notebook or folder | `noteId, notePath, newNotePath, subject` | +| `remove()` | Delete notebook or folder | `noteId, notePath, subject` | + +```mermaid +graph LR + subgraph "File Naming Convention" + NotePath["/my_project/my_note"] + NoteId["2A94M5J1Z"] + FileName["my_project/my_note_2A94M5J1Z.zpln"] + end + + subgraph "Utility Methods" + buildNoteFileName["buildNoteFileName()"] + getNoteId["getNoteId()"] + getNotePath["getNotePath()"] + end + + NotePath --> buildNoteFileName + NoteId --> buildNoteFileName + buildNoteFileName --> FileName + FileName --> getNoteId + FileName --> getNotePath +``` + +Sources: [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/repo/NotebookRepo.java:140-181]() + +## Synchronization Layer + +The `NotebookRepoSync` class manages synchronization between multiple storage backends, supporting both one-way and two-way synchronization modes. It operates as the primary interface between the notebook service and underlying storage implementations. + +```mermaid +graph TB + subgraph "NotebookRepoSync Operations" + Primary["Primary Storage
(repos[0])"] + Secondary["Secondary Storage
(repos[1])"] + + subgraph "Sync Logic" + Compare["notesCheckDiff()"] + Push["pushNotes()"] + Pull["pullNotes()"] + Delete["deleteNotes()"] + end + + subgraph "Conflict Resolution" + OneWaySync["oneWaySync mode"] + TwoWaySync["Timestamp comparison"] + VersionCheck["lastModificationDate()"] + end + end + + Primary --> Compare + Secondary --> Compare + Compare --> Push + Compare --> Pull + Compare --> Delete + + OneWaySync --> Push + TwoWaySync --> VersionCheck + VersionCheck --> Push + VersionCheck --> Pull +``` + +### Synchronization Process + +The synchronization algorithm compares notebooks between storage backends and determines required actions: + +1. **Note Comparison**: Uses `notesCheckDiff()` to identify differences between storages +2. **Timestamp Analysis**: Compares modification dates using paragraph timestamps +3. **Action Classification**: Categorizes notes into push, pull, or delete operations +4. **Conflict Resolution**: Applies one-way or timestamp-based resolution strategies + +Sources: [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/repo/NotebookRepoSync.java:315-376](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/repo/NotebookRepoSync.java:231-275]() + +## File System Storage + +The `VFSNotebookRepo` implementation provides file-based storage using Apache Commons VFS, supporting local file systems and various remote file system protocols. + +```mermaid +graph TB + subgraph "VFSNotebookRepo Components" + FileSystemManager["fsManager
FileSystemManager"] + RootFileObject["rootNotebookFileObject
FileObject"] + + subgraph "File Operations" + ListFolder["listFolder()"] + SaveNote["save()"] + MoveNote["move()"] + RemoveNote["remove()"] + end + + subgraph "VFS Backends" + LocalFS["Local File System"] + FTPFS["FTP"] + SFTPFS["SFTP"] + HTTPFS["HTTP/WebDAV"] + end + end + + FileSystemManager --> RootFileObject + RootFileObject --> ListFolder + RootFileObject --> SaveNote + RootFileObject --> MoveNote + RootFileObject --> RemoveNote + + FileSystemManager --> LocalFS + FileSystemManager --> FTPFS + FileSystemManager --> SFTPFS + FileSystemManager --> HTTPFS +``` + +### File Organization + +Notebooks are stored as JSON files with the `.zpln` extension, organized in a hierarchical folder structure that mirrors the notebook path hierarchy: + +- **Root Directory**: Configured via `ZEPPELIN_NOTEBOOK_DIR` +- **File Naming**: `{notePath}_{noteId}.zpln` (path separator becomes folder structure) +- **Temporary Files**: Uses `.tmp` extension during save operations for atomicity + +Sources: [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/repo/VFSNotebookRepo.java:96-128](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/repo/VFSNotebookRepo.java:142-159]() + +## Version Control Storage + +The `GitNotebookRepo` extends `VFSNotebookRepo` to provide Git-based version control capabilities, including checkpointing, revision history, and rollback functionality. + +```mermaid +graph TB + subgraph "GitNotebookRepo Features" + GitRepo["Git Repository"] + VFSBase["VFSNotebookRepo
Base Functionality"] + + subgraph "Version Control Operations" + Checkpoint["checkpoint()"] + RevisionHistory["revisionHistory()"] + GetRevision["get(noteId, notePath, revId)"] + SetRevision["setNoteRevision()"] + end + + subgraph "Git Operations" + GitAdd["git.add()"] + GitCommit["git.commit()"] + GitCheckout["git.checkout()"] + GitStash["git.stash()"] + GitLog["git.log()"] + end + end + + VFSBase --> GitRepo + Checkpoint --> GitAdd + Checkpoint --> GitCommit + RevisionHistory --> GitLog + GetRevision --> GitStash + GetRevision --> GitCheckout + SetRevision --> GitCheckout +``` + +### Revision Management + +The Git implementation provides sophisticated revision management: + +1. **Checkpointing**: Creates Git commits only when changes are detected via `git.diff()` +2. **Revision Retrieval**: Uses stash/checkout/unstash sequence to access historical versions +3. **History Tracking**: Leverages Git log to provide complete revision history +4. **Safe Rollback**: Implements atomic rollback operations using Git reset + +Sources: [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/repo/GitNotebookRepo.java:123-146](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/repo/GitNotebookRepo.java:155-191]() + +## Configuration and Initialization + +Storage backends are configured through `ZeppelinConfiguration` and loaded dynamically via the `PluginManager`. The system supports flexible configuration of multiple storage classes. + +```mermaid +graph LR + subgraph "Configuration Flow" + ConfVars["ConfVars.ZEPPELIN_NOTEBOOK_STORAGE"] + ClassNames["Storage Class Names
Comma-separated"] + PluginManager["PluginManager.loadNotebookRepo()"] + + subgraph "Default Storage" + DefaultStorage["org.apache.zeppelin.notebook.repo.GitNotebookRepo"] + end + + subgraph "Initialization" + InitRepo["repo.init(conf)"] + AddToList["repos.add(repo)"] + end + end + + ConfVars --> ClassNames + ClassNames --> PluginManager + PluginManager --> InitRepo + InitRepo --> AddToList + DefaultStorage --> PluginManager +``` + +### Configuration Properties + +| Property | Purpose | Default Value | +|----------|---------|---------------| +| `ZEPPELIN_NOTEBOOK_STORAGE` | Storage implementation classes | `GitNotebookRepo` | +| `ZEPPELIN_NOTEBOOK_ONE_WAY_SYNC` | Enable one-way synchronization | `false` | +| `ZEPPELIN_NOTEBOOK_DIR` | Root storage directory | `notebook/` | + +Sources: [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/repo/NotebookRepoSync.java:66-103](), [zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/repo/NotebookRepoSyncTest.java:82-87]() + +## Persistence Data Flow + +The following diagram illustrates how notebook data flows through the persistence layer during typical operations: + +```mermaid +sequenceDiagram + participant NS as "NotebookService" + participant NRS as "NotebookRepoSync" + participant PR as "Primary Repo" + participant SR as "Secondary Repo" + + Note over NS,SR: Save Operation + NS->>NRS: save(note, subject) + NRS->>PR: save(note, subject) + NRS->>SR: save(note, subject) + + Note over NS,SR: Sync Operation + NS->>NRS: sync(subject) + NRS->>PR: list(subject) + NRS->>SR: list(subject) + NRS->>NRS: notesCheckDiff() + + alt Notes need push + NRS->>PR: get(noteId, notePath, subject) + NRS->>SR: save(note, subject) + else Notes need pull + NRS->>SR: get(noteId, notePath, subject) + NRS->>PR: save(note, subject) + end +``` + +Sources: [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/repo/NotebookRepoSync.java:167-178](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/repo/NotebookRepoSync.java:277-279]() diff --git a/.cursor/documentation/deployment_and_operations/build_ci_cd_and_documentation.md b/.cursor/documentation/deployment_and_operations/build_ci_cd_and_documentation.md new file mode 100644 index 00000000000..4866596fab5 --- /dev/null +++ b/.cursor/documentation/deployment_and_operations/build_ci_cd_and_documentation.md @@ -0,0 +1,341 @@ +# Build, CI/CD, and Documentation + +
+Relevant source files + +The following files were used as context for generating this wiki page: + +- [.github/workflows/core.yml](.github/workflows/core.yml) +- [.github/workflows/frontend.yml](.github/workflows/frontend.yml) +- [.github/workflows/quick.yml](.github/workflows/quick.yml) +- [.mvn/wrapper/MavenWrapperDownloader.java](.mvn/wrapper/MavenWrapperDownloader.java) +- [.mvn/wrapper/maven-wrapper.jar](.mvn/wrapper/maven-wrapper.jar) +- [.mvn/wrapper/maven-wrapper.properties](.mvn/wrapper/maven-wrapper.properties) +- [Dockerfile](Dockerfile) +- [conf/interpreter-list](conf/interpreter-list) +- [dev/common_release.sh](dev/common_release.sh) +- [dev/create_release.sh](dev/create_release.sh) +- [dev/publish_release.sh](dev/publish_release.sh) +- [docs/_includes/themes/zeppelin/_navigation.html](docs/_includes/themes/zeppelin/_navigation.html) +- [docs/index.md](docs/index.md) +- [docs/setup/basics/how_to_build.md](docs/setup/basics/how_to_build.md) +- [docs/usage/interpreter/installation.md](docs/usage/interpreter/installation.md) +- [livy/README.md](livy/README.md) + +
+ + + +This page covers Zeppelin's build system infrastructure, continuous integration and deployment pipelines, and documentation framework. For information about deploying Zeppelin in production environments, see [Kubernetes Support](#6.1). For details about configuring and operating Zeppelin servers, see section 4. + +## Build System Architecture + +Zeppelin uses a Maven-based multi-module build system with a Maven wrapper for consistent builds across environments. The build system supports multiple profiles for different deployment targets and interpreter configurations. + +### Maven Wrapper Infrastructure + +The build system centers around the Maven wrapper (`mvnw`) which ensures consistent Maven versions across development and CI environments. The wrapper configuration is defined in [.mvn/wrapper/maven-wrapper.properties:18]() and uses Maven 3.8.1. + +```mermaid +graph TB + subgraph "Build Entry Points" + mvnw["./mvnw
Maven Wrapper"] + mvnwcmd["./mvnw.cmd
Windows Wrapper"] + end + + subgraph "Maven Wrapper Infrastructure" + props[".mvn/wrapper/maven-wrapper.properties
Distribution Config"] + downloader[".mvn/wrapper/MavenWrapperDownloader.java
Bootstrap Logic"] + jar[".mvn/wrapper/maven-wrapper.jar
Wrapper Binary"] + end + + subgraph "Build Profiles" + spark["spark-2.4, spark-3.0, spark-3.1, spark-3.2, spark-3.3
Spark Version Profiles"] + scala["spark-scala-2.11, spark-scala-2.12, spark-scala-2.13
Scala Version Profiles"] + hadoop["hadoop2, hadoop3
Hadoop Version Profiles"] + web["web-angular
Frontend Build Profile"] + dist["build-distr
Distribution Build Profile"] + end + + mvnw --> props + mvnw --> downloader + downloader --> jar + mvnw --> spark + mvnw --> scala + mvnw --> hadoop + mvnw --> web + mvnw --> dist +``` + +**Sources:** [.mvn/wrapper/maven-wrapper.properties:1-20](), [.mvn/wrapper/MavenWrapperDownloader.java:1-117](), [docs/setup/basics/how_to_build.md:80-175]() + +### Build Command Patterns + +The build system supports various configuration combinations through Maven profiles: + +| Build Type | Command Example | Purpose | +|------------|-----------------|---------| +| Basic Build | `./mvnw clean package -DskipTests` | Standard compilation without tests | +| Distribution | `./mvnw clean package -Pbuild-distr` | Creates complete distribution package | +| Spark-specific | `./mvnw clean package -Pspark-3.2 -Pspark-scala-2.12` | Builds with specific Spark/Scala versions | +| Docker Build | `./mvnw -B package -Pbuild-distr -Pspark-3.2 -Pweb-angular` | Container-ready build | + +**Sources:** [docs/setup/basics/how_to_build.md:58-175](), [Dockerfile:24]() + +## Continuous Integration Pipeline + +Zeppelin employs a comprehensive GitHub Actions-based CI/CD system with multiple workflow files handling different aspects of testing and validation. + +### Primary CI Workflows + +```mermaid +graph TB + subgraph "GitHub Actions Workflows" + core[".github/workflows/core.yml
Core Module Testing"] + frontend[".github/workflows/frontend.yml
Frontend Testing"] + quick[".github/workflows/quick.yml
License & Validation"] + end + + subgraph "Core Workflow Jobs" + coremodules["core-modules
zeppelin-interpreter, zeppelin-zengine, zeppelin-server"] + interptest["interpreter-test-non-core
beam, hbase, pig, jdbc, etc."] + jupyter["interpreter-test-jupyter-python-rlang
Python 3.7, 3.8 + R"] + integration["zeppelin-integration-test
Full Integration Tests"] + flink["flink-test-and-flink-integration-test
Flink 1.12-1.15"] + spark["spark-integration-test
Spark Integration"] + sparktest["spark-test
Multiple Spark Versions"] + livy["livy-0-5-with-spark-2-2-0
Livy Integration"] + end + + subgraph "Frontend Workflow Jobs" + e2e["run-e2e-tests-in-zeppelin-web
Selenium E2E Tests"] + angular["run-tests-in-zeppelin-web-angular
Angular Unit Tests"] + selenium["test-selenium-with-spark-module
Spark UI Integration"] + end + + subgraph "Quick Workflow Jobs" + license["license-check
Apache RAT License Check"] + validate["maven-validate
Maven POM Validation"] + end + + core --> coremodules + core --> interptest + core --> jupyter + core --> integration + core --> flink + core --> spark + core --> sparktest + core --> livy + + frontend --> e2e + frontend --> angular + frontend --> selenium + + quick --> license + quick --> validate +``` + +**Sources:** [.github/workflows/core.yml:1-477](), [.github/workflows/frontend.yml:1-132](), [.github/workflows/quick.yml:1-58]() + +### Environment Configuration + +The CI system uses standardized environment variables and caching strategies across all workflows: + +```bash +# Standard Maven Options (from core.yml:13-17) +MAVEN_OPTS: "-Xms1024M -Xmx2048M -XX:MaxMetaspaceSize=1024m + -XX:-UseGCOverheadLimit + -Dhttp.keepAlive=false + -Dmaven.wagon.http.pool=false" + +# Zeppelin-specific Configuration (from core.yml:18-21) +ZEPPELIN_HELIUM_REGISTRY: helium +SPARK_PRINT_LAUNCH_COMMAND: "true" +SPARK_LOCAL_IP: 127.0.0.1 +ZEPPELIN_LOCAL_IP: 127.0.0.1 +``` + +**Sources:** [.github/workflows/core.yml:10-22](), [.github/workflows/frontend.yml:10-22]() + +### Test Matrix Strategy + +The CI system employs matrix builds to test across multiple configurations: + +| Matrix Dimension | Values | Workflow | +|------------------|--------|----------| +| Hadoop Versions | `hadoop2`, `hadoop3` | `core.yml:36` | +| Python Versions | `3.7`, `3.8` | `core.yml:129`, `spark-test:336` | +| Flink Versions | `112`, `113`, `114`, `115` | `core.yml:228` | +| Spark Versions | `2.4`, `3.0`, `3.1`, `3.2`, `3.3` | `spark-test` job | + +**Sources:** [.github/workflows/core.yml:33-39](), [.github/workflows/core.yml:126-130](), [.github/workflows/core.yml:225-229]() + +## Release Management System + +Zeppelin's release process is automated through shell scripts that handle both source and binary artifact creation and publishing. + +### Release Artifact Creation + +```mermaid +graph LR + subgraph "Release Scripts" + create["dev/create_release.sh
Artifact Creation"] + publish["dev/publish_release.sh
Maven Publishing"] + common["dev/common_release.sh
Shared Functions"] + end + + subgraph "Artifact Types" + source["zeppelin-${version}.tgz
Source Package"] + netinst["zeppelin-${version}-bin-netinst.tgz
Minimal Binary"] + all["zeppelin-${version}-bin-all.tgz
Full Binary"] + end + + subgraph "Publishing Targets" + staging["Apache Staging Repository
Nexus Repository"] + snapshot["Apache Snapshot Repository
SNAPSHOT Versions"] + end + + create --> source + create --> netinst + create --> all + publish --> staging + publish --> snapshot + + create -.-> common + publish -.-> common +``` + +**Sources:** [dev/create_release.sh:1-107](), [dev/publish_release.sh:1-183](), [dev/common_release.sh:1-66]() + +### Binary Distribution Profiles + +The release system creates different binary distributions based on included interpreters: + +```bash +# Minimal Distribution (from create_release.sh:100) +make_binary_release netinst "-Pweb-angular -Phadoop-2.6 -pl !beam,!hbase,!pig,!jdbc,!file,!flink,!ignite,!cassandra,!elasticsearch,!bigquery,!alluxio,!scio,!livy,!groovy,!sap,!java,!geode,!neo4j,!hazelcastjet,!submarine,!sparql,!mongodb,!ksql -am" + +# Full Distribution (from create_release.sh:101) +make_binary_release all "-Pweb-angular -Phadoop-2.6" +``` + +**Sources:** [dev/create_release.sh:99-101]() + +## Documentation Infrastructure + +Zeppelin's documentation uses Jekyll with a custom theme and structured navigation system hosted as a static site. + +### Documentation Site Structure + +```mermaid +graph TB + subgraph "Documentation Framework" + jekyll["Jekyll Static Site Generator"] + theme["docs/_includes/themes/zeppelin/
Custom Zeppelin Theme"] + nav["docs/_includes/themes/zeppelin/_navigation.html
Main Navigation Structure"] + end + + subgraph "Content Organization" + index["docs/index.md
Documentation Homepage"] + quickstart["docs/quickstart/
Getting Started Guides"] + usage["docs/usage/
Feature Documentation"] + setup["docs/setup/
Configuration Guides"] + interpreter["docs/interpreter/
Interpreter Documentation"] + development["docs/development/
Developer Guides"] + end + + subgraph "Navigation Categories" + quickmenu["Quick Start
Install, UI, Tutorial"] + usagemenu["Usage
Display System, Interpreters, REST API"] + setupmenu["Setup
Deployment, Security, Storage"] + interpmenu["Interpreter
Language-specific Docs"] + devmenu["More
Extending & Contributing"] + end + + jekyll --> theme + theme --> nav + nav --> quickmenu + nav --> usagemenu + nav --> setupmenu + nav --> interpmenu + nav --> devmenu + + index --> quickstart + index --> usage + index --> setup + index --> interpreter + index --> development +``` + +**Sources:** [docs/_includes/themes/zeppelin/_navigation.html:1-207](), [docs/index.md:1-177]() + +### Navigation System Implementation + +The documentation navigation is implemented as a Bootstrap-based dropdown menu system in [docs/_includes/themes/zeppelin/_navigation.html:22-202](). Key navigation sections include: + +| Section | Lines | Content | +|---------|-------|---------| +| Quick Start | 24-42 | Installation, UI exploration, tutorials | +| Usage | 45-86 | Dynamic forms, display system, interpreters, REST API | +| Setup | 90-126 | Deployment, security, storage, operations | +| Interpreter | 130-170 | Language-specific interpreter documentation | +| More | 173-196 | Development guides and contribution information | + +**Sources:** [docs/_includes/themes/zeppelin/_navigation.html:22-202]() + +### Interpreter Documentation Management + +Interpreter installation and documentation is managed through the `conf/interpreter-list` file, which defines available community-managed interpreters: + +```bash +# Format: [name] [maven artifact] [description] +# Examples from conf/interpreter-list: +jdbc org.apache.zeppelin:zeppelin-jdbc:0.10.0 Jdbc interpreter +python org.apache.zeppelin:zeppelin-python:0.10.0 Python interpreter +spark org.apache.zeppelin:zeppelin-spark:0.10.0 Spark interpreter +``` + +This file drives both the `install-interpreter.sh` script functionality and documentation organization. + +**Sources:** [conf/interpreter-list:1-46](), [docs/usage/interpreter/installation.md:103-235]() + +## Container Build Integration + +Zeppelin includes Docker containerization as part of its build and deployment strategy, with the `Dockerfile` implementing a multi-stage build process. + +### Docker Build Process + +```mermaid +graph TB + subgraph "Multi-stage Docker Build" + builder["FROM openjdk:8 as builder
Build Stage"] + runtime["FROM ubuntu:20.04
Runtime Stage"] + end + + subgraph "Build Stage Steps" + workspace["ADD . /workspace/zeppelin
Copy Source Code"] + maven["./mvnw -B package -DskipTests
Maven Build with Profiles"] + extract["mv zeppelin-distribution/target/zeppelin-*
Extract Distribution"] + cleanup["rm -rf ~/.m2 && rm -rf /workspace/zeppelin/*
Clean Build Artifacts"] + end + + subgraph "Build Configuration" + profiles["-Pbuild-distr -Pspark-3.2
-Pinclude-hadoop -Phadoop3
-Pspark-scala-2.12 -Pweb-angular"] + mavenenvs["MAVEN_OPTS
Memory Configuration"] + npmconfig["unsafe-perm=true
allow_root=true"] + end + + builder --> workspace + workspace --> maven + maven --> extract + extract --> cleanup + cleanup --> runtime + + maven --> profiles + maven --> mavenenvs + maven --> npmconfig +``` + +**Sources:** [Dockerfile:17-34]() + +The Docker build uses the same Maven profiles as the standard build system, ensuring consistency between local development, CI, and containerized deployments. The build produces a minimal runtime image by copying only the final distribution artifacts from the builder stage. diff --git a/.cursor/documentation/deployment_and_operations/deployment_and_operations.md b/.cursor/documentation/deployment_and_operations/deployment_and_operations.md new file mode 100644 index 00000000000..83287ced218 --- /dev/null +++ b/.cursor/documentation/deployment_and_operations/deployment_and_operations.md @@ -0,0 +1,374 @@ +# Deployment and Operations + +
+Relevant source files + +The following files were used as context for generating this wiki page: + +- [.github/workflows/core.yml](.github/workflows/core.yml) +- [.github/workflows/frontend.yml](.github/workflows/frontend.yml) +- [.github/workflows/quick.yml](.github/workflows/quick.yml) +- [.mvn/wrapper/MavenWrapperDownloader.java](.mvn/wrapper/MavenWrapperDownloader.java) +- [.mvn/wrapper/maven-wrapper.jar](.mvn/wrapper/maven-wrapper.jar) +- [.mvn/wrapper/maven-wrapper.properties](.mvn/wrapper/maven-wrapper.properties) +- [Dockerfile](Dockerfile) +- [bin/common.cmd](bin/common.cmd) +- [bin/common.sh](bin/common.sh) +- [bin/functions.sh](bin/functions.sh) +- [bin/interpreter.sh](bin/interpreter.sh) +- [bin/zeppelin-daemon.sh](bin/zeppelin-daemon.sh) +- [bin/zeppelin.sh](bin/zeppelin.sh) +- [conf/interpreter-list](conf/interpreter-list) +- [conf/zeppelin-env.cmd.template](conf/zeppelin-env.cmd.template) +- [conf/zeppelin-env.sh.template](conf/zeppelin-env.sh.template) +- [dev/common_release.sh](dev/common_release.sh) +- [dev/create_release.sh](dev/create_release.sh) +- [dev/publish_release.sh](dev/publish_release.sh) +- [docs/_includes/themes/zeppelin/_navigation.html](docs/_includes/themes/zeppelin/_navigation.html) +- [docs/index.md](docs/index.md) +- [docs/quickstart/kubernetes.md](docs/quickstart/kubernetes.md) +- [docs/setup/basics/how_to_build.md](docs/setup/basics/how_to_build.md) +- [docs/usage/interpreter/installation.md](docs/usage/interpreter/installation.md) +- [k8s/interpreter/100-interpreter-spec.yaml](k8s/interpreter/100-interpreter-spec.yaml) +- [k8s/zeppelin-server.yaml](k8s/zeppelin-server.yaml) +- [livy/README.md](livy/README.md) +- [scripts/docker/zeppelin-interpreter/Dockerfile](scripts/docker/zeppelin-interpreter/Dockerfile) +- [scripts/docker/zeppelin-server/Dockerfile](scripts/docker/zeppelin-server/Dockerfile) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/InterpreterNotFoundException.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/InterpreterNotFoundException.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/InterpreterRunner.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/InterpreterRunner.java) +- [zeppelin-interpreter/src/test/java/org/apache/zeppelin/interpreter/util/ByteBufferUtilTest.java](zeppelin-interpreter/src/test/java/org/apache/zeppelin/interpreter/util/ByteBufferUtilTest.java) +- [zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sRemoteInterpreterProcess.java](zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sRemoteInterpreterProcess.java) +- [zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sSpecTemplate.java](zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sSpecTemplate.java) +- [zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sStandardInterpreterLauncher.java](zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sStandardInterpreterLauncher.java) +- [zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sUtils.java](zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sUtils.java) +- [zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/PodPhaseWatcher.java](zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/PodPhaseWatcher.java) +- [zeppelin-plugins/launcher/k8s-standard/src/test/java/org/apache/zeppelin/interpreter/launcher/K8sRemoteInterpreterProcessTest.java](zeppelin-plugins/launcher/k8s-standard/src/test/java/org/apache/zeppelin/interpreter/launcher/K8sRemoteInterpreterProcessTest.java) +- [zeppelin-plugins/launcher/k8s-standard/src/test/java/org/apache/zeppelin/interpreter/launcher/K8sSpecTemplateTest.java](zeppelin-plugins/launcher/k8s-standard/src/test/java/org/apache/zeppelin/interpreter/launcher/K8sSpecTemplateTest.java) +- [zeppelin-plugins/launcher/k8s-standard/src/test/java/org/apache/zeppelin/interpreter/launcher/K8sStandardInterpreterLauncherTest.java](zeppelin-plugins/launcher/k8s-standard/src/test/java/org/apache/zeppelin/interpreter/launcher/K8sStandardInterpreterLauncherTest.java) +- [zeppelin-plugins/launcher/k8s-standard/src/test/java/org/apache/zeppelin/interpreter/launcher/K8sUtilsTest.java](zeppelin-plugins/launcher/k8s-standard/src/test/java/org/apache/zeppelin/interpreter/launcher/K8sUtilsTest.java) +- [zeppelin-plugins/launcher/k8s-standard/src/test/java/org/apache/zeppelin/interpreter/launcher/PodPhaseWatcherTest.java](zeppelin-plugins/launcher/k8s-standard/src/test/java/org/apache/zeppelin/interpreter/launcher/PodPhaseWatcherTest.java) +- [zeppelin-plugins/launcher/k8s-standard/src/test/resources/k8s-specs/interpreter-spec.yaml](zeppelin-plugins/launcher/k8s-standard/src/test/resources/k8s-specs/interpreter-spec.yaml) +- [zeppelin-plugins/launcher/k8s-standard/src/test/resources/log4j.properties](zeppelin-plugins/launcher/k8s-standard/src/test/resources/log4j.properties) +- [zeppelin-server/src/main/java/org/apache/zeppelin/server/ImmediateErrorHandlerImpl.java](zeppelin-server/src/main/java/org/apache/zeppelin/server/ImmediateErrorHandlerImpl.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/socket/SessionConfigurator.java](zeppelin-server/src/main/java/org/apache/zeppelin/socket/SessionConfigurator.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/utils/TestUtils.java](zeppelin-server/src/main/java/org/apache/zeppelin/utils/TestUtils.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/recovery/RecoveryTest.java](zeppelin-server/src/test/java/org/apache/zeppelin/recovery/RecoveryTest.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/recovery/StopInterpreter.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/recovery/StopInterpreter.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/SleepInterpreter.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/SleepInterpreter.java) + +
+ + + +This document covers the deployment, configuration, and operational aspects of Apache Zeppelin. It describes how to deploy Zeppelin in various environments, manage its lifecycle, configure runtime settings, and handle operational concerns like monitoring and recovery. + +For information about the core server architecture and interpreter framework, see [Server Components](#4) and [Interpreter Framework](#2.3). For details about specific interpreter implementations, see [Interpreters](#5). + +## Deployment Environments + +Apache Zeppelin supports multiple deployment modes, each with specific operational characteristics and management approaches. + +### Standalone Deployment + +Standalone deployment runs Zeppelin as a local process with interpreters launched as separate processes on the same machine or remote hosts. + +```mermaid +graph TB + subgraph "Standalone Deployment" + ZeppelinServer["ZeppelinServer
(org.apache.zeppelin.server.ZeppelinServer)"] + ZeppelinDaemon["zeppelin-daemon.sh
Process Controller"] + + subgraph "Interpreter Processes" + RemoteInterpreterServer1["RemoteInterpreterServer
(Spark)"] + RemoteInterpreterServer2["RemoteInterpreterServer
(Python)"] + RemoteInterpreterServer3["RemoteInterpreterServer
(JDBC)"] + end + + subgraph "Configuration" + ZeppelinEnv["zeppelin-env.sh
Environment Variables"] + ZeppelinSiteXml["zeppelin-site.xml
Server Configuration"] + InterpreterJson["interpreter.json
Interpreter Settings"] + end + end + + ZeppelinDaemon --> ZeppelinServer + ZeppelinServer --> RemoteInterpreterServer1 + ZeppelinServer --> RemoteInterpreterServer2 + ZeppelinServer --> RemoteInterpreterServer3 + ZeppelinEnv --> ZeppelinServer + ZeppelinSiteXml --> ZeppelinServer + InterpreterJson --> ZeppelinServer +``` + +The `zeppelin-daemon.sh` script provides standard daemon operations including start, stop, restart, and status checking. It uses the `ZeppelinServer` class as the main entry point and manages process lifecycle through PID files. + +**Sources:** [bin/zeppelin-daemon.sh:1-281](), [bin/zeppelin.sh:1-143](), [bin/common.sh:1-177]() + +### Kubernetes Deployment + +Kubernetes deployment enables cloud-native operation with automatic scaling, resource management, and fault tolerance. The `K8sStandardInterpreterLauncher` creates interpreter pods dynamically using YAML templates. + +```mermaid +graph TB + subgraph "Kubernetes Cluster" + subgraph "Zeppelin Server Pod" + ZeppelinServerContainer["zeppelin-server
Container"] + NginxGateway["nginx-gateway
Reverse Proxy"] + end + + subgraph "Interpreter Pods" + SparkPod["spark-interpreter
K8sRemoteInterpreterProcess"] + PythonPod["python-interpreter
K8sRemoteInterpreterProcess"] + JdbcPod["jdbc-interpreter
K8sRemoteInterpreterProcess"] + end + + subgraph "Kubernetes Resources" + ConfigMaps["ConfigMaps
(zeppelin-server-conf)"] + Services["Services
(Pod Discovery)"] + Roles["RBAC Roles
(Pod Management)"] + end + + subgraph "Templates" + InterpreterSpec["100-interpreter-spec.yaml
Pod Template"] + K8sSpecTemplate["K8sSpecTemplate
Jinjava Renderer"] + end + end + + ZeppelinServerContainer --> SparkPod + ZeppelinServerContainer --> PythonPod + ZeppelinServerContainer --> JdbcPod + NginxGateway --> ZeppelinServerContainer + NginxGateway --> SparkPod + ConfigMaps --> ZeppelinServerContainer + Services --> SparkPod + Services --> PythonPod + Services --> JdbcPod + K8sSpecTemplate --> InterpreterSpec + InterpreterSpec --> SparkPod +``` + +The `K8sStandardInterpreterLauncher` uses the Fabric8 Kubernetes client to create pods based on templates in the `k8s/interpreter/` directory. Templates are processed using the Jinjava templating engine with interpreter properties and environment variables. + +**Sources:** [zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sStandardInterpreterLauncher.java:1-147](), [k8s/zeppelin-server.yaml:1-217](), [k8s/interpreter/100-interpreter-spec.yaml:1-217]() + +### Docker Deployment + +Docker deployment packages Zeppelin and its dependencies into container images for consistent deployment across environments. + +```mermaid +graph TB + subgraph "Docker Images" + ZeppelinServer["apache/zeppelin-server
Server Image"] + ZeppelinInterpreter["apache/zeppelin-interpreter
Interpreter Image"] + SparkImage["spark:2.4.5
Spark Runtime"] + end + + subgraph "Build Process" + Dockerfile["Dockerfile
Multi-stage Build"] + ZeppelinDistribution["zeppelin-distribution
Build Artifacts"] + MavenBuild["Maven Build
(-Pbuild-distr)"] + end + + subgraph "Runtime Containers" + ServerContainer["Zeppelin Server
Container"] + InterpreterContainer["Interpreter
Container"] + SparkContainer["Spark Executor
Container"] + end + + MavenBuild --> ZeppelinDistribution + ZeppelinDistribution --> Dockerfile + Dockerfile --> ZeppelinServer + Dockerfile --> ZeppelinInterpreter + ZeppelinServer --> ServerContainer + ZeppelinInterpreter --> InterpreterContainer + SparkImage --> SparkContainer +``` + +The build process uses a multi-stage Dockerfile that compiles Zeppelin from source and creates optimized runtime images. The `zeppelin-distribution` build target creates the deployable artifacts. + +**Sources:** [Dockerfile:1-34](), [scripts/docker/zeppelin-server/Dockerfile:1-71](), [scripts/docker/zeppelin-interpreter/Dockerfile:1-89]() + +## Configuration Management + +Zeppelin configuration is managed through multiple layers of configuration files and environment variables that control server behavior, interpreter settings, and runtime environment. + +### Environment Configuration + +The `zeppelin-env.sh` template defines environment variables for JVM settings, cluster integration, and interpreter configuration: + +| Variable | Purpose | Default | +|----------|---------|---------| +| `ZEPPELIN_MEM` | Server JVM memory | `-Xms1024m -Xmx1024m` | +| `ZEPPELIN_INTP_MEM` | Interpreter JVM memory | `-Xms1024m -Xmx1024m` | +| `SPARK_HOME` | Spark installation path | None | +| `HADOOP_CONF_DIR` | Hadoop configuration | None | +| `ZEPPELIN_INTERPRETER_LAUNCHER` | Launcher type | `StandardInterpreterLauncher` | + +The `common.sh` script provides utility functions for classpath construction, Java version checking, and environment setup used by both server and interpreter processes. + +**Sources:** [conf/zeppelin-env.sh.template:1-113](), [bin/common.sh:68-83](), [bin/common.sh:140-172]() + +### Interpreter Installation and Management + +The interpreter installation system allows dynamic installation of community-managed and third-party interpreters: + +```bash +# Install all community interpreters +./bin/install-interpreter.sh --all + +# Install specific interpreters +./bin/install-interpreter.sh --name md,shell,jdbc,python + +# List available interpreters +./bin/install-interpreter.sh --list +``` + +Available interpreters are defined in `conf/interpreter-list` with Maven coordinates and descriptions. The installation process downloads and extracts interpreter JARs to the appropriate directories. + +**Sources:** [conf/interpreter-list:1-46](), [docs/usage/interpreter/installation.md:32-48]() + +## Process Lifecycle Management + +### Daemon Control + +The `zeppelin-daemon.sh` script provides comprehensive process lifecycle management with functions for starting, stopping, and monitoring Zeppelin server: + +```mermaid +graph LR + subgraph "Daemon Operations" + Start["start()
Initialize and Launch"] + Stop["stop()
Graceful Shutdown"] + Restart["restart()
Stop + Start"] + Status["find_zeppelin_process()
Health Check"] + Upstart["upstart()
Service Mode"] + end + + subgraph "Process Management" + PidFile["PID File
Process Tracking"] + LogFiles["Log Files
Output Capture"] + JavaOpts["JAVA_OPTS
JVM Configuration"] + Classpath["ZEPPELIN_CLASSPATH
Dependency Loading"] + end + + Start --> PidFile + Start --> LogFiles + Start --> JavaOpts + Start --> Classpath + Stop --> PidFile + Status --> PidFile +``` + +The daemon script handles process initialization, environment setup, classpath construction, and graceful shutdown with configurable timeouts. + +**Sources:** [bin/zeppelin-daemon.sh:188-216](), [bin/zeppelin-daemon.sh:218-234](), [bin/zeppelin-daemon.sh:236-251]() + +### Interpreter Process Management + +The `interpreter.sh` script launches remote interpreter processes with environment-specific configuration for different interpreter types (Spark, Flink, HBase, etc.): + +The script performs several key functions: +- Downloads interpreter dependencies via `RemoteInterpreterDownloader` +- Configures interpreter-specific classpaths and environment variables +- Sets up user impersonation for security +- Launches interpreters using `spark-submit` for Spark or direct Java execution for others + +**Sources:** [bin/interpreter.sh:27-34](), [bin/interpreter.sh:104-185](), [bin/interpreter.sh:278-299]() + +## Build and Release Management + +### Build System + +Zeppelin uses a Maven-based build system with multiple profiles for different deployment scenarios: + +| Profile | Purpose | +|---------|---------| +| `-Pbuild-distr` | Creates distribution package | +| `-Pspark-3.2` | Spark 3.2 compatibility | +| `-Phadoop3` | Hadoop 3.x integration | +| `-Pweb-angular` | Angular frontend build | +| `-Pk8s` | Kubernetes support | + +The build process compiles all modules, runs tests, and packages distributable artifacts including Docker images. + +**Sources:** [docs/setup/basics/how_to_build.md:62-67](), [Dockerfile:24-30]() + +### Release Process + +The release creation process is automated through `create_release.sh` which: +1. Creates source packages with GPG signatures +2. Builds binary distributions for different configurations +3. Generates checksums and digital signatures +4. Packages everything for distribution + +```mermaid +graph TB + subgraph "Release Pipeline" + SourcePackage["make_source_package()
Source Code Archive"] + BinaryRelease["make_binary_release()
Binary Distribution"] + NetinstBuild["netinst Build
Minimal Distribution"] + AllBuild["all Build
Full Distribution"] + end + + subgraph "Artifacts" + TarGz["*.tgz
Archive Files"] + GpgSig["*.asc
GPG Signatures"] + Sha512["*.sha512
Checksums"] + end + + SourcePackage --> TarGz + BinaryRelease --> TarGz + NetinstBuild --> TarGz + AllBuild --> TarGz + TarGz --> GpgSig + TarGz --> Sha512 +``` + +**Sources:** [dev/create_release.sh:47-96](), [dev/publish_release.sh:105-174]() + +## Continuous Integration and Testing + +### CI/CD Pipeline + +GitHub Actions workflows provide comprehensive testing across multiple environments and configurations: + +- **Core Tests**: Tests core modules (interpreter, zengine, server) with different Hadoop versions +- **Interpreter Tests**: Tests individual interpreter implementations +- **Integration Tests**: End-to-end testing with real cluster environments +- **Frontend Tests**: UI and Angular application testing + +The CI system uses conditional matrix builds to test different combinations of Spark versions, Scala versions, and Python versions. + +**Sources:** [.github/workflows/core.yml:28-80](), [.github/workflows/frontend.yml:25-55]() + +### Build Environment + +The CI environment is configured with: +- Java 8 (Temurin distribution) +- Maven with specific memory settings (`MAVEN_OPTS`) +- Conda environments for Python/R testing +- Docker for containerized testing +- Kubernetes (minikube) for K8s integration tests + +**Sources:** [.github/workflows/core.yml:10-22](), [.github/workflows/core.yml:62-78]() + +## Recovery and State Management + +### Recovery Storage + +Zeppelin provides pluggable recovery storage for interpreter state persistence. The `FileSystemRecoveryStorage` implementation stores interpreter metadata and state information to enable recovery after failures. + +The recovery system tracks: +- Interpreter process information +- Session state and variables +- Notebook execution context +- Interpreter group configurations + +Recovery storage is configured via `ZEPPELIN_RECOVERY_STORAGE_CLASS` and `ZEPPELIN_RECOVERY_DIR` environment variables. + +**Sources:** [zeppelin-server/src/test/java/org/apache/zeppelin/recovery/RecoveryTest.java:63-67](), [zeppelin-server/src/test/java/org/apache/zeppelin/recovery/RecoveryTest.java:76-78]() + +### Operational Monitoring + +The daemon script includes health checking functionality through `find_zeppelin_process()` which verifies process status and provides operational feedback. For CI environments, additional monitoring includes log capture and automated health verification. + +**Sources:** [bin/zeppelin-daemon.sh:166-174](), [bin/zeppelin-daemon.sh:144-158]() diff --git a/.cursor/documentation/deployment_and_operations/kubernetes_support.md b/.cursor/documentation/deployment_and_operations/kubernetes_support.md new file mode 100644 index 00000000000..0c9ca27de2a --- /dev/null +++ b/.cursor/documentation/deployment_and_operations/kubernetes_support.md @@ -0,0 +1,367 @@ +# Kubernetes Support + +
+Relevant source files + +The following files were used as context for generating this wiki page: + +- [docs/quickstart/kubernetes.md](docs/quickstart/kubernetes.md) +- [k8s/interpreter/100-interpreter-spec.yaml](k8s/interpreter/100-interpreter-spec.yaml) +- [k8s/zeppelin-server.yaml](k8s/zeppelin-server.yaml) +- [scripts/docker/zeppelin-interpreter/Dockerfile](scripts/docker/zeppelin-interpreter/Dockerfile) +- [scripts/docker/zeppelin-server/Dockerfile](scripts/docker/zeppelin-server/Dockerfile) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/InterpreterNotFoundException.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/InterpreterNotFoundException.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/InterpreterRunner.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/InterpreterRunner.java) +- [zeppelin-interpreter/src/test/java/org/apache/zeppelin/interpreter/util/ByteBufferUtilTest.java](zeppelin-interpreter/src/test/java/org/apache/zeppelin/interpreter/util/ByteBufferUtilTest.java) +- [zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sRemoteInterpreterProcess.java](zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sRemoteInterpreterProcess.java) +- [zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sSpecTemplate.java](zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sSpecTemplate.java) +- [zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sStandardInterpreterLauncher.java](zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sStandardInterpreterLauncher.java) +- [zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sUtils.java](zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sUtils.java) +- [zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/PodPhaseWatcher.java](zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/PodPhaseWatcher.java) +- [zeppelin-plugins/launcher/k8s-standard/src/test/java/org/apache/zeppelin/interpreter/launcher/K8sRemoteInterpreterProcessTest.java](zeppelin-plugins/launcher/k8s-standard/src/test/java/org/apache/zeppelin/interpreter/launcher/K8sRemoteInterpreterProcessTest.java) +- [zeppelin-plugins/launcher/k8s-standard/src/test/java/org/apache/zeppelin/interpreter/launcher/K8sSpecTemplateTest.java](zeppelin-plugins/launcher/k8s-standard/src/test/java/org/apache/zeppelin/interpreter/launcher/K8sSpecTemplateTest.java) +- [zeppelin-plugins/launcher/k8s-standard/src/test/java/org/apache/zeppelin/interpreter/launcher/K8sStandardInterpreterLauncherTest.java](zeppelin-plugins/launcher/k8s-standard/src/test/java/org/apache/zeppelin/interpreter/launcher/K8sStandardInterpreterLauncherTest.java) +- [zeppelin-plugins/launcher/k8s-standard/src/test/java/org/apache/zeppelin/interpreter/launcher/K8sUtilsTest.java](zeppelin-plugins/launcher/k8s-standard/src/test/java/org/apache/zeppelin/interpreter/launcher/K8sUtilsTest.java) +- [zeppelin-plugins/launcher/k8s-standard/src/test/java/org/apache/zeppelin/interpreter/launcher/PodPhaseWatcherTest.java](zeppelin-plugins/launcher/k8s-standard/src/test/java/org/apache/zeppelin/interpreter/launcher/PodPhaseWatcherTest.java) +- [zeppelin-plugins/launcher/k8s-standard/src/test/resources/k8s-specs/interpreter-spec.yaml](zeppelin-plugins/launcher/k8s-standard/src/test/resources/k8s-specs/interpreter-spec.yaml) +- [zeppelin-plugins/launcher/k8s-standard/src/test/resources/log4j.properties](zeppelin-plugins/launcher/k8s-standard/src/test/resources/log4j.properties) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/recovery/StopInterpreter.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/recovery/StopInterpreter.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/SleepInterpreter.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/SleepInterpreter.java) + +
+ + + +## Purpose and Scope + +Zeppelin's Kubernetes support enables cloud-native deployment where the Zeppelin server runs in a Kubernetes cluster and launches interpreters as separate pods. This provides automatic scaling, resource isolation, and leverages Kubernetes' orchestration capabilities for interpreter lifecycle management. + +This document covers the Kubernetes-specific interpreter launcher implementation and deployment configurations. For general interpreter architecture concepts, see [Interpreter Framework](#2.3). For Spark-specific configuration when running on Kubernetes, see [Spark Interpreters](#5.2). + +## Architecture Overview + +The Kubernetes support is implemented through a specialized launcher that replaces local process execution with pod-based interpreter deployment: + +```mermaid +graph TB + subgraph "Zeppelin Server Pod" + ZeppelinServer["ZeppelinServer
Main Application"] + K8sLauncher["K8sStandardInterpreterLauncher
Pod Creator"] + K8sClient["KubernetesClient
fabric8 client"] + end + + subgraph "Kubernetes Cluster" + subgraph "Interpreter Pods" + InterpPod1["spark-abc123
Spark Interpreter Pod"] + InterpPod2["python-def456
Python Interpreter Pod"] + InterpPod3["jdbc-ghi789
JDBC Interpreter Pod"] + end + + subgraph "Spark Executor Pods" + SparkExec1["spark-exec-001"] + SparkExec2["spark-exec-002"] + end + + subgraph "Services & Networking" + Service1["spark-abc123-svc
ClusterIP None"] + Service2["python-def456-svc
ClusterIP None"] + Ingress["Spark UI Ingress
Optional"] + end + end + + subgraph "Configuration" + Templates["k8s/interpreter/
YAML Templates"] + ConfigMap["zeppelin-server-conf-map
Environment Variables"] + end + + ZeppelinServer --> K8sLauncher + K8sLauncher --> K8sClient + K8sClient --> InterpPod1 + K8sClient --> InterpPod2 + K8sClient --> InterpPod3 + K8sLauncher --> Templates + InterpPod1 --> SparkExec1 + InterpPod1 --> SparkExec2 + InterpPod1 --> Service1 + Service1 --> Ingress + ConfigMap --> ZeppelinServer +``` + +**Key Design Principles:** +- Each interpreter group runs in its own pod for isolation +- Templates in `k8s/interpreter/` define pod specifications +- Fabric8 Kubernetes client handles cluster communication +- Automatic resource cleanup via owner references + +Sources: [zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sStandardInterpreterLauncher.java:1-147](), [k8s/interpreter/100-interpreter-spec.yaml:1-217](), [k8s/zeppelin-server.yaml:1-228]() + +## Core Components + +The Kubernetes launcher implementation consists of several key classes that handle different aspects of pod management: + +```mermaid +classDiagram + class K8sStandardInterpreterLauncher { + -KubernetesClient client + -ZeppelinConfiguration zConf + +launchDirectly(context) K8sRemoteInterpreterProcess + +getZeppelinService(context) String + +getZeppelinServiceRpcPort(context) int + +buildEnvFromProperties(context) Map + } + + class K8sRemoteInterpreterProcess { + -String podName + -String interpreterNamespace + -File specTemplates + -Properties properties + -AtomicBoolean started + +start(userName) void + +stop() void + +isRunning() boolean + +getPodPhase() String + +apply(path, delete, templateProperties) void + +getTemplateBindings(userName) Properties + } + + class K8sUtils { + +calculateMemoryWithDefaultOverhead(memory) String + +calculateSparkMemory(memory, memoryOverhead) String + +getCurrentK8sNamespace() String + +getInterpreterNamespace(properties, zConf) String + +isRunningOnKubernetes() boolean + +generateK8sName(baseName, randomSuffix) String + } + + class K8sSpecTemplate { + +render(templateFile) String + +render(template) String + +loadProperties(properties) void + } + + class PodPhaseWatcher { + -CountDownLatch countDownLatch + -Predicate predicate + +eventReceived(action, pod) void + +onClose(cause) void + +getCountDownLatch() CountDownLatch + } + + K8sStandardInterpreterLauncher --> K8sRemoteInterpreterProcess : creates + K8sRemoteInterpreterProcess --> K8sUtils : uses + K8sRemoteInterpreterProcess --> K8sSpecTemplate : uses + K8sRemoteInterpreterProcess --> PodPhaseWatcher : uses + K8sStandardInterpreterLauncher --> KubernetesClient : uses + K8sRemoteInterpreterProcess --> KubernetesClient : uses +``` + +### Component Responsibilities + +| Component | Primary Responsibility | Key Methods | +|-----------|----------------------|-------------| +| `K8sStandardInterpreterLauncher` | Entry point for creating interpreter pods | `launchDirectly()`, `buildEnvFromProperties()` | +| `K8sRemoteInterpreterProcess` | Manages individual interpreter pod lifecycle | `start()`, `stop()`, `apply()`, `getTemplateBindings()` | +| `K8sUtils` | Kubernetes utility functions and namespace resolution | `calculateMemoryWithDefaultOverhead()`, `isRunningOnKubernetes()` | +| `K8sSpecTemplate` | YAML template rendering using Jinjava | `render()`, `loadProperties()` | +| `PodPhaseWatcher` | Monitors pod state transitions | `eventReceived()`, `onClose()` | + +Sources: [zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sStandardInterpreterLauncher.java:39-147](), [zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sRemoteInterpreterProcess.java:54-493](), [zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sUtils.java:36-177]() + +## Interpreter Pod Deployment Process + +The deployment of interpreter pods follows a structured sequence involving template rendering, pod creation, and state monitoring: + +```mermaid +sequenceDiagram + participant User + participant ZeppelinServer + participant K8sLauncher as "K8sStandardInterpreterLauncher" + participant K8sProcess as "K8sRemoteInterpreterProcess" + participant K8sClient as "KubernetesClient" + participant K8sCluster as "Kubernetes Cluster" + participant InterpPod as "Interpreter Pod" + + User->>ZeppelinServer: Execute paragraph + ZeppelinServer->>K8sLauncher: launchDirectly(context) + K8sLauncher->>K8sProcess: new K8sRemoteInterpreterProcess() + K8sLauncher->>K8sProcess: getZeppelinService(context) + K8sLauncher->>K8sProcess: buildEnvFromProperties(context) + + ZeppelinServer->>K8sProcess: start(userName) + K8sProcess->>K8sProcess: getTemplateBindings(userName) + K8sProcess->>K8sProcess: apply(specTemplates, false, templateProperties) + K8sProcess->>K8sClient: load(renderedTemplate) + K8sClient->>K8sCluster: createOrReplace(pod, service, rbac) + + K8sCluster->>InterpPod: Pod created (Pending) + K8sProcess->>K8sClient: watch(podName, PodPhaseWatcher) + InterpPod->>K8sProcess: Pod phase: Running + InterpPod->>K8sProcess: processStarted(port, host) + K8sProcess->>ZeppelinServer: Interpreter ready + + Note over User,InterpPod: Interpreter execution ready + + User->>ZeppelinServer: Stop interpreter + ZeppelinServer->>K8sProcess: stop() + K8sProcess->>K8sClient: watch(podName, termination) + K8sProcess->>K8sProcess: apply(specTemplates, true, templateProperties) + K8sClient->>K8sCluster: delete(pod, service, rbac) +``` + +### Template Processing Pipeline + +The pod specification is generated through a multi-step template processing pipeline: + +1. **Template Loading**: YAML files from `k8s/interpreter/` directory are loaded +2. **Property Binding**: Interpreter properties are converted to template variables via `getTemplateBindings()` +3. **Jinjava Rendering**: Templates are rendered with dynamic values using `K8sSpecTemplate.render()` +4. **Resource Creation**: Rendered YAML is applied to cluster via Fabric8 client + +Sources: [zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sRemoteInterpreterProcess.java:152-195](), [zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sRemoteInterpreterProcess.java:254-285](), [zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sSpecTemplate.java:32-81]() + +## Configuration and Customization + +### Server-Level Configuration + +The Zeppelin server requires specific environment variables and configurations to enable Kubernetes mode: + +| Environment Variable | Purpose | Default Value | +|---------------------|---------|---------------| +| `ZEPPELIN_RUN_MODE` | Activates Kubernetes launcher | `k8s` | +| `ZEPPELIN_K8S_CONTAINER_IMAGE` | Interpreter container image | `apache/zeppelin-interpreter:0.10.0` | +| `ZEPPELIN_K8S_SPARK_CONTAINER_IMAGE` | Spark container image | `spark:2.4.5` | +| `ZEPPELIN_K8S_NAMESPACE` | Default interpreter namespace | Current pod namespace | +| `ZEPPELIN_K8S_PORTFORWARD` | Enable port forwarding for development | `false` | + +### Interpreter-Level Properties + +Individual interpreters can be customized through properties in their settings: + +| Property | Description | Example Value | +|----------|-------------|---------------| +| `zeppelin.k8s.interpreter.namespace` | Interpreter pod namespace | `zeppelin-interpreters` | +| `zeppelin.k8s.interpreter.serviceAccount` | Service account for pod | `zeppelin-interpreter` | +| `zeppelin.k8s.interpreter.container.image` | Custom interpreter image | `my-registry/zeppelin:latest` | +| `zeppelin.k8s.interpreter.cores` | CPU request/limit | `2` | +| `zeppelin.k8s.interpreter.memory` | Memory request | `4Gi` | +| `zeppelin.k8s.interpreter.gpu.type` | GPU resource type | `nvidia.com/gpu` | +| `zeppelin.k8s.interpreter.gpu.nums` | Number of GPUs | `1` | + +### Template Customization + +The pod specifications are defined in YAML templates under `k8s/interpreter/`. The main template structure includes: + +- **Pod Specification**: Container image, resources, environment variables +- **Service Definition**: Headless service for pod discovery +- **RBAC Resources**: Role and RoleBinding for Spark interpreters +- **Ingress Resources**: Optional Spark UI access (when `zeppelin.k8s.spark.useIngress=true`) + +Key template variables populated by `getTemplateBindings()`: + +```yaml +metadata: + namespace: {{zeppelin.k8s.interpreter.namespace}} + name: {{zeppelin.k8s.interpreter.pod.name}} +spec: + serviceAccountName: {{zeppelin.k8s.interpreter.serviceAccount}} + containers: + - name: {{zeppelin.k8s.interpreter.container.name}} + image: {{zeppelin.k8s.interpreter.container.image}} + resources: + requests: + memory: "{{zeppelin.k8s.interpreter.memory}}" + cpu: "{{zeppelin.k8s.interpreter.cores}}" +``` + +Sources: [k8s/interpreter/100-interpreter-spec.yaml:17-217](), [zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sRemoteInterpreterProcess.java:288-375](), [k8s/zeppelin-server.yaml:17-39]() + +## Spark on Kubernetes Integration + +When the interpreter group is `spark`, additional Kubernetes-specific configurations are automatically applied: + +### Automatic Spark Configuration + +The `K8sRemoteInterpreterProcess.buildSparkSubmitOptions()` method generates Spark submit options for Kubernetes execution: + +```bash +--master k8s://https://kubernetes.default.svc +--deploy-mode client +--conf spark.kubernetes.namespace=default +--conf spark.executor.instances=1 +--conf spark.kubernetes.driver.pod.name=spark-abc123 +--conf spark.kubernetes.container.image=spark:2.4.5 +--conf spark.driver.bindAddress=0.0.0.0 +--conf spark.driver.host=spark-abc123.default.svc +--conf spark.driver.port=22321 +--conf spark.blockManager.port=22322 +``` + +### Resource Calculation + +Spark driver resources are automatically calculated from interpreter properties: + +- **Memory**: Combines `spark.driver.memory` + `spark.driver.memoryOverhead` using `K8sUtils.calculateSparkMemory()` +- **CPU**: Maps `spark.driver.cores` to Kubernetes CPU requests/limits +- **Network**: Predefined ports for driver (22321) and block manager (22322) + +### Spark UI Access + +The Spark UI is accessible through multiple mechanisms: + +1. **Reverse Proxy**: Via nginx sidecar using pattern `4040-{podName}.{serviceDomain}` +2. **Ingress**: When `zeppelin.k8s.spark.useIngress=true`, creates Kubernetes Ingress resource +3. **Port Forward**: For development with `ZEPPELIN_K8S_PORTFORWARD=true` + +The UI URL template is configurable via `zeppelin.spark.uiWebUrl` property and rendered using `sparkUiWebUrlFromTemplate()`. + +### RBAC Permissions + +Spark interpreters require additional RBAC permissions to create executor pods: + +```yaml +rules: +- apiGroups: [""] + resources: ["pods", "services", "configmaps"] + verbs: ["create", "get", "update", "list", "delete", "watch"] +``` + +Sources: [zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sRemoteInterpreterProcess.java:406-428](), [zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sRemoteInterpreterProcess.java:354-367](), [k8s/interpreter/100-interpreter-spec.yaml:146-185]() + +## Development and Troubleshooting + +### Development Mode + +For development, Zeppelin server can run locally while launching interpreters in Kubernetes: + +| Environment Variable | Value | Purpose | +|---------------------|-------|---------| +| `ZEPPELIN_RUN_MODE` | `k8s` | Force Kubernetes mode | +| `ZEPPELIN_K8S_PORTFORWARD` | `true` | Enable local port forwarding | +| `KUBERNETES_AUTH_TOKEN` | `` | Cluster authentication | + +This configuration uses `LocalPortForward` to tunnel interpreter connections through `kubectl port-forward`. + +### Pod Lifecycle Monitoring + +The `PodPhaseWatcher` class monitors pod state transitions: + +- **Pending → Running**: Normal startup sequence +- **Failed**: Triggers cleanup and error reporting +- **Succeeded**: Natural termination + +Timeout behavior is controlled by `zeppelin.k8s.interpreter.timeout.during.pending` property. + +### Common Issues and Diagnostics + +| Issue | Diagnostic Method | Solution | +|-------|------------------|----------| +| Pod startup timeout | Check `getPodPhase()` and pod events | Increase timeout or check resource availability | +| Image pull failures | Verify `imagePullSecrets` configuration | Configure registry credentials | +| RBAC errors | Check service account permissions | Update ClusterRole/RoleBinding | +| Network connectivity | Test port forwarding with `processStarted()` | Verify service discovery and DNS | + +### Resource Cleanup + +Automatic cleanup is handled through Kubernetes owner references when server and interpreter pods are in the same namespace. The `ownerUID()` and `ownerName()` methods link interpreter pods to the Zeppelin server pod for garbage collection. + +Sources: [docs/quickstart/kubernetes.md:274-287](), [zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/PodPhaseWatcher.java:32-70](), [zeppelin-plugins/launcher/k8s-standard/src/main/java/org/apache/zeppelin/interpreter/launcher/K8sRemoteInterpreterProcess.java:459-465]() diff --git a/.cursor/documentation/interpreters/flink_interpreter.md b/.cursor/documentation/interpreters/flink_interpreter.md new file mode 100644 index 00000000000..7f521da760a --- /dev/null +++ b/.cursor/documentation/interpreters/flink_interpreter.md @@ -0,0 +1,316 @@ +# Flink Interpreter + +
+Relevant source files + +The following files were used as context for generating this wiki page: + +- [conf/log4j2.properties](conf/log4j2.properties) +- [docs/interpreter/flink.md](docs/interpreter/flink.md) +- [flink-cmd/src/main/java/org/apache/zeppelin/flink/cmd/FlinkCmdInterpreter.java](flink-cmd/src/main/java/org/apache/zeppelin/flink/cmd/FlinkCmdInterpreter.java) +- [flink-cmd/src/main/java/org/apache/zeppelin/flink/cmd/YarnUtils.java](flink-cmd/src/main/java/org/apache/zeppelin/flink/cmd/YarnUtils.java) +- [flink-cmd/src/main/resources/interpreter-setting.json](flink-cmd/src/main/resources/interpreter-setting.json) +- [flink-cmd/src/test/java/org/apache/zeppelin/flink/cmd/FlinkCmdInterpreterTest.java](flink-cmd/src/test/java/org/apache/zeppelin/flink/cmd/FlinkCmdInterpreterTest.java) +- [flink/pom.xml](flink/pom.xml) + +
+ + + +This document covers Apache Zeppelin's Flink interpreter system, which provides integration with Apache Flink for stream and batch data processing. The Flink interpreter enables users to execute Scala, Python, and SQL code against Flink clusters in various deployment modes. + +For general interpreter framework concepts, see [Interpreter Framework](#2.3). For information about other streaming interpreters, see [Remote Interpreter Infrastructure](#5.1). + +## Overview and Architecture + +The Flink interpreter system consists of multiple specialized interpreters that work together to provide comprehensive Flink support. The system acts as a Flink client responsible for compiling, submitting, and managing Flink job lifecycles. + +```mermaid +graph TB + subgraph "Zeppelin Server" + ISM["InterpreterSettingManager"] + FlinkGroup["Flink Interpreter Group"] + end + + subgraph "Flink Interpreter Process" + FlinkMain["FlinkInterpreter"] + PyFlink["PyFlinkInterpreter"] + IPyFlink["IPyFlinkInterpreter"] + StreamSQL["FlinkStreamSqlInterpreter"] + BatchSQL["FlinkBatchSqlInterpreter"] + + ScalaShell["Scala Shell
ExecutionEnvironment
StreamExecutionEnvironment
TableEnvironment"] + PythonShell["Python Shell
PyFlink Runtime"] + end + + subgraph "Flink Cluster Options" + Local["MiniCluster
(Local Mode)"] + Remote["Standalone Cluster
(Remote Mode)"] + Yarn["Yarn Session
(Yarn Mode)"] + YarnApp["Yarn Application
(Yarn-Application Mode)"] + end + + ISM --> FlinkGroup + FlinkGroup --> FlinkMain + FlinkGroup --> PyFlink + FlinkGroup --> IPyFlink + FlinkGroup --> StreamSQL + FlinkGroup --> BatchSQL + + FlinkMain --> ScalaShell + PyFlink --> PythonShell + IPyFlink --> PythonShell + StreamSQL --> ScalaShell + BatchSQL --> ScalaShell + + ScalaShell --> Local + ScalaShell --> Remote + ScalaShell --> Yarn + ScalaShell --> YarnApp +``` + +Sources: [docs/interpreter/flink.md:155-169]() + +## Interpreter Components + +The Flink interpreter group provides five distinct interpreters, each serving specific use cases: + +| Interpreter | Class | Description | +|-------------|-------|-------------| +| `%flink` | `FlinkInterpreter` | Scala environment with Flink execution contexts | +| `%flink.pyflink` | `PyFlinkInterpreter` | Python environment for PyFlink | +| `%flink.ipyflink` | `IPyFlinkInterpreter` | Enhanced IPython environment for PyFlink | +| `%flink.ssql` | `FlinkStreamSqlInterpreter` | Streaming SQL execution | +| `%flink.bsql` | `FlinkBatchSqlInterpreter` | Batch SQL execution | + +### Built-in Variables + +The Scala shell creates several built-in variables that are shared across interpreters: + +- `senv` - `StreamExecutionEnvironment` for streaming jobs +- `benv` - `ExecutionEnvironment` for batch jobs +- `stenv` - `StreamTableEnvironment` for streaming SQL +- `btenv` - `BatchTableEnvironment` for batch SQL +- `z` - `ZeppelinContext` for display and visualization + +Sources: [docs/interpreter/flink.md:33-64](), [docs/interpreter/flink.md:386-398]() + +## Multi-Version Support + +The Flink interpreter supports multiple Flink and Scala versions through a modular architecture: + +```mermaid +graph TB + FlinkParent["flink-parent"] + + subgraph "Version Modules" + FlinkShims["flink-shims
(Common Interface)"] + Flink112["flink1.12-shims"] + Flink113["flink1.13-shims"] + Flink114["flink1.14-shims"] + Flink115["flink1.15-shims"] + end + + subgraph "Scala Version Support" + ScalaParent["flink-scala-parent"] + Scala211["flink-scala-2.11"] + Scala212["flink-scala-2.12"] + end + + FlinkParent --> FlinkShims + FlinkParent --> Flink112 + FlinkParent --> Flink113 + FlinkParent --> Flink114 + FlinkParent --> Flink115 + + FlinkParent --> ScalaParent + ScalaParent --> Scala211 + ScalaParent --> Scala212 +``` + +The system uses Maven profiles to build specific version combinations: +- `flink-112` (default): Flink 1.12 with Scala 2.11/2.12 +- `flink-113`: Flink 1.13 with Scala 2.11/2.12 +- `flink-114`: Flink 1.14 with Scala 2.11/2.12 +- `flink-115`: Flink 1.15 with Scala 2.12 only + +Sources: [flink/pom.xml:36-90]() + +## Execution Modes + +The Flink interpreter supports four execution modes configured via `flink.execution.mode`: + +### Local Mode +Creates a `MiniCluster` within the interpreter JVM. Default configuration uses port 8081 with 4 TaskManagers and 1 slot each. + +### Remote Mode +Connects to an existing Flink cluster. Requires `flink.execution.remote.host` and `flink.execution.remote.port` configuration. + +### Yarn Mode +Launches a dedicated Flink session cluster on YARN. Requires `HADOOP_CONF_DIR` and `hadoop` command availability. + +### Yarn Application Mode +Runs the Flink interpreter within the JobManager on YARN (Flink 1.11+). Recommended for production to reduce resource usage on Zeppelin server. + +```mermaid +graph LR + User["User Code"] + + subgraph "Local Mode" + LocalInterp["Flink Interpreter"] + MiniCluster["MiniCluster
(port 8081)"] + LocalInterp --> MiniCluster + end + + subgraph "Remote Mode" + RemoteInterp["Flink Interpreter"] + StandaloneCluster["Standalone Cluster
(host:port)"] + RemoteInterp --> StandaloneCluster + end + + subgraph "Yarn Mode" + YarnInterp["Flink Interpreter"] + YarnSession["Yarn Session Cluster"] + YarnInterp --> YarnSession + end + + subgraph "Yarn Application Mode" + YarnAppCluster["Yarn Application Cluster"] + JobManager["JobManager
(contains interpreter)"] + YarnAppCluster --> JobManager + end + + User --> LocalInterp + User --> RemoteInterp + User --> YarnInterp + User --> JobManager +``` + +Sources: [docs/interpreter/flink.md:344-384]() + +## Configuration System + +The Flink interpreter provides extensive configuration options covering execution, resources, and integration settings: + +### Core Configuration + +| Property | Default | Description | +|----------|---------|-------------| +| `FLINK_HOME` | - | **Required** - Flink installation location | +| `flink.execution.mode` | `local` | Execution mode: local, remote, yarn, yarn-application | +| `jobmanager.memory.process.size` | `1024m` | JobManager total memory | +| `taskmanager.memory.process.size` | `1024m` | TaskManager total memory | +| `taskmanager.numberOfTaskSlots` | `1` | Slots per TaskManager | + +### Dependencies and UDFs + +The system supports multiple approaches for adding dependencies: + +- `flink.execution.packages` - Maven-style dependencies (recommended) +- `flink.execution.jars` - Direct JAR file paths +- `flink.udf.jars` - Auto-registered UDF JARs with class scanning + +### SQL-Specific Configuration + +- `zeppelin.flink.concurrentBatchSql.max` - Max concurrent batch SQL (default: 10) +- `zeppelin.flink.concurrentStreamSql.max` - Max concurrent stream SQL (default: 10) +- `zeppelin.flink.maxResult` - Max rows returned by SQL (default: 1000) + +Sources: [docs/interpreter/flink.md:170-335]() + +## SQL Interpreters and Visualization + +The Flink SQL interpreters provide enhanced capabilities beyond standard Flink SQL Client: + +### Stream SQL Visualization Modes + +The `%flink.ssql` interpreter supports three visualization modes for streaming results: + +1. **Single Mode** - Single row results with HTML template support +2. **Update Mode** - Multi-row results with continuous updates +3. **Append Mode** - Append-only results for windowed aggregations + +```mermaid +graph TB + SQLCode["SQL Statement"] + + subgraph "Streaming SQL Execution" + Parser["SQL Parser"] + JobSubmit["Job Submission"] + FlinkJob["Flink Streaming Job"] + end + + subgraph "Visualization Pipeline" + ResultCollector["Result Collector"] + + Single["Single Mode
HTML Template"] + Update["Update Mode
Table Updates"] + Append["Append Mode
Incremental Data"] + end + + SQLCode --> Parser + Parser --> JobSubmit + JobSubmit --> FlinkJob + FlinkJob --> ResultCollector + + ResultCollector --> Single + ResultCollector --> Update + ResultCollector --> Append +``` + +### Paragraph Properties + +SQL execution behavior is controlled via paragraph-local properties: + +- `type` - Visualization mode (single/update/append) +- `parallelism` - Job parallelism override +- `jobName` - Custom job name for insert statements +- `runAsOne` - Execute multiple inserts in single job +- `template` - HTML template for single mode visualization + +Sources: [docs/interpreter/flink.md:442-528](), [docs/interpreter/flink.md:731-792]() + +## Python Integration + +The Flink interpreter provides comprehensive Python support through PyFlink: + +### Python Interpreter Variants + +- `%flink.pyflink` - Basic Python environment +- `%flink.ipyflink` - Enhanced IPython environment with Jupyter-like features + +### Python Environment Variables + +- `s_env` - `StreamExecutionEnvironment` +- `b_env` - `ExecutionEnvironment` +- `st_env` - `StreamTableEnvironment` +- `bt_env` - `BatchTableEnvironment` + +The Python shell shares the same underlying Flink context with the Scala shell, enabling cross-language UDF usage. + +Sources: [docs/interpreter/flink.md:529-583]() + +## Command-Line Integration + +A separate `flink-cmd` module provides direct Flink CLI integration: + +```mermaid +graph LR + ZeppelinUI["Zeppelin UI"] + FlinkCmdInterpreter["FlinkCmdInterpreter"] + ShellInterpreter["ShellInterpreter"] + FlinkCLI["flink CLI Command"] + YarnUtils["YarnUtils"] + + ZeppelinUI --> FlinkCmdInterpreter + FlinkCmdInterpreter --> ShellInterpreter + ShellInterpreter --> FlinkCLI + FlinkCmdInterpreter --> YarnUtils +``` + +The `FlinkCmdInterpreter` extends `ShellInterpreter` to: +- Execute Flink CLI commands directly +- Extract YARN application URLs for job monitoring +- Provide timeout-free execution for long-running jobs + +Sources: [flink-cmd/src/main/java/org/apache/zeppelin/flink/cmd/FlinkCmdInterpreter.java:32-52](), [flink-cmd/src/main/java/org/apache/zeppelin/flink/cmd/YarnUtils.java:34-58]() diff --git a/.cursor/documentation/interpreters/interpreters.md b/.cursor/documentation/interpreters/interpreters.md new file mode 100644 index 00000000000..ab937207468 --- /dev/null +++ b/.cursor/documentation/interpreters/interpreters.md @@ -0,0 +1,475 @@ +# Interpreters + +
+Relevant source files + +The following files were used as context for generating this wiki page: + +- [docs/assets/themes/zeppelin/img/docs-img/jdbc_refresh.gif](docs/assets/themes/zeppelin/img/docs-img/jdbc_refresh.gif) +- [docs/assets/themes/zeppelin/img/screenshots/interpreter_setting_with_context_parameters.png](docs/assets/themes/zeppelin/img/screenshots/interpreter_setting_with_context_parameters.png) +- [docs/interpreter/jdbc.md](docs/interpreter/jdbc.md) +- [docs/interpreter/spark.md](docs/interpreter/spark.md) +- [docs/quickstart/sql_with_zeppelin.md](docs/quickstart/sql_with_zeppelin.md) +- [jdbc/pom.xml](jdbc/pom.xml) +- [jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java](jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java) +- [jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCUserConfigurations.java](jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCUserConfigurations.java) +- [jdbc/src/main/java/org/apache/zeppelin/jdbc/SqlCompleter.java](jdbc/src/main/java/org/apache/zeppelin/jdbc/SqlCompleter.java) +- [jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationRequest.java](jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationRequest.java) +- [jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationResponse.java](jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationResponse.java) +- [jdbc/src/main/java/org/apache/zeppelin/jdbc/hive/BeelineInPlaceUpdateStream.java](jdbc/src/main/java/org/apache/zeppelin/jdbc/hive/BeelineInPlaceUpdateStream.java) +- [jdbc/src/main/java/org/apache/zeppelin/jdbc/hive/HiveUtils.java](jdbc/src/main/java/org/apache/zeppelin/jdbc/hive/HiveUtils.java) +- [jdbc/src/main/java/org/apache/zeppelin/jdbc/hive/ProgressBar.java](jdbc/src/main/java/org/apache/zeppelin/jdbc/hive/ProgressBar.java) +- [jdbc/src/main/java/org/apache/zeppelin/jdbc/hive/YarnUtil.java](jdbc/src/main/java/org/apache/zeppelin/jdbc/hive/YarnUtil.java) +- [jdbc/src/main/java/org/apache/zeppelin/jdbc/security/JDBCSecurityImpl.java](jdbc/src/main/java/org/apache/zeppelin/jdbc/security/JDBCSecurityImpl.java) +- [jdbc/src/main/resources/interpreter-setting.json](jdbc/src/main/resources/interpreter-setting.json) +- [jdbc/src/test/java/org/apache/zeppelin/jdbc/JDBCInterpreterInterpolationTest.java](jdbc/src/test/java/org/apache/zeppelin/jdbc/JDBCInterpreterInterpolationTest.java) +- [jdbc/src/test/java/org/apache/zeppelin/jdbc/JDBCInterpreterTest.java](jdbc/src/test/java/org/apache/zeppelin/jdbc/JDBCInterpreterTest.java) +- [jdbc/src/test/java/org/apache/zeppelin/jdbc/SqlCompleterTest.java](jdbc/src/test/java/org/apache/zeppelin/jdbc/SqlCompleterTest.java) +- [jdbc/src/test/java/org/apache/zeppelin/jdbc/hive/HiveUtilsTest.java](jdbc/src/test/java/org/apache/zeppelin/jdbc/hive/HiveUtilsTest.java) +- [jdbc/src/test/resources/log4j.properties](jdbc/src/test/resources/log4j.properties) +- [kotlin/src/test/java/org/apache/zeppelin/kotlin/KotlinInterpreterTest.java](kotlin/src/test/java/org/apache/zeppelin/kotlin/KotlinInterpreterTest.java) +- [livy/src/main/java/org/apache/zeppelin/livy/SessionDeadException.java](livy/src/main/java/org/apache/zeppelin/livy/SessionDeadException.java) +- [spark-submit/src/main/java/org/apache/zeppelin/spark/submit/SparkSubmitInterpreter.java](spark-submit/src/main/java/org/apache/zeppelin/spark/submit/SparkSubmitInterpreter.java) +- [spark-submit/src/main/resources/interpreter-setting.json](spark-submit/src/main/resources/interpreter-setting.json) +- [spark/interpreter/src/main/resources/interpreter-setting.json](spark/interpreter/src/main/resources/interpreter-setting.json) +- [spark/interpreter/src/test/java/org/apache/zeppelin/spark/KotlinSparkInterpreterTest.java](spark/interpreter/src/test/java/org/apache/zeppelin/spark/KotlinSparkInterpreterTest.java) +- [zeppelin-interpreter-integration/README.md](zeppelin-interpreter-integration/README.md) +- [zeppelin-interpreter-integration/src/test/java/org/apache/zeppelin/integration/SparkIntegrationTest.java](zeppelin-interpreter-integration/src/test/java/org/apache/zeppelin/integration/SparkIntegrationTest.java) +- [zeppelin-interpreter-integration/src/test/java/org/apache/zeppelin/integration/SparkSubmitIntegrationTest.java](zeppelin-interpreter-integration/src/test/java/org/apache/zeppelin/integration/SparkSubmitIntegrationTest.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/completer/CachedCompleter.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/completer/CachedCompleter.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/completer/StringsCompleter.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/completer/StringsCompleter.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/AbstractInterpreter.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/AbstractInterpreter.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/Interpreter.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/Interpreter.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/InterpreterException.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/InterpreterException.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/InterpreterOption.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/InterpreterOption.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/launcher/InterpreterClient.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/launcher/InterpreterClient.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/launcher/InterpreterLaunchContext.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/launcher/InterpreterLaunchContext.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/util/SqlSplitter.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/util/SqlSplitter.java) +- [zeppelin-interpreter/src/test/java/org/apache/zeppelin/interpreter/InterpreterTest.java](zeppelin-interpreter/src/test/java/org/apache/zeppelin/interpreter/InterpreterTest.java) +- [zeppelin-interpreter/src/test/java/org/apache/zeppelin/interpreter/ZeppCtxtVariableTest.java](zeppelin-interpreter/src/test/java/org/apache/zeppelin/interpreter/ZeppCtxtVariableTest.java) +- [zeppelin-interpreter/src/test/java/org/apache/zeppelin/interpreter/util/SqlSplitterTest.java](zeppelin-interpreter/src/test/java/org/apache/zeppelin/interpreter/util/SqlSplitterTest.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/rest/InterpreterRestApi.java](zeppelin-server/src/main/java/org/apache/zeppelin/rest/InterpreterRestApi.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/rest/InterpreterRestApiTest.java](zeppelin-server/src/test/java/org/apache/zeppelin/rest/InterpreterRestApiTest.java) +- [zeppelin-web/e2e/collaborativeMode.spec.js](zeppelin-web/e2e/collaborativeMode.spec.js) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterFactory.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterFactory.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSetting.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSetting.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSettingManager.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSettingManager.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/ManagedInterpreterGroup.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/ManagedInterpreterGroup.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/launcher/SparkInterpreterLauncher.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/launcher/SparkInterpreterLauncher.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/InterpreterFactoryTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/InterpreterFactoryTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/InterpreterSettingManagerTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/InterpreterSettingManagerTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/InterpreterSettingTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/InterpreterSettingTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/launcher/SparkInterpreterLauncherTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/launcher/SparkInterpreterLauncherTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/mock/MockInterpreter1.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/mock/MockInterpreter1.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/mock/MockInterpreter2.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/mock/MockInterpreter2.java) + +
+ + + +This document provides a comprehensive guide to Zeppelin's interpreter system, which enables execution of code in multiple programming languages and data processing frameworks within notebook paragraphs. The interpreter system provides a pluggable architecture for language support, configuration management, process isolation, and distributed execution. + +For information about specific interpreter implementations and deployment patterns, see [Remote Interpreter Infrastructure](#5.1), [Spark Interpreters](#5.2), [Python Interpreter](#5.3), [JDBC Interpreter](#5.4), [Flink Interpreter](#5.5), and [Livy Interpreter](#5.6). + +## Architecture Overview + +The interpreter system is built around a multi-layered architecture that separates configuration management, instance creation, and execution environments. The system supports both local and remote execution modes with comprehensive lifecycle management. + +```mermaid +graph TB + subgraph "Zeppelin Server Process" + ISM["InterpreterSettingManager
Central Registry"] + IF["InterpreterFactory
Instance Provider"] + IConfig["interpreter-setting.json
Configuration Templates"] + IS["InterpreterSetting
Configuration & Lifecycle"] + end + + subgraph "Interpreter Instances" + RemoteInterp["RemoteInterpreter
Proxy to Remote Process"] + MIG["ManagedInterpreterGroup
Session Management"] + LocalInterp["Local Interpreters
In-Process Execution"] + end + + subgraph "Remote Interpreter Process" + RIS["RemoteInterpreterServer
Thrift RPC Server"] + IG["InterpreterGroup
Session Container"] + ActualInterp["Actual Interpreter
Language Runtime"] + end + + subgraph "Language Implementations" + JDBC["JDBCInterpreter
SQL Execution"] + Spark["SparkInterpreter
Scala REPL"] + Python["PythonInterpreter
Python Execution"] + Shell["ShellInterpreter
Shell Commands"] + end + + ISM --> IF + ISM --> IS + IS --> IConfig + IF --> RemoteInterp + IS --> MIG + MIG --> RemoteInterp + RemoteInterp --> RIS + RIS --> IG + IG --> ActualInterp + ActualInterp --> JDBC + ActualInterp --> Spark + ActualInterp --> Python + ActualInterp --> Shell +``` + +Sources: [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSettingManager.java:104-109](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterFactory.java:29-36](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSetting.java:76-142]() + +## Core Components + +### InterpreterSettingManager + +The `InterpreterSettingManager` serves as the central registry for all interpreter configurations and provides the primary interface for interpreter lifecycle management. + +| Component | Responsibility | Key Methods | +|-----------|---------------|-------------| +| Template Management | Load interpreter definitions from `interpreter-setting.json` | `loadInterpreterSettingFromDefaultDir()` | +| Instance Registry | Maintain active interpreter settings | `interpreterSettings` Map | +| Configuration Persistence | Save/load interpreter configurations | `saveToFile()`, `loadFromFile()` | +| Factory Integration | Coordinate with InterpreterFactory | `getByName()`, `createNewSetting()` | + +The manager maintains two key data structures: +- `interpreterSettingTemplates`: Default configurations loaded from interpreter directories +- `interpreterSettings`: Active interpreter instances with user customizations + +Sources: [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSettingManager.java:121-134](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSettingManager.java:361-385]() + +### InterpreterSetting + +`InterpreterSetting` represents a configured interpreter group with its properties, dependencies, and runtime options. Each setting can spawn multiple interpreter groups based on isolation requirements. + +```mermaid +graph TB + IS["InterpreterSetting
(e.g., 'spark', 'jdbc')"] + + subgraph "Configuration" + Props["Properties
URL, credentials, etc."] + Deps["Dependencies
Maven artifacts"] + Option["InterpreterOption
Isolation settings"] + InfoList["List<InterpreterInfo>
Available interpreters"] + end + + subgraph "Runtime Groups" + MIG1["ManagedInterpreterGroup
user1-note1"] + MIG2["ManagedInterpreterGroup
user2-note2"] + MIG3["ManagedInterpreterGroup
shared"] + end + + subgraph "Group Sessions" + Session1["Session: user1:note1
Interpreter instances"] + Session2["Session: user2
Interpreter instances"] + SessionShared["Session: shared
Interpreter instances"] + end + + IS --> Props + IS --> Deps + IS --> Option + IS --> InfoList + IS --> MIG1 + IS --> MIG2 + IS --> MIG3 + MIG1 --> Session1 + MIG2 --> Session2 + MIG3 --> SessionShared +``` + +The interpreter group ID generation follows specific patterns based on isolation settings: +- **Per-user isolation**: `{settingId}-{userId}` +- **Per-note isolation**: `{settingId}-{userId}-{noteId}` +- **Shared mode**: `{settingId}-shared_process` + +Sources: [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSetting.java:400-426](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSetting.java:428-443]() + +### InterpreterFactory + +The `InterpreterFactory` provides the main entry point for obtaining interpreter instances, resolving interpreter names to actual implementations. + +The resolution process follows this hierarchy: +1. **Fully qualified**: `{group}.{interpreter}` (e.g., `spark.sql`) +2. **Group only**: `{group}` resolves to default interpreter in group +3. **Interpreter only**: `{interpreter}` within default group + +Sources: [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterFactory.java:38-83]() + +## Interpreter Configuration System + +### Configuration Templates + +Interpreter configuration is defined in `interpreter-setting.json` files located in each interpreter's directory. These templates define available interpreters, their properties, and default values. + +```json +{ + "group": "jdbc", + "name": "sql", + "className": "org.apache.zeppelin.jdbc.JDBCInterpreter", + "properties": { + "default.url": { + "defaultValue": "jdbc:postgresql://localhost:5432/", + "description": "The URL for JDBC", + "type": "string" + } + } +} +``` + +### Property Types and Validation + +The system supports various property types with built-in validation: + +| Type | Description | Example | +|------|-------------|---------| +| `string` | Text values | Database URLs, usernames | +| `password` | Masked input | Database passwords | +| `number` | Numeric values | Connection limits, timeouts | +| `checkbox` | Boolean flags | Enable/disable features | +| `textarea` | Multi-line text | SQL precode, custom configurations | + +Sources: [jdbc/src/main/resources/interpreter-setting.json:6-69](), [spark/interpreter/src/main/resources/interpreter-setting.json:7-164]() + +### Runtime Property Resolution + +Properties are resolved with the following precedence: +1. User-configured values in interpreter settings +2. Default values from templates +3. Environment variables (for properties with `envName`) +4. System defaults + +Sources: [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSetting.java:626-651]() + +## Process Lifecycle Management + +### Interpreter Creation and Initialization + +The interpreter creation process involves multiple phases with clear separation of concerns: + +```mermaid +sequenceDiagram + participant User + participant IF as "InterpreterFactory" + participant IS as "InterpreterSetting" + participant MIG as "ManagedInterpreterGroup" + participant Launcher as "InterpreterLauncher" + participant RIP as "RemoteInterpreterProcess" + + User->>IF: "getInterpreter(replName, context)" + IF->>IS: "getInterpreter(context, name)" + IS->>MIG: "getOrCreateInterpreterGroup(context)" + IS->>MIG: "getOrCreateSession(user, sessionId)" + + alt "First access to group" + MIG->>IS: "createInterpreterProcess(groupId, user, props)" + IS->>Launcher: "launch(launchContext)" + Launcher->>RIP: "start remote process" + RIP-->>Launcher: "process started" + Launcher-->>IS: "RemoteInterpreterProcess" + IS->>IS: "createInterpreters(user, groupId, sessionId)" + end + + MIG-->>IS: "List" + IS-->>IF: "Interpreter instance" + IF-->>User: "Ready interpreter" +``` + +### Process Isolation Strategies + +The system supports multiple isolation levels configured via `InterpreterOption`: + +| Mode | Scope | Process Sharing | Use Case | +|------|-------|-----------------|----------| +| `shared_process` | Global | All users, all notes | Development, testing | +| `scoped` per user | Per user | User's notes only | Multi-tenant deployment | +| `scoped` per note | Per note | Single note only | Production isolation | +| `isolated` | Per execution | No sharing | Maximum security | + +Sources: [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSetting.java:410-422](), [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/InterpreterOption.java:1-200]() + +### Launcher System + +Different launcher implementations handle various deployment scenarios: + +```mermaid +graph TB + LauncherFactory["Launcher Selection Logic"] + + subgraph "Launcher Implementations" + Standard["StandardInterpreterLauncher
Local execution"] + Spark["SparkInterpreterLauncher
Spark-specific setup"] + K8s["K8sStandardInterpreterLauncher
Kubernetes pods"] + Docker["DockerInterpreterLauncher
Docker containers"] + Yarn["YarnInterpreterLauncher
YARN containers"] + end + + subgraph "Selection Criteria" + RunMode["conf.getRunMode()"] + Group["interpreterGroup"] + Props["launcher properties"] + end + + RunMode --> LauncherFactory + Group --> LauncherFactory + Props --> LauncherFactory + + LauncherFactory --> Standard + LauncherFactory --> Spark + LauncherFactory --> K8s + LauncherFactory --> Docker + LauncherFactory --> Yarn +``` + +The launcher selection logic in `InterpreterSetting.getLauncherPlugin()` determines the appropriate launcher based on deployment configuration and interpreter type. + +Sources: [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSetting.java:763-791](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/launcher/SparkInterpreterLauncher.java:47-60]() + +## JDBC Interpreter Deep Dive + +The `JDBCInterpreter` demonstrates the interpreter pattern with sophisticated connection management, SQL execution, and database-specific optimizations. + +### Connection Pool Architecture + +```mermaid +graph TB + subgraph "User Layer" + User1["User 1"] + User2["User 2"] + end + + subgraph "JDBC Interpreter" + JDBCInterp["JDBCInterpreter"] + UserConfigs["Map<String, JDBCUserConfigurations>
Per-user connection pools"] + end + + subgraph "Connection Pools" + Pool1["PoolingDriver
user1-dbprefix"] + Pool2["PoolingDriver
user2-dbprefix"] + Generic["GenericObjectPool
Connection pooling"] + end + + subgraph "Database Connections" + Conn1["Connection 1"] + Conn2["Connection 2"] + Conn3["Connection 3"] + end + + User1 --> JDBCInterp + User2 --> JDBCInterp + JDBCInterp --> UserConfigs + UserConfigs --> Pool1 + UserConfigs --> Pool2 + Pool1 --> Generic + Pool2 --> Generic + Generic --> Conn1 + Generic --> Conn2 + Generic --> Conn3 +``` + +### Multi-Database Configuration + +The JDBC interpreter supports multiple database connections through property prefixing: + +```properties +# Default database +default.driver=org.postgresql.Driver +default.url=jdbc:postgresql://localhost:5432/ +default.user=postgres + +# MySQL database +mysql.driver=com.mysql.cj.jdbc.Driver +mysql.url=jdbc:mysql://localhost:3306/ +mysql.user=mysql_user + +# Oracle database +oracle.driver=oracle.jdbc.driver.OracleDriver +oracle.url=jdbc:oracle:thin:@localhost:1521:xe +oracle.user=oracle_user +``` + +Users can specify the database prefix in paragraph local properties: +- `%jdbc(db=mysql)` - Use MySQL connection +- `%jdbc(db=oracle)` - Use Oracle connection +- `%jdbc` - Use default connection + +Sources: [jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java:219-234](), [jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java:466-472]() + +### SQL Processing Pipeline + +The JDBC interpreter implements a sophisticated SQL processing pipeline: + +1. **SQL Splitting**: Parse multiple statements separated by semicolons +2. **Validation**: Optional pre-execution validation via external service +3. **Execution**: Statement execution with timeout and monitoring +4. **Result Processing**: Format results for display with pagination + +```mermaid +graph LR + Input["SQL Input"] + Splitter["SqlSplitter
Parse statements"] + Validator["ValidationRequest
Pre-execution check"] + Executor["Statement.execute()"] + Formatter["ResultSet formatting"] + Output["Table output"] + + Input --> Splitter + Splitter --> Validator + Validator --> Executor + Executor --> Formatter + Formatter --> Output +``` + +The validation system integrates with external services for query analysis and security checks, supporting fail-fast mechanisms for queries missing partition filters or accessing deprecated tables. + +Sources: [jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java:827-839](), [jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java:872-924](), [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/util/SqlSplitter.java:1-200]() + +## Spark Interpreter Integration + +The Spark interpreter showcases advanced launcher integration with environment detection and configuration management. + +### Scala Version Detection + +The `SparkInterpreterLauncher` automatically detects the Spark Scala version to ensure compatibility: + +```mermaid +graph TB + Detection["detectSparkScalaVersion()"] + + subgraph "Detection Methods" + SparkSubmit["spark-submit --version
Parse output for Scala version"] + ClassPath["Examine JAR files
Check for repl classes"] + end + + subgraph "Version Mapping" + Scala211["2.11.x → '2.11'"] + Scala212["2.12.x → '2.12'"] + Scala213["2.13.x → '2.13'"] + end + + Detection --> SparkSubmit + SparkSubmit --> Scala211 + SparkSubmit --> Scala212 + SparkSubmit --> Scala213 + + Detection --> ClassPath + ClassPath --> Scala211 +``` + +This version information drives JAR selection and classpath construction for proper Spark integration. + +Sources: [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/launcher/SparkInterpreterLauncher.java:268-293](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/launcher/SparkInterpreterLauncher.java:295-328]() + +### Environment Configuration + +The Spark launcher builds comprehensive environment configurations: + +| Environment Variable | Purpose | Configuration Source | +|---------------------|---------|---------------------| +| `SPARK_HOME` | Spark installation directory | Interpreter properties or `zeppelin-env.sh` | +| `SPARK_CONF_DIR` | Spark configuration directory | Derived from `SPARK_HOME` or explicit setting | +| `HADOOP_CONF_DIR` | Hadoop configuration | Required for YARN mode | +| `ZEPPELIN_SPARK_CONF` | Spark properties | Generated from interpreter properties | +| `HADOOP_USER_NAME` | User impersonation | Set when `zeppelin.spark.run.asLoginUser=true` | + +Sources: [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/launcher/SparkInterpreterLauncher.java:215-264]() diff --git a/.cursor/documentation/interpreters/jdbc_interpreter.md b/.cursor/documentation/interpreters/jdbc_interpreter.md new file mode 100644 index 00000000000..ace65f63cd8 --- /dev/null +++ b/.cursor/documentation/interpreters/jdbc_interpreter.md @@ -0,0 +1,366 @@ +# JDBC Interpreter + +
+Relevant source files + +The following files were used as context for generating this wiki page: + +- [docs/assets/themes/zeppelin/img/docs-img/jdbc_refresh.gif](docs/assets/themes/zeppelin/img/docs-img/jdbc_refresh.gif) +- [docs/interpreter/jdbc.md](docs/interpreter/jdbc.md) +- [docs/quickstart/sql_with_zeppelin.md](docs/quickstart/sql_with_zeppelin.md) +- [jdbc/pom.xml](jdbc/pom.xml) +- [jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java](jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java) +- [jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCUserConfigurations.java](jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCUserConfigurations.java) +- [jdbc/src/main/java/org/apache/zeppelin/jdbc/SqlCompleter.java](jdbc/src/main/java/org/apache/zeppelin/jdbc/SqlCompleter.java) +- [jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationRequest.java](jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationRequest.java) +- [jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationResponse.java](jdbc/src/main/java/org/apache/zeppelin/jdbc/ValidationResponse.java) +- [jdbc/src/main/java/org/apache/zeppelin/jdbc/hive/BeelineInPlaceUpdateStream.java](jdbc/src/main/java/org/apache/zeppelin/jdbc/hive/BeelineInPlaceUpdateStream.java) +- [jdbc/src/main/java/org/apache/zeppelin/jdbc/hive/HiveUtils.java](jdbc/src/main/java/org/apache/zeppelin/jdbc/hive/HiveUtils.java) +- [jdbc/src/main/java/org/apache/zeppelin/jdbc/hive/ProgressBar.java](jdbc/src/main/java/org/apache/zeppelin/jdbc/hive/ProgressBar.java) +- [jdbc/src/main/java/org/apache/zeppelin/jdbc/hive/YarnUtil.java](jdbc/src/main/java/org/apache/zeppelin/jdbc/hive/YarnUtil.java) +- [jdbc/src/main/java/org/apache/zeppelin/jdbc/security/JDBCSecurityImpl.java](jdbc/src/main/java/org/apache/zeppelin/jdbc/security/JDBCSecurityImpl.java) +- [jdbc/src/main/resources/interpreter-setting.json](jdbc/src/main/resources/interpreter-setting.json) +- [jdbc/src/test/java/org/apache/zeppelin/jdbc/JDBCInterpreterInterpolationTest.java](jdbc/src/test/java/org/apache/zeppelin/jdbc/JDBCInterpreterInterpolationTest.java) +- [jdbc/src/test/java/org/apache/zeppelin/jdbc/JDBCInterpreterTest.java](jdbc/src/test/java/org/apache/zeppelin/jdbc/JDBCInterpreterTest.java) +- [jdbc/src/test/java/org/apache/zeppelin/jdbc/SqlCompleterTest.java](jdbc/src/test/java/org/apache/zeppelin/jdbc/SqlCompleterTest.java) +- [jdbc/src/test/java/org/apache/zeppelin/jdbc/hive/HiveUtilsTest.java](jdbc/src/test/java/org/apache/zeppelin/jdbc/hive/HiveUtilsTest.java) +- [jdbc/src/test/resources/log4j.properties](jdbc/src/test/resources/log4j.properties) +- [kotlin/src/test/java/org/apache/zeppelin/kotlin/KotlinInterpreterTest.java](kotlin/src/test/java/org/apache/zeppelin/kotlin/KotlinInterpreterTest.java) +- [livy/src/main/java/org/apache/zeppelin/livy/SessionDeadException.java](livy/src/main/java/org/apache/zeppelin/livy/SessionDeadException.java) +- [spark/interpreter/src/test/java/org/apache/zeppelin/spark/KotlinSparkInterpreterTest.java](spark/interpreter/src/test/java/org/apache/zeppelin/spark/KotlinSparkInterpreterTest.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/completer/CachedCompleter.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/completer/CachedCompleter.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/completer/StringsCompleter.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/completer/StringsCompleter.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/InterpreterException.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/InterpreterException.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/util/SqlSplitter.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/util/SqlSplitter.java) +- [zeppelin-interpreter/src/test/java/org/apache/zeppelin/interpreter/util/SqlSplitterTest.java](zeppelin-interpreter/src/test/java/org/apache/zeppelin/interpreter/util/SqlSplitterTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/mock/MockInterpreter1.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/mock/MockInterpreter1.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/mock/MockInterpreter2.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/mock/MockInterpreter2.java) + +
+ + + +The JDBC Interpreter provides database connectivity for Apache Zeppelin notebooks, enabling users to execute SQL queries against any JDBC-compatible database. This interpreter supports a wide range of databases including PostgreSQL, MySQL, Apache Hive, Presto/Trino, and many others through standard JDBC drivers. + +For information about other SQL interpreters in Zeppelin, see [Spark Interpreters](#5.2) for SparkSQL support, [Flink Interpreter](#5.5) for Flink SQL, and [Python Interpreter](#5.3) for pandasSQL. + +## Architecture Overview + +The JDBC Interpreter follows Zeppelin's standard interpreter architecture with specialized components for database connectivity, user authentication, and SQL processing. + +```mermaid +graph TB + User["User Interface"] --> WebSocket["NotebookServer WebSocket"] + WebSocket --> InterpreterFactory["InterpreterFactory"] + InterpreterFactory --> JDBCInterpreter["JDBCInterpreter"] + + JDBCInterpreter --> SqlSplitter["SqlSplitter"] + JDBCInterpreter --> JDBCUserConfigurations["JDBCUserConfigurations"] + JDBCInterpreter --> SqlCompleter["SqlCompleter"] + JDBCInterpreter --> JDBCSecurityImpl["JDBCSecurityImpl"] + + JDBCUserConfigurations --> PoolingDriver["Apache DBCP2 PoolingDriver"] + PoolingDriver --> Database["JDBC Database"] + + JDBCInterpreter --> HiveUtils["HiveUtils"] + HiveUtils --> ProgressBar["ProgressBar"] + HiveUtils --> YarnUtil["YarnUtil"] +``` + +Sources: [jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java:108](), [jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCUserConfigurations.java:30](), [jdbc/src/main/java/org/apache/zeppelin/jdbc/SqlCompleter.java:41]() + +## Core Components + +### JDBCInterpreter Class + +The `JDBCInterpreter` class extends `KerberosInterpreter` and serves as the main entry point for SQL execution. It manages database connections, user configurations, and SQL processing. + +```mermaid +classDiagram + class JDBCInterpreter { + -HashMap~String,Properties~ basePropertiesMap + -HashMap~String,JDBCUserConfigurations~ jdbcUserConfigurationsMap + -HashMap~String,SqlCompleter~ sqlCompletersMap + -SqlSplitter sqlSplitter + +interpret(String, InterpreterContext) InterpreterResult + +getConnection(String, InterpreterContext) Connection + +executeSql(String, String, InterpreterContext) InterpreterResult + +completion(String, int, InterpreterContext) List~InterpreterCompletion~ + } + + class JDBCUserConfigurations { + -Map~String,Statement~ paragraphIdStatementMap + -Map~String,PoolingDriver~ poolingDriverMap + -HashMap~String,Properties~ propertiesMap + +saveStatement(String, Statement) + +cancelStatement(String) + +saveDBDriverPool(String, PoolingDriver) + } + + class SqlCompleter { + -CachedCompleter schemasCompleter + -Map~String,CachedCompleter~ tablesCompleters + -Map~String,CachedCompleter~ columnsCompleters + +complete(String, int, List~InterpreterCompletion~) + +createOrUpdateFromConnection(Connection, String, String, int) + } + + JDBCInterpreter --> JDBCUserConfigurations + JDBCInterpreter --> SqlCompleter +``` + +Sources: [jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java:166-178](), [jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCUserConfigurations.java:31-44](), [jdbc/src/main/java/org/apache/zeppelin/jdbc/SqlCompleter.java:60-78]() + +### Connection Management + +The interpreter uses Apache Commons DBCP2 for connection pooling with per-user connection isolation. Each user maintains separate connection pools for different database prefixes. + +| Component | Purpose | Configuration | +|-----------|---------|---------------| +| `PoolingDriver` | JDBC connection pooling | `maxTotal`, `maxIdle`, `minIdle` | +| `JDBCUserConfigurations` | Per-user connection state | User credentials, properties | +| `ConnectionFactory` | Connection creation | Driver-specific properties | + +Sources: [jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java:524-563](), [jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCUserConfigurations.java:89-105]() + +## Configuration Properties + +The JDBC Interpreter supports extensive configuration through properties defined in `interpreter-setting.json`: + +### Database Connection Properties + +| Property | Default | Description | +|----------|---------|-------------| +| `default.url` | `jdbc:postgresql://localhost:5432/` | JDBC URL | +| `default.user` | `gpadmin` | Database username | +| `default.password` | `` | Database password | +| `default.driver` | `org.postgresql.Driver` | JDBC driver class | + +### Execution Control Properties + +| Property | Default | Description | +|----------|---------|-------------| +| `common.max_count` | `1000` | Maximum result rows to display | +| `zeppelin.jdbc.maxRows` | `1000` | Maximum rows fetched from query | +| `zeppelin.jdbc.concurrent.use` | `true` | Enable parallel execution | +| `zeppelin.jdbc.concurrent.max_connection` | `10` | Concurrent connection limit | + +### Security Properties + +| Property | Default | Description | +|----------|---------|-------------| +| `zeppelin.jdbc.auth.type` | `` | Authentication type (SIMPLE, KERBEROS) | +| `zeppelin.jdbc.principal` | `` | Kerberos principal | +| `zeppelin.jdbc.keytab.location` | `` | Kerberos keytab path | + +Sources: [jdbc/src/main/resources/interpreter-setting.json:7-153](), [jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java:111-154]() + +## SQL Execution Flow + +The SQL execution process involves statement splitting, validation, and result processing: + +```mermaid +sequenceDiagram + participant User + participant JDBCInterpreter + participant SqlSplitter + participant ValidationService + participant Database + participant HiveUtils + + User ->> JDBCInterpreter: interpret(sql, context) + JDBCInterpreter ->> SqlSplitter: splitSql(sql) + SqlSplitter -->> JDBCInterpreter: List statements + + loop For each SQL statement + JDBCInterpreter ->> ValidationService: sendValidationRequest() + ValidationService -->> JDBCInterpreter: ValidationResponse + + alt Validation passed + JDBCInterpreter ->> Database: execute(statement) + + alt Hive connection + JDBCInterpreter ->> HiveUtils: startHiveMonitorThread() + HiveUtils ->> ProgressBar: monitor progress + end + + Database -->> JDBCInterpreter: ResultSet + JDBCInterpreter ->> JDBCInterpreter: getResults(resultSet) + else Validation failed + JDBCInterpreter -->> User: Error response + end + end + + JDBCInterpreter -->> User: InterpreterResult +``` + +Sources: [jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java:800-924](), [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/util/SqlSplitter.java:79-178](), [jdbc/src/main/java/org/apache/zeppelin/jdbc/hive/HiveUtils.java:58-121]() + +### SQL Statement Processing + +The `SqlSplitter` class handles multi-statement SQL parsing with support for comments and string literals: + +```mermaid +graph LR + Input["SQL Text"] --> SqlSplitter["SqlSplitter.splitSql()"] + SqlSplitter --> Parser["Parse characters"] + Parser --> Comments["Handle comments
-- and /* */"] + Parser --> Strings["Handle string literals
' and \""] + Parser --> Semicolon["Split on semicolons"] + Semicolon --> Refine["Refine statements
preserve line numbers"] + Refine --> Output["List statements"] +``` + +Sources: [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/util/SqlSplitter.java:79-178](), [zeppelin-interpreter/src/test/java/org/apache/zeppelin/interpreter/util/SqlSplitterTest.java]() + +## Auto-Completion System + +The `SqlCompleter` provides intelligent SQL auto-completion with database metadata caching: + +### Completion Components + +```mermaid +graph TB + SqlCompleter["SqlCompleter"] --> CachedCompleter["CachedCompleter"] + CachedCompleter --> StringsCompleter["StringsCompleter"] + + SqlCompleter --> SchemaCompleter["schemasCompleter"] + SqlCompleter --> TableCompleters["tablesCompleters Map"] + SqlCompleter --> ColumnCompleters["columnsCompleters Map"] + SqlCompleter --> KeywordCompleter["keywordCompleter"] + + SchemaCompleter --> DatabaseMeta["DatabaseMetaData.getSchemas()"] + TableCompleters --> TableMeta["DatabaseMetaData.getTables()"] + ColumnCompleters --> ColumnMeta["DatabaseMetaData.getColumns()"] + KeywordCompleter --> SQLKeywords["SQL Keywords + Driver Functions"] +``` + +### Completion Types + +| Completion Type | Trigger | Cache Key | TTL | +|----------------|---------|-----------|-----| +| Keywords | Any SQL context | Global | Permanent | +| Schemas | After `FROM`, `JOIN` | Global | `completer.ttlInSeconds` | +| Tables | After schema prefix | Schema name | `completer.ttlInSeconds` | +| Columns | After table reference | `schema.table` | `completer.ttlInSeconds` | + +Sources: [jdbc/src/main/java/org/apache/zeppelin/jdbc/SqlCompleter.java:277-343](), [jdbc/src/main/java/org/apache/zeppelin/jdbc/SqlCompleter.java:60-83]() + +## Authentication and Security + +### Kerberos Authentication + +The `JDBCSecurityImpl` class handles Kerberos authentication for secure database connections: + +```mermaid +graph TD + Auth["Authentication Request"] --> AuthType["Check auth.type property"] + AuthType --> Simple["SIMPLE"] + AuthType --> Kerberos["KERBEROS"] + + Simple --> DirectConnection["Direct JDBC Connection"] + + Kerberos --> KeytabCheck["Check keytab/principal"] + KeytabCheck --> Login["UserGroupInformation.loginUserFromKeytab()"] + Login --> ProxyCheck["Check proxy.user.property"] + ProxyCheck --> ProxyUser["Create proxy user"] + ProxyCheck --> DirectKerberos["Direct Kerberos connection"] + ProxyUser --> SecureConnection["Secure JDBC Connection"] + DirectKerberos --> SecureConnection +``` + +### User Impersonation + +For multi-user environments, the interpreter supports user impersonation through: + +- **Credential Management**: Per-user username/password storage +- **Proxy Users**: Kerberos proxy user creation +- **Connection Isolation**: Separate connection pools per user + +Sources: [jdbc/src/main/java/org/apache/zeppelin/jdbc/security/JDBCSecurityImpl.java:41-82](), [jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java:593-631]() + +## Hive-Specific Features + +### Progress Monitoring + +For Apache Hive connections, the interpreter provides enhanced monitoring through `HiveUtils`: + +```mermaid +graph LR + HiveStmt["HiveStatement"] --> Monitor["HiveMonitorThread"] + Monitor --> Logs["Query Logs"] + Monitor --> Progress["Progress Updates"] + + Logs --> JobURL["Extract Job URL"] + JobURL --> MRJob["MapReduce Job URL"] + JobURL --> TezApp["Tez Application ID"] + + Progress --> ProgressBar["InPlaceUpdateStream"] + ProgressBar --> Display["Real-time Progress Display"] +``` + +### Supported Hive Features + +| Feature | Implementation | Requirements | +|---------|----------------|--------------| +| Progress Bar | `ProgressBar` + `BeelineInPlaceUpdateStream` | Hive ≥ 2.3 | +| Job Monitoring | `HiveMonitorThread` | Hive JDBC driver | +| Query Logs | `HiveStatement.getQueryLog()` | Enabled in properties | +| Application Tags | URL parameter injection | `hive.engines.tag.enable` | + +Sources: [jdbc/src/main/java/org/apache/zeppelin/jdbc/hive/HiveUtils.java:58-121](), [jdbc/src/main/java/org/apache/zeppelin/jdbc/hive/ProgressBar.java:32-54](), [jdbc/src/main/java/org/apache/zeppelin/jdbc/hive/BeelineInPlaceUpdateStream.java:40-77]() + +## Connection Pooling Configuration + +The interpreter uses Apache Commons DBCP2 with extensive pooling configuration: + +### Pool Properties + +| Property | Type | Description | Default | +|----------|------|-------------|---------| +| `testOnBorrow` | Boolean | Validate connections before use | `false` | +| `testWhileIdle` | Boolean | Validate idle connections | `false` | +| `maxTotal` | Integer | Maximum active connections | `-1` (unlimited) | +| `maxIdle` | Integer | Maximum idle connections | `8` | +| `minIdle` | Integer | Minimum idle connections | `0` | +| `maxWaitMillis` | Long | Connection wait timeout | `-1` (indefinite) | + +### Pool Lifecycle + +```mermaid +stateDiagram-v2 + [*] --> Creating + Creating --> Active: getConnection() + Active --> Idle: return connection + Idle --> Active: reuse connection + Idle --> Destroyed: eviction/timeout + Active --> Destroyed: connection error + Destroyed --> [*] + + Idle --> Validated: testWhileIdle + Validated --> Active: validation success + Validated --> Destroyed: validation failure +``` + +Sources: [jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java:500-522](), [jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java:545-562]() + +## Dependencies and Build Configuration + +The JDBC interpreter includes core database drivers and security dependencies: + +### Core Dependencies + +- **PostgreSQL Driver**: `org.postgresql:postgresql:42.3.3` (included by default) +- **Connection Pooling**: `org.apache.commons:commons-dbcp2:2.0.1` +- **Hadoop Security**: `org.apache.hadoop:hadoop-common` (for Kerberos) +- **Hive Support**: `org.apache.hive:hive-jdbc:2.3.4` (provided scope) + +### Build Profiles + +| Profile | Purpose | Additional Dependencies | +|---------|---------|------------------------| +| `jdbc-phoenix` | Apache Phoenix support | `phoenix-core` | +| `jdbc-hadoop2` | Hadoop 2.x compatibility | `hadoop-common:2.7.x` | +| `jdbc-hadoop3` | Hadoop 3.x compatibility | `hadoop-common:3.0.x` | + +Sources: [jdbc/pom.xml:47-208](), [jdbc/pom.xml:233-374]() diff --git a/.cursor/documentation/interpreters/livy_interpreter.md b/.cursor/documentation/interpreters/livy_interpreter.md new file mode 100644 index 00000000000..258dcec85a3 --- /dev/null +++ b/.cursor/documentation/interpreters/livy_interpreter.md @@ -0,0 +1,314 @@ +# Livy Interpreter + +
+Relevant source files + +The following files were used as context for generating this wiki page: + +- [docs/interpreter/livy.md](docs/interpreter/livy.md) +- [livy/src/main/java/org/apache/zeppelin/livy/BaseLivyInterpreter.java](livy/src/main/java/org/apache/zeppelin/livy/BaseLivyInterpreter.java) +- [livy/src/main/java/org/apache/zeppelin/livy/LivyException.java](livy/src/main/java/org/apache/zeppelin/livy/LivyException.java) +- [livy/src/main/java/org/apache/zeppelin/livy/LivyPySpark3Interpreter.java](livy/src/main/java/org/apache/zeppelin/livy/LivyPySpark3Interpreter.java) +- [livy/src/main/java/org/apache/zeppelin/livy/LivyPySparkBaseInterpreter.java](livy/src/main/java/org/apache/zeppelin/livy/LivyPySparkBaseInterpreter.java) +- [livy/src/main/java/org/apache/zeppelin/livy/LivyPySparkInterpreter.java](livy/src/main/java/org/apache/zeppelin/livy/LivyPySparkInterpreter.java) +- [livy/src/main/java/org/apache/zeppelin/livy/LivySharedInterpreter.java](livy/src/main/java/org/apache/zeppelin/livy/LivySharedInterpreter.java) +- [livy/src/main/java/org/apache/zeppelin/livy/LivySparkInterpreter.java](livy/src/main/java/org/apache/zeppelin/livy/LivySparkInterpreter.java) +- [livy/src/main/java/org/apache/zeppelin/livy/LivySparkRInterpreter.java](livy/src/main/java/org/apache/zeppelin/livy/LivySparkRInterpreter.java) +- [livy/src/main/java/org/apache/zeppelin/livy/LivySparkSQLInterpreter.java](livy/src/main/java/org/apache/zeppelin/livy/LivySparkSQLInterpreter.java) +- [livy/src/main/java/org/apache/zeppelin/livy/LivyVersion.java](livy/src/main/java/org/apache/zeppelin/livy/LivyVersion.java) +- [livy/src/main/java/org/apache/zeppelin/livy/SessionNotFoundException.java](livy/src/main/java/org/apache/zeppelin/livy/SessionNotFoundException.java) +- [livy/src/main/resources/interpreter-setting.json](livy/src/main/resources/interpreter-setting.json) +- [livy/src/test/java/org/apache/zeppelin/livy/LivyInterpreterIT.java](livy/src/test/java/org/apache/zeppelin/livy/LivyInterpreterIT.java) +- [livy/src/test/java/org/apache/zeppelin/livy/LivySQLInterpreterTest.java](livy/src/test/java/org/apache/zeppelin/livy/LivySQLInterpreterTest.java) +- [livy/src/test/resources/livy_tutorial_1.scala](livy/src/test/resources/livy_tutorial_1.scala) + +
+ + + +The Livy Interpreter provides remote execution of Apache Spark code through Apache Livy's REST interface. It enables Zeppelin to execute Spark applications (Scala, Python, R, SQL) on remote clusters without requiring direct Spark dependencies in the Zeppelin server process. This interpreter supports multiple language runtimes within shared Spark sessions and provides features like session management, user impersonation, and resource configuration. + +For information about direct Spark interpreters that run in the same JVM as Zeppelin, see [Spark Interpreters](#5.2). + +## Architecture Overview + +The Livy interpreter consists of a hierarchy of interpreter classes that communicate with Apache Livy server via REST API to execute code remotely on Spark clusters. + +```mermaid +graph TB + subgraph "Zeppelin Server Process" + BaseLivy["BaseLivyInterpreter
Base Class"] + SparkInterp["LivySparkInterpreter
spark session kind"] + SQLInterp["LivySparkSQLInterpreter
sql execution"] + PySparkInterp["LivyPySparkInterpreter
pyspark session kind"] + PySpark3Interp["LivyPySpark3Interpreter
pyspark3 session kind"] + SparkRInterp["LivySparkRInterpreter
sparkr session kind"] + SharedInterp["LivySharedInterpreter
shared session kind"] + end + + subgraph "Apache Livy Server" + LivyREST["REST API
/sessions, /statements"] + SessionMgr["Session Management
Session Creation & Lifecycle"] + end + + subgraph "Remote Spark Cluster" + SparkContext["SparkContext
Shared Across Languages"] + ScalaREPL["Scala REPL"] + PythonKernel["Python Kernel"] + RKernel["R Kernel"] + end + + BaseLivy --> SparkInterp + BaseLivy --> SQLInterp + BaseLivy --> PySparkInterp + BaseLivy --> PySpark3Interp + BaseLivy --> SparkRInterp + BaseLivy --> SharedInterp + + SparkInterp --> LivyREST + SQLInterp --> LivyREST + PySparkInterp --> LivyREST + SharedInterp --> LivyREST + + LivyREST --> SessionMgr + SessionMgr --> SparkContext + SparkContext --> ScalaREPL + SparkContext --> PythonKernel + SparkContext --> RKernel +``` + +**Sources:** [livy/src/main/java/org/apache/zeppelin/livy/BaseLivyInterpreter.java:85-106](), [livy/src/main/resources/interpreter-setting.json:1-270]() + +## Interpreter Implementations + +### Base Interpreter Class + +`BaseLivyInterpreter` provides the core functionality for communicating with Livy server, including session management, REST API calls, and result processing. + +```mermaid +graph TB + subgraph "BaseLivyInterpreter Core Components" + SessionInfo["SessionInfo
sessionInfo field"] + RestTemplate["RestTemplate
restTemplate field"] + LivyVersion["LivyVersion
livyVersion field"] + Properties["Properties
configuration"] + end + + subgraph "Key Methods" + Open["open()
Initialize session"] + Interpret["interpret()
Execute code"] + CreateSession["createSession()
Session creation"] + CallRestAPI["callRestAPI()
HTTP communication"] + GetProgress["getProgress()
Execution progress"] + Cancel["cancel()
Job cancellation"] + end + + SessionInfo --> Open + RestTemplate --> CallRestAPI + LivyVersion --> Open + Properties --> CreateSession + + Open --> CreateSession + Interpret --> CallRestAPI + CreateSession --> CallRestAPI +``` + +**Sources:** [livy/src/main/java/org/apache/zeppelin/livy/BaseLivyInterpreter.java:85-134](), [livy/src/main/java/org/apache/zeppelin/livy/BaseLivyInterpreter.java:301-340]() + +### Language-Specific Implementations + +| Interpreter Class | Session Kind | Purpose | App ID Extraction | +|-------------------|--------------|---------|-------------------| +| `LivySparkInterpreter` | `spark` | Scala/Spark execution | `sc.applicationId` | +| `LivySparkSQLInterpreter` | `spark` | SQL query execution | Delegates to `LivySparkInterpreter` | +| `LivyPySparkInterpreter` | `pyspark` | Python/PySpark execution | `sc.applicationId` | +| `LivyPySpark3Interpreter` | `pyspark3` | Python 3/PySpark execution | `sc.applicationId` | +| `LivySparkRInterpreter` | `sparkr` | R/SparkR execution | Not implemented | +| `LivySharedInterpreter` | `shared` | Multi-language session sharing | Not applicable | + +**Sources:** [livy/src/main/java/org/apache/zeppelin/livy/LivySparkInterpreter.java:25-34](), [livy/src/main/java/org/apache/zeppelin/livy/LivySparkSQLInterpreter.java:49-75](), [livy/src/main/java/org/apache/zeppelin/livy/LivyPySparkInterpreter.java:26-35]() + +### SQL Interpreter Special Features + +`LivySparkSQLInterpreter` has unique functionality for SQL result processing: + +- **Result Parsing**: Converts Spark SQL tabular output to Zeppelin table format +- **UTF Character Support**: Handles international characters in table data via `tableWithUTFCharacter()` property +- **Field Truncation**: Configurable via `ZEPPELIN_LIVY_SPARK_SQL_FIELD_TRUNCATE` +- **Concurrent Execution**: Optional parallel SQL execution via `concurrentSQL()` property + +**Sources:** [livy/src/main/java/org/apache/zeppelin/livy/LivySparkSQLInterpreter.java:115-172](), [livy/src/main/java/org/apache/zeppelin/livy/LivySparkSQLInterpreter.java:377-398]() + +## Session Management + +### Session Lifecycle + +```mermaid +sequenceDiagram + participant ZepUser as "Zeppelin User" + participant Interp as "BaseLivyInterpreter" + participant Livy as "Livy REST API" + participant Spark as "Spark Cluster" + + ZepUser->>Interp: "Execute Code" + Interp->>Interp: "open()" + Interp->>Livy: "POST /sessions
{kind, proxyUser, conf}" + Livy->>Spark: "Create Spark Session" + Spark-->>Livy: "Session Created" + Livy-->>Interp: "SessionInfo{id, state, appId}" + + loop "Wait for Ready State" + Interp->>Livy: "GET /sessions/{id}" + Livy-->>Interp: "SessionInfo{state}" + end + + Interp->>Livy: "POST /sessions/{id}/statements
{code, kind}" + Livy->>Spark: "Execute Statement" + + loop "Poll Statement Status" + Interp->>Livy: "GET /sessions/{id}/statements/{stmtId}" + Livy-->>Interp: "StatementInfo{state, progress, output}" + end + + Spark-->>Livy: "Statement Result" + Livy-->>Interp: "StatementInfo{output, data}" + Interp-->>ZepUser: "InterpreterResult" +``` + +**Sources:** [livy/src/main/java/org/apache/zeppelin/livy/BaseLivyInterpreter.java:301-340](), [livy/src/main/java/org/apache/zeppelin/livy/BaseLivyInterpreter.java:351-430]() + +### Session Recovery and Error Handling + +The interpreter handles session failures through multiple mechanisms: + +- **Session Expiration**: Detects expired sessions via `SessionNotFoundException` and recreates them automatically +- **Session Death**: Optionally restarts dead sessions when `zeppelin.livy.restart_dead_session` is enabled +- **Connection Timeout**: Configurable session creation timeout via `zeppelin.livy.session.create_timeout` + +**Sources:** [livy/src/main/java/org/apache/zeppelin/livy/BaseLivyInterpreter.java:372-399](), [livy/src/main/java/org/apache/zeppelin/livy/BaseLivyInterpreter.java:452-461]() + +### Shared Sessions (Livy 0.5+) + +`LivySharedInterpreter` enables sharing SparkContext across multiple language interpreters when Livy version supports it: + +```mermaid +graph LR + subgraph "Shared Session Architecture" + SharedInterp["LivySharedInterpreter
Session Owner"] + SparkInterp["LivySparkInterpreter
Delegates to Shared"] + SQLInterp["LivySparkSQLInterpreter
Delegates to Shared"] + PySparkInterp["LivyPySparkInterpreter
Delegates to Shared"] + end + + subgraph "Single Livy Session" + LivySession["Livy Session
kind=shared"] + SparkCtx["Shared SparkContext
Accessible from all languages"] + end + + SparkInterp --> SharedInterp + SQLInterp --> SharedInterp + PySparkInterp --> SharedInterp + SharedInterp --> LivySession + LivySession --> SparkCtx +``` + +**Sources:** [livy/src/main/java/org/apache/zeppelin/livy/LivySharedInterpreter.java:34-74](), [livy/src/main/java/org/apache/zeppelin/livy/BaseLivyInterpreter.java:158-165]() + +## Configuration + +### Core Connection Properties + +| Property | Default | Purpose | +|----------|---------|---------| +| `zeppelin.livy.url` | `http://localhost:8998` | Livy server endpoint | +| `zeppelin.livy.session.create_timeout` | `120` | Session creation timeout (seconds) | +| `zeppelin.livy.pull_status.interval.millis` | `1000` | Status polling interval | +| `zeppelin.livy.maxLogLines` | `1000` | Maximum log lines to retrieve | + +### Spark Resource Configuration + +All Spark configuration properties can be passed with `livy.spark.` prefix: + +| Property | Purpose | +|----------|---------| +| `livy.spark.driver.cores` | Driver CPU cores | +| `livy.spark.driver.memory` | Driver memory allocation | +| `livy.spark.executor.instances` | Number of executors | +| `livy.spark.executor.cores` | Executor CPU cores | +| `livy.spark.executor.memory` | Executor memory allocation | +| `livy.spark.dynamicAllocation.enabled` | Enable dynamic resource allocation | + +**Sources:** [livy/src/main/resources/interpreter-setting.json:7-123](), [livy/src/main/java/org/apache/zeppelin/livy/BaseLivyInterpreter.java:304-311]() + +### Security Configuration + +The interpreter supports multiple authentication and security mechanisms: + +- **Kerberos Authentication**: Via `zeppelin.livy.principal` and `zeppelin.livy.keytab` +- **SSL/TLS**: Configurable truststore and keystore settings +- **Custom HTTP Headers**: Via `zeppelin.livy.http.headers` for additional authentication +- **User Impersonation**: Automatic when Zeppelin authentication is enabled + +**Sources:** [livy/src/main/java/org/apache/zeppelin/livy/BaseLivyInterpreter.java:619-677](), [livy/src/main/java/org/apache/zeppelin/livy/BaseLivyInterpreter.java:124-135]() + +## Code Execution Flow + +### Statement Execution Process + +```mermaid +graph TD + subgraph "Code Execution Pipeline" + Input["User Code Input"] + Validate["Code Validation"] + ExecuteReq["ExecuteRequest
{code, kind}"] + RestCall["REST API Call
POST /sessions/{id}/statements"] + PollStatus["Poll Statement Status
GET /sessions/{id}/statements/{stmtId}"] + ProcessResult["Process StatementInfo
Extract output data"] + FormatResult["Format InterpreterResult
Handle different output types"] + end + + subgraph "Result Types" + TextResult["Plain Text Output"] + TableResult["Table Data
application/livy-table-json"] + ImageResult["Image Data
image/png"] + ErrorResult["Error Output
evalue + traceback"] + end + + Input --> Validate + Validate --> ExecuteReq + ExecuteReq --> RestCall + RestCall --> PollStatus + PollStatus --> ProcessResult + ProcessResult --> FormatResult + + FormatResult --> TextResult + FormatResult --> TableResult + FormatResult --> ImageResult + FormatResult --> ErrorResult +``` + +**Sources:** [livy/src/main/java/org/apache/zeppelin/livy/BaseLivyInterpreter.java:351-430](), [livy/src/main/java/org/apache/zeppelin/livy/BaseLivyInterpreter.java:485-556]() + +### SQL Result Processing + +`LivySparkSQLInterpreter` implements specialized parsing for SQL output formats: + +- **Standard Table Format**: Parses ASCII table output with `parseSQLOutput()` +- **JSON Format**: Handles UTF characters via `parseSQLJsonOutput()` when `tableWithUTFCharacter` is enabled +- **Field Truncation**: Configurable truncation of long field values +- **Result Limits**: Enforced via `ZEPPELIN_LIVY_SPARK_SQL_MAX_RESULT` + +**Sources:** [livy/src/main/java/org/apache/zeppelin/livy/LivySparkSQLInterpreter.java:179-270](), [livy/src/main/java/org/apache/zeppelin/livy/LivySparkSQLInterpreter.java:124-158]() + +## Version Compatibility + +The interpreter adapts behavior based on Livy server version through `LivyVersion` class: + +| Feature | Minimum Version | Implementation | +|---------|----------------|----------------| +| Statement Cancellation | Livy 0.3.0 | `isCancelSupported()` | +| Progress Reporting | Livy 0.4.0 | `isGetProgressSupported()` | +| Shared Sessions | Livy 0.5.0 | `isSharedSupported()` | +| Code Completion | Livy 0.5.0 | `/completion` API endpoint | + +**Sources:** [livy/src/main/java/org/apache/zeppelin/livy/LivyVersion.java:75-85](), [livy/src/main/java/org/apache/zeppelin/livy/BaseLivyInterpreter.java:258-270]() diff --git a/.cursor/documentation/interpreters/python_interpreter.md b/.cursor/documentation/interpreters/python_interpreter.md new file mode 100644 index 00000000000..f97928cdede --- /dev/null +++ b/.cursor/documentation/interpreters/python_interpreter.md @@ -0,0 +1,258 @@ +# Python Interpreter + +
+Relevant source files + +The following files were used as context for generating this wiki page: + +- [docs/assets/themes/zeppelin/img/docs-img/python_pandas_sql.png](docs/assets/themes/zeppelin/img/docs-img/python_pandas_sql.png) +- [docs/assets/themes/zeppelin/img/docs-img/python_zshow_df.png](docs/assets/themes/zeppelin/img/docs-img/python_zshow_df.png) +- [docs/interpreter/python.md](docs/interpreter/python.md) +- [python/pom.xml](python/pom.xml) +- [python/src/main/java/org/apache/zeppelin/python/IPythonInterpreter.java](python/src/main/java/org/apache/zeppelin/python/IPythonInterpreter.java) +- [python/src/main/java/org/apache/zeppelin/python/PythonCondaInterpreter.java](python/src/main/java/org/apache/zeppelin/python/PythonCondaInterpreter.java) +- [python/src/main/java/org/apache/zeppelin/python/PythonDockerInterpreter.java](python/src/main/java/org/apache/zeppelin/python/PythonDockerInterpreter.java) +- [python/src/main/java/org/apache/zeppelin/python/PythonInterpreter.java](python/src/main/java/org/apache/zeppelin/python/PythonInterpreter.java) +- [python/src/main/java/org/apache/zeppelin/python/PythonInterpreterPandasSql.java](python/src/main/java/org/apache/zeppelin/python/PythonInterpreterPandasSql.java) +- [python/src/main/proto/ipython.proto](python/src/main/proto/ipython.proto) +- [python/src/main/resources/interpreter-setting.json](python/src/main/resources/interpreter-setting.json) +- [python/src/main/resources/output_templates/conda_usage.html](python/src/main/resources/output_templates/conda_usage.html) +- [python/src/main/resources/output_templates/docker_usage.html](python/src/main/resources/output_templates/docker_usage.html) +- [python/src/main/resources/python/zeppelin_python.py](python/src/main/resources/python/zeppelin_python.py) +- [python/src/test/java/org/apache/zeppelin/python/BasePythonInterpreterTest.java](python/src/test/java/org/apache/zeppelin/python/BasePythonInterpreterTest.java) +- [python/src/test/java/org/apache/zeppelin/python/IPythonInterpreterTest.java](python/src/test/java/org/apache/zeppelin/python/IPythonInterpreterTest.java) +- [python/src/test/java/org/apache/zeppelin/python/PythonCondaInterpreterTest.java](python/src/test/java/org/apache/zeppelin/python/PythonCondaInterpreterTest.java) +- [python/src/test/java/org/apache/zeppelin/python/PythonDockerInterpreterTest.java](python/src/test/java/org/apache/zeppelin/python/PythonDockerInterpreterTest.java) +- [python/src/test/java/org/apache/zeppelin/python/PythonInterpreterMatplotlibTest.java](python/src/test/java/org/apache/zeppelin/python/PythonInterpreterMatplotlibTest.java) +- [python/src/test/java/org/apache/zeppelin/python/PythonInterpreterPandasSqlTest.java](python/src/test/java/org/apache/zeppelin/python/PythonInterpreterPandasSqlTest.java) +- [python/src/test/java/org/apache/zeppelin/python/PythonInterpreterTest.java](python/src/test/java/org/apache/zeppelin/python/PythonInterpreterTest.java) +- [python/src/test/resources/log4j.properties](python/src/test/resources/log4j.properties) +- [spark/interpreter/src/test/resources/log4j.properties](spark/interpreter/src/test/resources/log4j.properties) +- [zeppelin-interpreter-integration/src/test/resources/log4j.properties](zeppelin-interpreter-integration/src/test/resources/log4j.properties) + +
+ + + +This document covers Zeppelin's Python interpreter system, which provides multiple Python execution environments and integration capabilities. The Python interpreter supports both vanilla Python and IPython execution, environment management through Conda and Docker, and advanced features like SQL over Pandas DataFrames. + +For information about the broader interpreter framework, see [Interpreter Framework](#2.3). For Spark-specific Python integration, see [Spark Interpreters](#5.2). + +## Architecture Overview + +Zeppelin's Python interpreter system consists of multiple interpreter implementations that can work together to provide different execution environments and capabilities. The system is designed around process separation and communication bridges between the JVM and Python processes. + +```mermaid +graph TB + subgraph "Zeppelin Server JVM" + PI["PythonInterpreter
Vanilla Python"] + IPI["IPythonInterpreter
Enhanced via Jupyter"] + PISP["PythonInterpreterPandasSql
SQL over DataFrames"] + PCI["PythonCondaInterpreter
Environment Management"] + PDI["PythonDockerInterpreter
Container Support"] + end + + subgraph "Python Process" + ZP["zeppelin_python.py
Main Script"] + ZC["zeppelin_context.py
ZeppelinContext Bridge"] + PY4J["Py4j Gateway
Java-Python Bridge"] + end + + subgraph "IPython/Jupyter Process" + JK["Jupyter Kernel
IPython Runtime"] + GRPC["gRPC Communication"] + end + + PI --> PY4J + IPI --> GRPC + PISP -.-> PI + PCI -.-> PI + PDI -.-> PI + + PY4J --> ZP + ZP --> ZC + GRPC --> JK + + PI -.->|"delegates to"| IPI +``` + +Sources: [python/src/main/java/org/apache/zeppelin/python/PythonInterpreter.java:52-77](), [python/src/main/java/org/apache/zeppelin/python/IPythonInterpreter.java:42-47](), [python/src/main/resources/interpreter-setting.json:1-97]() + +## Core Interpreter Implementations + +### PythonInterpreter - Vanilla Python Implementation + +The `PythonInterpreter` class provides the fundamental Python execution capability with minimal dependencies. It launches a separate Python process and communicates via Py4j gateway server. + +```mermaid +graph TB + subgraph "PythonInterpreter Lifecycle" + OPEN["open()
Check IPython availability"] + GATEWAY["createGatewayServerAndStartScript()
Start Py4j gateway"] + PYTHON["Launch Python process
zeppelin_python.py"] + EXECUTE["interpret()
Send code via gateway"] + CLOSE["close()
Shutdown processes"] + end + + subgraph "Process Communication" + JAVA["Java Side
GatewayServer"] + GATEWAY_COMM["Py4j Protocol
Method calls"] + PYTHON_SIDE["Python Side
Gateway client"] + end + + OPEN --> GATEWAY + GATEWAY --> PYTHON + PYTHON --> EXECUTE + EXECUTE --> CLOSE + + JAVA <--> GATEWAY_COMM + GATEWAY_COMM <--> PYTHON_SIDE +``` + +The interpreter can automatically delegate to `IPythonInterpreter` when IPython prerequisites are available and `zeppelin.python.useIPython` is true. + +Sources: [python/src/main/java/org/apache/zeppelin/python/PythonInterpreter.java:79-125](), [python/src/main/java/org/apache/zeppelin/python/PythonInterpreter.java:127-173]() + +### IPythonInterpreter - Enhanced Jupyter Integration + +The `IPythonInterpreter` extends `JupyterKernelInterpreter` to provide advanced features like magic commands, rich output formatting, and better code completion through Jupyter protocol. + +Key features: +- Inherits Jupyter kernel capabilities from `JupyterKernelInterpreter` +- Sets up Py4j gateway for ZeppelinContext integration +- Supports advanced IPython features like `%timeit`, `%matplotlib inline` +- Better error handling and output formatting + +Sources: [python/src/main/java/org/apache/zeppelin/python/IPythonInterpreter.java:47-62](), [python/src/main/java/org/apache/zeppelin/python/IPythonInterpreter.java:108-120]() + +## Communication Mechanisms + +### Py4j Gateway Communication + +The vanilla Python interpreter uses Py4j to enable bidirectional communication between Java and Python processes. The gateway allows Python to call Java methods and vice versa. + +```mermaid +graph LR + subgraph "Java Process" + GS["GatewayServer
Port binding"] + PI_METHODS["PythonInterpreter methods
getStatements()
setStatementsFinished()"] + ZC_JAVA["ZeppelinContext
Java implementation"] + end + + subgraph "Python Process" + GC["GatewayClient
Connection to Java"] + INTP["'intp' variable
Gateway entry point"] + ZC_PY["PyZeppelinContext
Python wrapper"] + end + + GS <--> GC + INTP --> PI_METHODS + ZC_JAVA <--> ZC_PY +``` + +Sources: [python/src/main/java/org/apache/zeppelin/python/PythonInterpreter.java:127-137](), [python/src/main/resources/python/zeppelin_python.py:89-114]() + +### Statement Execution Flow + +The execution flow involves synchronization between Java and Python processes using wait/notify mechanisms: + +1. Java calls `callPython()` with `PythonInterpretRequest` +2. Python calls `getStatements()` to retrieve code +3. Python executes code and calls `setStatementsFinished()` with results +4. Java thread is notified and returns results + +Sources: [python/src/main/java/org/apache/zeppelin/python/PythonInterpreter.java:360-377](), [python/src/main/java/org/apache/zeppelin/python/PythonInterpreter.java:314-347](), [python/src/main/resources/python/zeppelin_python.py:120-206]() + +## Environment Management + +### Conda Integration + +The `PythonCondaInterpreter` provides conda environment management capabilities: + +| Command Pattern | Functionality | +|-----------------|---------------| +| `env list` | List available conda environments | +| `activate [ENV_NAME]` | Switch to specified environment | +| `deactivate` | Return to default Python | +| `create [ARGS]` | Create new conda environment | +| `install [PACKAGE]` | Install packages in current environment | + +```mermaid +graph TB + CMD["Conda Command"] + PATTERN_MATCH["Pattern matching
PATTERN_COMMAND_*"] + ENV_CHANGE["changePythonEnvironment()
Update python path"] + RESTART["restartPythonProcess()
Close and reopen"] + + CMD --> PATTERN_MATCH + PATTERN_MATCH --> ENV_CHANGE + ENV_CHANGE --> RESTART +``` + +Sources: [python/src/main/java/org/apache/zeppelin/python/PythonCondaInterpreter.java:83-129](), [python/src/main/java/org/apache/zeppelin/python/PythonCondaInterpreter.java:142-171]() + +### Docker Support + +The `PythonDockerInterpreter` enables running Python in Docker containers by modifying the Python execution command to use `docker run`: + +- Mounts Python working directory into container +- Sets up PYTHONPATH for Zeppelin integration +- Supports image pulling and environment activation/deactivation + +Sources: [python/src/main/java/org/apache/zeppelin/python/PythonDockerInterpreter.java:69-113]() + +## Integration Features + +### SQL over Pandas DataFrames + +The `PythonInterpreterPandasSql` interpreter enables SQL queries on Pandas DataFrames using the `pandasql` library: + +```mermaid +graph LR + SQL_CODE["SQL Query"] + TRANSFORM["Wrap in pysqldf()"] + PYTHON_EXEC["Execute via PythonInterpreter"] + RESULT["z.show() output"] + + SQL_CODE --> TRANSFORM + TRANSFORM --> PYTHON_EXEC + PYTHON_EXEC --> RESULT +``` + +The interpreter transforms SQL like `SELECT * FROM df` into Python code: `z.show(pysqldf('SELECT * FROM df'))`. + +Sources: [python/src/main/java/org/apache/zeppelin/python/PythonInterpreterPandasSql.java:67-72]() + +### ZeppelinContext Integration + +The `PyZeppelinContext` class provides Python access to Zeppelin's display system and notebook features: + +| Method | Purpose | +|--------|---------| +| `z.show(dataframe)` | Display Pandas DataFrames as tables | +| `z.input(name, default)` | Create dynamic form inputs | +| `z.select(name, options)` | Create dropdown forms | +| `z.put(key, value)` | Store data in distributed resource pool | +| `z.get(key)` | Retrieve shared data | + +Sources: [python/src/main/resources/python/zeppelin_context.py](), [python/src/main/java/org/apache/zeppelin/python/PythonZeppelinContext.java]() + +### Matplotlib Integration + +The Python interpreter includes built-in matplotlib backend support for inline plotting. The `backend_zinline.py` provides a custom matplotlib backend that integrates with Zeppelin's display system. + +Sources: [python/src/main/java/org/apache/zeppelin/python/PythonInterpreter.java:190-194](), [docs/interpreter/python.md:139-185]() + +## Configuration Properties + +The interpreter system supports several configuration properties defined in `interpreter-setting.json`: + +| Property | Default | Description | +|----------|---------|-------------| +| `zeppelin.python` | `python` | Path to Python executable | +| `zeppelin.python.maxResult` | `1000` | Maximum DataFrame rows to display | +| `zeppelin.python.useIPython` | `true` | Whether to use IPython when available | +| `zeppelin.ipython.launch.timeout` | `30000` | IPython launch timeout (ms) | +| `zeppelin.ipython.grpc.message_size` | `33554432` | gRPC message size limit | + +Sources: [python/src/main/resources/interpreter-setting.json:7-51]() diff --git a/.cursor/documentation/interpreters/remote_interpreter_infrastructure.md b/.cursor/documentation/interpreters/remote_interpreter_infrastructure.md new file mode 100644 index 00000000000..154552407b5 --- /dev/null +++ b/.cursor/documentation/interpreters/remote_interpreter_infrastructure.md @@ -0,0 +1,345 @@ +# Remote Interpreter Infrastructure + +
+Relevant source files + +The following files were used as context for generating this wiki page: + +- [helium-dev/src/main/java/org/apache/zeppelin/helium/ZeppelinDevServer.java](helium-dev/src/main/java/org/apache/zeppelin/helium/ZeppelinDevServer.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/InterpreterGroup.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/InterpreterGroup.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterServer.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterServer.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterUtils.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterUtils.java) +- [zeppelin-interpreter/src/test/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterServerTest.java](zeppelin-interpreter/src/test/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterServerTest.java) +- [zeppelin-interpreter/src/test/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterUtilsTest.java](zeppelin-interpreter/src/test/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterUtilsTest.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/RemoteInterpreterEventServer.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/RemoteInterpreterEventServer.java) + +
+ + + +## Purpose and Scope + +The Remote Interpreter Infrastructure enables Apache Zeppelin to execute interpreters in separate processes from the main Zeppelin server, providing process isolation, resource management, and distributed execution capabilities. This document covers the core components, communication protocols, and lifecycle management of remote interpreter processes. + +For information about interpreter configuration and management, see [Interpreter Framework](#2.3). For specific interpreter implementations, see [Spark Interpreters](#5.2), [Python Interpreter](#5.3), and other interpreter-specific sections. + +## Architecture Overview + +The remote interpreter infrastructure consists of separate processes communicating via Apache Thrift RPC. The main Zeppelin server manages interpreter processes through a client-server architecture where interpreter processes register themselves and handle execution requests. + +### System Architecture + +```mermaid +graph TB + subgraph "Zeppelin Server Process" + ISM["InterpreterSettingManager"] + RIEventServer["RemoteInterpreterEventServer
Thrift Server"] + MIG["ManagedInterpreterGroup"] + RIProcess["RemoteInterpreterProcess"] + end + + subgraph "Remote Interpreter Process" + RIServer["RemoteInterpreterServer
Thrift Server
Main Entry Point"] + IG["InterpreterGroup
Session Management"] + LazyInterp["LazyOpenInterpreter"] + ActualInterp["Actual Interpreter
(Spark, Python, etc.)"] + end + + subgraph "Communication Layer" + ThriftRPC["Apache Thrift RPC"] + RegisterInfo["RegisterInfo"] + RIEventClient["RemoteInterpreterEventClient"] + end + + ISM --> MIG + MIG --> RIProcess + RIProcess --> ThriftRPC + ThriftRPC --> RIServer + RIServer --> IG + IG --> LazyInterp + LazyInterp --> ActualInterp + + RIServer --> RIEventClient + RIEventClient --> ThriftRPC + ThriftRPC --> RIEventServer + RIEventServer --> ISM + + RIServer --> RegisterInfo + RegisterInfo --> RIEventServer +``` + +Sources: [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterServer.java:106-107](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/RemoteInterpreterEventServer.java:76](), [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/InterpreterGroup.java:45]() + +## Core Components + +### RemoteInterpreterServer + +The `RemoteInterpreterServer` is the main entry point for interpreter processes, running as a separate thread and implementing the `RemoteInterpreterService.Iface` interface. It manages interpreter instances and handles RPC requests from the Zeppelin server. + +```mermaid +graph TB + subgraph "RemoteInterpreterServer Components" + Main["main(String[] args)
Entry Point"] + Server["RemoteInterpreterServer
extends Thread"] + ThriftServer["TThreadPoolServer
Thrift RPC Server"] + ServiceImpl["RemoteInterpreterService.Iface
Implementation"] + end + + subgraph "Key Operations" + Init["init(Map properties)"] + CreateInterp["createInterpreter(...)"] + Interpret["interpret(...)"] + Cancel["cancel(...)"] + Shutdown["shutdown()"] + end + + subgraph "Registration & Communication" + RegisterRunnable["RegisterRunnable
Registration Thread"] + EventClient["RemoteInterpreterEventClient"] + RegisterInfo["RegisterInfo
Host, Port, GroupId"] + end + + Main --> Server + Server --> ThriftServer + Server --> ServiceImpl + ServiceImpl --> Init + ServiceImpl --> CreateInterp + ServiceImpl --> Interpret + ServiceImpl --> Cancel + ServiceImpl --> Shutdown + + Server --> RegisterRunnable + RegisterRunnable --> EventClient + EventClient --> RegisterInfo +``` + +Sources: [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterServer.java:298-334](), [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterServer.java:616-659]() + +### InterpreterGroup and Session Management + +The `InterpreterGroup` manages collections of interpreter sessions, where each session can contain multiple interpreters sharing the same execution context. + +```mermaid +graph LR + subgraph "InterpreterGroup Structure" + IG["InterpreterGroup
id: String"] + Sessions["sessions: Map<String, List<Interpreter>>"] + AOR["AngularObjectRegistry"] + ResourcePool["ResourcePool"] + HookRegistry["InterpreterHookRegistry"] + end + + subgraph "Session Examples" + Session1["session_1"] + Session2["shared_session"] + SparkInterp["SparkInterpreter"] + PySparkInterp["PySparkInterpreter"] + SqlInterp["SparkSqlInterpreter"] + end + + IG --> Sessions + IG --> AOR + IG --> ResourcePool + IG --> HookRegistry + + Sessions --> Session1 + Sessions --> Session2 + Session1 --> SparkInterp + Session1 --> PySparkInterp + Session1 --> SqlInterp +``` + +Sources: [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/InterpreterGroup.java:32-44](), [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/InterpreterGroup.java:90-107]() + +### RemoteInterpreterEventServer + +The `RemoteInterpreterEventServer` runs in the main Zeppelin server process and handles events and registration from remote interpreter processes. + +```mermaid +graph TB + subgraph "Event Server Operations" + RegProcess["registerInterpreterProcess
(RegisterInfo)"] + UnregProcess["unRegisterInterpreterProcess
(String intpGroupId)"] + AppendOutput["appendOutput
(OutputAppendEvent)"] + UpdateOutput["updateOutput
(OutputUpdateEvent)"] + AngularOps["Angular Object Operations
add/update/remove"] + ResourceOps["Resource Operations
get/invoke methods"] + end + + subgraph "Event Types" + OutputEvents["Output Events
Append, Update, Clear"] + AppEvents["Application Events
Status, Output Updates"] + ResourceEvents["Resource Events
Cross-interpreter sharing"] + ParagraphEvents["Paragraph Events
Run, Config updates"] + end + + RegProcess --> OutputEvents + UnregProcess --> AppEvents + AppendOutput --> ResourceEvents + UpdateOutput --> ParagraphEvents + AngularOps --> OutputEvents + ResourceOps --> ResourceEvents +``` + +Sources: [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/RemoteInterpreterEventServer.java:168-203](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/RemoteInterpreterEventServer.java:218-284]() + +## Communication Protocol + +### Thrift RPC Interface + +The communication between Zeppelin server and remote interpreter processes uses Apache Thrift with two main service interfaces: + +| Service Interface | Location | Purpose | +|-------------------|----------|---------| +| `RemoteInterpreterService.Iface` | Interpreter Process | Handles execution requests from server | +| `RemoteInterpreterEventService.Iface` | Zeppelin Server | Handles events from interpreter processes | + +### Communication Flow + +```mermaid +sequenceDiagram + participant ZepServer as "Zeppelin Server" + participant EventServer as "RemoteInterpreterEventServer" + participant IntpProcess as "RemoteInterpreterServer" + participant EventClient as "RemoteInterpreterEventClient" + + Note over ZepServer,EventClient: Process Startup & Registration + ZepServer->>IntpProcess: Launch interpreter process + IntpProcess->>IntpProcess: Start TThreadPoolServer + IntpProcess->>EventServer: registerInterpreterProcess(RegisterInfo) + EventServer->>ZepServer: Update process status + + Note over ZepServer,EventClient: Code Execution + ZepServer->>IntpProcess: interpret(sessionId, className, code, context) + IntpProcess->>IntpProcess: Execute in InterpretJob + IntpProcess->>EventClient: Output events (append/update) + EventClient->>EventServer: Forward events + EventServer->>ZepServer: Update UI + IntpProcess-->>ZepServer: Return InterpreterResult + + Note over ZepServer,EventClient: Process Shutdown + ZepServer->>IntpProcess: shutdown() + IntpProcess->>EventServer: unRegisterInterpreterProcess() + IntpProcess->>IntpProcess: Stop TThreadPoolServer +``` + +Sources: [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterServer.java:542-614](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/RemoteInterpreterEventServer.java:168-186]() + +## Process Lifecycle Management + +### Interpreter Process Lifecycle + +The remote interpreter process follows a specific lifecycle from startup to shutdown, with proper resource management and error handling. + +```mermaid +stateDiagram-v2 + [*] --> Starting + Starting --> Initializing: main() called + Initializing --> Registering: TThreadPoolServer started + Registering --> Ready: Registration successful + Ready --> Executing: interpret() request + Executing --> Ready: Execution complete + Ready --> Shutting: shutdown() called + Shutting --> Cleanup: Unregister process + Cleanup --> [*]: Process terminated + + Registering --> Error: Registration failed + Error --> Cleanup: Shutdown on error + Executing --> Cancelling: cancel() request + Cancelling --> Ready: Cancellation complete +``` + +### Key Lifecycle Components + +| Component | Class | Responsibility | +|-----------|-------|----------------| +| Startup | `RemoteInterpreterServer.main()` | Process entry point and argument parsing | +| Initialization | `RemoteInterpreterServer.init()` | Configuration setup and client creation | +| Registration | `RegisterRunnable` | Register with Zeppelin server | +| Execution | `InterpretJob` | Handle code interpretation requests | +| Shutdown | `ShutdownThread` | Clean resource cleanup and process termination | + +Sources: [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterServer.java:298-334](), [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterServer.java:661-753]() + +### Job Execution and Recovery + +The system supports job recovery for long-running operations through the `runningJobs` cache and result caching mechanisms. + +```mermaid +graph TB + subgraph "Job Execution Pipeline" + InterpretRequest["interpret() Request"] + JobCreation["Create InterpretJob"] + JobSubmit["Submit to Scheduler"] + JobExecution["jobRun() Execution"] + ResultCache["Cache Result"] + end + + subgraph "Recovery Mechanism" + RunningJobs["runningJobs
ConcurrentMap"] + IsRecover["isRecover flag"] + JobLookup["Lookup existing job"] + ResultReturn["Return cached result"] + end + + subgraph "Resource Management" + ProgressMap["progressMap
Progress tracking"] + ResultCleanup["resultCleanService
Scheduled cleanup"] + CacheTimeout["resultCacheInSeconds"] + end + + InterpretRequest --> IsRecover + IsRecover -->|true| JobLookup + IsRecover -->|false| JobCreation + JobLookup --> ResultReturn + JobCreation --> JobSubmit + JobSubmit --> JobExecution + JobExecution --> ResultCache + JobCreation --> RunningJobs + JobExecution --> ProgressMap + ResultCache --> ResultCleanup + ResultCleanup --> CacheTimeout +``` + +Sources: [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterServer.java:557-599](), [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterServer.java:769-932]() + +## Network and Port Management + +### Port Allocation and Host Discovery + +The remote interpreter infrastructure includes utilities for network configuration and port management to support various deployment scenarios. + +| Function | Purpose | Configuration | +|----------|---------|---------------| +| `findAvailablePort()` | Allocate ports within specified ranges | `portRange` parameter (e.g., "30000:40000") | +| `findAvailableHostAddress()` | Discover accessible host addresses | `ZEPPELIN_LOCAL_IP` environment variable | +| `checkIfRemoteEndpointAccessible()` | Verify connectivity to remote endpoints | Socket connection testing | + +### Cluster Mode Integration + +In cluster mode, the system integrates with cluster management for process discovery and metadata registration: + +```mermaid +graph LR + subgraph "Cluster Integration" + ClusterClient["ClusterManagerClient"] + ClusterMeta["Cluster Metadata"] + NodeDiscovery["Node Discovery"] + end + + subgraph "Process Metadata" + ProcessMeta["INTP_PROCESS_META"] + HostPort["Host:Port Info"] + Heartbeat["Heartbeat Status"] + StartTime["Start Time"] + end + + ClusterClient --> ClusterMeta + ClusterMeta --> NodeDiscovery + NodeDiscovery --> ProcessMeta + ProcessMeta --> HostPort + ProcessMeta --> Heartbeat + ProcessMeta --> StartTime +``` + +Sources: [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterUtils.java:52-79](), [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterServer.java:337-354]() diff --git a/.cursor/documentation/interpreters/spark_interpreters.md b/.cursor/documentation/interpreters/spark_interpreters.md new file mode 100644 index 00000000000..abda031a47a --- /dev/null +++ b/.cursor/documentation/interpreters/spark_interpreters.md @@ -0,0 +1,355 @@ +# Spark Interpreters + +
+Relevant source files + +The following files were used as context for generating this wiki page: + +- [docs/interpreter/spark.md](docs/interpreter/spark.md) +- [spark-submit/src/main/java/org/apache/zeppelin/spark/submit/SparkSubmitInterpreter.java](spark-submit/src/main/java/org/apache/zeppelin/spark/submit/SparkSubmitInterpreter.java) +- [spark-submit/src/main/resources/interpreter-setting.json](spark-submit/src/main/resources/interpreter-setting.json) +- [spark/interpreter/src/main/java/org/apache/zeppelin/spark/AbstractSparkScalaInterpreter.java](spark/interpreter/src/main/java/org/apache/zeppelin/spark/AbstractSparkScalaInterpreter.java) +- [spark/interpreter/src/main/java/org/apache/zeppelin/spark/KotlinSparkInterpreter.java](spark/interpreter/src/main/java/org/apache/zeppelin/spark/KotlinSparkInterpreter.java) +- [spark/interpreter/src/main/java/org/apache/zeppelin/spark/SparkInterpreter.java](spark/interpreter/src/main/java/org/apache/zeppelin/spark/SparkInterpreter.java) +- [spark/interpreter/src/main/java/org/apache/zeppelin/spark/SparkSqlInterpreter.java](spark/interpreter/src/main/java/org/apache/zeppelin/spark/SparkSqlInterpreter.java) +- [spark/interpreter/src/main/resources/interpreter-setting.json](spark/interpreter/src/main/resources/interpreter-setting.json) +- [spark/interpreter/src/test/java/org/apache/zeppelin/spark/SparkInterpreterTest.java](spark/interpreter/src/test/java/org/apache/zeppelin/spark/SparkInterpreterTest.java) +- [spark/scala-2.11/src/main/scala/org/apache/zeppelin/spark/SparkScala211Interpreter.scala](spark/scala-2.11/src/main/scala/org/apache/zeppelin/spark/SparkScala211Interpreter.scala) +- [spark/scala-2.12/src/main/scala/org/apache/zeppelin/spark/SparkScala212Interpreter.scala](spark/scala-2.12/src/main/scala/org/apache/zeppelin/spark/SparkScala212Interpreter.scala) +- [spark/scala-2.13/src/main/scala/org/apache/zeppelin/spark/SparkScala213Interpreter.scala](spark/scala-2.13/src/main/scala/org/apache/zeppelin/spark/SparkScala213Interpreter.scala) +- [zeppelin-interpreter-integration/README.md](zeppelin-interpreter-integration/README.md) +- [zeppelin-interpreter-integration/src/test/java/org/apache/zeppelin/integration/SparkIntegrationTest.java](zeppelin-interpreter-integration/src/test/java/org/apache/zeppelin/integration/SparkIntegrationTest.java) +- [zeppelin-interpreter-integration/src/test/java/org/apache/zeppelin/integration/SparkSubmitIntegrationTest.java](zeppelin-interpreter-integration/src/test/java/org/apache/zeppelin/integration/SparkSubmitIntegrationTest.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/launcher/InterpreterClient.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/launcher/InterpreterClient.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/launcher/InterpreterLaunchContext.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/launcher/InterpreterLaunchContext.java) +- [zeppelin-web/e2e/collaborativeMode.spec.js](zeppelin-web/e2e/collaborativeMode.spec.js) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/launcher/SparkInterpreterLauncher.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/launcher/SparkInterpreterLauncher.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/launcher/SparkInterpreterLauncherTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/launcher/SparkInterpreterLauncherTest.java) + +
+ + + +This document covers the Spark interpreter system in Apache Zeppelin, which provides multi-language support for Apache Spark through a collection of specialized interpreters. The Spark interpreters enable users to execute Scala, Python, R, SQL, and Kotlin code against Spark clusters in various deployment modes. + +For information about the broader interpreter framework that manages these interpreters, see [Interpreter Framework](#2.3). For details about remote interpreter execution infrastructure, see [Remote Interpreter Infrastructure](#5.1). + +## Architecture Overview + +The Spark interpreter system is built around a delegation pattern where a main `SparkInterpreter` coordinates with language-specific sub-interpreters, all sharing the same `SparkContext` and `SparkSession`. + +```mermaid +graph TB + subgraph "Zeppelin Server Process" + ISM["InterpreterSettingManager"] + SIL["SparkInterpreterLauncher"] + ILC["InterpreterLaunchContext"] + end + + subgraph "Remote Interpreter Process" + SI["SparkInterpreter
(Main Coordinator)"] + + subgraph "Scala Implementations" + S211["SparkScala211Interpreter"] + S212["SparkScala212Interpreter"] + S213["SparkScala213Interpreter"] + end + + subgraph "Language Interpreters" + SQL["SparkSqlInterpreter"] + PSI["PySparkInterpreter"] + RSI["SparkRInterpreter"] + KSI["KotlinSparkInterpreter"] + end + + subgraph "Shared Spark Resources" + SC["SparkContext"] + SS["SparkSession"] + SQLC["SQLContext"] + ZC["ZeppelinContext"] + end + end + + ISM --> SIL + SIL --> ILC + ILC --> SI + + SI --> S211 + SI --> S212 + SI --> S213 + + SQL --> SI + PSI --> SI + RSI --> SI + KSI --> SI + + S211 --> SC + S212 --> SC + S213 --> SC + SQL --> SQLC + PSI --> SC + RSI --> SC + KSI --> SC + + SC --> SS + SS --> SQLC + SC --> ZC +``` + +**Sources:** [spark/interpreter/src/main/java/org/apache/zeppelin/spark/SparkInterpreter.java:54-96](), [spark/interpreter/src/main/java/org/apache/zeppelin/spark/AbstractSparkScalaInterpreter.java:47-72](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/launcher/SparkInterpreterLauncher.java:47-59]() + +## Interpreter Types and Language Support + +The Spark interpreter group consists of multiple specialized interpreters that share a common Spark runtime: + +| Interpreter Name | Class | Language | Description | +|------------------|-------|----------|-------------| +| `%spark` | `SparkInterpreter` | Scala | Main Scala environment with SparkContext/SparkSession | +| `%spark.sql` | `SparkSqlInterpreter` | SQL | SQL environment using shared SparkSession | +| `%spark.pyspark` | `PySparkInterpreter` | Python | Python environment with PySpark | +| `%spark.ipyspark` | `IPySparkInterpreter` | Python | IPython-enhanced Python environment | +| `%spark.r` | `SparkRInterpreter` | R | Vanilla R environment with SparkR | +| `%spark.ir` | `SparkIRInterpreter` | R | Jupyter IRKernel-based R environment | +| `%spark.shiny` | `SparkShinyInterpreter` | R | R Shiny app creation with SparkR | +| `%spark.kotlin` | `KotlinSparkInterpreter` | Kotlin | Kotlin environment with Spark support | + +**Sources:** [spark/interpreter/src/main/resources/interpreter-setting.json:1-368](), [docs/interpreter/spark.md:31-77]() + +## Scala Version Support Architecture + +The `SparkInterpreter` delegates to version-specific Scala implementations to support multiple Scala versions (2.11, 2.12, 2.13) within the same Zeppelin instance: + +```mermaid +graph TB + subgraph "SparkInterpreter Delegation" + SI["SparkInterpreter"] + SIL["SparkInterpreterLauncher"] + + subgraph "Version Detection" + SVD["extractScalaVersion()"] + SHD["detectSparkScalaVersion()"] + end + + subgraph "ClassLoader Management" + ICM["innerInterpreterClassMap"] + URLCl["URLClassLoader"] + ScalaJars["scala-{version} jars"] + end + end + + subgraph "Scala Interpreter Implementations" + ASI["AbstractSparkScalaInterpreter"] + S211["SparkScala211Interpreter
(Spark 2.x only)"] + S212["SparkScala212Interpreter
(Spark 2.x, 3.x)"] + S213["SparkScala213Interpreter
(Spark 3.x only)"] + end + + subgraph "Deployment Locations" + ZH["ZEPPELIN_HOME/interpreter/spark/"] + S211Dir["scala-2.11/"] + S212Dir["scala-2.12/"] + S213Dir["scala-2.13/"] + end + + SI --> SVD + SIL --> SHD + SVD --> ICM + ICM --> URLCl + URLCl --> ScalaJars + + SI --> ASI + ASI --> S211 + ASI --> S212 + ASI --> S213 + + URLCl --> ZH + ZH --> S211Dir + ZH --> S212Dir + ZH --> S213Dir + + S211Dir --> S211 + S212Dir --> S212 + S213Dir --> S213 +``` + +**Sources:** [spark/interpreter/src/main/java/org/apache/zeppelin/spark/SparkInterpreter.java:158-189](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/launcher/SparkInterpreterLauncher.java:268-293](), [spark/interpreter/src/main/java/org/apache/zeppelin/spark/SparkInterpreter.java:92-95]() + +The version detection process involves: + +1. **Launcher Detection**: `SparkInterpreterLauncher.detectSparkScalaVersion()` runs `spark-submit --version` to determine Scala version +2. **Runtime Confirmation**: `SparkInterpreter.extractScalaVersion()` uses `scala.util.Properties.versionString()` for final determination +3. **Dynamic Loading**: Version-specific jars are loaded from `ZEPPELIN_HOME/interpreter/spark/scala-{version}/` directories + +**Sources:** [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/launcher/SparkInterpreterLauncher.java:268-293](), [spark/interpreter/src/main/java/org/apache/zeppelin/spark/SparkInterpreter.java:268-292]() + +## Launcher and Process Management + +The `SparkInterpreterLauncher` handles Spark-specific process configuration and deployment across different execution modes: + +```mermaid +graph TB + subgraph "Launch Configuration" + SIL["SparkInterpreterLauncher"] + ILC["InterpreterLaunchContext"] + Props["Properties"] + EnvVars["Environment Variables"] + end + + subgraph "Deployment Mode Detection" + GM["getSparkMaster()"] + GDM["getDeployMode()"] + YM["isYarnMode()"] + YC["isYarnCluster()"] + end + + subgraph "Environment Building" + BEP["buildEnvFromProperties()"] + SparkConf["ZEPPELIN_SPARK_CONF"] + SPARK_HOME["SPARK_HOME"] + HADOOP_CONF_DIR["HADOOP_CONF_DIR"] + end + + subgraph "Execution Modes" + Local["local[*]"] + YarnClient["yarn-client / yarn + client"] + YarnCluster["yarn-cluster / yarn + cluster"] + K8s["k8s://"] + Standalone["spark://master:7077"] + end + + subgraph "Additional Jars (Yarn Cluster)" + LocalRepo["Local Repository Jars"] + ScalaJars["Scala Version Jars"] + ZepJars["zeppelin-interpreter-shaded"] + end + + SIL --> ILC + ILC --> Props + Props --> EnvVars + + SIL --> GM + GM --> GDM + GDM --> YM + YM --> YC + + SIL --> BEP + BEP --> SparkConf + BEP --> SPARK_HOME + BEP --> HADOOP_CONF_DIR + + GM --> Local + GM --> YarnClient + GM --> YarnCluster + GM --> K8s + GM --> Standalone + + YarnCluster --> LocalRepo + YarnCluster --> ScalaJars + YarnCluster --> ZepJars +``` + +**Sources:** [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/launcher/SparkInterpreterLauncher.java:62-266](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/launcher/SparkInterpreterLauncher.java:405-447]() + +### Yarn Cluster Mode Special Handling + +For `yarn-cluster` mode, the launcher automatically includes additional jars in `spark.jars`: +- Local repository jars from `zeppelin.interpreter.localRepo` +- Scala version-specific jars from `ZEPPELIN_HOME/interpreter/spark/scala-{version}/` +- `zeppelin-interpreter-shaded` jar for remote communication + +**Sources:** [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/launcher/SparkInterpreterLauncher.java:141-193]() + +## Configuration System + +Spark interpreters support extensive configuration through the `interpreter-setting.json` file and runtime properties: + +### Core Spark Properties + +| Property | Default | Description | +|----------|---------|-------------| +| `SPARK_HOME` | - | Location of Spark distribution | +| `spark.master` | `local[*]` | Spark master URI | +| `spark.submit.deployMode` | - | Deploy mode: `client` or `cluster` | +| `spark.app.name` | `Zeppelin` | Spark application name | +| `spark.driver.memory` | `1g` | Driver memory allocation | +| `spark.executor.memory` | `1g` | Executor memory allocation | + +### Zeppelin-Specific Properties + +| Property | Default | Description | +|----------|---------|-------------| +| `zeppelin.spark.useHiveContext` | `true` | Enable Hive support in SparkSession | +| `zeppelin.spark.printREPLOutput` | `true` | Print Scala REPL output | +| `zeppelin.spark.maxResult` | `1000` | Max rows in SQL results | +| `zeppelin.spark.concurrentSQL` | `false` | Enable concurrent SQL execution | +| `zeppelin.pyspark.useIPython` | `true` | Use IPython for PySpark | + +**Sources:** [spark/interpreter/src/main/resources/interpreter-setting.json:7-164](), [docs/interpreter/spark.md:145-297]() + +## SQL Interpreter Integration + +The `SparkSqlInterpreter` demonstrates shared resource management by reusing the main Spark interpreter's context: + +```mermaid +graph LR + subgraph "SparkSqlInterpreter Execution Flow" + SSI["SparkSqlInterpreter"] + SI["SparkInterpreter"] + SQLCtx["SQLContext"] + SparkCtx["SparkContext"] + SqlSplitter["SqlSplitter"] + ZepCtx["ZeppelinContext"] + end + + subgraph "Concurrent Execution" + CS["concurrentSQL()"] + PS["ParallelScheduler"] + FairSched["FAIR Scheduler"] + Pool["spark.scheduler.pool"] + end + + SSI --> SI + SI --> SQLCtx + SI --> SparkCtx + SSI --> SqlSplitter + SI --> ZepCtx + + SSI --> CS + CS --> PS + CS --> FairSched + SparkCtx --> Pool +``` + +**Sources:** [spark/interpreter/src/main/java/org/apache/zeppelin/spark/SparkSqlInterpreter.java:45-185](), [spark/interpreter/src/main/java/org/apache/zeppelin/spark/SparkSqlInterpreter.java:61-63](), [spark/interpreter/src/main/java/org/apache/zeppelin/spark/SparkSqlInterpreter.java:165-184]() + +The SQL interpreter supports: +- **Concurrent SQL execution** via `zeppelin.spark.concurrentSQL` property +- **Statement splitting** using `SqlSplitter` for multi-statement paragraphs +- **Shared spark context** with job group and pool management +- **Error handling** with optional stack trace display + +**Sources:** [spark/interpreter/src/main/java/org/apache/zeppelin/spark/SparkSqlInterpreter.java:79-146]() + +## Deployment and Testing + +The integration tests demonstrate various deployment scenarios: + +### Test Coverage Matrix + +| Test Class | Deployment Modes | Key Features | +|------------|------------------|--------------| +| `SparkIntegrationTest` | Local, Yarn-Client, Yarn-Cluster | Multi-language, Scoped mode | +| `SparkSubmitIntegrationTest` | Yarn-Cluster via spark-submit | External submission, cancellation | +| `SparkInterpreterLauncherTest` | All modes | Launcher configuration testing | + +**Sources:** [zeppelin-interpreter-integration/src/test/java/org/apache/zeppelin/integration/SparkIntegrationTest.java:59-395](), [zeppelin-interpreter-integration/src/test/java/org/apache/zeppelin/integration/SparkSubmitIntegrationTest.java:52-191](), [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/launcher/SparkInterpreterLauncherTest.java:42-318]() + +### SparkSubmit Integration + +The `SparkSubmitInterpreter` provides `%spark-submit` functionality for external Spark application submission: + +- **Command wrapping**: Prepends `SPARK_HOME/bin/spark-submit` to user commands +- **Yarn application tracking**: Extracts application IDs for monitoring and cancellation +- **UI integration**: Automatically detects and reports Spark UI URLs + +**Sources:** [spark-submit/src/main/java/org/apache/zeppelin/spark/submit/SparkSubmitInterpreter.java:44-157]() diff --git a/.cursor/documentation/overview/overview.md b/.cursor/documentation/overview/overview.md new file mode 100644 index 00000000000..15d2e9cf46d --- /dev/null +++ b/.cursor/documentation/overview/overview.md @@ -0,0 +1,243 @@ +# Overview + +
+Relevant source files + +The following files were used as context for generating this wiki page: + +- [.github/workflows/core.yml](.github/workflows/core.yml) +- [.github/workflows/frontend.yml](.github/workflows/frontend.yml) +- [.github/workflows/quick.yml](.github/workflows/quick.yml) +- [.gitignore](.gitignore) +- [.mvn/wrapper/MavenWrapperDownloader.java](.mvn/wrapper/MavenWrapperDownloader.java) +- [.mvn/wrapper/maven-wrapper.jar](.mvn/wrapper/maven-wrapper.jar) +- [.mvn/wrapper/maven-wrapper.properties](.mvn/wrapper/maven-wrapper.properties) +- [Dockerfile](Dockerfile) +- [README.md](README.md) +- [conf/interpreter-list](conf/interpreter-list) +- [dev/common_release.sh](dev/common_release.sh) +- [dev/create_release.sh](dev/create_release.sh) +- [dev/publish_release.sh](dev/publish_release.sh) +- [docs/_includes/themes/zeppelin/_navigation.html](docs/_includes/themes/zeppelin/_navigation.html) +- [docs/index.md](docs/index.md) +- [docs/setup/basics/how_to_build.md](docs/setup/basics/how_to_build.md) +- [docs/usage/interpreter/installation.md](docs/usage/interpreter/installation.md) +- [livy/README.md](livy/README.md) +- [pom.xml](pom.xml) +- [spark/pom.xml](spark/pom.xml) +- [zeppelin-display/pom.xml](zeppelin-display/pom.xml) +- [zeppelin-distribution/pom.xml](zeppelin-distribution/pom.xml) +- [zeppelin-interpreter/pom.xml](zeppelin-interpreter/pom.xml) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/scheduler/Job.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/scheduler/Job.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/scheduler/JobWithProgressPoller.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/scheduler/JobWithProgressPoller.java) +- [zeppelin-interpreter/src/test/java/org/apache/zeppelin/scheduler/JobTest.java](zeppelin-interpreter/src/test/java/org/apache/zeppelin/scheduler/JobTest.java) +- [zeppelin-server/pom.xml](zeppelin-server/pom.xml) +- [zeppelin-server/src/main/java/org/apache/zeppelin/rest/NotebookRestApi.java](zeppelin-server/src/main/java/org/apache/zeppelin/rest/NotebookRestApi.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/service/NotebookService.java](zeppelin-server/src/main/java/org/apache/zeppelin/service/NotebookService.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java](zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/rest/AbstractTestRestApi.java](zeppelin-server/src/test/java/org/apache/zeppelin/rest/AbstractTestRestApi.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/rest/NotebookRestApiTest.java](zeppelin-server/src/test/java/org/apache/zeppelin/rest/NotebookRestApiTest.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/rest/ZeppelinRestApiTest.java](zeppelin-server/src/test/java/org/apache/zeppelin/rest/ZeppelinRestApiTest.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/service/NotebookServiceTest.java](zeppelin-server/src/test/java/org/apache/zeppelin/service/NotebookServiceTest.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/socket/NotebookServerTest.java](zeppelin-server/src/test/java/org/apache/zeppelin/socket/NotebookServerTest.java) +- [zeppelin-web/pom.xml](zeppelin-web/pom.xml) +- [zeppelin-zengine/pom.xml](zeppelin-zengine/pom.xml) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Note.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Note.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Notebook.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Notebook.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Paragraph.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Paragraph.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/helium/HeliumApplicationFactoryTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/helium/HeliumApplicationFactoryTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/NoteTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/NoteTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/NotebookTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/NotebookTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/ParagraphTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/ParagraphTest.java) + +
+ + + +This document provides an overview of Apache Zeppelin, a web-based notebook platform for interactive data analytics. This page covers the high-level architecture, core components, and system relationships within the Zeppelin codebase. For detailed information about specific subsystems, see [Architecture](#2) for the overall system design, [Web Interface](#3) for frontend components, [Server Components](#4) for backend services, [Interpreters](#5) for language execution engines, and [Deployment and Operations](#6) for deployment workflows. + +Apache Zeppelin is a multi-purpose notebook that enables data ingestion, discovery, analytics, and visualization through support for 20+ language backends including Spark, Python, SQL, and R. The system follows a modular architecture with a web-based frontend, RESTful and WebSocket APIs, a core notebook engine, and pluggable interpreter framework. + +## System Architecture + +```mermaid +graph TB + subgraph "Frontend Layer" + AngularJS["zeppelin-web
Angular.js Application"] + WebUI["Web Interface
Notebook Editor"] + end + + subgraph "Communication Layer" + NotebookServer["NotebookServer
WebSocket Endpoint"] + RestAPI["NotebookRestApi
REST Endpoints"] + WebSocket["WebSocket Protocol
Real-time Updates"] + end + + subgraph "Service Layer" + NotebookService["NotebookService
Business Logic"] + AuthService["AuthorizationService
Security"] + JobService["JobManagerService
Execution Management"] + end + + subgraph "Core Engine" + Notebook["Notebook
High-level Operations"] + Note["Note
Notebook Instance"] + Paragraph["Paragraph
Code Cells"] + NoteManager["NoteManager
Note Lifecycle"] + end + + subgraph "Interpreter Framework" + InterpreterFactory["InterpreterFactory
Interpreter Creation"] + InterpreterSetting["InterpreterSettingManager
Configuration"] + RemoteInterpreter["RemoteInterpreterServer
Process Management"] + end + + subgraph "Language Backends" + SparkInterp["Spark Interpreters
Scala 2.11/2.12/2.13"] + PythonInterp["Python Interpreter
PySpark, IPython"] + JDBCInterp["JDBC Interpreter
SQL Databases"] + OtherInterp["Other Interpreters
R, Flink, Shell, etc."] + end + + subgraph "Storage Layer" + NotebookRepo["NotebookRepo
Persistence Interface"] + VFSRepo["VFSNotebookRepo
File System"] + GitRepo["GitNotebookRepo
Version Control"] + S3Repo["S3NotebookRepo
Cloud Storage"] + end + + AngularJS --> WebSocket + AngularJS --> RestAPI + WebSocket --> NotebookServer + RestAPI --> NotebookService + NotebookServer --> NotebookService + NotebookService --> AuthService + NotebookService --> Notebook + Notebook --> Note + Note --> Paragraph + Notebook --> NoteManager + Paragraph --> InterpreterFactory + InterpreterFactory --> InterpreterSetting + InterpreterSetting --> RemoteInterpreter + RemoteInterpreter --> SparkInterp + RemoteInterpreter --> PythonInterp + RemoteInterpreter --> JDBCInterp + RemoteInterpreter --> OtherInterp + Notebook --> NotebookRepo + NotebookRepo --> VFSRepo + NotebookRepo --> GitRepo + NotebookRepo --> S3Repo +``` + +**System Architecture Overview** + +Sources: [pom.xml:54-103](), [zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java:117-123](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Notebook.java:73]() + +## Core Components + +### Web-based Notebook Interface + +The frontend is built as an Angular.js single-page application in the `zeppelin-web` module. It provides an interactive notebook interface with real-time collaboration features, dynamic forms, and rich data visualization capabilities. + +**Key Frontend Components:** +- Notebook editor with paragraph-based code cells +- Real-time output rendering and progress tracking +- Interpreter management interface +- Navigation and home page components + +Sources: [zeppelin-web/pom.xml:29-31](), [docs/_includes/themes/zeppelin/_navigation.html:1-20]() + +### Server and API Layer + +The `zeppelin-server` module provides both REST and WebSocket APIs for notebook operations. The `NotebookServer` class handles WebSocket connections for real-time communication, while `NotebookRestApi` provides RESTful endpoints for CRUD operations. + +**Key Server Classes:** +- `NotebookServer`: WebSocket endpoint for real-time updates at `/ws` +- `NotebookRestApi`: REST API endpoints under `/api/notebook` +- `NotebookService`: Business logic layer with permission checking +- `AuthorizationService`: Security and access control + +Sources: [zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java:112-117](), [zeppelin-server/src/main/java/org/apache/zeppelin/rest/NotebookRestApi.java:75-77](), [zeppelin-server/src/main/java/org/apache/zeppelin/service/NotebookService.java:80]() + +### Notebook Engine (zeppelin-zengine) + +The core notebook engine manages notebook lifecycle, execution, and persistence. The `Notebook` class provides high-level operations, while `Note` and `Paragraph` represent the data model. + +**Core Engine Classes:** +- `Notebook`: High-level notebook operations and management +- `Note`: Individual notebook instance with metadata and paragraphs +- `Paragraph`: Executable code cell with interpreter binding +- `NoteManager`: Note lifecycle and folder management +- `ParagraphJobListener`: Execution event handling + +Sources: [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Notebook.java:73](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Note.java:74](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Paragraph.java:69]() + +### Interpreter Framework + +Zeppelin's interpreter framework provides pluggable language backends through a common interface. Interpreters can run locally or as remote processes for isolation and scalability. + +**Interpreter Components:** +- `InterpreterFactory`: Creates and manages interpreter instances +- `InterpreterSettingManager`: Configuration and interpreter settings +- `RemoteInterpreterServer`: Manages remote interpreter processes +- Language-specific interpreters in separate modules (spark, python, jdbc, etc.) + +Sources: [zeppelin-zengine/pom.xml:52-56](), [spark/pom.xml:58-67]() + +## Communication Flow + +```mermaid +sequenceDiagram + participant "Frontend" as FE + participant "NotebookServer" as NS + participant "NotebookService" as NSvc + participant "Notebook" as NB + participant "Paragraph" as P + participant "InterpreterFactory" as IF + participant "RemoteInterpreter" as RI + + FE->>NS: "WebSocket: RUN_PARAGRAPH" + NS->>NSvc: "runParagraph()" + NSvc->>NB: "processNote(noteId)" + NB->>P: "execute()" + P->>IF: "getInterpreter()" + IF->>RI: "interpret(code)" + RI-->>IF: "InterpreterResult" + IF-->>P: "Result" + P-->>NB: "Paragraph Status" + NB-->>NSvc: "Execution Complete" + NSvc-->>NS: "broadcastParagraph()" + NS-->>FE: "WebSocket: PARAGRAPH" +``` + +**Paragraph Execution Flow** + +The diagram shows how code execution flows from the frontend through the server layers to the interpreter and back. The `NotebookServer` handles WebSocket messages, delegates to `NotebookService` for business logic, which uses the `Notebook` engine to execute paragraphs via the interpreter framework. + +Sources: [zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java:274-500](), [zeppelin-server/src/main/java/org/apache/zeppelin/service/NotebookService.java:1000-1100](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Paragraph.java:322-392]() + +## Module Structure + +The project follows a multi-module Maven structure with clear separation of concerns: + +| Module | Purpose | Key Classes | +|--------|---------|-------------| +| `zeppelin-web` | Angular.js frontend application | Web UI components | +| `zeppelin-server` | REST and WebSocket APIs | `NotebookServer`, `NotebookRestApi` | +| `zeppelin-zengine` | Core notebook engine | `Notebook`, `Note`, `Paragraph` | +| `zeppelin-interpreter` | Base interpreter framework | `Interpreter`, `InterpreterContext` | +| `spark/*` | Spark interpreter with multiple Scala versions | Spark integration | +| `python` | Python and PySpark interpreter | Python execution | +| `jdbc` | SQL database connectivity | JDBC connections | +| `zeppelin-distribution` | Packaging and distribution | Assembly configuration | + +**Technology Stack:** +- **Frontend**: Angular.js, Node.js build system +- **Backend**: Java 8, Maven, Jetty web server +- **Communication**: WebSocket, REST APIs, Apache Thrift +- **Storage**: Pluggable backends (Git, S3, Azure, local filesystem) +- **Interpreters**: JVM and native process execution + +Sources: [pom.xml:54-103](), [zeppelin-web/pom.xml:29-31](), [zeppelin-server/pom.xml:29-31](), [zeppelin-zengine/pom.xml:30-33]() + +The modular architecture enables independent development and deployment of different components, while the interpreter framework provides extensibility for new language backends. The WebSocket-based communication ensures real-time collaboration and responsive user experience. diff --git a/.cursor/documentation/server_components/configuration_management.md b/.cursor/documentation/server_components/configuration_management.md new file mode 100644 index 00000000000..6e259f83b16 --- /dev/null +++ b/.cursor/documentation/server_components/configuration_management.md @@ -0,0 +1,274 @@ +# Configuration Management + +
+Relevant source files + +The following files were used as context for generating this wiki page: + +- [conf/zeppelin-site.xml.template](conf/zeppelin-site.xml.template) +- [docs/setup/operation/configuration.md](docs/setup/operation/configuration.md) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/launcher/InterpreterLauncher.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/launcher/InterpreterLauncher.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/lifecycle/TimeoutLifecycleManager.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/lifecycle/TimeoutLifecycleManager.java) +- [zeppelin-interpreter/src/test/java/org/apache/zeppelin/conf/ZeppelinConfigurationTest.java](zeppelin-interpreter/src/test/java/org/apache/zeppelin/conf/ZeppelinConfigurationTest.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/server/ZeppelinServer.java](zeppelin-server/src/main/java/org/apache/zeppelin/server/ZeppelinServer.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/recovery/RecoveryUtils.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/recovery/RecoveryUtils.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreter.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreter.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/conf/ZeppelinConfigurationTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/conf/ZeppelinConfigurationTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/launcher/StandardInterpreterLauncherTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/launcher/StandardInterpreterLauncherTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/lifecycle/TimeoutLifecycleManagerTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/lifecycle/TimeoutLifecycleManagerTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterTest.java) + +
+ + + +## Purpose and Scope + +This document covers the server-side configuration management system in Apache Zeppelin, focusing on how configuration properties are loaded, processed, and used throughout the server components. It explains the architecture of the `ZeppelinConfiguration` class, configuration source precedence, and the integration with various server subsystems. + +For information about interpreter framework configuration, see [Interpreter Framework](#2.3). For details about process lifecycle management, see [Process Lifecycle Management](#4.3). + +## Configuration Architecture Overview + +Zeppelin's configuration system is built around a centralized `ZeppelinConfiguration` class that manages all server and interpreter settings. The system supports multiple configuration sources with a clear precedence hierarchy and provides type-safe access to configuration values. + +```mermaid +graph TB + subgraph "Configuration Sources" + EnvVars[Environment Variables
"ZEPPELIN_*"] + SysProps[System Properties
"-Dzeppelin.*"] + XmlFile[XML Configuration
"zeppelin-site.xml"] + end + + subgraph "ZeppelinConfiguration Class" + ConfVars[ConfVars Enum
"Configuration Definitions"] + Properties[Properties Map
"Loaded Values"] + Getters[Type-Safe Getters
"getString(), getInt(), etc."] + end + + subgraph "Server Components" + ZeppelinServer[ZeppelinServer
"Main Application"] + InterpreterMgr[InterpreterSettingManager
"Interpreter Management"] + NotebookServer[NotebookServer
"WebSocket/REST APIs"] + JettyServer[Jetty Configuration
"Web Server Setup"] + end + + EnvVars --> Properties + SysProps --> Properties + XmlFile --> Properties + ConfVars --> Getters + Properties --> Getters + + Getters --> ZeppelinServer + Getters --> InterpreterMgr + Getters --> NotebookServer + Getters --> JettyServer +``` + +Sources: [zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java:62-118](), [conf/zeppelin-site.xml.template:20-830]() + +## Configuration Source Precedence + +The configuration system follows a strict precedence hierarchy where higher-priority sources override lower-priority ones. This allows for flexible deployment scenarios while maintaining predictable behavior. + +```mermaid +graph TD + EnvConfig[EnvironmentConfiguration
"Highest Priority"] + SysConfig[SystemConfiguration
"Medium Priority"] + XmlConfig[XMLConfiguration
"Lowest Priority"] + + EnvConfig --> |"envConfig.containsKey()"| Getter[Configuration Getter Methods] + SysConfig --> |"sysConfig.containsKey()"| Getter + XmlConfig --> |"getStringValue()"| Getter + + Getter --> Result[Final Configuration Value] + + style EnvConfig fill:#f9f9f9 + style SysConfig fill:#f9f9f9 + style XmlConfig fill:#f9f9f9 +``` + +The precedence implementation is handled in methods like `getString()`, `getInt()`, and `getBoolean()`: + +Sources: [zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java:215-223](), [zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java:229-237]() + +## ZeppelinConfiguration Class Architecture + +The `ZeppelinConfiguration` class serves as the central configuration management component, implementing a singleton pattern with lazy initialization and comprehensive type conversion capabilities. + +| Component | Purpose | Key Methods | +|-----------|---------|-------------| +| `ConfVars` enum | Configuration variable definitions | `getVarName()`, `getStringValue()`, `getIntValue()` | +| Properties map | Runtime configuration storage | `setProperty()`, `getStringValue()` | +| Type-safe getters | Type conversion and validation | `getString()`, `getInt()`, `getBoolean()`, `getTime()` | +| File location strategies | Configuration file discovery | `ZeppelinLocationStrategy`, `ClasspathLocationStrategy` | + +```mermaid +graph TB + subgraph "ZeppelinConfiguration Singleton" + ConfVarsEnum[ConfVars Enum
"ZEPPELIN_PORT, ZEPPELIN_SSL, etc."] + PropertiesMap[properties Map<String,String>
"Runtime configuration storage"] + LoadXMLConfig[loadXMLConfig Method
"Parse zeppelin-site.xml"] + TypeGetters[Type-Safe Getters
"getString(), getInt(), getBoolean()"] + end + + subgraph "Configuration Loading" + ZeppelinLocationStrategy[ZeppelinLocationStrategy
"Local file system search"] + ClasspathLocationStrategy[ClasspathLocationStrategy
"Classpath resource search"] + XMLConfiguration[XMLConfiguration
"Apache Commons Config"] + end + + LoadXMLConfig --> ZeppelinLocationStrategy + LoadXMLConfig --> ClasspathLocationStrategy + ZeppelinLocationStrategy --> XMLConfiguration + ClasspathLocationStrategy --> XMLConfiguration + XMLConfiguration --> PropertiesMap + + ConfVarsEnum --> TypeGetters + PropertiesMap --> TypeGetters +``` + +Sources: [zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java:83-118](), [zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java:129-147]() + +## Configuration Categories + +Zeppelin configuration properties are organized into logical categories, each serving specific aspects of the system: + +### Server Configuration +- **Network Settings**: `zeppelin.server.addr`, `zeppelin.server.port`, `zeppelin.server.ssl.port` +- **Context Path**: `zeppelin.server.context.path` +- **Security Headers**: `zeppelin.server.xframe.options`, `zeppelin.server.xxss.protection` + +### Notebook Storage Configuration +- **Storage Backend**: `zeppelin.notebook.storage` (GitNotebookRepo, S3NotebookRepo, etc.) +- **Storage Paths**: `zeppelin.notebook.dir`, `zeppelin.notebook.s3.bucket` +- **Synchronization**: `zeppelin.notebook.one.way.sync` + +### Interpreter Configuration +- **Directory Paths**: `zeppelin.interpreter.dir`, `zeppelin.interpreter.localRepo` +- **Connection Settings**: `zeppelin.interpreter.connect.timeout`, `zeppelin.interpreter.output.limit` +- **Deployment Mode**: `zeppelin.run.mode` (auto, local, k8s, docker) + +### SSL and Security Configuration +- **SSL Enable**: `zeppelin.ssl`, `zeppelin.ssl.client.auth` +- **Certificate Paths**: `zeppelin.ssl.keystore.path`, `zeppelin.ssl.truststore.path` +- **PEM Support**: `zeppelin.ssl.pem.key`, `zeppelin.ssl.pem.cert` + +Sources: [conf/zeppelin-site.xml.template:22-830](), [zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java:911-1439]() + +## Configuration Loading Process + +The configuration loading process involves multiple stages, from file discovery to property resolution, with robust error handling and fallback mechanisms. + +```mermaid +sequenceDiagram + participant Client as "Application Startup" + participant ZepConf as "ZeppelinConfiguration" + participant FileStrategy as "File Location Strategy" + participant XMLConfig as "XMLConfiguration" + participant PropsMap as "Properties Map" + + Client->>ZepConf: create() + ZepConf->>ZepConf: new ZeppelinConfiguration(filename) + ZepConf->>ZepConf: loadXMLConfig(filename) + ZepConf->>FileStrategy: Combined strategy search + FileStrategy->>FileStrategy: ZeppelinLocationStrategy + FileStrategy->>FileStrategy: ClasspathLocationStrategy + FileStrategy-->>ZepConf: File location found + ZepConf->>XMLConfig: Build configuration + XMLConfig->>XMLConfig: Parse XML nodes + XMLConfig-->>ZepConf: Property nodes + ZepConf->>PropsMap: setProperty(name, value) + PropsMap-->>ZepConf: Properties stored + ZepConf-->>Client: Configuration instance +``` + +The loading process handles various scenarios: +- Missing configuration files (uses defaults) +- Invalid XML syntax (logs warning, continues with defaults) +- Environment variable overrides +- System property overrides + +Sources: [zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java:92-118](), [zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java:129-147]() + +## Remote Interpreter Configuration + +Remote interpreter processes require special configuration handling, including connection timeouts, port ranges, and environment variable propagation. + +```mermaid +graph TB + subgraph "ZeppelinConfiguration" + InterpTimeout[getTime Method
"ZEPPELIN_INTERPRETER_CONNECT_TIMEOUT"] + InterpPortRange[getInterpreterPortRange
"Port allocation"] + RunMode[getRunMode
"LOCAL, K8S, DOCKER"] + end + + subgraph "InterpreterLauncher" + ConnTimeout[getConnectTimeout
"Timeout resolution"] + ConnPoolSize[getConnectPoolSize
"Connection pooling"] + LaunchContext[InterpreterLaunchContext
"Launch parameters"] + end + + subgraph "Remote Process" + RemoteInterpProcess[RemoteInterpreterProcess
"Process management"] + ThriftClient[Thrift RPC Client
"Communication"] + RecoveryStorage[RecoveryStorage
"State persistence"] + end + + InterpTimeout --> ConnTimeout + InterpPortRange --> LaunchContext + RunMode --> LaunchContext + ConnTimeout --> RemoteInterpProcess + ConnPoolSize --> ThriftClient + LaunchContext --> RemoteInterpProcess + RemoteInterpProcess --> RecoveryStorage +``` + +Key configuration properties for remote interpreters: +- `zeppelin.interpreter.connect.timeout`: Connection timeout with time unit support (e.g., "600s") +- `zeppelin.interpreter.rpc.portRange`: Port range for RPC communication +- `zeppelin.recovery.storage.class`: Recovery storage implementation +- `zeppelin.run.mode`: Deployment mode affecting interpreter launcher selection + +Sources: [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/launcher/InterpreterLauncher.java:56-65](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/recovery/RecoveryUtils.java:74-106]() + +## Environment and Runtime Configuration + +The configuration system integrates with environment variables and runtime settings to support different deployment scenarios, including development, testing, and production environments. + +### Time Unit Configuration +Zeppelin supports time unit suffixes for timeout configurations: +- `ms`: milliseconds +- `s`: seconds +- `m`: minutes +- `h`: hours + +Example: `zeppelin.interpreter.connect.timeout=600s` + +### Runtime Mode Detection +The `zeppelin.run.mode` property supports automatic detection: +- `auto`: Detects Kubernetes vs local environment +- `local`: Forces local interpreter processes +- `k8s`: Forces Kubernetes pod deployment +- `docker`: Forces Docker container deployment + +```mermaid +graph LR + AutoMode[run.mode = "auto"] + K8sCheck["/var/run/secrets/kubernetes.io/
serviceaccount/namespace exists?"] + K8sMode[RUN_MODE.K8S] + LocalMode[RUN_MODE.LOCAL] + + AutoMode --> K8sCheck + K8sCheck -->|Yes| K8sMode + K8sCheck -->|No| LocalMode +``` + +### Configuration Validation and Error Handling +The system includes comprehensive error handling for configuration parsing: +- Invalid numeric values fall back to defaults with warnings +- Missing configuration files proceed with built-in defaults +- Type conversion errors are logged but don't prevent startup + +Sources: [zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java:259-265](), [zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java:840-851](), [zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java:167-177]() diff --git a/.cursor/documentation/server_components/notebook_server_and_apis.md b/.cursor/documentation/server_components/notebook_server_and_apis.md new file mode 100644 index 00000000000..0b9642dd7f0 --- /dev/null +++ b/.cursor/documentation/server_components/notebook_server_and_apis.md @@ -0,0 +1,506 @@ +# Notebook Server and APIs + +
+Relevant source files + +The following files were used as context for generating this wiki page: + +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/scheduler/Job.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/scheduler/Job.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/scheduler/JobWithProgressPoller.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/scheduler/JobWithProgressPoller.java) +- [zeppelin-interpreter/src/test/java/org/apache/zeppelin/scheduler/JobTest.java](zeppelin-interpreter/src/test/java/org/apache/zeppelin/scheduler/JobTest.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/rest/NotebookRestApi.java](zeppelin-server/src/main/java/org/apache/zeppelin/rest/NotebookRestApi.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/service/NotebookService.java](zeppelin-server/src/main/java/org/apache/zeppelin/service/NotebookService.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java](zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/rest/AbstractTestRestApi.java](zeppelin-server/src/test/java/org/apache/zeppelin/rest/AbstractTestRestApi.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/rest/NotebookRestApiTest.java](zeppelin-server/src/test/java/org/apache/zeppelin/rest/NotebookRestApiTest.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/rest/ZeppelinRestApiTest.java](zeppelin-server/src/test/java/org/apache/zeppelin/rest/ZeppelinRestApiTest.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/service/NotebookServiceTest.java](zeppelin-server/src/test/java/org/apache/zeppelin/service/NotebookServiceTest.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/socket/NotebookServerTest.java](zeppelin-server/src/test/java/org/apache/zeppelin/socket/NotebookServerTest.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Note.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Note.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Notebook.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Notebook.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Paragraph.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Paragraph.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/helium/HeliumApplicationFactoryTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/helium/HeliumApplicationFactoryTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/NoteTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/NoteTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/NotebookTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/NotebookTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/ParagraphTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/ParagraphTest.java) + +
+ + + +This document covers Zeppelin's notebook server implementation and API layer, which provides both WebSocket and REST interfaces for notebook operations. The notebook server handles real-time communication between the frontend and backend, managing notebook execution, collaborative editing, and state synchronization. For information about the underlying notebook storage and persistence mechanisms, see [Storage and Persistence](#2.4). For details about interpreter execution and management, see [Interpreter Framework](#2.3). + +## Overview + +The notebook server consists of two primary communication interfaces: + +- **WebSocket Server** (`NotebookServer`) - Real-time bidirectional communication for interactive operations like paragraph execution, collaborative editing, and live updates +- **REST API** (`NotebookRestApi`) - HTTP endpoints for CRUD operations on notebooks, permissions management, and file operations + +Both interfaces operate on the same underlying data models (`Note`, `Paragraph`) and utilize shared services (`NotebookService`, `AuthorizationService`) for business logic and security. + +## WebSocket Server Architecture + +The `NotebookServer` class serves as the primary WebSocket endpoint, implementing multiple listener interfaces to handle different types of events and providing real-time communication capabilities. + +```mermaid +graph TB + subgraph "WebSocket Layer" + WS[NotebookServer] + CM[ConnectionManager] + NS[NotebookSocket] + end + + subgraph "Listener Interfaces" + AOL[AngularObjectRegistryListener] + RIPL[RemoteInterpreterProcessListener] + AEL[ApplicationEventListener] + PJL[ParagraphJobListener] + NEL[NoteEventListener] + CEL[ClusterEventListener] + end + + subgraph "Service Layer" + NBS[NotebookService] + AUS[AuthorizationService] + CS[ConfigurationService] + JMS[JobManagerService] + end + + subgraph "Core Models" + NOTE[Note] + PARA[Paragraph] + end + + WS -.implements.-> AOL + WS -.implements.-> RIPL + WS -.implements.-> AEL + WS -.implements.-> PJL + WS -.implements.-> NEL + WS -.implements.-> CEL + + WS --> CM + CM --> NS + WS --> NBS + WS --> AUS + WS --> CS + WS --> JMS + + NBS --> NOTE + NOTE --> PARA +``` + +Sources: [zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java:118-123](), [zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java:156-164]() + +### Message Handling + +The WebSocket server processes messages through a centralized `onMessage` method that deserializes incoming messages and routes them to appropriate handlers based on the operation type. + +```mermaid +graph LR + CLIENT[Client] -->|WebSocket Message| WS[NotebookServer.onMessage] + WS --> DM[deserializeMessage] + DM --> SWITCH[Switch on OP] + + SWITCH --> LN[LIST_NOTES] + SWITCH --> GN[GET_NOTE] + SWITCH --> RP[RUN_PARAGRAPH] + SWITCH --> UP[UPDATE_PARAGRAPH] + SWITCH --> CN[CREATE_NOTE] + SWITCH --> DN[DELETE_NOTE] + + LN --> LNI[listNotesInfo] + GN --> GNT[getNote] + RP --> RPG[runParagraph] + UP --> UPD[updateParagraph] + CN --> CNT[createNote] + DN --> DNT[deleteNote] +``` + +Sources: [zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java:280-500]() + +Key message operations include: + +| Operation | Handler Method | Description | +|-----------|---------------|-------------| +| `LIST_NOTES` | `listNotesInfo` | Retrieve available notebooks | +| `GET_NOTE` | `getNote` | Load specific notebook content | +| `RUN_PARAGRAPH` | `runParagraph` | Execute paragraph code | +| `COMMIT_PARAGRAPH` | `updateParagraph` | Save paragraph changes | +| `NEW_NOTE` | `createNote` | Create new notebook | +| `DEL_NOTE` | `deleteNote` | Remove notebook | +| `ANGULAR_OBJECT_UPDATED` | `angularObjectUpdated` | Sync UI state | + +Sources: [zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java:330-491]() + +### Connection Management + +The `ConnectionManager` handles WebSocket connections, user sessions, and broadcasting capabilities. Connections are tracked both globally and per-notebook for efficient message distribution. + +```mermaid +graph TB + subgraph "Connection Tracking" + SESSMAP[sessionIdNotebookSocketMap] + CONNMGR[ConnectionManager] + USERCONN[User Connections] + NOTECONN[Note Connections] + end + + subgraph "Broadcasting" + BCAST[broadcast] + UCAST[unicast] + MCAST[multicast] + end + + WS[NotebookServer] --> SESSMAP + WS --> CONNMGR + CONNMGR --> USERCONN + CONNMGR --> NOTECONN + + CONNMGR --> BCAST + CONNMGR --> UCAST + CONNMGR --> MCAST +``` + +Sources: [zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java:157](), [zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java:202-208]() + +## REST API Structure + +The `NotebookRestApi` provides HTTP endpoints for notebook management operations, implementing standard REST patterns with proper authentication and authorization. + +```mermaid +graph TB + subgraph "REST Endpoints" + ROOT["/notebook"] + NOTEID["/notebook/{noteId}"] + PERMS["/notebook/{noteId}/permissions"] + REV["/notebook/{noteId}/revision"] + PARA["/notebook/{noteId}/paragraph/{paragraphId}"] + end + + subgraph "HTTP Methods" + GET[GET] + POST[POST] + PUT[PUT] + DELETE[DELETE] + end + + subgraph "Operations" + LIST[List Notes] + CREATE[Create Note] + READ[Read Note] + UPDATE[Update Note] + DEL[Delete Note] + RUNP[Run Paragraph] + SETPERM[Set Permissions] + end + + ROOT --> GET + ROOT --> POST + NOTEID --> GET + NOTEID --> PUT + NOTEID --> DELETE + PERMS --> GET + PERMS --> PUT + PARA --> POST + + GET --> LIST + GET --> READ + POST --> CREATE + POST --> RUNP + PUT --> UPDATE + PUT --> SETPERM + DELETE --> DEL +``` + +Sources: [zeppelin-server/src/main/java/org/apache/zeppelin/rest/NotebookRestApi.java:75-78](), [zeppelin-server/src/main/java/org/apache/zeppelin/rest/NotebookRestApi.java:311-317]() + +### Key REST Endpoints + +| Endpoint | Method | Description | +|----------|--------|-------------| +| `/notebook` | GET | List all accessible notebooks | +| `/notebook` | POST | Create new notebook | +| `/notebook/{noteId}` | GET | Retrieve notebook content | +| `/notebook/{noteId}` | PUT | Update notebook metadata | +| `/notebook/{noteId}` | DELETE | Delete notebook | +| `/notebook/{noteId}/permissions` | GET/PUT | Manage permissions | +| `/notebook/{noteId}/paragraph/{paragraphId}` | POST | Execute paragraph | +| `/notebook/{noteId}/revision` | GET/POST | Version control operations | + +Sources: [zeppelin-server/src/main/java/org/apache/zeppelin/rest/NotebookRestApi.java:118-131](), [zeppelin-server/src/main/java/org/apache/zeppelin/rest/NotebookRestApi.java:327-334]() + +## Core Data Models + +### Note Model + +The `Note` class represents a complete notebook document containing metadata, configuration, and a collection of paragraphs. + +```mermaid +graph TB + subgraph "Note Structure" + NOTE[Note] + META[Metadata] + CONFIG[Configuration] + PARAS[Paragraphs] + FORMS[Note Forms] + PARAMS[Note Parameters] + end + + subgraph "Note Operations" + RUN[runAll] + SAVE[save] + CLONE[clone] + MOVE[move] + EXEC[execute] + end + + subgraph "Paragraph Management" + ADD[addNewParagraph] + REMOVE[removeParagraph] + MOVEPAR[moveParagraph] + CLEAR[clearParagraphOutput] + end + + NOTE --> META + NOTE --> CONFIG + NOTE --> PARAS + NOTE --> FORMS + NOTE --> PARAMS + + NOTE --> RUN + NOTE --> SAVE + NOTE --> CLONE + NOTE --> MOVE + NOTE --> EXEC + + NOTE --> ADD + NOTE --> REMOVE + NOTE --> MOVEPAR + NOTE --> CLEAR +``` + +Sources: [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Note.java:125-150](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Note.java:452-533]() + +Key Note properties: +- `id`: Unique identifier generated by `IdHashes.generateId()` +- `name`: Display name derived from path +- `path`: Hierarchical location in notebook tree +- `paragraphs`: List of executable code cells +- `defaultInterpreterGroup`: Default interpreter for new paragraphs +- `config`: Notebook-level configuration including cron settings +- `noteParams`/`noteForms`: Shared parameters and form inputs + +Sources: [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Note.java:125-150]() + +### Paragraph Model + +The `Paragraph` class represents individual executable code cells within notebooks, extending `JobWithProgressPoller` to support asynchronous execution with progress tracking. + +```mermaid +graph TB + subgraph "Paragraph Structure" + PARA[Paragraph] + JOB[JobWithProgressPoller] + TEXT[text] + TITLE[title] + CONFIG[config] + RESULTS[results] + STATUS[status] + end + + subgraph "Execution Flow" + PARSE[parseText] + BIND[getBindedInterpreter] + EXEC[execute] + JOBRUN[jobRun] + SETRES[setResult] + end + + subgraph "Text Parsing" + INTPTEXT[intpText] + SCRIPTTEXT[scriptText] + LOCALPROPS[localProperties] + end + + PARA --> JOB + PARA --> TEXT + PARA --> TITLE + PARA --> CONFIG + PARA --> RESULTS + PARA --> STATUS + + PARA --> PARSE + PARSE --> INTPTEXT + PARSE --> SCRIPTTEXT + PARSE --> LOCALPROPS + + PARA --> BIND + PARA --> EXEC + EXEC --> JOBRUN + JOBRUN --> SETRES +``` + +Sources: [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Paragraph.java:69-101](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Paragraph.java:180-195](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Paragraph.java:332-392]() + +Key Paragraph features: +- **Text Parsing**: Separates interpreter directive (`%interpreter`) from script content +- **Async Execution**: Implements job scheduling with status tracking and progress updates +- **Results Management**: Stores `InterpreterResult` with typed output messages +- **Personalized Mode**: Supports per-user paragraph instances for collaborative editing +- **Form Integration**: Handles dynamic form inputs and parameter substitution + +Sources: [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Paragraph.java:186-195](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Paragraph.java:332-392]() + +## Service Layer Architecture + +### NotebookService + +The `NotebookService` provides high-level business logic for notebook operations, abstracting the complexity of permission checking, validation, and coordination between different components. + +```mermaid +graph TB + subgraph "NotebookService Operations" + NS[NotebookService] + CREATE[createNote] + GET[getNote] + LIST[listNotesInfo] + RENAME[renameNote] + CLONE[cloneNote] + REMOVE[removeNote] + RUNPARA[runParagraph] + end + + subgraph "Permission Checking" + CHECKREAD[checkPermission READER] + CHECKWRITE[checkPermission WRITER] + CHECKOWNER[checkPermission OWNER] + CHECKRUN[checkPermission RUNNER] + end + + subgraph "Validation" + NORMALIZE[normalizeNotePath] + VALIDATE[validateInput] + SANITIZE[sanitizeContent] + end + + subgraph "Backend Integration" + NOTEBOOK[Notebook] + AUTHSVC[AuthorizationService] + SCHEDULER[SchedulerService] + end + + NS --> CREATE + NS --> GET + NS --> LIST + NS --> RENAME + NS --> CLONE + NS --> REMOVE + NS --> RUNPARA + + CREATE --> CHECKWRITE + GET --> CHECKREAD + RENAME --> CHECKOWNER + REMOVE --> CHECKOWNER + RUNPARA --> CHECKRUN + + NS --> NORMALIZE + NS --> VALIDATE + NS --> SANITIZE + + NS --> NOTEBOOK + NS --> AUTHSVC + NS --> SCHEDULER +``` + +Sources: [zeppelin-server/src/main/java/org/apache/zeppelin/service/NotebookService.java:80-101](), [zeppelin-server/src/main/java/org/apache/zeppelin/service/NotebookService.java:191-221]() + +The service layer implements the callback pattern for asynchronous operations, calling `ServiceCallback.onSuccess()` or `ServiceCallback.onFailure()` based on operation results. + +Sources: [zeppelin-server/src/main/java/org/apache/zeppelin/service/NotebookService.java:75-77]() + +## Message Flow and Broadcasting + +The notebook server implements sophisticated message broadcasting to keep all connected clients synchronized with notebook state changes. + +```mermaid +sequenceDiagram + participant C1 as "Client 1" + participant C2 as "Client 2" + participant NS as "NotebookServer" + participant NBS as "NotebookService" + participant NOTE as "Note" + participant PARA as "Paragraph" + + C1->>NS: "RUN_PARAGRAPH message" + NS->>NBS: "runParagraph()" + NBS->>NOTE: "run(paragraphId)" + NOTE->>PARA: "execute()" + PARA->>PARA: "jobRun()" + PARA->>NOTE: "result ready" + NOTE->>NS: "onParagraphFinish()" + NS->>NS: "broadcastParagraph()" + NS->>C1: "PARAGRAPH update" + NS->>C2: "PARAGRAPH update" +``` + +Sources: [zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java:385-387](), [zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java:669-672]() + +### Broadcasting Patterns + +The system supports multiple broadcasting patterns: + +| Method | Scope | Description | +|--------|-------|-------------| +| `broadcast()` | All connections | Send to all connected clients | +| `unicast()` | Single connection | Send to specific client | +| `multicastToUser()` | User connections | Send to all sessions for a user | +| `broadcastNote()` | Note subscribers | Send to clients viewing a note | + +Sources: [zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java:544-546](), [zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java:648-656]() + +## Authentication and Authorization Integration + +Both WebSocket and REST interfaces integrate with Zeppelin's authentication and authorization system to enforce permissions at multiple levels. + +```mermaid +graph TB + subgraph "Security Layers" + TICKET[TicketContainer] + AUTH[AuthenticationInfo] + AUTHZ[AuthorizationService] + PERMS[Permissions] + end + + subgraph "Permission Types" + OWNER[OWNER] + WRITER[WRITER] + READER[READER] + RUNNER[RUNNER] + end + + subgraph "Enforcement Points" + WSCHECK[WebSocket Message Check] + RESTCHECK[REST Endpoint Check] + SERVICECHECK[Service Layer Check] + end + + TICKET --> AUTH + AUTH --> AUTHZ + AUTHZ --> PERMS + + PERMS --> OWNER + PERMS --> WRITER + PERMS --> READER + PERMS --> RUNNER + + WSCHECK --> TICKET + RESTCHECK --> AUTH + SERVICECHECK --> AUTHZ +``` + +Sources: [zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java:294-308](), [zeppelin-server/src/main/java/org/apache/zeppelin/rest/NotebookRestApi.java:154-196]() + +The authentication flow validates tickets for WebSocket connections and uses standard HTTP authentication for REST calls, while authorization checks are performed at the service layer using `AuthorizationService.checkPermission()`. + +Sources: [zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java:294-308](), [zeppelin-server/src/main/java/org/apache/zeppelin/service/NotebookService.java:110-120]() diff --git a/.cursor/documentation/server_components/process_lifecycle_management.md b/.cursor/documentation/server_components/process_lifecycle_management.md new file mode 100644 index 00000000000..83e0327b3b9 --- /dev/null +++ b/.cursor/documentation/server_components/process_lifecycle_management.md @@ -0,0 +1,361 @@ +# Process Lifecycle Management + +
+Relevant source files + +The following files were used as context for generating this wiki page: + +- [bin/common.cmd](bin/common.cmd) +- [bin/common.sh](bin/common.sh) +- [bin/functions.sh](bin/functions.sh) +- [bin/interpreter.sh](bin/interpreter.sh) +- [bin/zeppelin-daemon.sh](bin/zeppelin-daemon.sh) +- [bin/zeppelin.sh](bin/zeppelin.sh) +- [conf/zeppelin-env.cmd.template](conf/zeppelin-env.cmd.template) +- [conf/zeppelin-env.sh.template](conf/zeppelin-env.sh.template) +- [zeppelin-server/src/main/java/org/apache/zeppelin/server/ImmediateErrorHandlerImpl.java](zeppelin-server/src/main/java/org/apache/zeppelin/server/ImmediateErrorHandlerImpl.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/socket/SessionConfigurator.java](zeppelin-server/src/main/java/org/apache/zeppelin/socket/SessionConfigurator.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/utils/TestUtils.java](zeppelin-server/src/main/java/org/apache/zeppelin/utils/TestUtils.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/recovery/RecoveryTest.java](zeppelin-server/src/test/java/org/apache/zeppelin/recovery/RecoveryTest.java) + +
+ + + +This document describes the shell scripts, daemon management, and interpreter process lifecycle in Apache Zeppelin. It covers how Zeppelin manages the lifecycle of both the main server process and remote interpreter processes through a collection of shell scripts and Java components. + +For information about the interpreter framework architecture, see [Interpreter Framework](#2.3). For details about remote interpreter infrastructure, see [Remote Interpreter Infrastructure](#5.1). + +## Overview + +Zeppelin's process lifecycle management consists of three main components: + +1. **Server Process Management** - Managing the main Zeppelin server daemon +2. **Interpreter Process Management** - Launching and managing remote interpreter processes +3. **Environment Configuration** - Setting up runtime environments for all processes + +The system uses a combination of shell scripts, Java classes, and configuration files to orchestrate process creation, monitoring, and termination across different deployment scenarios. + +```mermaid +graph TB + subgraph "Shell Script Layer" + ZeppelinDaemon["zeppelin-daemon.sh
Server Lifecycle"] + ZeppelinSh["zeppelin.sh
Direct Server Launch"] + InterpreterSh["interpreter.sh
Interpreter Launcher"] + CommonSh["common.sh
Shared Utilities"] + end + + subgraph "Java Process Layer" + ZeppelinServer["ZeppelinServer
Main Process"] + RemoteInterpreterServer["RemoteInterpreterServer
Interpreter Processes"] + end + + subgraph "Configuration Layer" + ZeppelinEnv["zeppelin-env.sh
Environment Config"] + ZeppelinSite["zeppelin-site.xml
Server Config"] + end + + subgraph "Process Management" + PidFiles["PID Files
/run/*.pid"] + LogFiles["Log Files
/logs/*.log"] + RecoveryStorage["Recovery Storage
Process State"] + end + + ZeppelinDaemon --> ZeppelinServer + ZeppelinSh --> ZeppelinServer + InterpreterSh --> RemoteInterpreterServer + + CommonSh --> ZeppelinDaemon + CommonSh --> ZeppelinSh + CommonSh --> InterpreterSh + + ZeppelinEnv --> CommonSh + ZeppelinSite --> ZeppelinServer + + ZeppelinServer --> PidFiles + ZeppelinServer --> LogFiles + RemoteInterpreterServer --> RecoveryStorage +``` + +Sources: [bin/zeppelin-daemon.sh:1-281](), [bin/zeppelin.sh:1-143](), [bin/interpreter.sh:1-300](), [bin/common.sh:1-177]() + +## Shell Script Infrastructure + +### Common Utilities (`common.sh`) + +The `common.sh` script provides shared functionality used by all other scripts. It establishes the foundational environment for Zeppelin processes. + +**Key Functions:** +- **Environment Setup**: Defines `ZEPPELIN_HOME`, `ZEPPELIN_CONF_DIR`, `ZEPPELIN_LOG_DIR`, and `ZEPPELIN_PID_DIR` +- **Java Configuration**: Sets up `JAVA_OPTS` and `ZEPPELIN_RUNNER` based on available Java installation +- **Classpath Management**: Provides functions like `addJarInDir()` and `addEachJarInDir()` for building classpaths +- **Version Checking**: Implements `check_java_version()` to ensure Java 8+ compatibility + +```mermaid +graph LR + subgraph "Environment Variables" + ZeppelinHome["ZEPPELIN_HOME
Base Directory"] + ZeppelinConfDir["ZEPPELIN_CONF_DIR
Configuration"] + ZeppelinLogDir["ZEPPELIN_LOG_DIR
Log Output"] + ZeppelinPidDir["ZEPPELIN_PID_DIR
Process IDs"] + end + + subgraph "Java Configuration" + JavaHome["JAVA_HOME
Java Installation"] + ZeppelinRunner["ZEPPELIN_RUNNER
Java Executable"] + JavaOpts["JAVA_OPTS
JVM Options"] + end + + subgraph "Classpath Functions" + AddJarInDir["addJarInDir()
Add Directory JARs"] + AddEachJarInDir["addEachJarInDir()
Add Individual JARs"] + CheckJavaVersion["check_java_version()
Validate Java"] + end + + ZeppelinHome --> ZeppelinConfDir + ZeppelinHome --> ZeppelinLogDir + ZeppelinHome --> ZeppelinPidDir + + JavaHome --> ZeppelinRunner + ZeppelinRunner --> JavaOpts +``` + +Sources: [bin/common.sh:25-42](), [bin/common.sh:68-83](), [bin/common.sh:85-119]() + +### Environment Configuration + +The environment is configured through `zeppelin-env.sh` (created from the template) and various environment variables. + +**Key Configuration Areas:** +- **Memory Settings**: `ZEPPELIN_MEM`, `ZEPPELIN_INTP_MEM` for server and interpreter memory +- **Java Options**: `ZEPPELIN_JAVA_OPTS`, `ZEPPELIN_INTP_JAVA_OPTS` for JVM tuning +- **Hadoop Integration**: `HADOOP_CONF_DIR`, `USE_HADOOP` for Hadoop ecosystem support +- **Spark Configuration**: `SPARK_HOME`, `SPARK_SUBMIT_OPTIONS` for Spark integration + +Sources: [conf/zeppelin-env.sh.template:19-113](), [bin/common.sh:140-164]() + +## Server Daemon Management + +### Daemon Operations (`zeppelin-daemon.sh`) + +The `zeppelin-daemon.sh` script manages the Zeppelin server as a system daemon with support for standard lifecycle operations. + +**Supported Operations:** +- `start` - Launch server daemon with PID tracking +- `stop` - Gracefully shutdown server with timeout +- `restart` - Stop and start sequence +- `status` - Check if server process is running +- `upstart` - Run as managed service (no daemon fork) + +```mermaid +stateDiagram-v2 + [*] --> Stopped + + Stopped --> Starting : start command + Starting --> Running : process launched + Starting --> Failed : startup error + + Running --> Stopping : stop command + Running --> Failed : process died + + Stopping --> Stopped : graceful shutdown + Stopping --> Killed : force kill timeout + + Failed --> Stopped : cleanup + Killed --> Stopped : cleanup + + Running --> Running : status check + Stopped --> Stopped : status check +``` + +**Process Management Functions:** +- `start()` - Creates PID file, launches server with nohup, monitors startup +- `stop()` - Reads PID, sends TERM signal, waits for graceful shutdown, force kills if needed +- `wait_for_zeppelin_to_die()` - Implements timeout-based process termination +- `check_if_process_is_alive()` - Validates process health using PID + +Sources: [bin/zeppelin-daemon.sh:188-216](), [bin/zeppelin-daemon.sh:218-234](), [bin/zeppelin-daemon.sh:117-142](), [bin/zeppelin-daemon.sh:166-174]() + +### Direct Server Launch (`zeppelin.sh`) + +The `zeppelin.sh` script provides direct server execution without daemon functionality, useful for development and containerized environments. + +**Key Features:** +- Supports `--config` for custom configuration directory +- Supports `--run ` for automated notebook execution +- Uses `exec` for direct process replacement (no PID management) +- Shares classpath construction logic with daemon script + +Sources: [bin/zeppelin.sh:45-87](), [bin/zeppelin.sh:94-142]() + +## Interpreter Process Lifecycle + +### Interpreter Launcher (`interpreter.sh`) + +The `interpreter.sh` script is responsible for launching remote interpreter processes with complex environment setup and configuration management. + +**Command Line Arguments:** +- `-p ` - Callback port for server communication +- `-r ` - Interpreter process port +- `-d ` - Interpreter directory to load +- `-l ` - Local interpreter repository +- `-g ` - Interpreter group name +- `-u ` - User impersonation +- `-c ` - Callback host for server + +```mermaid +sequenceDiagram + participant ISM as "InterpreterSettingManager" + participant Launcher as "ProcessLauncher" + participant Script as "interpreter.sh" + participant Process as "RemoteInterpreterServer" + + ISM->>Launcher: launch(interpreterDir, port, groupId) + Launcher->>Script: execute with args + + Script->>Script: parse arguments + Script->>Script: setup environment + Script->>Script: download dependencies + Script->>Script: construct classpath + Script->>Script: handle user impersonation + Script->>Script: build launch command + + alt Spark Interpreter + Script->>Process: spark-submit --class RemoteInterpreterServer + else Other Interpreters + Script->>Process: java -cp ... RemoteInterpreterServer + end + + Process->>ISM: callback connection established +``` + +**Environment Setup Process:** +1. **Container Detection**: Checks if running in container and sets up user entry +2. **Argument Parsing**: Processes command line options using `getopts` +3. **Java Validation**: Calls `check_java_version()` from `common.sh` +4. **Classpath Construction**: Builds interpreter-specific classpath +5. **Dependency Download**: Downloads interpreter libraries if needed +6. **Interpreter-Specific Setup**: Configures environment for Spark, Flink, HBase, etc. +7. **User Impersonation**: Sets up sudo commands if impersonation is enabled +8. **Launch Command Construction**: Builds final execution command +9. **Process Execution**: Uses `exec` to replace shell with interpreter process + +Sources: [bin/interpreter.sh:36-57](), [bin/interpreter.sh:59-94](), [bin/interpreter.sh:97-135](), [bin/interpreter.sh:278-299]() + +### Interpreter-Specific Configuration + +Different interpreter types require specialized environment setup: + +**Spark Interpreter:** +- Sets up `SPARK_HOME` and `SPARK_SUBMIT` variables +- Configures `PYTHONPATH` for PySpark support +- Handles Kerberos authentication with `kinit` +- Manages Hadoop configuration directory +- Uses `spark-submit` for launching instead of direct Java execution + +**Flink Interpreter:** +- Adds Flink JAR files to classpath recursively +- Handles Flink Python JAR inclusion +- Supports application mode execution with `flink run-application` +- Manages Hadoop classpath integration + +**HBase Interpreter:** +- Configures `HBASE_CONF_DIR` or `HBASE_HOME` +- Adds HBase configuration to classpath + +Sources: [bin/interpreter.sh:165-184](), [bin/interpreter.sh:230-261](), [bin/interpreter.sh:200-207]() + +## Process Recovery and Monitoring + +### Recovery Storage + +Zeppelin implements process recovery through the recovery storage system to handle server restarts and interpreter process failures. + +**Recovery Components:** +- `FileSystemRecoveryStorage` - Stores interpreter state to filesystem +- Recovery directory (`ZEPPELIN_RECOVERY_DIR`) - Contains process state files +- `StopInterpreter` utility - Cleanly terminates orphaned interpreter processes + +```mermaid +graph TB + subgraph "Recovery System" + RecoveryStorage["FileSystemRecoveryStorage
State Persistence"] + RecoveryDir["Recovery Directory
Process State Files"] + StopInterpreter["StopInterpreter
Cleanup Utility"] + end + + subgraph "Process Monitoring" + PidTracking["PID File Tracking
zeppelin-*.pid"] + ProcessHealth["Process Health Checks
kill -0 "] + LogMonitoring["Log File Monitoring
*.log files"] + end + + subgraph "Recovery Scenarios" + ServerRestart["Server Restart
Restore Interpreter State"] + InterpreterCrash["Interpreter Crash
Clean Recovery"] + OrphanedProcess["Orphaned Processes
Cleanup on Startup"] + end + + RecoveryStorage --> RecoveryDir + RecoveryDir --> ServerRestart + StopInterpreter --> OrphanedProcess + PidTracking --> ProcessHealth + ProcessHealth --> InterpreterCrash +``` + +**Recovery Process:** +1. **State Persistence**: Interpreter groups save state to recovery storage +2. **Server Startup**: Recovery system checks for existing interpreter processes +3. **Process Validation**: Attempts to reconnect to running interpreters +4. **Cleanup**: Terminates orphaned or invalid processes +5. **State Restoration**: Restores interpreter group state where possible + +Sources: [zeppelin-server/src/test/java/org/apache/zeppelin/recovery/RecoveryTest.java:61-79](), [zeppelin-server/src/test/java/org/apache/zeppelin/recovery/RecoveryTest.java:240-242]() + +### PID and Log Management + +Process lifecycle management relies on PID files and log files for monitoring and control. + +**PID File Structure:** +- Server PID: `${ZEPPELIN_PID_DIR}/zeppelin-${ZEPPELIN_IDENT_STRING}-${HOSTNAME}.pid` +- Interpreter PID: `${ZEPPELIN_PID_DIR}/zeppelin-interpreter-${INTP_GROUP_ID}-${ZEPPELIN_IDENT_STRING}-${HOSTNAME}-${PORT}.pid` + +**Log File Structure:** +- Server Log: `${ZEPPELIN_LOG_DIR}/zeppelin-${ZEPPELIN_IDENT_STRING}-${HOSTNAME}.log` +- Interpreter Log: `${ZEPPELIN_LOG_DIR}/zeppelin-interpreter-${INTERPRETER_GROUP_ID}-${USER}-${HOSTNAME}.log` + +Sources: [bin/zeppelin-daemon.sh:53](), [bin/interpreter.sh:132](), [bin/zeppelin-daemon.sh:51](), [bin/interpreter.sh:142-156]() + +## Advanced Process Management + +### User Impersonation + +Zeppelin supports running interpreter processes as different users through the impersonation system. + +**Configuration:** +- `ZEPPELIN_IMPERSONATE_USER` - Target user for impersonation +- `ZEPPELIN_IMPERSONATE_CMD` - Custom impersonation command +- `ZEPPELIN_IMPERSONATE_SPARK_PROXY_USER` - Spark-specific proxy user settings + +**Implementation:** +- Uses SSH or custom commands to switch user context +- Maintains environment variable inheritance +- Supports both individual interpreter and Spark-specific impersonation modes + +Sources: [bin/interpreter.sh:144-150](), [bin/interpreter.sh:266-276](), [conf/zeppelin-env.sh.template:110-113]() + +### Container Support + +The scripts include special handling for containerized environments: + +**Container Detection:** +- Checks `/proc/self/cgroup` for container indicators +- Uses `getent` to validate user entries +- Automatically creates user entries in `/etc/passwd` if missing + +**Container-Specific Adaptations:** +- Handles anonymous UID scenarios +- Manages container-specific logging configurations +- Adapts PID management for container lifecycle + +Sources: [bin/interpreter.sh:36-57](), [bin/zeppelin.sh:22-43]() diff --git a/.cursor/documentation/server_components/server_components.md b/.cursor/documentation/server_components/server_components.md new file mode 100644 index 00000000000..519ffb8c6c8 --- /dev/null +++ b/.cursor/documentation/server_components/server_components.md @@ -0,0 +1,416 @@ +# Server Components + +
+Relevant source files + +The following files were used as context for generating this wiki page: + +- [bin/common.cmd](bin/common.cmd) +- [bin/common.sh](bin/common.sh) +- [bin/functions.sh](bin/functions.sh) +- [bin/interpreter.sh](bin/interpreter.sh) +- [bin/zeppelin-daemon.sh](bin/zeppelin-daemon.sh) +- [bin/zeppelin.sh](bin/zeppelin.sh) +- [conf/zeppelin-env.cmd.template](conf/zeppelin-env.cmd.template) +- [conf/zeppelin-env.sh.template](conf/zeppelin-env.sh.template) +- [conf/zeppelin-site.xml.template](conf/zeppelin-site.xml.template) +- [docs/setup/operation/configuration.md](docs/setup/operation/configuration.md) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/launcher/InterpreterLauncher.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/launcher/InterpreterLauncher.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/lifecycle/TimeoutLifecycleManager.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/interpreter/lifecycle/TimeoutLifecycleManager.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/scheduler/Job.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/scheduler/Job.java) +- [zeppelin-interpreter/src/main/java/org/apache/zeppelin/scheduler/JobWithProgressPoller.java](zeppelin-interpreter/src/main/java/org/apache/zeppelin/scheduler/JobWithProgressPoller.java) +- [zeppelin-interpreter/src/test/java/org/apache/zeppelin/conf/ZeppelinConfigurationTest.java](zeppelin-interpreter/src/test/java/org/apache/zeppelin/conf/ZeppelinConfigurationTest.java) +- [zeppelin-interpreter/src/test/java/org/apache/zeppelin/scheduler/JobTest.java](zeppelin-interpreter/src/test/java/org/apache/zeppelin/scheduler/JobTest.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/rest/NotebookRestApi.java](zeppelin-server/src/main/java/org/apache/zeppelin/rest/NotebookRestApi.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/server/ImmediateErrorHandlerImpl.java](zeppelin-server/src/main/java/org/apache/zeppelin/server/ImmediateErrorHandlerImpl.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/server/ZeppelinServer.java](zeppelin-server/src/main/java/org/apache/zeppelin/server/ZeppelinServer.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/service/NotebookService.java](zeppelin-server/src/main/java/org/apache/zeppelin/service/NotebookService.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java](zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/socket/SessionConfigurator.java](zeppelin-server/src/main/java/org/apache/zeppelin/socket/SessionConfigurator.java) +- [zeppelin-server/src/main/java/org/apache/zeppelin/utils/TestUtils.java](zeppelin-server/src/main/java/org/apache/zeppelin/utils/TestUtils.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/recovery/RecoveryTest.java](zeppelin-server/src/test/java/org/apache/zeppelin/recovery/RecoveryTest.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/rest/AbstractTestRestApi.java](zeppelin-server/src/test/java/org/apache/zeppelin/rest/AbstractTestRestApi.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/rest/NotebookRestApiTest.java](zeppelin-server/src/test/java/org/apache/zeppelin/rest/NotebookRestApiTest.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/rest/ZeppelinRestApiTest.java](zeppelin-server/src/test/java/org/apache/zeppelin/rest/ZeppelinRestApiTest.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/service/NotebookServiceTest.java](zeppelin-server/src/test/java/org/apache/zeppelin/service/NotebookServiceTest.java) +- [zeppelin-server/src/test/java/org/apache/zeppelin/socket/NotebookServerTest.java](zeppelin-server/src/test/java/org/apache/zeppelin/socket/NotebookServerTest.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/recovery/RecoveryUtils.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/recovery/RecoveryUtils.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreter.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreter.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Note.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Note.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Notebook.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Notebook.java) +- [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Paragraph.java](zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Paragraph.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/conf/ZeppelinConfigurationTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/conf/ZeppelinConfigurationTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/helium/HeliumApplicationFactoryTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/helium/HeliumApplicationFactoryTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/launcher/StandardInterpreterLauncherTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/launcher/StandardInterpreterLauncherTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/lifecycle/TimeoutLifecycleManagerTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/lifecycle/TimeoutLifecycleManagerTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/interpreter/remote/RemoteInterpreterTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/NoteTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/NoteTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/NotebookTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/NotebookTest.java) +- [zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/ParagraphTest.java](zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/ParagraphTest.java) + +
+ + + +This document covers the server-side components of Apache Zeppelin that handle web requests, manage notebook operations, and coordinate with interpreter processes. These components form the backend infrastructure that serves the web UI and manages the execution environment. + +For information about interpreter framework and remote process management, see [Interpreter Framework](#2.3). For details about the web frontend, see [Web Interface](#3). + +## Main Server Architecture + +The Zeppelin server is built around a Jetty-based web server with dependency injection provided by HK2. The main entry point is the `ZeppelinServer` class, which bootstraps all server components and manages the application lifecycle. + +```mermaid +graph TB + ZeppelinServer["ZeppelinServer
(main entry point)"] + JettyServer["Jetty Web Server
(HTTP/HTTPS/WebSocket)"] + ServiceLocator["HK2 ServiceLocator
(Dependency Injection)"] + + subgraph "Web Contexts" + DefaultWebApp["Default Web Context
(zeppelin-web)"] + NextWebApp["Next Web Context
(zeppelin-web-angular)"] + end + + subgraph "Core Services" + NotebookSvc["NotebookService"] + AuthSvc["AuthorizationService"] + InterpMgr["InterpreterSettingManager"] + ConfigSvc["ConfigurationService"] + end + + subgraph "Communication Endpoints" + NotebookServer["NotebookServer
(WebSocket /ws)"] + RestAPI["REST APIs
(/api/notebook, etc.)"] + end + + ZeppelinServer --> JettyServer + ZeppelinServer --> ServiceLocator + JettyServer --> DefaultWebApp + JettyServer --> NextWebApp + JettyServer --> NotebookServer + JettyServer --> RestAPI + + ServiceLocator --> NotebookSvc + ServiceLocator --> AuthSvc + ServiceLocator --> InterpMgr + ServiceLocator --> ConfigSvc + + RestAPI --> NotebookSvc + NotebookServer --> NotebookSvc +``` + +The server startup process initializes metrics collection, configures SSL if enabled, sets up cluster mode for distributed deployments, and registers all service components through dependency injection. + +**Sources:** [zeppelin-server/src/main/java/org/apache/zeppelin/server/ZeppelinServer.java:134-352]() + +## WebSocket Communication Layer + +The `NotebookServer` class provides real-time bidirectional communication between the web frontend and server using WebSockets. It handles connection management, message routing, and real-time updates for collaborative editing. + +```mermaid +graph LR + subgraph "Frontend" + WebUI["Web UI
(Angular.js)"] + end + + subgraph "WebSocket Layer" + WSEndpoint["@ServerEndpoint('/ws')
NotebookServer"] + ConnectionMgr["ConnectionManager
(session management)"] + MessageRouter["Message Router
(OP enum routing)"] + end + + subgraph "Message Types" + LIST_NOTES["LIST_NOTES"] + GET_NOTE["GET_NOTE"] + RUN_PARAGRAPH["RUN_PARAGRAPH"] + COMPLETION["COMPLETION"] + AngularOps["ANGULAR_OBJECT_*"] + end + + subgraph "Backend Services" + NotebookSvc["NotebookService"] + ConfigSvc["ConfigurationService"] + JobMgrSvc["JobManagerService"] + end + + WebUI <--> WSEndpoint + WSEndpoint --> ConnectionMgr + WSEndpoint --> MessageRouter + + MessageRouter --> LIST_NOTES + MessageRouter --> GET_NOTE + MessageRouter --> RUN_PARAGRAPH + MessageRouter --> COMPLETION + MessageRouter --> AngularOps + + MessageRouter --> NotebookSvc + MessageRouter --> ConfigSvc + MessageRouter --> JobMgrSvc +``` + +The WebSocket server supports authentication through ticket validation, handles connection multiplexing for multiple users, and provides broadcasting capabilities for collaborative features and cluster synchronization. + +**Sources:** [zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java:112-123](), [zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java:330-491]() + +## REST API Layer + +The REST API provides HTTP endpoints for notebook operations, configuration management, and administrative tasks. The main notebook operations are handled by `NotebookRestApi` with comprehensive permission checking. + +```mermaid +graph TB + subgraph "REST Controllers" + NotebookAPI["NotebookRestApi
@Path('/notebook')"] + InterpreterAPI["InterpreterRestApi
@Path('/interpreter')"] + ConfigAPI["ConfigurationRestApi
@Path('/configurations')"] + SecurityAPI["SecurityRestApi
@Path('/security')"] + end + + subgraph "HTTP Methods & Endpoints" + GET_Notes["GET /
(list notebooks)"] + GET_Note["GET /{noteId}
(get notebook)"] + POST_Note["POST /
(create notebook)"] + PUT_Note["PUT /{noteId}
(update notebook)"] + DELETE_Note["DELETE /{noteId}
(delete notebook)"] + GET_Perms["GET /{noteId}/permissions"] + PUT_Perms["PUT /{noteId}/permissions"] + end + + subgraph "Permission Checks" + checkIfUserCanRead["checkIfUserCanRead()"] + checkIfUserCanWrite["checkIfUserCanWrite()"] + checkIfUserIsOwner["checkIfUserIsOwner()"] + checkIfUserCanRun["checkIfUserCanRun()"] + end + + subgraph "Service Layer" + NotebookSvc["NotebookService"] + AuthSvc["AuthorizationService"] + AuthnSvc["AuthenticationService"] + end + + NotebookAPI --> GET_Notes + NotebookAPI --> GET_Note + NotebookAPI --> POST_Note + NotebookAPI --> PUT_Note + NotebookAPI --> DELETE_Note + NotebookAPI --> GET_Perms + NotebookAPI --> PUT_Perms + + GET_Notes --> checkIfUserCanRead + GET_Note --> checkIfUserCanRead + POST_Note --> checkIfUserCanWrite + PUT_Note --> checkIfUserCanWrite + DELETE_Note --> checkIfUserIsOwner + + checkIfUserCanRead --> AuthSvc + checkIfUserCanWrite --> AuthSvc + checkIfUserIsOwner --> AuthSvc + checkIfUserCanRun --> AuthSvc + + NotebookAPI --> NotebookSvc + NotebookAPI --> AuthnSvc +``` + +Each REST endpoint performs authentication and authorization checks before delegating to the appropriate service layer component. The API supports both JSON and form-encoded request bodies depending on the operation. + +**Sources:** [zeppelin-server/src/main/java/org/apache/zeppelin/rest/NotebookRestApi.java:75-113](), [zeppelin-server/src/main/java/org/apache/zeppelin/rest/NotebookRestApi.java:154-227]() + +## Service Layer Architecture + +The service layer provides business logic for notebook operations while maintaining separation from the presentation layer. `NotebookService` is the primary service class that orchestrates notebook management operations. + +```mermaid +graph TB + subgraph "Service Layer" + NotebookSvc["NotebookService"] + ConfigSvc["ConfigurationService"] + JobMgrSvc["JobManagerService"] + AuthnSvc["AuthenticationService"] + end + + subgraph "Core Domain Objects" + Notebook["Notebook
(orchestration)"] + Note["Note
(business entity)"] + Paragraph["Paragraph
(execution unit)"] + AuthzSvc["AuthorizationService
(permissions)"] + end + + subgraph "Repository Layer" + NotebookRepo["NotebookRepo
(persistence)"] + NoteManager["NoteManager
(indexing)"] + SchedulerSvc["SchedulerService
(cron jobs)"] + end + + subgraph "Callback Pattern" + ServiceCallback["ServiceCallback"] + onSuccess["onSuccess(T result, ServiceContext)"] + onFailure["onFailure(Exception, ServiceContext)"] + end + + NotebookSvc --> Notebook + NotebookSvc --> AuthzSvc + NotebookSvc --> SchedulerSvc + NotebookSvc --> ServiceCallback + + Notebook --> Note + Notebook --> NoteManager + Notebook --> NotebookRepo + + Note --> Paragraph + + ServiceCallback --> onSuccess + ServiceCallback --> onFailure +``` + +The service layer uses an asynchronous callback pattern where operations are executed and results are provided through `ServiceCallback` implementations. This allows for consistent error handling and supports both synchronous and asynchronous execution patterns. + +**Sources:** [zeppelin-server/src/main/java/org/apache/zeppelin/service/NotebookService.java:80-101](), [zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Notebook.java:73-114]() + +## Configuration Management System + +Zeppelin's configuration system supports multiple sources with a clear precedence hierarchy. Configuration is centralized through the `ZeppelinConfiguration` class with XML file backing and environment variable overrides. + +```mermaid +graph TB + subgraph "Configuration Sources (by priority)" + EnvVars["Environment Variables
(highest priority)"] + SysProps["System Properties
(-D JVM args)"] + XMLFile["zeppelin-site.xml
(configuration file)"] + Defaults["Default Values
(lowest priority)"] + end + + subgraph "Configuration Categories" + ServerConfig["Server Configuration
(port, SSL, context)"] + NotebookConfig["Notebook Configuration
(storage, directories)"] + InterpConfig["Interpreter Configuration
(timeouts, memory)"] + SecurityConfig["Security Configuration
(authentication, authorization)"] + ClusterConfig["Cluster Configuration
(distributed mode)"] + end + + subgraph "Configuration Usage" + ZeppelinConf["ZeppelinConfiguration
(singleton)"] + ConfVars["ConfVars enum
(typed accessors)"] + EnvTemplate["zeppelin-env.sh
(environment setup)"] + end + + EnvVars --> ZeppelinConf + SysProps --> ZeppelinConf + XMLFile --> ZeppelinConf + Defaults --> ZeppelinConf + + ZeppelinConf --> ServerConfig + ZeppelinConf --> NotebookConfig + ZeppelinConf --> InterpConfig + ZeppelinConf --> SecurityConfig + ZeppelinConf --> ClusterConfig + + ZeppelinConf --> ConfVars + EnvVars --> EnvTemplate +``` + +The configuration system provides type-safe access through the `ConfVars` enum and supports runtime reconfiguration for certain properties. Configuration templates are provided for common deployment scenarios. + +**Sources:** [zeppelin-interpreter/src/main/java/org/apache/zeppelin/conf/ZeppelinConfiguration.java:62-150](), [conf/zeppelin-site.xml.template:20-829](), [conf/zeppelin-env.sh.template:19-185]() + +## Process Lifecycle Management + +Zeppelin uses shell scripts to manage the server daemon and interpreter processes. The process management system handles startup, shutdown, logging, and process monitoring across different deployment modes. + +```mermaid +graph TB + subgraph "Process Management Scripts" + ZeppelinDaemon["zeppelin-daemon.sh
(server lifecycle)"] + ZeppelinSh["zeppelin.sh
(direct execution)"] + InterpreterSh["interpreter.sh
(interpreter processes)"] + CommonSh["common.sh
(shared utilities)"] + end + + subgraph "Daemon Operations" + Start["start
(background process)"] + Stop["stop
(graceful shutdown)"] + Restart["restart
(stop + start)"] + Status["status
(process health)"] + Reload["reload
(configuration refresh)"] + end + + subgraph "Process Configuration" + PIDFiles["PID Files
(${ZEPPELIN_PID_DIR})"] + LogFiles["Log Files
(${ZEPPELIN_LOG_DIR})"] + ClassPath["CLASSPATH
(jar aggregation)"] + JavaOpts["JAVA_OPTS
(JVM configuration)"] + end + + subgraph "Runtime Environment" + ZeppelinHome["ZEPPELIN_HOME
(installation directory)"] + ZeppelinConf["ZEPPELIN_CONF_DIR
(configuration directory)"] + JavaHome["JAVA_HOME
(JDK location)"] + ZeppelinRunner["ZEPPELIN_RUNNER
(java executable)"] + end + + ZeppelinDaemon --> Start + ZeppelinDaemon --> Stop + ZeppelinDaemon --> Restart + ZeppelinDaemon --> Status + ZeppelinDaemon --> Reload + + ZeppelinDaemon --> PIDFiles + ZeppelinDaemon --> LogFiles + ZeppelinDaemon --> ClassPath + ZeppelinDaemon --> JavaOpts + + CommonSh --> ZeppelinHome + CommonSh --> ZeppelinConf + CommonSh --> JavaHome + CommonSh --> ZeppelinRunner + + InterpreterSh --> CommonSh + ZeppelinDaemon --> CommonSh + ZeppelinSh --> CommonSh +``` + +The daemon script provides standard Unix daemon operations and integrates with system init scripts. It manages process IDs, log rotation, and graceful shutdown procedures while supporting different deployment environments including Docker containers. + +**Sources:** [bin/zeppelin-daemon.sh:22-273](), [bin/interpreter.sh:19-94](), [bin/common.sh:19-177]() + +## Server Startup and Initialization Sequence + +The server initialization follows a structured sequence that ensures all components are properly configured and started in the correct order. + +```mermaid +sequenceDiagram + participant Main as "Main Thread" + participant ZeppelinServer as "ZeppelinServer" + participant ServiceLocator as "HK2 ServiceLocator" + participant JettyServer as "Jetty Server" + participant Services as "Core Services" + participant Notebook as "Notebook" + + Main->>ZeppelinServer: main(String[] args) + ZeppelinServer->>ZeppelinServer: setupJettyServer() + ZeppelinServer->>ServiceLocator: create(SERVICE_LOCATOR_NAME) + ZeppelinServer->>ServiceLocator: bind services with AbstractBinder + + ZeppelinServer->>JettyServer: setupWebAppContext() + ZeppelinServer->>JettyServer: initWebApp() + ZeppelinServer->>Services: initialize via ServiceLocator + + ZeppelinServer->>Notebook: getService(Notebook.class) + ZeppelinServer->>Notebook: initNotebook() + ZeppelinServer->>Notebook: recoveryIfNecessary() + + ZeppelinServer->>JettyServer: start() + Note over ZeppelinServer: Server ready to accept requests + + ZeppelinServer->>ZeppelinServer: waitForAtLeastOneConstructionError() + alt Construction errors found + ZeppelinServer->>Main: System.exit(-1) + else No errors + ZeppelinServer->>JettyServer: join() + Note over JettyServer: Server runs until shutdown + end +``` + +The initialization process includes metrics setup, cluster configuration, JMX enablement, and notebook recovery for fault tolerance. Error handling during startup ensures that construction failures result in clean server termination. + +**Sources:** [zeppelin-server/src/main/java/org/apache/zeppelin/server/ZeppelinServer.java:148-310]() diff --git a/.cursor/documentation/web_interface/frontend_build_system.md b/.cursor/documentation/web_interface/frontend_build_system.md new file mode 100644 index 00000000000..5900b9c4227 --- /dev/null +++ b/.cursor/documentation/web_interface/frontend_build_system.md @@ -0,0 +1,356 @@ +# Frontend Build System + +
+Relevant source files + +The following files were used as context for generating this wiki page: + +- [zeppelin-distribution/src/bin_license/licenses/LICENSE-patuaOne-font](zeppelin-distribution/src/bin_license/licenses/LICENSE-patuaOne-font) +- [zeppelin-distribution/src/bin_license/licenses/LICENSE-source_code_pro-font](zeppelin-distribution/src/bin_license/licenses/LICENSE-source_code_pro-font) +- [zeppelin-web/.eslintrc](zeppelin-web/.eslintrc) +- [zeppelin-web/Gruntfile.js](zeppelin-web/Gruntfile.js) +- [zeppelin-web/karma.conf.js](zeppelin-web/karma.conf.js) +- [zeppelin-web/package-lock.json](zeppelin-web/package-lock.json) +- [zeppelin-web/package.json](zeppelin-web/package.json) +- [zeppelin-web/src/app/app.controller.test.js](zeppelin-web/src/app/app.controller.test.js) +- [zeppelin-web/src/app/notebook/notebook.controller.test.js](zeppelin-web/src/app/notebook/notebook.controller.test.js) +- [zeppelin-web/src/app/notebook/paragraph/paragraph.controller.test.js](zeppelin-web/src/app/notebook/paragraph/paragraph.controller.test.js) +- [zeppelin-web/src/app/tabledata/tabledata.test.js](zeppelin-web/src/app/tabledata/tabledata.test.js) +- [zeppelin-web/src/components/navbar/navbar.controller.test.js](zeppelin-web/src/components/navbar/navbar.controller.test.js) + +
+ + + +This document describes the build system, development workflow, and testing infrastructure for Zeppelin's Angular.js frontend application located in the `zeppelin-web` module. It covers the compilation, bundling, testing, and deployment processes that transform the source code into a production-ready web application. + +For information about the UI components and controllers themselves, see [Notebook and Paragraph UI](#3.1), [Interpreter Management UI](#3.2), and [Navigation and Core UI](#3.3). + +## Overview and Architecture + +The Zeppelin frontend build system is a hybrid approach combining modern JavaScript tooling with traditional task runners. It uses Webpack for module bundling and hot reloading during development, while Grunt handles asset processing, minification, and production optimization tasks. + +### Build System Architecture + +```mermaid +graph TB + subgraph "Source Code" + AngularJS["Angular.js Source Files
src/app/**/*.js"] + Templates["HTML Templates
src/app/**/*.html"] + Styles["CSS Stylesheets
src/assets/styles/**/*.css"] + Assets["Static Assets
src/assets/images/**/*"] + end + + subgraph "Build Tools" + Webpack["webpack
Module Bundling"] + Grunt["grunt
Task Automation"] + Babel["babel
ES6 Transpilation"] + ESLint["eslint
Code Quality"] + end + + subgraph "Development Workflow" + DevServer["webpack-dev-server
Hot Reloading"] + Watch["grunt watch
File Watching"] + LiveReload["LiveReload
Browser Refresh"] + end + + subgraph "Testing Infrastructure" + Karma["karma
Test Runner"] + Jasmine["jasmine
Test Framework"] + Coverage["karma-coverage
Code Coverage"] + end + + subgraph "Production Build" + Concat["concat
File Concatenation"] + Uglify["uglify
JS Minification"] + CSSMin["cssmin
CSS Minification"] + CacheBust["cacheBust
Cache Breaking"] + end + + AngularJS --> Webpack + Templates --> Grunt + Styles --> Grunt + Assets --> Grunt + + Webpack --> DevServer + Grunt --> Watch + Watch --> LiveReload + + AngularJS --> Karma + Karma --> Jasmine + Karma --> Coverage + + Webpack --> Concat + Grunt --> Uglify + Grunt --> CSSMin + Grunt --> CacheBust +``` + +Sources: [zeppelin-web/package.json:1-108](), [zeppelin-web/Gruntfile.js:1-422]() + +## Build Configuration and Dependencies + +### Package Management and Dependencies + +The build system manages dependencies through multiple package managers: + +| Tool | Purpose | Configuration File | +|------|---------|-------------------| +| npm | Build tools and dev dependencies | `package.json` | +| bower | Frontend libraries and frameworks | `bower.json` | + +Key build dependencies include: + +- **webpack** (4.24.0): Module bundler and development server +- **grunt** (0.4.1): Task automation and asset processing +- **babel** (6.26.x): ES6+ transpilation to ES5 +- **karma** (3.1.3): Test runner with browser automation +- **eslint** (3.19.0): JavaScript linting and code quality + +Sources: [zeppelin-web/package.json:39-103](), [zeppelin-web/package.json:26-38]() + +### Build Scripts and Commands + +The `package.json` defines key build commands: + +```bash +# Development workflow +npm run dev # Start development server with hot reloading +npm run dev:helium # Development with Helium bundle support +npm run lint:watch # Continuous code linting + +# Production builds +npm run build:dist # Full production build +npm run build:ci # Continuous integration build + +# Testing +npm run karma-test # Run unit tests +npm run test:watch # Continuous testing +npm run e2e # End-to-end tests +``` + +Sources: [zeppelin-web/package.json:8-25]() + +## Development Workflow + +### Webpack Development Configuration + +During development, Webpack provides module bundling and hot reloading through `webpack-dev-server`. The development workflow supports: + +- **Hot Module Replacement**: Live code updates without page refresh +- **Source Maps**: Debug support with original source file mapping +- **Live Reloading**: Automatic browser refresh on file changes +- **Helium Bundle Development**: Special mode for plugin development + +The development server runs on a separate port and proxies API requests to the backend Zeppelin server. + +Sources: [zeppelin-web/package.json:16-20]() + +### Grunt Watch System + +Grunt provides file watching and automated task execution for non-JavaScript assets: + +```mermaid +graph LR + FileChange["File Changes"] --> Watch["grunt watch"] + + Watch --> HTMLLint["HTML Validation
htmlhint"] + Watch --> StylesTask["CSS Processing
postcss, autoprefixer"] + Watch --> TestRun["Test Execution
karma"] + Watch --> LiveReload["Browser Reload
livereload:35729"] + + HTMLLint --> Browser["Browser Update"] + StylesTask --> Browser + TestRun --> Browser + LiveReload --> Browser +``` + +The watch system monitors these file patterns: +- `src/**/*.html` - HTML templates and components +- `src/app/**/*.css` - Application stylesheets +- `test/spec/**/*.js` - Test specifications +- `Gruntfile.js` - Build configuration changes + +Sources: [zeppelin-web/Gruntfile.js:77-117]() + +## Testing Infrastructure + +### Karma Test Runner Configuration + +The testing system uses Karma with Jasmine for unit testing and Firefox for browser automation: + +```mermaid +graph TB + subgraph "Test Configuration" + KarmaConf["karma.conf.js
Test Configuration"] + WebpackConf["webpack.config
Module Resolution"] + TestFiles["src/**/*.test.js
Test Specifications"] + end + + subgraph "Test Execution" + Karma["karma
Test Runner"] + Firefox["FirefoxHeadless
Browser Engine"] + Jasmine["jasmine
Test Framework"] + end + + subgraph "Test Features" + Coverage["karma-coverage
Code Coverage Reports"] + SourceMaps["karma-sourcemap-loader
Debug Support"] + Webpack["karma-webpack
Module Bundling"] + end + + KarmaConf --> Karma + WebpackConf --> Webpack + TestFiles --> Karma + + Karma --> Firefox + Karma --> Jasmine + Karma --> Coverage + + Webpack --> SourceMaps + Coverage --> Reports["reports/coverage/
HTML Reports"] +``` + +Test file patterns and dependencies: +- Test files: `src/**/*.test.js` +- Polyfills: `babel-polyfill` for ES6+ features +- External libraries: jQuery, Angular, D3, Ace Editor + +Sources: [zeppelin-web/karma.conf.js:25-165](), [zeppelin-web/karma.conf.js:39-100]() + +### Code Quality and Linting + +ESLint enforces code standards with the Google JavaScript style guide: + +Key linting rules: +- **Style**: Single quotes, semicolons required, 120 character line limit +- **Quality**: No unused variables, camelCase naming, strict mode +- **Angular**: Global variables for `angular`, `jQuery`, `_`, framework libraries + +Sources: [zeppelin-web/.eslintrc:1-54]() + +### Test Structure and Examples + +Unit tests follow Jasmine conventions with Angular mocking: + +```javascript +// Controller testing pattern +describe('Controller: NotebookCtrl', function() { + beforeEach(angular.mock.module('zeppelinWebApp')); + // Test implementation... +}); +``` + +Common test categories: +- **Controller Tests**: `*.controller.test.js` - UI controller logic +- **Service Tests**: Component and data service testing +- **Utility Tests**: `tabledata.test.js` - Data transformation logic + +Sources: [zeppelin-web/src/app/notebook/notebook.controller.test.js:1-140](), [zeppelin-web/src/app/tabledata/tabledata.test.js:1-131]() + +## Production Build Process + +### Multi-Stage Build Pipeline + +The production build process combines Webpack bundling with Grunt post-processing: + +```mermaid +graph TB + subgraph "Pre-Build Stage" + Clean["rimraf dist .tmp
Clean Previous Build"] + Lint["eslint src
Code Quality Check"] + WiredepDist["wiredep:dist
Inject Bower Dependencies"] + end + + subgraph "Webpack Stage" + WebpackBuild["webpack
Module Bundling"] + BabelTranspile["babel-loader
ES6 Transpilation"] + AssetProcessing["Asset Processing
file-loader, css-loader"] + end + + subgraph "Post-Webpack Stage" + UseminPrepare["useminPrepare
Parse Build Blocks"] + Concat["concat
File Concatenation"] + NgAnnotate["ngAnnotate
Angular DI Annotation"] + Uglify["uglify
JavaScript Minification"] + CSSMin["cssmin
CSS Minification"] + HTMLMin["htmlmin
HTML Minification"] + end + + subgraph "Finalization Stage" + CopyDist["copy:dist
Asset Copying"] + Replace["replace
Template Cache Busting"] + CacheBust["cacheBust
File Versioning"] + UseMin["usemin
Asset Path Updates"] + end + + Clean --> Lint + Lint --> WiredepDist + WiredepDist --> WebpackBuild + + WebpackBuild --> BabelTranspile + BabelTranspile --> AssetProcessing + AssetProcessing --> UseminPrepare + + UseminPrepare --> Concat + Concat --> NgAnnotate + NgAnnotate --> Uglify + Uglify --> CSSMin + CSSMin --> HTMLMin + + HTMLMin --> CopyDist + CopyDist --> Replace + Replace --> CacheBust + CacheBust --> UseMin +``` + +Sources: [zeppelin-web/package.json:11-13](), [zeppelin-web/Gruntfile.js:392-417]() + +### Asset Processing and Optimization + +The build system handles various asset types: + +| Asset Type | Processing | Output Location | +|------------|------------|-----------------| +| JavaScript | Babel → Concat → Uglify → Cache bust | `dist/scripts/*.js` | +| CSS | PostCSS → Autoprefixer → CSSMin | `dist/styles/*.css` | +| HTML | HTMLHint → HTMLMin → Template cache | `dist/**/*.html` | +| Images | Copy → SVGMin | `dist/assets/images/` | +| Fonts | Copy from bower components | `dist/fonts/` | + +Special asset handling: +- **MathJax**: Mathematical rendering library with extensions and fonts +- **DataTables**: UI grid images and assets +- **Bootstrap**: Icon fonts and themes +- **jQuery UI**: Theme images and components + +Sources: [zeppelin-web/Gruntfile.js:266-323](), [zeppelin-web/Gruntfile.js:199-264]() + +### Cache Busting and Versioning + +The build system implements cache busting through multiple mechanisms: + +1. **File Revisioning**: `grunt-filerev` adds content hashes to filenames +2. **Template Versioning**: Timestamp-based cache busting for templates +3. **Asset Path Updates**: `usemin` updates references to versioned files + +Template cache busting pattern: +```javascript +// Before: templateUrl: "template.html" +// After: templateUrl: "template.html?v=1634567890123" +``` + +Sources: [zeppelin-web/Gruntfile.js:66-75](), [zeppelin-web/Gruntfile.js:351-370]() + +## Integration with Zeppelin Server + +The frontend build integrates with the larger Zeppelin server architecture through Maven: + +- **Maven Integration**: The `zeppelin-web` module builds during the main Maven lifecycle +- **Distribution**: Built assets are packaged into `zeppelin-distribution` +- **Development Proxy**: Webpack dev server proxies API calls to the backend +- **Production Deployment**: Built files are served by the Jetty web server + +The build output becomes part of the overall Zeppelin server deployment, providing the web interface for notebook management and code execution. + +Sources: [zeppelin-web/package.json:1-4](), [zeppelin-web/Gruntfile.js:28-32]() diff --git a/.cursor/documentation/web_interface/interpreter_management_ui.md b/.cursor/documentation/web_interface/interpreter_management_ui.md new file mode 100644 index 00000000000..86e13fad218 --- /dev/null +++ b/.cursor/documentation/web_interface/interpreter_management_ui.md @@ -0,0 +1,248 @@ +# Interpreter Management UI + +
+Relevant source files + +The following files were used as context for generating this wiki page: + +- [zeppelin-web/src/app/interpreter/interpreter.controller.js](zeppelin-web/src/app/interpreter/interpreter.controller.js) +- [zeppelin-web/src/app/interpreter/interpreter.css](zeppelin-web/src/app/interpreter/interpreter.css) +- [zeppelin-web/src/app/interpreter/interpreter.html](zeppelin-web/src/app/interpreter/interpreter.html) +- [zeppelin-web/src/assets/styles/looknfeel/default.css](zeppelin-web/src/assets/styles/looknfeel/default.css) +- [zeppelin-web/src/assets/styles/looknfeel/report.css](zeppelin-web/src/assets/styles/looknfeel/report.css) +- [zeppelin-web/src/assets/styles/looknfeel/simple.css](zeppelin-web/src/assets/styles/looknfeel/simple.css) +- [zeppelin-web/src/assets/styles/printMode.css](zeppelin-web/src/assets/styles/printMode.css) + +
+ + + +The Interpreter Management UI provides a web-based interface for configuring and managing interpreter settings in Apache Zeppelin. This interface allows administrators and users to create, edit, and manage interpreter configurations, dependencies, and runtime options through a centralized dashboard. + +For information about the underlying interpreter framework and backend services, see [Interpreter Framework](#2.3). For details about individual interpreter implementations, see [Interpreters](#5). + +## Overview + +The Interpreter Management UI is implemented as an Angular.js application located in the `zeppelin-web` module. It consists of three main components: the HTML template, the Angular controller, and associated CSS styling. The interface provides comprehensive management capabilities for interpreter settings, including configuration of properties, dependencies, repositories, and runtime options. + +**Main UI Components Architecture** + +```mermaid +graph TB + subgraph "Interpreter Management UI" + Header["interpreterHead
Header Section"] + Search["searchInterpreter
Search Input"] + RepoInfo["showRepositoryInfo
Repository Management"] + CreateForm["showAddNewSetting
Create New Interpreter"] + SettingsList["interpreterSettings
Settings List"] + end + + subgraph "Individual Interpreter Setting" + SettingHeader["Interpreter Title & Status"] + Options["Runtime Options
(Globally/Per Note/Per User)"] + Permissions["Permission Settings"] + Properties["Properties Table"] + Dependencies["Dependencies Table"] + Actions["Edit/Restart/Remove"] + end + + Header --> Search + Header --> RepoInfo + Header --> CreateForm + Header --> SettingsList + SettingsList --> SettingHeader + SettingsList --> Options + SettingsList --> Permissions + SettingsList --> Properties + SettingsList --> Dependencies + SettingsList --> Actions +``` + +Sources: [zeppelin-web/src/app/interpreter/interpreter.html:14-89](), [zeppelin-web/src/app/interpreter/interpreter.html:91-540]() + +## Core Controller Functions + +The `InterpreterCtrl` controller manages all interpreter management operations through a comprehensive set of functions that handle CRUD operations, configuration management, and user interactions. + +**Controller Function Categories** + +| Category | Functions | Purpose | +|----------|-----------|---------| +| Settings Management | `getInterpreterSettings`, `updateInterpreterSetting`, `removeInterpreterSetting` | Core CRUD operations for interpreter settings | +| Runtime Options | `setInterpreterRunningOption`, `setPerNoteOption`, `setPerUserOption` | Configure interpreter binding modes | +| Properties & Dependencies | `addNewInterpreterProperty`, `removeInterpreterProperty`, `addNewInterpreterDependency` | Manage interpreter configuration | +| Repository Management | `getRepositories`, `addNewRepository`, `removeRepository` | Handle Maven repository settings | +| Lifecycle Operations | `restartInterpreterSetting`, `checkDownloadingDependencies` | Control interpreter lifecycle | + +Sources: [zeppelin-web/src/app/interpreter/interpreter.controller.js:111-129](), [zeppelin-web/src/app/interpreter/interpreter.controller.js:343-416](), [zeppelin-web/src/app/interpreter/interpreter.controller.js:464-484]() + +## Interpreter Binding Modes + +The UI provides sophisticated options for configuring how interpreters are instantiated and shared across notes and users. These binding modes control the lifecycle and isolation of interpreter processes. + +**Binding Mode Configuration Flow** + +```mermaid +sequenceDiagram + participant User + participant UI["Interpreter UI"] + participant Controller["InterpreterCtrl"] + participant Backend["InterpreterRestApi"] + + User->>UI: "Select binding mode" + UI->>Controller: "setInterpreterRunningOption(settingId, perNote, perUser)" + Controller->>Controller: "Configure option object" + Note over Controller: "option.perNote = 'shared'|'scoped'|'isolated'" + Note over Controller: "option.perUser = 'shared'|'scoped'|'isolated'" + Controller->>Controller: "updateInterpreterSetting(form, settingId)" + Controller->>Backend: "PUT /interpreter/setting/{settingId}" + Backend-->>Controller: "Updated setting" + Controller-->>UI: "Update UI state" + UI-->>User: "Display updated configuration" +``` + +The binding modes include: +- **Globally**: Single interpreter instance shared across all notes and users (`perNote: 'shared', perUser: 'shared'`) +- **Per Note**: Separate interpreter instances per note (`perNote: 'scoped'` or `'isolated'`) +- **Per User**: Separate interpreter instances per user with additional per-note options + +Sources: [zeppelin-web/src/app/interpreter/interpreter.controller.js:288-341](), [zeppelin-web/src/app/interpreter/interpreter.html:147-249]() + +## Properties and Dependencies Management + +The UI provides dynamic tables for managing interpreter properties and dependencies with inline editing capabilities using the `editable-form` directive. + +**Properties Management Structure** + +```mermaid +graph LR + subgraph "Properties Table" + PropName["property.name"] + PropValue["property.value
(editable)"] + PropDesc["property.description"] + PropType["property.type
(string|number|textarea|url|password|checkbox)"] + end + + subgraph "Dependencies Table" + DepArtifact["dep.groupArtifactVersion
(Maven coordinates)"] + DepExclusions["dep.exclusions
(comma-separated)"] + end + + subgraph "Actions" + AddProp["addNewInterpreterProperty()"] + RemoveProp["removeInterpreterProperty()"] + AddDep["addNewInterpreterDependency()"] + RemoveDep["removeInterpreterDependency()"] + end + + PropName --> AddProp + PropValue --> AddProp + PropDesc --> AddProp + PropType --> AddProp + DepArtifact --> AddDep + DepExclusions --> AddDep +``` + +The properties system supports multiple data types with appropriate UI widgets: +- `string`: Standard text input +- `number`: Numeric input with validation +- `textarea`: Multi-line text input +- `url`: URL input with link display +- `password`: Masked password input +- `checkbox`: Boolean checkbox input + +Sources: [zeppelin-web/src/app/interpreter/interpreter.html:392-467](), [zeppelin-web/src/app/interpreter/interpreter.controller.js:607-632](), [zeppelin-web/src/app/interpreter/interpreter.controller.js:634-683]() + +## Repository Management + +The UI includes a collapsible repository management section for configuring Maven repositories used to resolve interpreter dependencies. + +**Repository Operations** + +```mermaid +graph TB + subgraph "Repository UI Controls" + ToggleRepo["showRepositoryInfo
Toggle Button"] + RepoList["repositories array
Display List"] + AddRepoBtn["Add Repository Button"] + CreateModal["repoModal
Creation Dialog"] + end + + subgraph "Repository Data Model" + RepoId["repo.id"] + RepoUrl["repo.url"] + RepoSnapshot["repo.snapshot"] + RepoAuth["username/password"] + RepoProxy["proxy settings"] + end + + subgraph "API Operations" + GetRepos["GET /interpreter/repository"] + PostRepo["POST /interpreter/repository"] + DeleteRepo["DELETE /interpreter/repository/{id}"] + end + + ToggleRepo --> RepoList + AddRepoBtn --> CreateModal + CreateModal --> RepoId + CreateModal --> RepoUrl + RepoList --> DeleteRepo + CreateModal --> PostRepo + GetRepos --> RepoList +``` + +Default repositories (`central` and `local`) cannot be removed and are identified by the `isDefaultRepository()` function. + +Sources: [zeppelin-web/src/app/interpreter/interpreter.html:60-86](), [zeppelin-web/src/app/interpreter/interpreter.controller.js:700-747]() + +## Status Management and Error Handling + +The UI provides real-time status updates for interpreter settings, particularly during dependency download operations and error states. + +**Status Indicator System** + +| Status | Icon | Color | Meaning | +|--------|------|-------|---------| +| `READY` | `fa-circle` | Green | Interpreter is ready to use | +| `ERROR` | `fa-circle` | Red | Error occurred (clickable for details) | +| `DOWNLOADING_DEPENDENCIES` | `fa-spinner` | Blue | Dependencies being downloaded | + +The `checkDownloadingDependencies()` function polls the server every 2 seconds when downloads are in progress, automatically updating the UI to reflect current status. + +Sources: [zeppelin-web/src/app/interpreter/interpreter.html:108-125](), [zeppelin-web/src/app/interpreter/interpreter.controller.js:131-155](), [zeppelin-web/src/app/interpreter/interpreter.controller.js:749-754]() + +## Integration with Backend Services + +The Interpreter Management UI integrates with several REST API endpoints provided by the Zeppelin server to perform operations on interpreter settings. + +**API Integration Pattern** + +```mermaid +sequenceDiagram + participant UI["interpreter.html"] + participant Ctrl["InterpreterCtrl"] + participant Http["$http service"] + participant API["REST Endpoints"] + participant ISM["InterpreterSettingManager"] + + UI->>Ctrl: "User action" + Ctrl->>Http: "HTTP request" + Http->>API: "/interpreter/setting/*" + API->>ISM: "Business logic" + ISM-->>API: "Response data" + API-->>Http: "JSON response" + Http-->>Ctrl: "Promise resolution" + Ctrl-->>UI: "Update scope variables" + UI-->>UI: "Re-render template" +``` + +Key API endpoints used: +- `GET /interpreter/setting` - Retrieve all interpreter settings +- `POST /interpreter/setting` - Create new interpreter setting +- `PUT /interpreter/setting/{id}` - Update existing interpreter setting +- `DELETE /interpreter/setting/{id}` - Remove interpreter setting +- `PUT /interpreter/setting/restart/{id}` - Restart interpreter +- `GET /interpreter/repository` - Get repository list +- `GET /interpreter/property/types` - Get available property types + +Sources: [zeppelin-web/src/app/interpreter/interpreter.controller.js:112-113](), [zeppelin-web/src/app/interpreter/interpreter.controller.js:396-397](), [zeppelin-web/src/app/interpreter/interpreter.controller.js:547-548]() diff --git a/.cursor/documentation/web_interface/navigation_and_core_ui.md b/.cursor/documentation/web_interface/navigation_and_core_ui.md new file mode 100644 index 00000000000..f8d243a5276 --- /dev/null +++ b/.cursor/documentation/web_interface/navigation_and_core_ui.md @@ -0,0 +1,314 @@ +# Navigation and Core UI + +
+Relevant source files + +The following files were used as context for generating this wiki page: + +- [zeppelin-web/src/app/home/home.controller.js](zeppelin-web/src/app/home/home.controller.js) +- [zeppelin-web/src/app/home/home.css](zeppelin-web/src/app/home/home.css) +- [zeppelin-web/src/app/home/home.html](zeppelin-web/src/app/home/home.html) +- [zeppelin-web/src/components/navbar/navbar.controller.js](zeppelin-web/src/components/navbar/navbar.controller.js) +- [zeppelin-web/src/components/navbar/navbar.css](zeppelin-web/src/components/navbar/navbar.css) +- [zeppelin-web/src/components/navbar/navbar.html](zeppelin-web/src/components/navbar/navbar.html) + +
+ + + +This document covers the core user interface components and navigation system of Apache Zeppelin's web frontend. It focuses on the navigation bar, home page, and foundational UI elements that provide the primary user interaction layer. + +For information about notebook-specific UI components, see [Notebook and Paragraph UI](#3.1). For interpreter management interfaces, see [Interpreter Management UI](#3.2). + +## Purpose and Scope + +The Navigation and Core UI system provides the primary interface layer for Apache Zeppelin, including: + +- Top-level navigation bar with menu systems +- Home page with notebook listing and welcome content +- Core styling framework and responsive design +- User authentication and session management UI +- Global search and connection status indicators + +## Navigation Bar Architecture + +The navigation bar is implemented as an Angular.js component using the `NavCtrl` controller and provides the primary navigation interface for Zeppelin. + +```mermaid +graph TB + subgraph "Navigation Bar Components" + NavCtrl["NavCtrl Controller
(navbar.controller.js)"] + NavHTML["Navigation Template
(navbar.html)"] + NavCSS["Navigation Styles
(navbar.css)"] + end + + subgraph "Core Services" + noteListFactory["noteListFactory"] + websocketMsgSrv["websocketMsgSrv"] + searchService["searchService"] + baseUrlSrv["baseUrlSrv"] + end + + subgraph "UI Elements" + NotebookDropdown["Notebook Dropdown
with Note List"] + JobMenu["Job Manager Link"] + SearchBar["Global Search Input"] + UserDropdown["User Settings Dropdown"] + ConnStatus["Connection Status Indicator"] + end + + NavCtrl --> noteListFactory + NavCtrl --> websocketMsgSrv + NavCtrl --> searchService + NavCtrl --> baseUrlSrv + + NavHTML --> NotebookDropdown + NavHTML --> JobMenu + NavHTML --> SearchBar + NavHTML --> UserDropdown + NavHTML --> ConnStatus +``` + +**Sources:** [zeppelin-web/src/components/navbar/navbar.controller.js](), [zeppelin-web/src/components/navbar/navbar.html](), [zeppelin-web/src/components/navbar/navbar.css]() + +### Navigation Controller Structure + +The `NavCtrl` controller manages navigation state and user interactions: + +| Function | Purpose | Key Dependencies | +|----------|---------|------------------| +| `isActive()` | Determines active navigation state | `$routeParams.noteId` | +| `logout()` | Handles user logout process | `baseUrlSrv`, `$http` | +| `search()` | Redirects to search results | `$location` | +| `loadNotes()` | Fetches notebook list | `websocketMsgSrv.getNoteList()` | +| `getZeppelinVersion()` | Retrieves version information | `baseUrlSrv.getRestApiBase()` | + +The controller maintains several key state variables: +- `vm.connected`: WebSocket connection status [zeppelin-web/src/components/navbar/navbar.controller.js:24]() +- `vm.notes`: Reference to note list factory [zeppelin-web/src/components/navbar/navbar.controller.js:27]() +- `vm.numberOfNotesDisplayed`: Pagination control [zeppelin-web/src/components/navbar/navbar.controller.js:33]() + +**Sources:** [zeppelin-web/src/components/navbar/navbar.controller.js:15-276]() + +### Navigation Bar Template Structure + +The navigation bar template implements a Bootstrap-based responsive design: + +```mermaid +graph TB + subgraph "Navbar Structure" + Headroom[" Container"] + Container["Bootstrap Container"] + Header["Navbar Header"] + Collapse["Collapsible Navigation"] + end + + subgraph "Header Content" + ToggleBtn["Mobile Toggle Button"] + Logo["Zeppelin Logo Image"] + Brand["Zeppelin Brand Text"] + end + + subgraph "Main Navigation" + NotebookMenu["Notebook Dropdown Menu"] + JobLink["Job Manager Link"] + end + + subgraph "Right Navigation" + SearchForm["Search Input Form"] + ConnectionIndicator["WebSocket Status"] + UserMenu["User Dropdown Menu"] + LoginBtn["Login Button"] + end + + Headroom --> Container + Container --> Header + Container --> Collapse + + Header --> ToggleBtn + Header --> Logo + Header --> Brand + + Collapse --> NotebookMenu + Collapse --> JobLink + Collapse --> SearchForm + Collapse --> ConnectionIndicator + Collapse --> UserMenu + Collapse --> LoginBtn +``` + +**Sources:** [zeppelin-web/src/components/navbar/navbar.html:13-121]() + +## Home Page System + +The home page provides the main landing interface and is implemented through the `HomeCtrl` controller with dual display modes. + +### Home Page Controller Architecture + +```mermaid +graph LR + subgraph "HomeCtrl Controller" + HomeController["HomeCtrl
(home.controller.js)"] + HomeHTML["Home Template
(home.html)"] + HomeCSS["Home Styles
(home.css)"] + end + + subgraph "Display Modes" + StaticHome["staticHome
Welcome Page"] + NotebookHome["notebookHome
Notebook Content"] + end + + subgraph "Services" + noteListFactory2["noteListFactory"] + websocketMsgSrv2["websocketMsgSrv"] + noteActionService["noteActionService"] + end + + HomeController --> StaticHome + HomeController --> NotebookHome + HomeController --> noteListFactory2 + HomeController --> websocketMsgSrv2 + HomeController --> noteActionService +``` + +**Sources:** [zeppelin-web/src/app/home/home.controller.js:15-160](), [zeppelin-web/src/app/home/home.html]() + +### Home Page Display Modes + +The home page operates in two distinct modes controlled by authentication state: + +1. **Static Home Mode** (`vm.staticHome = true`): Displays welcome content and community links when user is not authenticated [zeppelin-web/src/app/home/home.controller.js:32-35]() + +2. **Notebook Home Mode** (`vm.notebookHome = true`): Shows a designated home notebook as the landing page when authenticated [zeppelin-web/src/app/home/home.controller.js:81-82]() + +The mode switching is handled through event listeners: +- `setNoteContent` event toggles between modes [zeppelin-web/src/app/home/home.controller.js:67-87]() +- `initHome()` function initializes notebook home mode [zeppelin-web/src/app/home/home.html:14]() + +**Sources:** [zeppelin-web/src/app/home/home.controller.js:29-86]() + +### Notebook List Management + +The home page includes sophisticated notebook list management with filtering and infinite scroll: + +| Feature | Implementation | Location | +|---------|----------------|----------| +| Note Filtering | `isFilterNote()` function | [zeppelin-web/src/app/home/home.controller.js:137-147]() | +| Infinite Scroll | `loadMoreNotes()` increments display count | [zeppelin-web/src/app/home/home.controller.js:89-91]() | +| Note Actions | Rename, trash, restore operations | [zeppelin-web/src/app/home/home.controller.js:93-131]() | +| Folder Operations | Toggle, rename, trash folder actions | [zeppelin-web/src/app/home/home.controller.js:51-53]() | + +**Sources:** [zeppelin-web/src/app/home/home.controller.js:46-160](), [zeppelin-web/src/app/home/home.html:27-56]() + +## Core UI Components + +### Responsive Design System + +The core UI implements a Bootstrap-based responsive design with custom extensions: + +```mermaid +graph TB + subgraph "Responsive Breakpoints" + Mobile["Mobile
< 768px"] + Tablet["Tablet
768px - 794px"] + Desktop["Desktop
> 830px"] + end + + subgraph "Navigation Adaptations" + MobileNav["Collapsed Menu
Toggle Button"] + TabletSearch["Reduced Search Width
130-150px"] + DesktopSearch["Full Search Width
170px"] + end + + subgraph "Layout Components" + FixedNavbar["Fixed Top Navbar
50px padding"] + ContainerFluid["Fluid Container
Auto width"] + ScrollableDropdown["Scrollable Dropdown
max-height calc()"] + end + + Mobile --> MobileNav + Tablet --> TabletSearch + Desktop --> DesktopSearch + + MobileNav --> FixedNavbar + TabletSearch --> ContainerFluid + DesktopSearch --> ScrollableDropdown +``` + +**Sources:** [zeppelin-web/src/app/home/home.css:218-234](), [zeppelin-web/src/components/navbar/navbar.css:61-73](), [zeppelin-web/src/components/navbar/navbar.css:137-174]() + +### Animation System + +The UI includes custom CSS animations for smooth transitions: + +- **Slide Animations**: Top, right, left, down slide transitions with cubic-bezier easing [zeppelin-web/src/app/home/home.css:370-697]() +- **Navbar Transitions**: Headroom.js integration for navbar pin/unpin effects [zeppelin-web/src/components/navbar/navbar.css:76-84]() +- **Modal Backdrop**: Z-index management for modal layering [zeppelin-web/src/app/home/home.css:359-365]() + +**Sources:** [zeppelin-web/src/app/home/home.css:370-697](), [zeppelin-web/src/components/navbar/navbar.css:76-84]() + +### Connection Status Indicator + +The navigation bar includes a real-time WebSocket connection status indicator: + +- **Connected State**: Green circle with "WebSocket Connected" tooltip [zeppelin-web/src/components/navbar/navbar.html:88-89]() +- **Disconnected State**: Red circle with "WebSocket Disconnected" tooltip [zeppelin-web/src/components/navbar/navbar.html:90-91]() +- **Status Updates**: Managed through `setConnectedStatus` event [zeppelin-web/src/components/navbar/navbar.controller.js:224-226]() + +The status styling is defined with specific colors and positioning [zeppelin-web/src/app/home/home.css:255-265](). + +**Sources:** [zeppelin-web/src/components/navbar/navbar.html:88-91](), [zeppelin-web/src/components/navbar/navbar.controller.js:224-226](), [zeppelin-web/src/app/home/home.css:255-265]() + +## User Authentication Interface + +### Login/Logout Flow + +The navigation system manages user authentication state through several mechanisms: + +```mermaid +graph TB + subgraph "Authentication States" + Unauthenticated["ticket === undefined"] + Authenticated["ticket.principal !== 'anonymous'"] + end + + subgraph "UI Elements" + LoginButton["Login Button
(loginModal trigger)"] + UserDropdown["Username Display
+ Settings Menu"] + LogoutAction["Logout Function
(logout API call)"] + end + + subgraph "Authentication Actions" + ShowLogin["showLoginWindow()"] + LoginSuccess["loginSuccess event"] + LogoutProcess["logout() function"] + end + + Unauthenticated --> LoginButton + Authenticated --> UserDropdown + + LoginButton --> ShowLogin + ShowLogin --> LoginSuccess + UserDropdown --> LogoutProcess + LogoutProcess --> Unauthenticated +``` + +**Sources:** [zeppelin-web/src/components/navbar/navbar.html:92-117](), [zeppelin-web/src/components/navbar/navbar.controller.js:101-177](), [zeppelin-web/src/components/navbar/navbar.controller.js:228-233]() + +The logout process includes browser-specific handling for authentication cache clearing, particularly for Internet Explorer [zeppelin-web/src/components/navbar/navbar.controller.js:125-151](). + +### User Settings Menu + +When authenticated, the user dropdown provides access to system configuration: + +| Menu Item | Route | Purpose | +|-----------|-------|---------| +| About Zeppelin | Modal trigger | Version and license information | +| Interpreter | `#/interpreter` | Interpreter configuration | +| Notebook Repos | `#/notebookRepos` | Repository settings | +| Credential | `#/credential` | Security credentials | +| Helium | `#/helium` | Plugin management | +| Configuration | `#/configuration` | System settings | +| Cluster | `#/cluster` | Cluster management (conditional) | + +**Sources:** [zeppelin-web/src/components/navbar/navbar.html:97-109]() diff --git a/.cursor/documentation/web_interface/notebook_and_paragraph_ui.md b/.cursor/documentation/web_interface/notebook_and_paragraph_ui.md new file mode 100644 index 00000000000..354d97f2f22 --- /dev/null +++ b/.cursor/documentation/web_interface/notebook_and_paragraph_ui.md @@ -0,0 +1,322 @@ +# Notebook and Paragraph UI + +
+Relevant source files + +The following files were used as context for generating this wiki page: + +- [zeppelin-distribution/src/bin_license/LICENSE](zeppelin-distribution/src/bin_license/LICENSE) +- [zeppelin-web/bower.json](zeppelin-web/bower.json) +- [zeppelin-web/src/app/app.js](zeppelin-web/src/app/app.js) +- [zeppelin-web/src/app/notebook/notebook-actionBar.html](zeppelin-web/src/app/notebook/notebook-actionBar.html) +- [zeppelin-web/src/app/notebook/notebook.controller.js](zeppelin-web/src/app/notebook/notebook.controller.js) +- [zeppelin-web/src/app/notebook/notebook.css](zeppelin-web/src/app/notebook/notebook.css) +- [zeppelin-web/src/app/notebook/notebook.html](zeppelin-web/src/app/notebook/notebook.html) +- [zeppelin-web/src/app/notebook/paragraph/paragraph-control.html](zeppelin-web/src/app/notebook/paragraph/paragraph-control.html) +- [zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js](zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js) +- [zeppelin-web/src/app/notebook/paragraph/paragraph.css](zeppelin-web/src/app/notebook/paragraph/paragraph.css) +- [zeppelin-web/src/app/notebook/paragraph/paragraph.html](zeppelin-web/src/app/notebook/paragraph/paragraph.html) +- [zeppelin-web/src/index.html](zeppelin-web/src/index.html) + +
+ + + +This document covers the frontend user interface components for Zeppelin's notebook and paragraph editing functionality. It describes the AngularJS-based web interface that allows users to create, edit, and execute code paragraphs within notebooks, including collaborative editing features, code completion, and real-time execution feedback. + +For information about the backend notebook server and WebSocket APIs, see [Notebook Server and APIs](#4.1). For details about interpreter management and configuration, see [Interpreter Management UI](#3.2). + +## Architecture Overview + +The notebook UI is built on AngularJS 1.5.7 and follows a hierarchical component structure where a single `NotebookCtrl` manages multiple `ParagraphCtrl` instances. The interface integrates the ACE code editor for syntax highlighting and provides real-time collaboration features through WebSocket connections. + +### Component Hierarchy + +```mermaid +graph TD + App["zeppelinWebApp"] --> Main["MainCtrl"] + Main --> Notebook["NotebookCtrl"] + Notebook --> ActionBar["notebook-actionBar.html"] + Notebook --> Settings["Interpreter Settings Panel"] + Notebook --> Permissions["Permissions Panel"] + Notebook --> ParagraphContainer["Paragraph Container"] + ParagraphContainer --> Para1["ParagraphCtrl (1)"] + ParagraphContainer --> Para2["ParagraphCtrl (2)"] + ParagraphContainer --> ParaN["ParagraphCtrl (n)"] + Para1 --> Editor["ACE Editor"] + Para1 --> Forms["Dynamic Forms"] + Para1 --> Results["ResultCtrl"] + Para1 --> Controls["paragraph-control.html"] +``` + +Sources: [zeppelin-web/src/app/app.js:86-109](), [zeppelin-web/src/app/notebook/notebook.html:153-158](), [zeppelin-web/src/app/notebook/paragraph/paragraph.html:1-89]() + +### Controller Communication Pattern + +```mermaid +graph LR + User["User Actions"] --> NotebookCtrl["NotebookCtrl"] + User --> ParagraphCtrl["ParagraphCtrl"] + NotebookCtrl --> WebSocket["websocketMsgSrv"] + ParagraphCtrl --> WebSocket + WebSocket --> Backend["Zeppelin Server"] + Backend --> WebSocket + WebSocket --> EventBroadcast["$rootScope.$broadcast"] + EventBroadcast --> NotebookCtrl + EventBroadcast --> ParagraphCtrl + NotebookCtrl --> ParagraphManagement["addParagraph
removeParagraph
moveParagraph"] + ParagraphCtrl --> CodeExecution["runParagraph
cancelParagraph"] + ParagraphCtrl --> ACEEditor["aceLoaded
aceChanged"] +``` + +Sources: [zeppelin-web/src/app/notebook/notebook.controller.js:97-181](), [zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js:31-35](), [zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js:726-742]() + +## Core UI Components + +### NotebookCtrl Controller + +The `NotebookCtrl` serves as the main controller for notebook-level operations and manages the overall notebook state. It handles notebook metadata, paragraph lifecycle, and coordination between multiple paragraphs. + +**Key Responsibilities:** +- Notebook initialization via `initNotebook()` +- Paragraph management through `addPara()`, `removePara()`, and move operations +- Interpreter binding configuration +- Permissions and access control +- Search and replace functionality across paragraphs +- Version control and revision management + +```mermaid +graph TD + NotebookCtrl --> Init["initNotebook()"] + NotebookCtrl --> ParaMgmt["Paragraph Management"] + NotebookCtrl --> InterpSettings["Interpreter Settings"] + NotebookCtrl --> SearchReplace["Search & Replace"] + NotebookCtrl --> VersionControl["Version Control"] + + ParaMgmt --> AddPara["addPara(paragraph, index)"] + ParaMgmt --> RemovePara["removePara(paragraphId)"] + ParaMgmt --> MovePara["moveParagraph events"] + + InterpSettings --> GetBindings["getInterpreterBindings()"] + InterpSettings --> SaveBindings["saveSetting()"] + + SearchReplace --> MarkOccurrences["markAllOccurrences()"] + SearchReplace --> NextPrev["nextOccurrence()
prevOccurrence()"] + SearchReplace --> Replace["replace()
replaceAll()"] +``` + +Sources: [zeppelin-web/src/app/notebook/notebook.controller.js:172-199](), [zeppelin-web/src/app/notebook/notebook.controller.js:582-627](), [zeppelin-web/src/app/notebook/notebook.controller.js:717-738](), [zeppelin-web/src/app/notebook/notebook.controller.js:879-1015]() + +### ParagraphCtrl Controller + +The `ParagraphCtrl` manages individual paragraph execution, editing, and display. Each paragraph instance has its own controller that handles code editing, execution, and result rendering. + +**Key Features:** +- ACE editor integration with syntax highlighting and autocompletion +- Code execution via `runParagraph()` with support for both interpreter and spell execution +- Real-time collaboration through differential synchronization +- Dynamic forms for interactive parameters +- Progress tracking and status updates + +The controller initializes with paragraph data and maintains state for: +- `$scope.editor` - ACE editor instance +- `$scope.originalText` - Original paragraph text for dirty checking +- `$scope.dirtyText` - Current editor content +- `$scope.paragraphFocused` - Focus state for keyboard navigation + +Sources: [zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js:134-158](), [zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js:455-480](), [zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js:751-1018]() + +## Code Editor Integration + +### ACE Editor Configuration + +The paragraph UI integrates the ACE editor for code editing with extensive customization for Zeppelin's requirements. The editor configuration includes: + +**Editor Setup:** +- Syntax highlighting for multiple languages (Scala, Python, SQL, R, etc.) +- Custom autocompletion via `remoteCompleter` that communicates with interpreters +- Emacs key bindings on macOS, standard bindings on other platforms +- Dynamic height adjustment based on content + +**Autocompletion System:** +The autocompletion system provides intelligent code suggestions by communicating with backend interpreters: + +```mermaid +graph LR + UserTypes["User Types"] --> ACEEditor["ACE Editor"] + ACEEditor --> RemoteCompleter["remoteCompleter.getCompletions()"] + RemoteCompleter --> WSMessage["websocketMsgSrv.completion()"] + WSMessage --> Backend["Interpreter Backend"] + Backend --> WSResponse["completionList event"] + WSResponse --> FilterResults["filterCompletions()"] + FilterResults --> DisplaySuggestions["ACE Suggestions UI"] +``` + +**Key Editor Methods:** +- `aceLoaded(_editor)` - Initializes editor with custom configuration +- `aceChanged(_, editor)` - Handles content changes and dirty state tracking +- `setParagraphMode(session, text)` - Sets appropriate syntax highlighting mode +- `autoAdjustEditorHeight(editor)` - Dynamically resizes editor + +Sources: [zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js:751-1018](), [zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js:806-866](), [zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js:1104-1139]() + +### Keyboard Shortcuts and Navigation + +The editor supports extensive keyboard shortcuts for efficient notebook navigation and editing: + +**Execution Shortcuts:** +- `Shift+Enter` - Run current paragraph +- `Ctrl+Enter` - Run current paragraph (alternative) +- `Ctrl+Shift+Enter` - Run all paragraphs above/below current + +**Navigation Shortcuts:** +- `Ctrl+Alt+N` - Insert new paragraph below +- `Ctrl+Alt+K` - Move paragraph up +- `Ctrl+Alt+J` - Move paragraph down +- `Ctrl+Alt+D` - Delete paragraph + +**Editor Shortcuts:** +- `Ctrl+Alt+E` - Toggle editor visibility +- `Ctrl+Alt+O` - Toggle output visibility +- `Ctrl+Alt+T` - Toggle title +- `Ctrl+Alt+M` - Toggle line numbers + +Sources: [zeppelin-web/src/app/notebook/paragraph/paragraph-control.html:49-211](), [zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js:911-1016]() + +## Real-time Collaboration Features + +### Collaborative Editing + +Zeppelin supports real-time collaborative editing using a differential synchronization approach. When `collaborativeMode` is enabled, changes are synchronized between multiple users editing the same notebook. + +**Collaboration Workflow:** +1. User makes changes in ACE editor +2. `aceChanged()` detects modifications and calls `sendPatch()` +3. `diffMatchPatch.patch_make()` generates a patch between original and current text +4. Patch is sent via WebSocket to other collaborators +5. Remote users receive and apply patches to their editors + +```mermaid +sequenceDiagram + participant User1 as "User 1" + participant ACE1 as "ACE Editor 1" + participant WS as "WebSocket" + participant Server as "Zeppelin Server" + participant ACE2 as "ACE Editor 2" + participant User2 as "User 2" + + User1->>ACE1: "Types code" + ACE1->>ACE1: "aceChanged() triggered" + ACE1->>ACE1: "diffMatchPatch.patch_make()" + ACE1->>WS: "patchParagraph(id, patch)" + WS->>Server: "PATCH_PARAGRAPH" + Server->>WS: "Broadcast patch" + WS->>ACE2: "Apply patch" + ACE2->>User2: "Show updated content" +``` + +**Collaborative State Management:** +- `$scope.collaborativeMode` - Enables/disables collaborative features +- `$scope.collaborativeModeUsers` - List of active users viewing the notebook +- `$scope.diffMatchPatch` - Instance for generating and applying patches + +Sources: [zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js:744-749](), [zeppelin-web/src/app/notebook/notebook.controller.js:38-39](), [zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js:47]() + +## Paragraph Execution and Results + +### Execution Flow + +Paragraph execution supports both traditional interpreter execution and spell-based execution (for frontend-only computations). The execution flow varies based on the paragraph content and available execution engines. + +**Execution Decision Tree:** +```mermaid +graph TD + RunClick["User clicks Run"] --> CheckText["Check paragraph text"] + CheckText --> ExtractMagic["SpellResult.extractMagic()"] + ExtractMagic --> HasSpell{Spell available?} + HasSpell -->|Yes| SpellExec["runParagraphUsingSpell()"] + HasSpell -->|No| InterpExec["runParagraphUsingBackendInterpreter()"] + + SpellExec --> SpellProcess["heliumService.executeSpell()"] + SpellProcess --> SpellResults["Handle spell results"] + SpellResults --> SpellComplete["cleanupSpellTransaction()"] + + InterpExec --> WSMessage["websocketMsgSrv.runParagraph()"] + WSMessage --> Backend["Backend Interpreter"] + Backend --> WSResponse["updateParagraphOutput events"] + WSResponse --> DisplayResults["Update paragraph.results"] +``` + +**Key Execution Methods:** +- `runParagraph(paragraphText, digestRequired, propagated)` - Main execution entry point +- `runParagraphUsingSpell()` - Frontend spell execution +- `runParagraphUsingBackendInterpreter()` - Traditional interpreter execution +- `cancelParagraph()` - Cancels running paragraphs + +Sources: [zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js:455-480](), [zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js:362-398](), [zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js:400-403]() + +### Result Rendering + +Paragraph results are handled by the `ResultCtrl` which supports multiple output formats including text, HTML, tables, and visualizations. The result system is extensible and supports dynamic table displays with sorting and filtering. + +**Result Types:** +- `TEXT` - Plain text output +- `HTML` - Rendered HTML content +- `TABLE` - Tabular data with interactive features +- `IMG` - Image outputs +- `SVG` - Vector graphics + +The result rendering pipeline processes interpreter output and applies appropriate formatting and interactive features based on the result type. + +Sources: [zeppelin-web/src/app/notebook/paragraph/paragraph.html:63-73](), [zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js:206-231]() + +## Action Bar and Controls + +### Notebook Action Bar + +The notebook action bar provides global notebook operations and is implemented as a fixed header that remains visible during scrolling. It includes notebook-level controls for execution, export, version control, and configuration. + +**Action Bar Features:** +- **Execution Controls**: Run all paragraphs, toggle editor/output visibility +- **Export Options**: Export as `.zpln` format or Jupyter `.ipynb` format +- **Version Control**: Commit changes, browse revisions, compare versions +- **Search**: Find and replace across all paragraphs +- **Settings**: Interpreter bindings, permissions, look and feel + +```mermaid +graph LR + ActionBar["notebook-actionBar.html"] --> ExecControls["Execution Controls"] + ActionBar --> ExportOpts["Export Options"] + ActionBar --> VersionCtrl["Version Control"] + ActionBar --> SearchFunc["Search Function"] + ActionBar --> Settings["Settings"] + + ExecControls --> RunAll["runAllParagraphs()"] + ExecControls --> ToggleEditor["toggleAllEditor()"] + ExecControls --> ToggleOutput["toggleAllTable()"] + ExecControls --> ClearOutput["clearAllParagraphOutput()"] + + ExportOpts --> ExportNote["exportNote()"] + ExportOpts --> ExportNbformat["exportNbformat()"] + + SearchFunc --> FindInput["Find Input Field"] + SearchFunc --> ReplaceInput["Replace Input Field"] + SearchFunc --> NavControls["Next/Previous Controls"] +``` + +Sources: [zeppelin-web/src/app/notebook/notebook-actionBar.html:30-242](), [zeppelin-web/src/app/notebook/notebook.controller.js:395-465]() + +### Paragraph Controls + +Each paragraph has an individual control panel that provides paragraph-specific operations accessible through a dropdown menu. The controls are contextually enabled/disabled based on notebook and paragraph state. + +**Control Categories:** +- **Execution**: Run, cancel, auto-run configuration +- **Layout**: Column width, font size, title visibility, line numbers +- **Navigation**: Move up/down, insert new paragraph +- **Management**: Clone, clear output, remove paragraph +- **Utility**: Copy paragraph ID, link to paragraph + +The control system respects the notebook's running state and revision view mode, disabling inappropriate actions when necessary. + +Sources: [zeppelin-web/src/app/notebook/paragraph/paragraph-control.html:70-214](), [zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js:529-612]() diff --git a/.cursor/documentation/web_interface/web_interface.md b/.cursor/documentation/web_interface/web_interface.md new file mode 100644 index 00000000000..3eee11c2750 --- /dev/null +++ b/.cursor/documentation/web_interface/web_interface.md @@ -0,0 +1,363 @@ +# Web Interface + +
+Relevant source files + +The following files were used as context for generating this wiki page: + +- [zeppelin-distribution/src/bin_license/LICENSE](zeppelin-distribution/src/bin_license/LICENSE) +- [zeppelin-distribution/src/bin_license/licenses/LICENSE-patuaOne-font](zeppelin-distribution/src/bin_license/licenses/LICENSE-patuaOne-font) +- [zeppelin-distribution/src/bin_license/licenses/LICENSE-source_code_pro-font](zeppelin-distribution/src/bin_license/licenses/LICENSE-source_code_pro-font) +- [zeppelin-web/.eslintrc](zeppelin-web/.eslintrc) +- [zeppelin-web/Gruntfile.js](zeppelin-web/Gruntfile.js) +- [zeppelin-web/bower.json](zeppelin-web/bower.json) +- [zeppelin-web/karma.conf.js](zeppelin-web/karma.conf.js) +- [zeppelin-web/package-lock.json](zeppelin-web/package-lock.json) +- [zeppelin-web/package.json](zeppelin-web/package.json) +- [zeppelin-web/src/app/app.controller.test.js](zeppelin-web/src/app/app.controller.test.js) +- [zeppelin-web/src/app/app.js](zeppelin-web/src/app/app.js) +- [zeppelin-web/src/app/home/home.controller.js](zeppelin-web/src/app/home/home.controller.js) +- [zeppelin-web/src/app/home/home.css](zeppelin-web/src/app/home/home.css) +- [zeppelin-web/src/app/home/home.html](zeppelin-web/src/app/home/home.html) +- [zeppelin-web/src/app/notebook/notebook-actionBar.html](zeppelin-web/src/app/notebook/notebook-actionBar.html) +- [zeppelin-web/src/app/notebook/notebook.controller.js](zeppelin-web/src/app/notebook/notebook.controller.js) +- [zeppelin-web/src/app/notebook/notebook.controller.test.js](zeppelin-web/src/app/notebook/notebook.controller.test.js) +- [zeppelin-web/src/app/notebook/notebook.css](zeppelin-web/src/app/notebook/notebook.css) +- [zeppelin-web/src/app/notebook/notebook.html](zeppelin-web/src/app/notebook/notebook.html) +- [zeppelin-web/src/app/notebook/paragraph/paragraph-control.html](zeppelin-web/src/app/notebook/paragraph/paragraph-control.html) +- [zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js](zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js) +- [zeppelin-web/src/app/notebook/paragraph/paragraph.controller.test.js](zeppelin-web/src/app/notebook/paragraph/paragraph.controller.test.js) +- [zeppelin-web/src/app/notebook/paragraph/paragraph.css](zeppelin-web/src/app/notebook/paragraph/paragraph.css) +- [zeppelin-web/src/app/notebook/paragraph/paragraph.html](zeppelin-web/src/app/notebook/paragraph/paragraph.html) +- [zeppelin-web/src/app/tabledata/tabledata.test.js](zeppelin-web/src/app/tabledata/tabledata.test.js) +- [zeppelin-web/src/components/navbar/navbar.controller.js](zeppelin-web/src/components/navbar/navbar.controller.js) +- [zeppelin-web/src/components/navbar/navbar.controller.test.js](zeppelin-web/src/components/navbar/navbar.controller.test.js) +- [zeppelin-web/src/components/navbar/navbar.css](zeppelin-web/src/components/navbar/navbar.css) +- [zeppelin-web/src/components/navbar/navbar.html](zeppelin-web/src/components/navbar/navbar.html) +- [zeppelin-web/src/index.html](zeppelin-web/src/index.html) + +
+ + + +The Apache Zeppelin web interface is a browser-based frontend application that provides an interactive notebook environment for data analytics. This document covers the architecture, components, and implementation details of the client-side web application. + +For information about server-side notebook management and APIs, see [Notebook Server and APIs](#4.1). For details about interpreter configuration interfaces, see [Interpreter Management UI](#3.2). + +## Frontend Architecture Overview + +The Zeppelin web interface is built as a single-page application (SPA) using AngularJS 1.5.7. The application follows a modular architecture with distinct controllers, services, and templates for different functional areas. + +### Application Structure + +```mermaid +graph TB + subgraph "zeppelin-web Module" + AppJS["app.js
Main Module"] + IndexHTML["index.html
Entry Point"] + MainCtrl["MainCtrl
Root Controller"] + end + + subgraph "Core Components" + NavCtrl["NavCtrl
navbar.controller.js"] + HomeCtrl["HomeCtrl
home.controller.js"] + NotebookCtrl["NotebookCtrl
notebook.controller.js"] + ParagraphCtrl["ParagraphCtrl
paragraph.controller.js"] + end + + subgraph "Services & Communication" + WebSocketSrv["websocketMsgSrv
WebSocket Service"] + BaseUrlSrv["baseUrlSrv
REST API Base"] + HeliumSrv["heliumService
Plugin System"] + end + + subgraph "External Libraries" + Angular["AngularJS 1.5.7"] + Bootstrap["Bootstrap 3.x"] + ACE["ACE Editor"] + D3["D3.js + NVD3"] + end + + AppJS --> NavCtrl + AppJS --> HomeCtrl + AppJS --> NotebookCtrl + AppJS --> ParagraphCtrl + + NotebookCtrl --> ParagraphCtrl + NavCtrl --> WebSocketSrv + NotebookCtrl --> WebSocketSrv + ParagraphCtrl --> WebSocketSrv + + AppJS --> Angular + AppJS --> Bootstrap + ParagraphCtrl --> ACE + AppJS --> D3 +``` + +Sources: [zeppelin-web/src/app/app.js:67-106](), [zeppelin-web/src/index.html:15-223](), [zeppelin-web/bower.json:4-43]() + +### Module Dependencies + +The main application module `zeppelinWebApp` integrates numerous Angular modules and third-party libraries: + +| Category | Modules | Purpose | +|----------|---------|---------| +| Core Angular | `ngCookies`, `ngAnimate`, `ngRoute`, `ngSanitize` | Basic Angular functionality | +| UI Components | `ui.bootstrap`, `ui.ace`, `as.sortable` | UI widgets and editor | +| Communication | `angular-websocket`, `ngResource` | Server communication | +| Data Processing | `angular.filter`, `ui.grid` | Data manipulation and display | +| Utilities | `ngToast`, `ngclipboard`, `xeditable` | User experience enhancements | + +Sources: [zeppelin-web/src/app/app.js:27-59](), [zeppelin-web/bower.json:5-39]() + +## Routing and Navigation + +The application uses Angular's `$routeProvider` to define client-side routes that map URLs to templates and controllers. + +### Route Configuration + +```mermaid +graph LR + subgraph "Main Routes" + Root["/"] + Notebook["/notebook/:noteId"] + Paragraph["/notebook/:noteId/paragraph/:paragraphId"] + Revision["/notebook/:noteId/revision/:revisionId"] + Interpreter["/interpreter"] + JobManager["/jobmanager"] + end + + subgraph "Templates & Controllers" + HomeHTML["home.html"] + NotebookHTML["notebook.html
NotebookCtrl"] + InterpreterHTML["interpreter.html
InterpreterCtrl"] + JobHTML["jobmanager.html
JobmanagerCtrl"] + end + + Root --> HomeHTML + Notebook --> NotebookHTML + Paragraph --> NotebookHTML + Revision --> NotebookHTML + Interpreter --> InterpreterHTML + JobManager --> JobHTML +``` + +Sources: [zeppelin-web/src/app/app.js:86-130]() + +The routing system supports: +- **Notebook Display**: `/notebook/:noteId` routes to the main notebook interface +- **Paragraph Focus**: `/notebook/:noteId/paragraph/:paragraphId` for direct paragraph access +- **Revision Viewing**: `/notebook/:noteId/revision/:revisionId` for version control +- **Management Interfaces**: Dedicated routes for interpreter and job management + +## Core User Interface Components + +### Navigation Bar Component + +The `NavCtrl` controller manages the top-level navigation, search functionality, and user authentication state. + +**Key Functions:** +- `search(searchTerm)` - Navigate to search results ([zeppelin-web/src/components/navbar/navbar.controller.js:205-207]()) +- `logout()` - Handle user logout process ([zeppelin-web/src/components/navbar/navbar.controller.js:101-177]()) +- `loadNotes()` - Fetch and display notebook list ([zeppelin-web/src/components/navbar/navbar.controller.js:93-95]()) + +```mermaid +graph TB + subgraph "NavCtrl Responsibilities" + UserAuth["User Authentication
Login/Logout"] + NotebookList["Notebook List
Dropdown Menu"] + SearchBox["Search Functionality
Global Search"] + ConnStatus["Connection Status
WebSocket State"] + end + + subgraph "Services Used" + WebSocketSrv["websocketMsgSrv"] + BaseUrlSrv["baseUrlSrv"] + NoteFactory["noteListFactory"] + end + + UserAuth --> BaseUrlSrv + NotebookList --> NoteFactory + SearchBox --> BaseUrlSrv + ConnStatus --> WebSocketSrv +``` + +Sources: [zeppelin-web/src/components/navbar/navbar.controller.js:17-276](), [zeppelin-web/src/components/navbar/navbar.html:13-154]() + +### Home Page Interface + +The `HomeCtrl` manages the landing page, displaying available notebooks and providing creation/import functionality. + +**Key Features:** +- Note list with filtering capabilities +- Import/export note functionality +- Folder-based organization +- Real-time updates via WebSocket + +Sources: [zeppelin-web/src/app/home/home.controller.js:17-139](), [zeppelin-web/src/app/home/home.html:14-128]() + +### Notebook Interface Architecture + +The notebook interface consists of the main `NotebookCtrl` controller and multiple `ParagraphCtrl` instances for individual code/text blocks. + +```mermaid +graph TB + subgraph "NotebookCtrl Scope" + NoteConfig["Note Configuration
Settings & Permissions"] + ActionBar["Action Bar
Run, Export, Clone"] + ParagraphMgmt["Paragraph Management
Add, Remove, Reorder"] + Search["Search & Replace
Content Search"] + end + + subgraph "ParagraphCtrl Instances" + Para1["ParagraphCtrl #1
Code Editor + Results"] + Para2["ParagraphCtrl #2
Code Editor + Results"] + ParaN["ParagraphCtrl #N
Code Editor + Results"] + end + + subgraph "Shared Services" + WebSocket["websocketMsgSrv
Real-time Communication"] + Helium["heliumService
Visualization Plugins"] + end + + NoteConfig --> WebSocket + ActionBar --> WebSocket + ParagraphMgmt --> Para1 + ParagraphMgmt --> Para2 + ParagraphMgmt --> ParaN + + Para1 --> WebSocket + Para2 --> WebSocket + ParaN --> WebSocket + + Para1 --> Helium + Para2 --> Helium + ParaN --> Helium +``` + +Sources: [zeppelin-web/src/app/notebook/notebook.controller.js:21-1319](), [zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js:31-1519]() + +## Communication Layer + +The frontend communicates with the Zeppelin server through two primary channels: + +### WebSocket Communication + +Real-time bidirectional communication is handled by `websocketMsgSrv` for: +- **Paragraph Execution**: `runParagraph()`, `cancelParagraphRun()` +- **Note Management**: `getNote()`, `updateNote()`, `cloneNote()` +- **Interpreter Control**: `getInterpreterBindings()`, `saveInterpreterBindings()` +- **Real-time Updates**: Paragraph results, status changes, collaborative editing + +### REST API Integration + +HTTP requests through `baseUrlSrv` for: +- **Authentication**: Login/logout operations +- **File Operations**: Note import/export +- **Configuration**: System settings and version info +- **Permissions**: Note access control + +```mermaid +graph LR + subgraph "Frontend Services" + WebSocketSrv["websocketMsgSrv"] + BaseUrlSrv["baseUrlSrv"] + HttpSrv["$http"] + end + + subgraph "Server Endpoints" + WSEndpoint["WebSocket
/ws"] + RestAPI["REST API
/api/..."] + AuthEndpoint["Auth
/login"] + end + + subgraph "Communication Types" + RealTime["Real-time Updates
Paragraph execution
Collaborative editing"] + ReqResp["Request/Response
Configuration
File operations"] + end + + WebSocketSrv --> WSEndpoint + BaseUrlSrv --> RestAPI + HttpSrv --> AuthEndpoint + + WSEndpoint --> RealTime + RestAPI --> ReqResp + AuthEndpoint --> ReqResp +``` + +Sources: [zeppelin-web/src/components/websocket/websocket-event.factory.js](), [zeppelin-web/src/components/baseUrl/baseurl.service.js]() + +## Code Editor Integration + +The paragraph interface integrates the ACE editor for code editing with extensive customization and language support. + +### Editor Configuration + +The `ParagraphCtrl` configures ACE editor instances with: +- **Syntax Highlighting**: Multiple language modes (Scala, Python, SQL, etc.) +- **Autocompletion**: Custom completion provider with backend integration +- **Key Bindings**: Emacs/default key bindings based on platform +- **Real-time Collaboration**: Patch-based synchronization + +**Key Editor Functions:** +- `aceLoaded(_editor)` - Initialize editor instance ([zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js:751-1018]()) +- `aceChanged(_, editor)` - Handle content changes ([zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js:726-742]()) +- `sendPatch()` - Collaborative editing synchronization ([zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js:744-749]()) + +Sources: [zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js:751-1018](), [zeppelin-web/bower.json:17-18]() + +## Build System and Development Workflow + +The frontend uses a modern JavaScript build pipeline combining Grunt, Webpack, and npm. + +### Build Tools Configuration + +| Tool | Purpose | Key Files | +|------|---------|-----------| +| **Grunt** | Task automation | [zeppelin-web/Gruntfile.js]() | +| **Webpack** | Module bundling | [zeppelin-web/webpack.config.js]() | +| **npm** | Dependency management | [zeppelin-web/package.json]() | +| **Bower** | Frontend dependencies | [zeppelin-web/bower.json]() | + +### Development Scripts + +The [zeppelin-web/package.json:8-24]() defines key npm scripts: +- `npm run dev` - Development server with hot reload +- `npm run build:dist` - Production build +- `npm run lint:watch` - Continuous linting +- `npm run karma-test` - Unit test execution + +### Asset Processing Pipeline + +```mermaid +graph LR + subgraph "Source Files" + JSNG["Angular.js Files
Controllers, Services"] + HTML["HTML Templates"] + CSS["CSS/SCSS Styles"] + Deps["Bower Dependencies
Third-party libs"] + end + + subgraph "Build Process" + Webpack["Webpack
Module Bundling"] + Grunt["Grunt Tasks
Copy, Minify, Concat"] + PostCSS["PostCSS
Autoprefixer"] + end + + subgraph "Output" + DistJS["Bundled JS
dist/scripts/"] + DistCSS["Optimized CSS
dist/styles/"] + DistHTML["Processed HTML
dist/"] + end + + JSNG --> Webpack + HTML --> Grunt + CSS --> PostCSS + Deps --> Grunt + + Webpack --> DistJS + PostCSS --> DistCSS + Grunt --> DistHTML +``` + +Sources: [zeppelin-web/Gruntfile.js:18-342](), [zeppelin-web/package.json:8-24](), [zeppelin-web/webpack.config.js]() diff --git a/.gitignore b/.gitignore index e52d5a63a2e..ec6747baad1 100644 --- a/.gitignore +++ b/.gitignore @@ -131,3 +131,5 @@ tramp # jEnv file .java-version +.pre-commit-config.yaml +trufflehog/