From 5fb6ee84e64ff6714f3d308f6837e4c4c208e195 Mon Sep 17 00:00:00 2001 From: Alex Ott Date: Thu, 11 Jun 2020 15:09:38 +0200 Subject: [PATCH] [ZEPPELIN-4870] Improve parsing of the paragraph properties --- docs/usage/interpreter/overview.md | 8 +- .../notebook/ParagraphTextParser.java | 129 ++++++++++++++---- .../notebook/ParagraphTextParserTest.java | 105 +++++++++++++- 3 files changed, 205 insertions(+), 37 deletions(-) diff --git a/docs/usage/interpreter/overview.md b/docs/usage/interpreter/overview.md index 865c3a60f39..20989ee2ccf 100644 --- a/docs/usage/interpreter/overview.md +++ b/docs/usage/interpreter/overview.md @@ -38,7 +38,13 @@ When you click the ```+Create``` button on the interpreter page, the interpreter You can create multiple interpreters for the same engine with different interpreter setting. e.g. You can create `spark2` for Spark 2.x and create `spark1` for Spark 1.x. -For each paragraph you write in Zeppelin, you need to specify its interpreter first via `%interpreter_group.interpreter_name`. e.g. `%spark.pyspark`, `%spark.r` +For each paragraph you write in Zeppelin, you need to specify its interpreter first via `%interpreter_group.interpreter_name`. e.g. `%spark.pyspark`, `%spark.r`. + +If you specify interpreter, you can also pass local properties to it (if it needs them). This is done by providing a set of key/value pairs, separated by comma, inside the round brackets right after the interpreter name. If key or value contain characters like `=`, or `,`, then you can either escape them with `\` character, or wrap the whole value inside the double quotes For example: + +``` +%cassandra(outputFormat=cql, dateFormat="E, d MMM yy", timeFormat=E\, d MMM yy) +``` ## What are the Interpreter Settings? The interpreter settings are the configuration of a given interpreter on the Zeppelin server. For example, certain properties need to be set for the Apache Hive JDBC interpreter to connect to the Hive server. diff --git a/zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/ParagraphTextParser.java b/zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/ParagraphTextParser.java index 5ee7261cd51..f4742b414c0 100644 --- a/zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/ParagraphTextParser.java +++ b/zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/ParagraphTextParser.java @@ -17,13 +17,12 @@ package org.apache.zeppelin.notebook; -import org.apache.commons.lang3.StringUtils; - import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; - +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Parser which is used for parsing the paragraph text. @@ -42,6 +41,7 @@ * localProperties: Map(pool->pool_1) */ public class ParagraphTextParser { + private static final Logger LOGGER = LoggerFactory.getLogger(ParagraphTextParser.class); public static class ParseResult { private String intpText; @@ -67,41 +67,112 @@ public Map getLocalProperties() { } } - private static Pattern REPL_PATTERN = - Pattern.compile("(\\s*)%([\\w\\.]+)(\\(.*?\\))?.*", Pattern.DOTALL); + private static Pattern REPL_PATTERN = Pattern.compile("^(\\s*)%(\\w+(?:\\.\\w+)*)"); + + private static int parseLocalProperties( + final String text, int startPos, + Map localProperties) throws RuntimeException{ + startPos++; + String propKey = null; + boolean insideQuotes = false, parseKey = true, finished = false; + StringBuilder sb = new StringBuilder(); + while(!finished && startPos < text.length()) { + char ch = text.charAt(startPos); + switch (ch) { + case ')': { + if (!insideQuotes) { + if (parseKey) { + propKey = sb.toString().trim(); + if (!propKey.isEmpty()) { + localProperties.put(propKey, propKey); + } + } else { + localProperties.put(propKey, sb.toString().trim()); + } + finished = true; + } else { + sb.append(ch); + } + break; + } + case '\\': { + if ((startPos + 1) == text.length()) { + throw new RuntimeException( + "Problems by parsing paragraph. Unfinished escape sequence"); + } + startPos++; + ch = text.charAt(startPos); + switch (ch) { + case 'n': + sb.append('\n'); + break; + case 't': + sb.append('\t'); + break; + default: + sb.append(ch); + } + break; + } + case '"': { + insideQuotes = !insideQuotes; + break; + } + case '=': { + if (insideQuotes) { + sb.append(ch); + } else { + propKey = sb.toString().trim(); + sb.delete(0, sb.length()); + parseKey = false; + } + break; + } + case ',': { + if (insideQuotes) { + sb.append(ch); + } else if (propKey == null || propKey.trim().isEmpty()) { + throw new RuntimeException( + "Problems by parsing paragraph. Local property key is empty"); + } else { + if (parseKey) { + propKey = sb.toString().trim(); + localProperties.put(propKey, propKey); + } else { + localProperties.put(propKey, sb.toString().trim()); + } + } + propKey = null; + parseKey = true; + sb.delete(0, sb.length()); + break; + } + default: + sb.append(ch); + } + startPos++; + } + if (!finished) { + throw new RuntimeException( + "Problems by parsing paragraph. Not finished interpreter configuration"); + } + return startPos; + } public static ParseResult parse(String text) { Map localProperties = new HashMap<>(); - String intpText = null; + String intpText = ""; String scriptText = null; Matcher matcher = REPL_PATTERN.matcher(text); - if (matcher.matches()) { + if (matcher.find()) { String headingSpace = matcher.group(1); intpText = matcher.group(2); - if (matcher.groupCount() == 3 && matcher.group(3) != null) { - String localPropertiesText = matcher.group(3); - String[] splits = localPropertiesText.substring(1, localPropertiesText.length() - 1) - .split(","); - for (String split : splits) { - String[] kv = split.split("="); - if (StringUtils.isBlank(split) || kv.length == 0) { - continue; - } - if (kv.length > 2) { - throw new RuntimeException("Invalid paragraph properties format: " + split); - } - if (kv.length == 1) { - localProperties.put(kv[0].trim(), kv[0].trim()); - } else { - localProperties.put(kv[0].trim(), kv[1].trim()); - } - } - scriptText = text.substring(headingSpace.length() + intpText.length() + - localPropertiesText.length() + 1).trim(); - } else { - scriptText = text.substring(headingSpace.length() + intpText.length() + 1).trim(); + int startPos = headingSpace.length() + intpText.length() + 1; + if (text.charAt(startPos) == '(') { + startPos = parseLocalProperties(text, startPos, localProperties); } + scriptText = text.substring(startPos).trim(); } else { intpText = ""; scriptText = text; diff --git a/zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/ParagraphTextParserTest.java b/zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/ParagraphTextParserTest.java index 6da2454f151..46361052a3a 100644 --- a/zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/ParagraphTextParserTest.java +++ b/zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/ParagraphTextParserTest.java @@ -17,7 +17,9 @@ package org.apache.zeppelin.notebook; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.ExpectedException; import static junit.framework.TestCase.assertEquals; @@ -33,30 +35,119 @@ public void testJupyter() { } @Test - public void testParagraphText() { + public void testParagraphTextLocalPropertiesAndText() { ParagraphTextParser.ParseResult parseResult = ParagraphTextParser.parse("%spark.pyspark(pool=pool_1) sc.version"); assertEquals("spark.pyspark", parseResult.getIntpText()); assertEquals(1, parseResult.getLocalProperties().size()); assertEquals("pool_1", parseResult.getLocalProperties().get("pool")); assertEquals("sc.version", parseResult.getScriptText()); + } - // no script text - parseResult = ParagraphTextParser.parse("%spark.pyspark(pool=pool_1)"); + @Test + public void testParagraphTextLocalPropertiesNoText() { + ParagraphTextParser.ParseResult parseResult = ParagraphTextParser.parse("%spark.pyspark(pool=pool_1)"); assertEquals("spark.pyspark", parseResult.getIntpText()); assertEquals(1, parseResult.getLocalProperties().size()); assertEquals("pool_1", parseResult.getLocalProperties().get("pool")); assertEquals("", parseResult.getScriptText()); + } + + @Test + public void testParagraphTextLocalPropertyNoValueNoText() { + ParagraphTextParser.ParseResult parseResult = ParagraphTextParser.parse("%spark.pyspark(pool)"); + assertEquals("spark.pyspark", parseResult.getIntpText()); + assertEquals(1, parseResult.getLocalProperties().size()); + assertEquals("pool", parseResult.getLocalProperties().get("pool")); + assertEquals("", parseResult.getScriptText()); + } - // no paragraph local properties - parseResult = ParagraphTextParser.parse("%spark.pyspark sc.version"); + @Test + public void testParagraphTextNoLocalProperties() { + ParagraphTextParser.ParseResult parseResult = ParagraphTextParser.parse("%spark.pyspark sc.version"); assertEquals("spark.pyspark", parseResult.getIntpText()); assertEquals(0, parseResult.getLocalProperties().size()); assertEquals("sc.version", parseResult.getScriptText()); + } - // no intp text and paragraph local properties - parseResult = ParagraphTextParser.parse("sc.version"); + @Test + public void testParagraphNoInterpreter() { + ParagraphTextParser.ParseResult parseResult = ParagraphTextParser.parse("sc.version"); assertEquals("", parseResult.getIntpText()); assertEquals(0, parseResult.getLocalProperties().size()); assertEquals("sc.version", parseResult.getScriptText()); } + + @Test + public void testParagraphInterpreterWithoutProperties() { + ParagraphTextParser.ParseResult parseResult = ParagraphTextParser.parse("%spark() sc.version"); + assertEquals("spark", parseResult.getIntpText()); + assertEquals(0, parseResult.getLocalProperties().size()); + assertEquals("sc.version", parseResult.getScriptText()); + } + + @Test + public void testParagraphTextQuotedPropertyValue1() { + ParagraphTextParser.ParseResult parseResult = ParagraphTextParser.parse( + "%spark.pyspark(pool=\"value with = inside\")"); + assertEquals("spark.pyspark", parseResult.getIntpText()); + assertEquals(1, parseResult.getLocalProperties().size()); + assertEquals("value with = inside", parseResult.getLocalProperties().get("pool")); + assertEquals("", parseResult.getScriptText()); + } + + @Test + public void testParagraphTextQuotedPropertyValue2() { + ParagraphTextParser.ParseResult parseResult = ParagraphTextParser.parse( + "%spark.pyspark(pool=\"value with \\\" inside\", p=\"eol\\ninside\" )"); + assertEquals("spark.pyspark", parseResult.getIntpText()); + assertEquals(2, parseResult.getLocalProperties().size()); + assertEquals("value with \" inside", parseResult.getLocalProperties().get("pool")); + assertEquals("eol\ninside", parseResult.getLocalProperties().get("p")); + assertEquals("", parseResult.getScriptText()); + } + + @Test + public void testParagraphTextQuotedPropertyKeyAndValue() { + ParagraphTextParser.ParseResult parseResult = ParagraphTextParser.parse( + "%spark.pyspark(\"po ol\"=\"value with \\\" inside\")"); + assertEquals("spark.pyspark", parseResult.getIntpText()); + assertEquals(1, parseResult.getLocalProperties().size()); + assertEquals("value with \" inside", parseResult.getLocalProperties().get("po ol")); + assertEquals("", parseResult.getScriptText()); + } + + @Rule + public ExpectedException exceptionRule = ExpectedException.none(); + + @Test + public void testParagraphTextUnfinishedConfig() { + exceptionRule.expect(RuntimeException.class); + exceptionRule.expectMessage( + "Problems by parsing paragraph. Not finished interpreter configuration"); + ParagraphTextParser.parse("%spark.pyspark(pool="); + } + + @Test + public void testParagraphTextUnfinishedQuote() { + exceptionRule.expect(RuntimeException.class); + exceptionRule.expectMessage( + "Problems by parsing paragraph. Not finished interpreter configuration"); + ParagraphTextParser.parse("%spark.pyspark(pool=\"2314234) sc.version"); + } + + @Test + public void testParagraphTextUnclosedBackslash() { + exceptionRule.expect(RuntimeException.class); + exceptionRule.expectMessage( + "Problems by parsing paragraph. Unfinished escape sequence"); + ParagraphTextParser.parse("%spark.pyspark(pool=\\"); + } + + @Test + public void testParagraphTextEmptyKey() { + exceptionRule.expect(RuntimeException.class); + exceptionRule.expectMessage( + "Problems by parsing paragraph. Local property key is empty"); + ParagraphTextParser.parse("%spark.pyspark(pool=123, ,)"); + } }