Add rudimentary form of error recovery to parser

When we encounter a parser error, we'll now attempt some basic error recovery by simply skipping token until we reach a token at the start of a line that is an identifier, documentation comment, or '@'. This should help with things like LSPs that still should be able to utilize jump to definition even with a broken or in-progress model.
smithy-lang · Apr 14, 2023 · a4c906d · a4c906d
1 parent 7080189
commit a4c906d
Show file tree

Hide file tree

Showing 4 changed files with 113 additions and 32 deletions.
diff --git a/smithy-model/src/main/java/software/amazon/smithy/model/loader/IdlModelLoader.java b/smithy-model/src/main/java/software/amazon/smithy/model/loader/IdlModelLoader.java
@@ -415,51 +415,85 @@ private void parseApplyStatement() {
 
     private void parseFirstShapeStatement(SourceLocation possibleDocCommentLocation) {
         if (tokenizer.getCurrentToken() != IdlToken.EOF) {
-            if (tokenizer.doesCurrentIdentifierStartWith('a')) {
-                parseApplyStatement();
-            } else {
-                List<IdlTraitParser.Result> traits;
-                boolean hasDocComment = tokenizer.getCurrentToken() == IdlToken.DOC_COMMENT;
-
-                if (possibleDocCommentLocation == null) {
-                    traits = IdlTraitParser.parseDocsAndTraitsBeforeShape(tokenizer, resolver);
+            try {
+                if (tokenizer.doesCurrentIdentifierStartWith('a')) {
+                    parseApplyStatement();
                 } else {
-                    // In this case, this is the first shape encountered for a model file that doesn't have any
-                    // use statements. We need to take the previously skipped documentation comments to parse
-                    // potential use statements and apply them to this first shape.
-                    String docLines = tokenizer.removePendingDocCommentLines();
-                    traits = IdlTraitParser.parseDocsAndTraitsBeforeShape(tokenizer, resolver);
-                    // Note that possibleDocCommentLocation is just a mark of where docs _could be_.
-                    if (docLines != null) {
-                        hasDocComment = true;
-                        traits.add(new IdlTraitParser.Result(DocumentationTrait.ID.toString(),
-                                                             new StringNode(docLines, possibleDocCommentLocation),
-                                                             IdlTraitParser.TraitType.DOC_COMMENT));
+                    List<IdlTraitParser.Result> traits;
+                    boolean hasDocComment = tokenizer.getCurrentToken() == IdlToken.DOC_COMMENT;
+
+                    if (possibleDocCommentLocation == null) {
+                        traits = IdlTraitParser.parseDocsAndTraitsBeforeShape(tokenizer, resolver);
+                    } else {
+                        // In this case, this is the first shape encountered for a model file that doesn't have any
+                        // use statements. We need to take the previously skipped documentation comments to parse
+                        // potential use statements and apply them to this first shape.
+                        String docLines = tokenizer.removePendingDocCommentLines();
+                        traits = IdlTraitParser.parseDocsAndTraitsBeforeShape(tokenizer, resolver);
+                        // Note that possibleDocCommentLocation is just a mark of where docs _could be_.
+                        if (docLines != null) {
+                            hasDocComment = true;
+                            traits.add(new IdlTraitParser.Result(DocumentationTrait.ID.toString(),
+                                                                 new StringNode(docLines, possibleDocCommentLocation),
+                                                                 IdlTraitParser.TraitType.DOC_COMMENT));
+                        }
                     }
-                }
 
-                if (parseShapeDefinition(traits, hasDocComment)) {
-                    parseShape(traits);
+                    if (parseShapeDefinition(traits, hasDocComment)) {
+                        parseShape(traits);
+                    }
                 }
+            } catch (ModelSyntaxException e) {
+                errorRecovery(e);
             }
         }
     }
 
     private void parseSubsequentShapeStatements() {
-        while (tokenizer.getCurrentToken() != IdlToken.EOF) {
-            if (tokenizer.doesCurrentIdentifierStartWith('a')) {
-                parseApplyStatement();
-            } else {
-                List<IdlTraitParser.Result> traits;
-                boolean hasDocComment = tokenizer.getCurrentToken() == IdlToken.DOC_COMMENT;
-                traits = IdlTraitParser.parseDocsAndTraitsBeforeShape(tokenizer, resolver);
-                if (parseShapeDefinition(traits, hasDocComment)) {
-                    parseShape(traits);
+        while (tokenizer.hasNext()) {
+            try {
+                if (tokenizer.doesCurrentIdentifierStartWith('a')) {
+                    parseApplyStatement();
+                } else {
+                    List<IdlTraitParser.Result> traits;
+                    boolean hasDocComment = tokenizer.getCurrentToken() == IdlToken.DOC_COMMENT;
+                    traits = IdlTraitParser.parseDocsAndTraitsBeforeShape(tokenizer, resolver);
+                    if (parseShapeDefinition(traits, hasDocComment)) {
+                        parseShape(traits);
+                    }
                 }
+            } catch (ModelSyntaxException e) {
+                errorRecovery(e);
             }
         }
     }
 
+    private void errorRecovery(ModelSyntaxException e) {
+        if (!tokenizer.hasNext()) {
+            throw e;
+        }
+
+        // Here we do rudimentary error recovery to attempt to make sense of the remaining model.
+        // We do this by scanning tokens until we find the next identifier at the start of a line.
+        // This will skip over doc comments and traits of the next shape, but is a decent heuristic.
+        // The model is still invalid and will fail to validate, but things like IDEs should still be able
+        // to do things like jump to definition.
+        emit(ValidationEvent.fromSourceException(e));
+
+        do {
+            // Always skip the current token since it was the one that failed.
+            IdlToken token = tokenizer.next();
+            if (tokenizer.getCurrentTokenColumn() == 1 && isErrorRecoveryToken(token)) {
+                break;
+            }
+        } while (tokenizer.hasNext());
+    }
+
+    // These tokens are good signals that the next shape is starting.
+    private boolean isErrorRecoveryToken(IdlToken token) {
+        return token == IdlToken.IDENTIFIER || token == IdlToken.DOC_COMMENT || token == IdlToken.AT;
+    }
+
     private boolean parseShapeDefinition(List<IdlTraitParser.Result> traits, boolean hasDocComment) {
         if (tokenizer.getCurrentToken() != IdlToken.EOF) {
             // Continue to parse if not at the end of the file.

diff --git a/smithy-model/src/main/java/software/amazon/smithy/model/loader/IdlTokenizer.java b/smithy-model/src/main/java/software/amazon/smithy/model/loader/IdlTokenizer.java
@@ -321,7 +321,7 @@ public IdlToken next() {
 
         if (c == SimpleParser.EOF) {
             if (emittedEof) {
-                throw new NoSuchElementException("Expected another token but traversed beyond EOF");
+                throw new NoSuchElementException("Expected another token but reached EOF");
             } else {
                 emittedEof = true;
                 currentTokenEnd = parser.position();

diff --git a/smithy-model/src/test/java/software/amazon/smithy/model/loader/IdlModelLoaderTest.java b/smithy-model/src/test/java/software/amazon/smithy/model/loader/IdlModelLoaderTest.java
@@ -339,4 +339,38 @@ public void setsCorrectLocationForEnum() {
         assertThat(fooBarMember.getSourceLocation().getLine(), is(7));
         assertThat(fooBarMember.getSourceLocation().getColumn(), is(5));
     }
+
+    @Test
+    public void doesBasicErrorRecovery() {
+        ValidatedResult<Model> result = Model.assembler()
+                .addImport(getClass().getResource("error-recovery.smithy"))
+                .assemble();
+
+        assertThat(result.isBroken(), is(true));
+        assertThat(result.getResult().isPresent(), is(true));
+
+        Model model = result.getResult().get();
+
+        assertThat(model.getShape(ShapeId.from("smithy.example#MyString")).isPresent(), is(true));
+        assertThat(model.getShape(ShapeId.from("smithy.example#MyFooIsBroken")).isPresent(), is(false));
+        assertThat(model.getShape(ShapeId.from("smithy.example#MyInteger")).isPresent(), is(false));
+        assertThat(model.getShape(ShapeId.from("smithy.example#MyInteger2")).isPresent(), is(true));
+
+        System.out.println(result.getValidationEvents());
+
+        boolean foundSyntax = false;
+        boolean foundTrait = false;
+        for (ValidationEvent e : result.getValidationEvents()) {
+            if (e.getSeverity() == Severity.ERROR && e.getMessage().contains(
+                    "Syntax error at line 9, column 9: Expected COLON(':') but found IDENTIFIER('MyInteger')")) {
+                foundSyntax = true;
+            }
+            if (e.getSeverity() == Severity.ERROR && e.getMessage().contains("Unable to resolve trait")) {
+                foundTrait = true;
+            }
+        }
+
+        assertThat(foundSyntax, is(true));
+        assertThat(foundTrait, is(true));
+    }
 }
diff --git a/smithy-model/src/test/resources/software/amazon/smithy/model/loader/error-recovery.smithy b/smithy-model/src/test/resources/software/amazon/smithy/model/loader/error-recovery.smithy
@@ -0,0 +1,13 @@
+$version: "2.0"
+
+namespace smithy.example
+
+string MyString
+
+structure MyFooIsBroken {
+// The parser will keep trying to parse here, assuming integer is a key and needs to be followed by ":".
+integer MyInteger
+
+// When the above fails, error recovery kicks in, looking for the next token at the start of the line.
+@unknown
+integer MyInteger2