Add rudimentary form of error recovery to parser

When we encounter a parser error, we'll now attempt some basic error recovery by simply skipping token until we reach a token at the start of a line that is an identifier, documentation comment, or '@'. This should help with things like LSPs that still should be able to utilize jump to definition even with a broken or in-progress model. Add error recovery to control statements and metadata. Ensure that invalid identifiers are turned into ERROR tokens instead of throwing. Error recovery will also consider "$" a recovery token.
smithy-lang · Apr 19, 2023 · d1ebe68 · d1ebe68
1 parent 94bf11b
commit d1ebe68
Show file tree

Hide file tree

Showing 8 changed files with 317 additions and 80 deletions.
diff --git a/smithy-model/src/main/java/software/amazon/smithy/model/loader/IdlModelLoader.java b/smithy-model/src/main/java/software/amazon/smithy/model/loader/IdlModelLoader.java
@@ -232,44 +232,48 @@ private void parseControlSection() {
         Set<CharSequence> definedKeys = new HashSet<>();
 
         while (tokenizer.getCurrentToken() == IdlToken.DOLLAR) {
-            tokenizer.next();
-            tokenizer.expect(IdlToken.IDENTIFIER, IdlToken.STRING);
-            String key = tokenizer.internString(tokenizer.getCurrentTokenStringSlice());
+            try {
+                tokenizer.next();
+                tokenizer.expect(IdlToken.IDENTIFIER, IdlToken.STRING);
+                String key = tokenizer.internString(tokenizer.getCurrentTokenStringSlice());
 
-            tokenizer.next();
-            tokenizer.skipSpaces();
-            tokenizer.expect(IdlToken.COLON);
-            tokenizer.next();
-            tokenizer.skipSpaces();
+                tokenizer.next();
+                tokenizer.skipSpaces();
+                tokenizer.expect(IdlToken.COLON);
+                tokenizer.next();
+                tokenizer.skipSpaces();
 
-            if (!definedKeys.add(key)) {
-                throw syntax(format("Duplicate control statement `%s`", key));
-            }
+                if (!definedKeys.add(key)) {
+                    throw syntax(format("Duplicate control statement `%s`", key));
+                }
 
-            Node value = IdlNodeParser.expectAndSkipNode(tokenizer, resolver);
+                Node value = IdlNodeParser.expectAndSkipNode(tokenizer, resolver);
 
-            switch (key) {
-                case "version":
-                    onVersion(value);
-                    break;
-                case "operationInputSuffix":
-                    operationInputSuffix = value.expectStringNode().getValue();
-                    break;
-                case "operationOutputSuffix":
-                    operationOutputSuffix = value.expectStringNode().getValue();
-                    break;
-                default:
-                    emit(ValidationEvent.builder()
-                                 .id(Validator.MODEL_ERROR)
-                                 .sourceLocation(value)
-                                 .severity(Severity.WARNING)
-                                 .message(format("Unknown control statement `%s` with value `%s",
-                                                 key, Node.printJson(value)))
-                                 .build());
-                    break;
-            }
+                switch (key) {
+                    case "version":
+                        onVersion(value);
+                        break;
+                    case "operationInputSuffix":
+                        operationInputSuffix = value.expectStringNode().getValue();
+                        break;
+                    case "operationOutputSuffix":
+                        operationOutputSuffix = value.expectStringNode().getValue();
+                        break;
+                    default:
+                        emit(ValidationEvent.builder()
+                                     .id(Validator.MODEL_ERROR)
+                                     .sourceLocation(value)
+                                     .severity(Severity.WARNING)
+                                     .message(format("Unknown control statement `%s` with value `%s",
+                                                     key, Node.printJson(value)))
+                                     .build());
+                        break;
+                }
 
-            tokenizer.expectAndSkipBr();
+                tokenizer.expectAndSkipBr();
+            } catch (ModelSyntaxException e) {
+                errorRecovery(e);
+            }
         }
     }
 
@@ -293,19 +297,23 @@ private void onVersion(Node value) {
 
     private void parseMetadataSection() {
         while (tokenizer.doesCurrentIdentifierStartWith('m')) {
-            tokenizer.expectCurrentLexeme("metadata");
-            tokenizer.next(); // skip "metadata"
-            tokenizer.expectAndSkipSpaces();
-            tokenizer.expect(IdlToken.IDENTIFIER, IdlToken.STRING);
-            String key = tokenizer.internString(tokenizer.getCurrentTokenStringSlice());
-            tokenizer.next();
-            tokenizer.skipSpaces();
-            tokenizer.expect(IdlToken.EQUAL);
-            tokenizer.next();
-            tokenizer.skipSpaces();
-            Node value = IdlNodeParser.expectAndSkipNode(tokenizer, resolver);
-            operations.accept(new LoadOperation.PutMetadata(modelVersion, key, value));
-            tokenizer.expectAndSkipBr();
+            try {
+                tokenizer.expectCurrentLexeme("metadata");
+                tokenizer.next(); // skip "metadata"
+                tokenizer.expectAndSkipSpaces();
+                tokenizer.expect(IdlToken.IDENTIFIER, IdlToken.STRING);
+                String key = tokenizer.internString(tokenizer.getCurrentTokenStringSlice());
+                tokenizer.next();
+                tokenizer.skipSpaces();
+                tokenizer.expect(IdlToken.EQUAL);
+                tokenizer.next();
+                tokenizer.skipSpaces();
+                Node value = IdlNodeParser.expectAndSkipNode(tokenizer, resolver);
+                operations.accept(new LoadOperation.PutMetadata(modelVersion, key, value));
+                tokenizer.expectAndSkipBr();
+            } catch (ModelSyntaxException e) {
+                errorRecovery(e);
+            }
         }
     }
 
@@ -415,51 +423,92 @@ private void parseApplyStatement() {
 
     private void parseFirstShapeStatement(SourceLocation possibleDocCommentLocation) {
         if (tokenizer.getCurrentToken() != IdlToken.EOF) {
-            if (tokenizer.doesCurrentIdentifierStartWith('a')) {
-                parseApplyStatement();
-            } else {
-                List<IdlTraitParser.Result> traits;
-                boolean hasDocComment = tokenizer.getCurrentToken() == IdlToken.DOC_COMMENT;
-
-                if (possibleDocCommentLocation == null) {
-                    traits = IdlTraitParser.parseDocsAndTraitsBeforeShape(tokenizer, resolver);
+            try {
+                if (tokenizer.doesCurrentIdentifierStartWith('a')) {
+                    parseApplyStatement();
                 } else {
-                    // In this case, this is the first shape encountered for a model file that doesn't have any
-                    // use statements. We need to take the previously skipped documentation comments to parse
-                    // potential use statements and apply them to this first shape.
-                    String docLines = tokenizer.removePendingDocCommentLines();
-                    traits = IdlTraitParser.parseDocsAndTraitsBeforeShape(tokenizer, resolver);
-                    // Note that possibleDocCommentLocation is just a mark of where docs _could be_.
-                    if (docLines != null) {
-                        hasDocComment = true;
-                        traits.add(new IdlTraitParser.Result(DocumentationTrait.ID.toString(),
-                                                             new StringNode(docLines, possibleDocCommentLocation),
-                                                             IdlTraitParser.TraitType.DOC_COMMENT));
+                    List<IdlTraitParser.Result> traits;
+                    boolean hasDocComment = tokenizer.getCurrentToken() == IdlToken.DOC_COMMENT;
+
+                    if (possibleDocCommentLocation == null) {
+                        traits = IdlTraitParser.parseDocsAndTraitsBeforeShape(tokenizer, resolver);
+                    } else {
+                        // In this case, this is the first shape encountered for a model file that doesn't have any
+                        // use statements. We need to take the previously skipped documentation comments to parse
+                        // potential use statements and apply them to this first shape.
+                        String docLines = tokenizer.removePendingDocCommentLines();
+                        traits = IdlTraitParser.parseDocsAndTraitsBeforeShape(tokenizer, resolver);
+                        // Note that possibleDocCommentLocation is just a mark of where docs _could be_.
+                        if (docLines != null) {
+                            hasDocComment = true;
+                            traits.add(new IdlTraitParser.Result(DocumentationTrait.ID.toString(),
+                                                                 new StringNode(docLines, possibleDocCommentLocation),
+                                                                 IdlTraitParser.TraitType.DOC_COMMENT));
+                        }
                     }
-                }
 
-                if (parseShapeDefinition(traits, hasDocComment)) {
-                    parseShape(traits);
+                    if (parseShapeDefinition(traits, hasDocComment)) {
+                        parseShape(traits);
+                    }
                 }
+            } catch (ModelSyntaxException e) {
+                errorRecovery(e);
             }
         }
     }
 
     private void parseSubsequentShapeStatements() {
-        while (tokenizer.getCurrentToken() != IdlToken.EOF) {
-            if (tokenizer.doesCurrentIdentifierStartWith('a')) {
-                parseApplyStatement();
-            } else {
-                List<IdlTraitParser.Result> traits;
-                boolean hasDocComment = tokenizer.getCurrentToken() == IdlToken.DOC_COMMENT;
-                traits = IdlTraitParser.parseDocsAndTraitsBeforeShape(tokenizer, resolver);
-                if (parseShapeDefinition(traits, hasDocComment)) {
-                    parseShape(traits);
+        while (tokenizer.hasNext()) {
+            try {
+                if (tokenizer.doesCurrentIdentifierStartWith('a')) {
+                    parseApplyStatement();
+                } else {
+                    List<IdlTraitParser.Result> traits;
+                    boolean hasDocComment = tokenizer.getCurrentToken() == IdlToken.DOC_COMMENT;
+                    traits = IdlTraitParser.parseDocsAndTraitsBeforeShape(tokenizer, resolver);
+                    if (parseShapeDefinition(traits, hasDocComment)) {
+                        parseShape(traits);
+                    }
                 }
+            } catch (ModelSyntaxException e) {
+                errorRecovery(e);
             }
         }
     }
 
+    private void errorRecovery(ModelSyntaxException e) {
+        if (!tokenizer.hasNext()) {
+            throw e;
+        }
+
+        // Here we do rudimentary error recovery to attempt to make sense of the remaining model.
+        // We do this by scanning tokens until we find the next "$", identifier, docs, or trait at the start of a line.
+        // The model is still invalid and will fail to validate, but things like IDEs should still be able to do
+        // things like goto definition.
+        emit(ValidationEvent.fromSourceException(e));
+
+        do {
+            // Always skip the current token since it was the one that failed.
+            IdlToken token = tokenizer.next();
+            if (tokenizer.getCurrentTokenColumn() == 1 && isErrorRecoveryToken(token)) {
+                break;
+            }
+        } while (tokenizer.hasNext());
+    }
+
+    // These tokens are good signals that the next shape is starting.
+    private boolean isErrorRecoveryToken(IdlToken token) {
+        switch (token) {
+            case IDENTIFIER:
+            case DOC_COMMENT:
+            case AT:
+            case DOLLAR:
+                return true;
+            default:
+                return false;
+        }
+    }
+
     private boolean parseShapeDefinition(List<IdlTraitParser.Result> traits, boolean hasDocComment) {
         if (tokenizer.getCurrentToken() != IdlToken.EOF) {
             // Continue to parse if not at the end of the file.

diff --git a/smithy-model/src/main/java/software/amazon/smithy/model/loader/IdlTokenizer.java b/smithy-model/src/main/java/software/amazon/smithy/model/loader/IdlTokenizer.java
@@ -321,7 +321,7 @@ public IdlToken next() {
         switch (c) {
             case SimpleParser.EOF:
                 if (currentTokenType == IdlToken.EOF) {
-                    throw new NoSuchElementException("Expected another token but traversed beyond EOF");
+                    throw new NoSuchElementException("Expected another token but reached EOF");
                 }
                 currentTokenEnd = parser.position();
                 return currentTokenType = IdlToken.EOF;
@@ -750,9 +750,15 @@ private IdlToken parseNumber() {
     }
 
     private IdlToken parseIdentifier() {
-        ParserUtils.consumeIdentifier(parser);
+        try {
+            ParserUtils.consumeIdentifier(parser);
+            currentTokenType = IdlToken.IDENTIFIER;
+        } catch (RuntimeException e) {
+            currentTokenType = IdlToken.ERROR;
+            currentTokenError = e.getMessage();
+        }
         currentTokenEnd = parser.position();
-        return currentTokenType = IdlToken.IDENTIFIER;
+        return currentTokenType;
     }
 
     private IdlToken parseString() {