From 2a0022776f9339fef269805dc3a8b47035ccceff Mon Sep 17 00:00:00 2001
From: Michael Dowling <mtdowling@gmail.com>
Date: Fri, 14 Apr 2023 16:03:46 -0700
Subject: [PATCH] Add rudimentary form of error recovery to parser

When we encounter a parser error, we'll now attempt some basic
error recovery by simply skipping token until we reach a token at
the start of a line that is an identifier, documentation comment,
or '@'. This should help with things like LSPs that still should be
able to utilize jump to definition even with a broken or in-progress
model.
---
 .../smithy/model/loader/IdlModelLoader.java   | 95 +++++++++++++------
 .../smithy/model/loader/IdlTokenizer.java     |  2 +-
 .../model/loader/IdlModelLoaderTest.java      | 34 +++++++
 .../smithy/model/loader/error-recovery.smithy | 13 +++
 4 files changed, 112 insertions(+), 32 deletions(-)
 create mode 100644 smithy-model/src/test/resources/software/amazon/smithy/model/loader/error-recovery.smithy

diff --git a/smithy-model/src/main/java/software/amazon/smithy/model/loader/IdlModelLoader.java b/smithy-model/src/main/java/software/amazon/smithy/model/loader/IdlModelLoader.java
index 187edded716..56f847b3c2f 100644
--- a/smithy-model/src/main/java/software/amazon/smithy/model/loader/IdlModelLoader.java
+++ b/smithy-model/src/main/java/software/amazon/smithy/model/loader/IdlModelLoader.java
@@ -415,51 +415,84 @@ private void parseApplyStatement() {
 
     private void parseFirstShapeStatement(SourceLocation possibleDocCommentLocation) {
         if (tokenizer.getCurrentToken() != IdlToken.EOF) {
-            if (tokenizer.doesCurrentIdentifierStartWith('a')) {
-                parseApplyStatement();
-            } else {
-                List<IdlTraitParser.Result> traits;
-                boolean hasDocComment = tokenizer.getCurrentToken() == IdlToken.DOC_COMMENT;
-
-                if (possibleDocCommentLocation == null) {
-                    traits = IdlTraitParser.parseDocsAndTraitsBeforeShape(tokenizer, resolver);
+            try {
+                if (tokenizer.doesCurrentIdentifierStartWith('a')) {
+                    parseApplyStatement();
                 } else {
-                    // In this case, this is the first shape encountered for a model file that doesn't have any
-                    // use statements. We need to take the previously skipped documentation comments to parse
-                    // potential use statements and apply them to this first shape.
-                    String docLines = tokenizer.removePendingDocCommentLines();
-                    traits = IdlTraitParser.parseDocsAndTraitsBeforeShape(tokenizer, resolver);
-                    // Note that possibleDocCommentLocation is just a mark of where docs _could be_.
-                    if (docLines != null) {
-                        hasDocComment = true;
-                        traits.add(new IdlTraitParser.Result(DocumentationTrait.ID.toString(),
-                                                             new StringNode(docLines, possibleDocCommentLocation),
-                                                             IdlTraitParser.TraitType.DOC_COMMENT));
+                    List<IdlTraitParser.Result> traits;
+                    boolean hasDocComment = tokenizer.getCurrentToken() == IdlToken.DOC_COMMENT;
+
+                    if (possibleDocCommentLocation == null) {
+                        traits = IdlTraitParser.parseDocsAndTraitsBeforeShape(tokenizer, resolver);
+                    } else {
+                        // In this case, this is the first shape encountered for a model file that doesn't have any
+                        // use statements. We need to take the previously skipped documentation comments to parse
+                        // potential use statements and apply them to this first shape.
+                        String docLines = tokenizer.removePendingDocCommentLines();
+                        traits = IdlTraitParser.parseDocsAndTraitsBeforeShape(tokenizer, resolver);
+                        // Note that possibleDocCommentLocation is just a mark of where docs _could be_.
+                        if (docLines != null) {
+                            hasDocComment = true;
+                            traits.add(new IdlTraitParser.Result(DocumentationTrait.ID.toString(),
+                                                                 new StringNode(docLines, possibleDocCommentLocation),
+                                                                 IdlTraitParser.TraitType.DOC_COMMENT));
+                        }
                     }
-                }
 
-                if (parseShapeDefinition(traits, hasDocComment)) {
-                    parseShape(traits);
+                    if (parseShapeDefinition(traits, hasDocComment)) {
+                        parseShape(traits);
+                    }
                 }
+            } catch (ModelSyntaxException e) {
+                errorRecovery(e);
             }
         }
     }
 
     private void parseSubsequentShapeStatements() {
-        while (tokenizer.getCurrentToken() != IdlToken.EOF) {
-            if (tokenizer.doesCurrentIdentifierStartWith('a')) {
-                parseApplyStatement();
-            } else {
-                List<IdlTraitParser.Result> traits;
-                boolean hasDocComment = tokenizer.getCurrentToken() == IdlToken.DOC_COMMENT;
-                traits = IdlTraitParser.parseDocsAndTraitsBeforeShape(tokenizer, resolver);
-                if (parseShapeDefinition(traits, hasDocComment)) {
-                    parseShape(traits);
+        while (tokenizer.hasNext()) {
+            try {
+                if (tokenizer.doesCurrentIdentifierStartWith('a')) {
+                    parseApplyStatement();
+                } else {
+                    List<IdlTraitParser.Result> traits;
+                    boolean hasDocComment = tokenizer.getCurrentToken() == IdlToken.DOC_COMMENT;
+                    traits = IdlTraitParser.parseDocsAndTraitsBeforeShape(tokenizer, resolver);
+                    if (parseShapeDefinition(traits, hasDocComment)) {
+                        parseShape(traits);
+                    }
                 }
+            } catch (ModelSyntaxException e) {
+                errorRecovery(e);
             }
         }
     }
 
+    private void errorRecovery(ModelSyntaxException e) {
+        if (!tokenizer.hasNext()) {
+            throw e;
+        }
+
+        // Here we do rudimentary error recovery to attempt to make sense of the remaining model.
+        // We do this by scanning tokens until we find the next identifier, docs, or trait at the start of a line.
+        // The model is still invalid and will fail to validate, but things like IDEs should still be able to do
+        // things like goto definition.
+        emit(ValidationEvent.fromSourceException(e));
+
+        do {
+            // Always skip the current token since it was the one that failed.
+            IdlToken token = tokenizer.next();
+            if (tokenizer.getCurrentTokenColumn() == 1 && isErrorRecoveryToken(token)) {
+                break;
+            }
+        } while (tokenizer.hasNext());
+    }
+
+    // These tokens are good signals that the next shape is starting.
+    private boolean isErrorRecoveryToken(IdlToken token) {
+        return token == IdlToken.IDENTIFIER || token == IdlToken.DOC_COMMENT || token == IdlToken.AT;
+    }
+
     private boolean parseShapeDefinition(List<IdlTraitParser.Result> traits, boolean hasDocComment) {
         if (tokenizer.getCurrentToken() != IdlToken.EOF) {
             // Continue to parse if not at the end of the file.
diff --git a/smithy-model/src/main/java/software/amazon/smithy/model/loader/IdlTokenizer.java b/smithy-model/src/main/java/software/amazon/smithy/model/loader/IdlTokenizer.java
index 3cdc9071671..4b9b828397c 100644
--- a/smithy-model/src/main/java/software/amazon/smithy/model/loader/IdlTokenizer.java
+++ b/smithy-model/src/main/java/software/amazon/smithy/model/loader/IdlTokenizer.java
@@ -321,7 +321,7 @@ public IdlToken next() {
         switch (c) {
             case SimpleParser.EOF:
                 if (currentTokenType == IdlToken.EOF) {
-                    throw new NoSuchElementException("Expected another token but traversed beyond EOF");
+                    throw new NoSuchElementException("Expected another token but reached EOF");
                 }
                 currentTokenEnd = parser.position();
                 return currentTokenType = IdlToken.EOF;
diff --git a/smithy-model/src/test/java/software/amazon/smithy/model/loader/IdlModelLoaderTest.java b/smithy-model/src/test/java/software/amazon/smithy/model/loader/IdlModelLoaderTest.java
index cf4f5ff9470..0002a59e71f 100644
--- a/smithy-model/src/test/java/software/amazon/smithy/model/loader/IdlModelLoaderTest.java
+++ b/smithy-model/src/test/java/software/amazon/smithy/model/loader/IdlModelLoaderTest.java
@@ -339,4 +339,38 @@ public void setsCorrectLocationForEnum() {
         assertThat(fooBarMember.getSourceLocation().getLine(), is(7));
         assertThat(fooBarMember.getSourceLocation().getColumn(), is(5));
     }
+
+    @Test
+    public void doesBasicErrorRecovery() {
+        ValidatedResult<Model> result = Model.assembler()
+                .addImport(getClass().getResource("error-recovery.smithy"))
+                .assemble();
+
+        assertThat(result.isBroken(), is(true));
+        assertThat(result.getResult().isPresent(), is(true));
+
+        Model model = result.getResult().get();
+
+        assertThat(model.getShape(ShapeId.from("smithy.example#MyString")).isPresent(), is(true));
+        assertThat(model.getShape(ShapeId.from("smithy.example#MyFooIsBroken")).isPresent(), is(false));
+        assertThat(model.getShape(ShapeId.from("smithy.example#MyInteger")).isPresent(), is(false));
+        assertThat(model.getShape(ShapeId.from("smithy.example#MyInteger2")).isPresent(), is(true));
+
+        System.out.println(result.getValidationEvents());
+
+        boolean foundSyntax = false;
+        boolean foundTrait = false;
+        for (ValidationEvent e : result.getValidationEvents()) {
+            if (e.getSeverity() == Severity.ERROR && e.getMessage().contains(
+                    "Syntax error at line 9, column 9: Expected COLON(':') but found IDENTIFIER('MyInteger')")) {
+                foundSyntax = true;
+            }
+            if (e.getSeverity() == Severity.ERROR && e.getMessage().contains("Unable to resolve trait")) {
+                foundTrait = true;
+            }
+        }
+
+        assertThat(foundSyntax, is(true));
+        assertThat(foundTrait, is(true));
+    }
 }
diff --git a/smithy-model/src/test/resources/software/amazon/smithy/model/loader/error-recovery.smithy b/smithy-model/src/test/resources/software/amazon/smithy/model/loader/error-recovery.smithy
new file mode 100644
index 00000000000..5ac7655d45f
--- /dev/null
+++ b/smithy-model/src/test/resources/software/amazon/smithy/model/loader/error-recovery.smithy
@@ -0,0 +1,13 @@
+$version: "2.0"
+
+namespace smithy.example
+
+string MyString
+
+structure MyFooIsBroken {
+// The parser will keep trying to parse here, assuming integer is a key and needs to be followed by ":".
+integer MyInteger
+
+// When the above fails, error recovery kicks in, looking for the next token at the start of the line.
+@unknown
+integer MyInteger2