ftomassetti · ftomassetti · Oct 14, 2022 · Oct 6, 2022
diff --git a/.gitignore b/.gitignore
@@ -1,8 +1,10 @@
 .gradle
+.DS_Store
+bin
 build
 out
 *.iml
 *.ipr
 *.iws
 generated-src
-.idea
+.idea
diff --git a/README.md b/README.md
@@ -27,6 +27,103 @@ this.genericParser = antlrGrammar.genericParser()
 val ast = genericParser.parse("class A { }")
 ```
 
+## Push/Pop Mode Commands
+
+JavaCC by default does not have a way for tokens to change the token manager lexical state with memory, like ANTLR provides
+with the `pushMode` and `popMode` commands. For example, to parse as a single token a balanced set of parentheses such as
+`((()) ())` you might have the following JavaCC parser:
+```
+TOKEN_MGR_DECLS : {
+    static List<Integer> lexicalStateStack = new ArrayList<Integer>();
+
+    static void openParen() {
+        lexicalStateStack.add(curLexState);
+    }
+
+    static void closeParen() {
+        SwitchTo(lexicalStateStack.remove(lexicalStateStack.size() - 1));
+    }
+}
+
+<DEFAULT, LEVEL1, LEVELN> SKIP : {
+    < " " >
+}
+
+<LEVELN> MORE : {
+    <LPAREN:    "("> { openParen(); }
+|   <RPAREN:    ")"> { closeParen(); }
+}
+
+MORE : {
+    < "(" > { openParen(); } : LEVEL1
+}
+
+<LEVEL1> MORE : {
+    < "(" > { openParen(); } : LEVELN
+}
+
+<LEVEL1> TOKEN : {
+    <BALANCED_PARENS: ")" > { closeParen(); } : DEFAULT
+}
+
+void Start(): {} { <BALANCED_PARENS> <EOF> }
+```
+
+However, the ANTLR lexer would not behave correctly because we cannot infer when, according to the `SwitchTo` statements
+executed as part of the actions, the corresponding ANTLR rules should use `mode`, `pushMode`, or `popMode` commands:
+
+```
+lexer grammar Lexer;
+
+SKIP0 : ' ' -> skip ;
+MORE0 : '(' -> more, mode(LEVEL1) ;
+
+mode LEVEL1;
+LEVEL1_SKIP0 : SKIP0 -> skip ;
+MORE1 : '(' -> more, mode(LEVELN) ;
+BALANCED_PARENS : ')' -> mode(DEFAULT_MODE) ;
+
+mode LEVELN;
+LEVELN_SKIP0 : SKIP0 -> skip ;
+LPAREN : '(' -> more ;
+RPAREN : ')' -> more ;  // PROBLEM: Cannot escape this mode!
+
+
+parser grammar Parser;
+
+options { tokenVocab=Lexer; }
+
+start :  BALANCED_PARENS EOF  ;
+```
+
+In order to handle such actions, you must add the following fields to your `TOKEN_MGR_DECLS` with values set to the name
+of your functions that should map to `pushMode` and `popMode` commands respectively:
+
+```
+TOKEN_MGR_DECLS : {
+    ...
+    final static String pushStateFunc = "openParen";
+    final static String popStateFunc = "closeParen";
+}
+```
+
+Now the lexer gets generated correctly:
+
+```
+SKIP0 : ' ' -> skip ;
+MORE0 : '(' -> more, pushMode(LEVEL1) ;
+
+mode LEVEL1;
+LEVEL1_SKIP0 : SKIP0 -> skip ;
+MORE1 : '(' -> more, pushMode(LEVELN) ;
+BALANCED_PARENS : ')' -> popMode ;
+
+mode LEVELN;
+LEVELN_SKIP0 : SKIP0 -> skip ;
+LPAREN : '(' -> more, pushMode(LEVELN) ;
+RPAREN : ')' -> more, popMode ;
+```
+
 ## Licensing
 
 The project is made available under the Apache Public License V2.0. Please see the file called [LICENSE](LICENSE).
diff --git a/build.gradle b/build.gradle
@@ -48,14 +48,22 @@ generateGrammarSource {
     arguments += ['-package', 'com.strumenta.javacc']
     outputDirectory = new File("generated-src/antlr/main/com/strumenta/javacc".toString())
 }
-compileJava.dependsOn generateGrammarSource
 sourceSets {
     generated {
         java.srcDir 'generated-src/antlr/main/'
     }
 }
-compileJava.source sourceSets.generated.java, sourceSets.main.java
-compileKotlin.source sourceSets.generated.java, sourceSets.main.java, sourceSets.main.kotlin
+compileGeneratedJava.enabled false
+compileGeneratedKotlin.enabled false
+compileJava {
+    dependsOn generateGrammarSource
+    source sourceSets.generated.java, sourceSets.main.java
+}
+compileKotlin {
+    dependsOn generateGrammarSource
+    dependsOn generateGeneratedGrammarSource
+    source sourceSets.generated.java, sourceSets.main.java, sourceSets.main.kotlin
+}
 
 clean {
     delete "generated-src"

diff --git a/src/main/kotlin/com/strumenta/javacc/AntlrModel.kt b/src/main/kotlin/com/strumenta/javacc/AntlrModel.kt
@@ -6,11 +6,10 @@ import java.io.PrintWriter
 import java.io.StringWriter
 import java.util.*
 
-data class RuleDefinition(val name: String, val body: String, val action: String?, val fragment: Boolean = false) {
+data class RuleDefinition(val name: String, val body: String, val commandStr: String, val fragment: Boolean = false) {
     fun generate() : String {
         val prefix = if (fragment) "fragment " else ""
-        val actionPostfix = if (action == null) "" else "-> $action"
-        val body= if(action!=null && body.contains("|")) "($body)" else body
+        val actionPostfix = if (commandStr.isEmpty()) "" else "-> $commandStr"
         return "$prefix$name : $body $actionPostfix ;"
     }
 }
@@ -40,11 +39,12 @@ class ParserDefinitions(val name: String) {
     }
 }
 
-class LexerDefinitions(val name: String) {
+class LexerDefinitions(val name: String, private val generateLetterFragments: Boolean) {
 
-    private val rulesByMode  = HashMap<String, MutableList<RuleDefinition>>()
+    private val rulesByMode = HashMap<String, MutableList<RuleDefinition>>()
+    private val letterFragments: MutableList<RuleDefinition> = generateLetterFragments().toMutableList()
 
-    fun ruleForImage(image: String, mode: String = DEFAULT_MODE_NAME) : RuleDefinition? {
+    fun ruleForImage(image: String, mode: String = JAVACC_DEFAULT_MODE_NAME) : RuleDefinition? {
         return rulesByMode[mode]?.firstOrNull {
             it.body == "'$image'"
         }
@@ -56,15 +56,29 @@ class LexerDefinitions(val name: String) {
         }
         var ruleDefinitionCorrected = ruleDefinition
         if (ruleDefinition.name.isEmpty()) {
-            if (rulesByMode[mode]!!.any { it.body == ruleDefinition.body }) {
-                return
-            }
-            ruleDefinitionCorrected = ruleDefinition.copy(name = generateName(ruleDefinition.body, rulesByMode[mode]!!.map { it.name }.toSet()))
+            throw UnsupportedOperationException(ruleDefinition.body)
         }
         if (ruleDefinitionCorrected.name.startsWith("_")) {
-            ruleDefinitionCorrected = ruleDefinitionCorrected.copy(name = "US_${ruleDefinitionCorrected.name}")
+            // Antlr lexer rule names must begin with a capital letter
+            ruleDefinitionCorrected = ruleDefinitionCorrected.copy(name = "US${ruleDefinitionCorrected.name}")
+        }
+        if (generateLetterFragments
+                && ruleDefinitionCorrected.name.length == 1
+                && ruleDefinitionCorrected.name in "A".."Z") {
+            if (ruleDefinitionCorrected.body.uppercase() == ruleDefinitionCorrected.name.uppercase()) {
+                // If the user's letter rule can serve in place of the letter fragment we would generate, preserve the
+                // rule and skip generating the redundant letter fragment later
+                if (!ruleDefinitionCorrected.fragment) {
+                    val letterFragment = letterFragments.first { it.name == ruleDefinitionCorrected.name }
+                    ruleDefinitionCorrected = ruleDefinitionCorrected.copy(body = letterFragment.body)
+                    letterFragments.remove(letterFragment)
+                }
+            } else {
+                throw UnsupportedOperationException("Rule conflicts with automatically generated case insensitive character fragment: '${ruleDefinitionCorrected.name}' -> ${ruleDefinitionCorrected.body}")
+            }
         }
         if (ruleDefinitionCorrected.name == ruleDefinitionCorrected.body) {
+            // Such a lexer rule would infinitely recurse
             return
         }
         if (ruleDefinitionCorrected.body.contains("~[]")) {
@@ -73,37 +87,37 @@ class LexerDefinitions(val name: String) {
         rulesByMode[mode]!!.add(ruleDefinitionCorrected)
     }
 
-    private fun generateName(body: String, usedNames: Set<String>) : String {
-        throw UnsupportedOperationException(body)
+    private fun generateLetterFragments() : List<RuleDefinition> {
+        // Generate e.g. fragment A: [aA] rules that literals will be rewritten to use to support case insensitivity
+        return (0..25).map {
+            RuleDefinition(('A' + it).toString(), "[" + ('a' + it).toString() + ('A' + it).toString() + "]", "", fragment = true)
+        }
     }
 
     fun generate() : String {
         val stringWriter = StringWriter()
         val printWriter = PrintWriter(stringWriter)
         printWriter.println("lexer grammar $name;")
-        printMode(DEFAULT_MODE_NAME, printWriter)
-        rulesByMode.keys.filter { it != DEFAULT_MODE_NAME }.forEach { printMode(it, printWriter) }
+        if (generateLetterFragments) {
+            letterFragments.forEach { rulesByMode[JAVACC_DEFAULT_MODE_NAME]!!.add(it) }
+        }
+        printMode(JAVACC_DEFAULT_MODE_NAME, printWriter)
+        rulesByMode.keys.filter { it != JAVACC_DEFAULT_MODE_NAME }.forEach { printMode(it, printWriter) }
         return stringWriter.toString()
     }
 
     private fun printMode(mode: String, printWriter: PrintWriter) {
         printWriter.println()
-        if (mode != DEFAULT_MODE_NAME) {
+        if (mode != JAVACC_DEFAULT_MODE_NAME) {
             printWriter.println("mode $mode;")
         }
         rulesByMode[mode]!!.forEach {
-            if (it.name.contains("COMMENT") && it.action == null) {
-                printWriter.println(it.copy(action = "skip").generate())
-            } else if (it.name.contains("COMMENT") && it.action != null && !it.action.contains("skip")) {
-                printWriter.println(it.copy(action = "skip, ${it.action}").generate())
-            } else {
-                printWriter.println(it.generate())
-            }
+            printWriter.println(it.generate())
         }
     }
 }
 
-class AntlrGrammar(private val lexerDefinitions: LexerDefinitions, private val parserDefinitions: ParserDefinitions) {
+class AntlrGrammar(val lexerDefinitions: LexerDefinitions, private val parserDefinitions: ParserDefinitions) {
     private fun lexerCode() = lexerDefinitions.generate()
     private fun parserCode() = parserDefinitions.generate(lexerDefinitions.name)
     fun saveLexer(file: File) {

diff --git a/src/main/kotlin/com/strumenta/javacc/JavaCCLoader.kt b/src/main/kotlin/com/strumenta/javacc/JavaCCLoader.kt
@@ -4,11 +4,62 @@ import org.javacc.parser.*
 import java.io.File
 import java.io.FileInputStream
 
-data class JavaCCGrammar(val tokenRules: List<TokenProduction>, val parserRules: List<NormalProduction>)
+data class JavaCCGrammar(val tokenRules: List<TokenProduction>, val parserRules: List<NormalProduction>, val changeStateFunctions: ChangeStateFunctions)
 
 fun loadJavaCCGrammar(javaCCGrammarFile: File) : JavaCCGrammar{
     val javaccParser = JavaCCParser(FileInputStream(javaCCGrammarFile))
     Options.init()
     javaccParser.javacc_input()
-    return JavaCCGrammar(JavaCCGlobals.rexprlist, JavaCCGlobals.bnfproductions)
+    return JavaCCGrammar(JavaCCGlobals.rexprlist, JavaCCGlobals.bnfproductions, getChangeStateFunctions())
+}
+
+const val PUSH_STATE_FUNC = "pushStateFunc"
+const val POP_STATE_FUNC = "popStateFunc"
+data class ChangeStateFunctions(val pushState: String?, val popState: String?)
+
+/**
+ * Support user-defined functions that when they appear in lexer rule actions, tell us that we need to generate either
+ * "popMode" or "pushMode" commands, rather than "mode" commands. You must create variables pushStateFunc and popStateFunc
+ * assigned to the name of the corresponding functions used by your actions, so those actions can be identified.
+ * We support this because it is otherwise impossible to accurately identify when a rule necessitates a pushMode vs popMode.
+ *
+ * Example:
+ * TOKEN_MGR_DECLS : {
+ *   List<Integer> stateStack = new ArrayList<Integer>();
+ *
+ *   void push() {
+ *     stateStack.add(curLexState);
+ *   }
+ *
+ *   void pop() {
+ *     SwitchTo(stateStack.remove(stateStack.size() - 1));
+ *   }
+ *
+ *   private String pushStateFunc = "push";   <----- Your javacc grammar must define this variable
+ *   private String popStateFunc = "pop";     <----- Your javacc grammar must define this variable
+ * }
+ *
+ * <JAVA> MORE :
+ * {
+ *   "/*" { push(); } : IN_JAVA_COMMENT
+ * }
+ * <IN_JAVA_COMMENT> MORE :
+ * {
+ *   < ~[] >
+ * }
+ * <IN_JAVA_COMMENT> SPECIAL_TOKEN :
+ * {
+ *   <JAVA_COMMENT: "*/" > { pop(); }
+ * }
+ */
+private fun getChangeStateFunctions(): ChangeStateFunctions {
+    val tokenMgrDecls = JavaCCGlobals.token_mgr_decls
+    val findStateChangeFunction = { func: String ->
+        (tokenMgrDecls?.find {
+            (it as Token).kind == JavaCCParserConstants.IDENTIFIER && it.image == func
+                    && it.next.kind == JavaCCParserConstants.ASSIGN
+                    && it.next.next.kind == JavaCCParserConstants.STRING_LITERAL
+        } as Token?)?.next?.next?.image?.removePrefix("\"")?.removeSuffix("\"")
+    }
+    return ChangeStateFunctions(findStateChangeFunction(PUSH_STATE_FUNC), findStateChangeFunction(POP_STATE_FUNC))
 }