Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
.gradle
.DS_Store
bin
build
out
*.iml
*.ipr
*.iws
generated-src
.idea
.idea
97 changes: 97 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,103 @@ this.genericParser = antlrGrammar.genericParser()
val ast = genericParser.parse("class A { }")
```

## Push/Pop Mode Commands

JavaCC by default does not have a way for tokens to change the token manager lexical state with memory, like ANTLR provides
with the `pushMode` and `popMode` commands. For example, to parse as a single token a balanced set of parentheses such as
`((()) ())` you might have the following JavaCC parser:
```
TOKEN_MGR_DECLS : {
static List<Integer> lexicalStateStack = new ArrayList<Integer>();

static void openParen() {
lexicalStateStack.add(curLexState);
}

static void closeParen() {
SwitchTo(lexicalStateStack.remove(lexicalStateStack.size() - 1));
}
}

<DEFAULT, LEVEL1, LEVELN> SKIP : {
< " " >
}

<LEVELN> MORE : {
<LPAREN: "("> { openParen(); }
| <RPAREN: ")"> { closeParen(); }
}

MORE : {
< "(" > { openParen(); } : LEVEL1
}

<LEVEL1> MORE : {
< "(" > { openParen(); } : LEVELN
}

<LEVEL1> TOKEN : {
<BALANCED_PARENS: ")" > { closeParen(); } : DEFAULT
}

void Start(): {} { <BALANCED_PARENS> <EOF> }
```

However, the ANTLR lexer would not behave correctly because we cannot infer when, according to the `SwitchTo` statements
executed as part of the actions, the corresponding ANTLR rules should use `mode`, `pushMode`, or `popMode` commands:

```
lexer grammar Lexer;

SKIP0 : ' ' -> skip ;
MORE0 : '(' -> more, mode(LEVEL1) ;

mode LEVEL1;
LEVEL1_SKIP0 : SKIP0 -> skip ;
MORE1 : '(' -> more, mode(LEVELN) ;
BALANCED_PARENS : ')' -> mode(DEFAULT_MODE) ;

mode LEVELN;
LEVELN_SKIP0 : SKIP0 -> skip ;
LPAREN : '(' -> more ;
RPAREN : ')' -> more ; // PROBLEM: Cannot escape this mode!


parser grammar Parser;

options { tokenVocab=Lexer; }

start : BALANCED_PARENS EOF ;
```

In order to handle such actions, you must add the following fields to your `TOKEN_MGR_DECLS` with values set to the name
of your functions that should map to `pushMode` and `popMode` commands respectively:

```
TOKEN_MGR_DECLS : {
...
final static String pushStateFunc = "openParen";
final static String popStateFunc = "closeParen";
}
```

Now the lexer gets generated correctly:

```
SKIP0 : ' ' -> skip ;
MORE0 : '(' -> more, pushMode(LEVEL1) ;

mode LEVEL1;
LEVEL1_SKIP0 : SKIP0 -> skip ;
MORE1 : '(' -> more, pushMode(LEVELN) ;
BALANCED_PARENS : ')' -> popMode ;

mode LEVELN;
LEVELN_SKIP0 : SKIP0 -> skip ;
LPAREN : '(' -> more, pushMode(LEVELN) ;
RPAREN : ')' -> more, popMode ;
```

## Licensing

The project is made available under the Apache Public License V2.0. Please see the file called [LICENSE](LICENSE).
14 changes: 11 additions & 3 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,22 @@ generateGrammarSource {
arguments += ['-package', 'com.strumenta.javacc']
outputDirectory = new File("generated-src/antlr/main/com/strumenta/javacc".toString())
}
compileJava.dependsOn generateGrammarSource
sourceSets {
generated {
java.srcDir 'generated-src/antlr/main/'
}
}
compileJava.source sourceSets.generated.java, sourceSets.main.java
compileKotlin.source sourceSets.generated.java, sourceSets.main.java, sourceSets.main.kotlin
compileGeneratedJava.enabled false
compileGeneratedKotlin.enabled false
compileJava {
dependsOn generateGrammarSource
source sourceSets.generated.java, sourceSets.main.java
}
compileKotlin {
dependsOn generateGrammarSource
dependsOn generateGeneratedGrammarSource
source sourceSets.generated.java, sourceSets.main.java, sourceSets.main.kotlin
}

clean {
delete "generated-src"
Expand Down
62 changes: 38 additions & 24 deletions src/main/kotlin/com/strumenta/javacc/AntlrModel.kt
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,10 @@ import java.io.PrintWriter
import java.io.StringWriter
import java.util.*

data class RuleDefinition(val name: String, val body: String, val action: String?, val fragment: Boolean = false) {
data class RuleDefinition(val name: String, val body: String, val commandStr: String, val fragment: Boolean = false) {
fun generate() : String {
val prefix = if (fragment) "fragment " else ""
val actionPostfix = if (action == null) "" else "-> $action"
val body= if(action!=null && body.contains("|")) "($body)" else body
val actionPostfix = if (commandStr.isEmpty()) "" else "-> $commandStr"
return "$prefix$name : $body $actionPostfix ;"
}
}
Expand Down Expand Up @@ -40,11 +39,12 @@ class ParserDefinitions(val name: String) {
}
}

class LexerDefinitions(val name: String) {
class LexerDefinitions(val name: String, private val generateLetterFragments: Boolean) {

private val rulesByMode = HashMap<String, MutableList<RuleDefinition>>()
private val rulesByMode = HashMap<String, MutableList<RuleDefinition>>()
private val letterFragments: MutableList<RuleDefinition> = generateLetterFragments().toMutableList()

fun ruleForImage(image: String, mode: String = DEFAULT_MODE_NAME) : RuleDefinition? {
fun ruleForImage(image: String, mode: String = JAVACC_DEFAULT_MODE_NAME) : RuleDefinition? {
return rulesByMode[mode]?.firstOrNull {
it.body == "'$image'"
}
Expand All @@ -56,15 +56,29 @@ class LexerDefinitions(val name: String) {
}
var ruleDefinitionCorrected = ruleDefinition
if (ruleDefinition.name.isEmpty()) {
if (rulesByMode[mode]!!.any { it.body == ruleDefinition.body }) {
return
}
ruleDefinitionCorrected = ruleDefinition.copy(name = generateName(ruleDefinition.body, rulesByMode[mode]!!.map { it.name }.toSet()))
throw UnsupportedOperationException(ruleDefinition.body)
}
if (ruleDefinitionCorrected.name.startsWith("_")) {
ruleDefinitionCorrected = ruleDefinitionCorrected.copy(name = "US_${ruleDefinitionCorrected.name}")
// Antlr lexer rule names must begin with a capital letter
ruleDefinitionCorrected = ruleDefinitionCorrected.copy(name = "US${ruleDefinitionCorrected.name}")
}
if (generateLetterFragments
&& ruleDefinitionCorrected.name.length == 1
&& ruleDefinitionCorrected.name in "A".."Z") {
if (ruleDefinitionCorrected.body.uppercase() == ruleDefinitionCorrected.name.uppercase()) {
// If the user's letter rule can serve in place of the letter fragment we would generate, preserve the
// rule and skip generating the redundant letter fragment later
if (!ruleDefinitionCorrected.fragment) {
val letterFragment = letterFragments.first { it.name == ruleDefinitionCorrected.name }
ruleDefinitionCorrected = ruleDefinitionCorrected.copy(body = letterFragment.body)
letterFragments.remove(letterFragment)
}
} else {
throw UnsupportedOperationException("Rule conflicts with automatically generated case insensitive character fragment: '${ruleDefinitionCorrected.name}' -> ${ruleDefinitionCorrected.body}")
}
}
if (ruleDefinitionCorrected.name == ruleDefinitionCorrected.body) {
// Such a lexer rule would infinitely recurse
return
}
if (ruleDefinitionCorrected.body.contains("~[]")) {
Expand All @@ -73,37 +87,37 @@ class LexerDefinitions(val name: String) {
rulesByMode[mode]!!.add(ruleDefinitionCorrected)
}

private fun generateName(body: String, usedNames: Set<String>) : String {
throw UnsupportedOperationException(body)
private fun generateLetterFragments() : List<RuleDefinition> {
// Generate e.g. fragment A: [aA] rules that literals will be rewritten to use to support case insensitivity
return (0..25).map {
RuleDefinition(('A' + it).toString(), "[" + ('a' + it).toString() + ('A' + it).toString() + "]", "", fragment = true)
}
}

fun generate() : String {
val stringWriter = StringWriter()
val printWriter = PrintWriter(stringWriter)
printWriter.println("lexer grammar $name;")
printMode(DEFAULT_MODE_NAME, printWriter)
rulesByMode.keys.filter { it != DEFAULT_MODE_NAME }.forEach { printMode(it, printWriter) }
if (generateLetterFragments) {
letterFragments.forEach { rulesByMode[JAVACC_DEFAULT_MODE_NAME]!!.add(it) }
}
printMode(JAVACC_DEFAULT_MODE_NAME, printWriter)
rulesByMode.keys.filter { it != JAVACC_DEFAULT_MODE_NAME }.forEach { printMode(it, printWriter) }
return stringWriter.toString()
}

private fun printMode(mode: String, printWriter: PrintWriter) {
printWriter.println()
if (mode != DEFAULT_MODE_NAME) {
if (mode != JAVACC_DEFAULT_MODE_NAME) {
printWriter.println("mode $mode;")
}
rulesByMode[mode]!!.forEach {
if (it.name.contains("COMMENT") && it.action == null) {
printWriter.println(it.copy(action = "skip").generate())
} else if (it.name.contains("COMMENT") && it.action != null && !it.action.contains("skip")) {
printWriter.println(it.copy(action = "skip, ${it.action}").generate())
} else {
printWriter.println(it.generate())
}
printWriter.println(it.generate())
}
}
}

class AntlrGrammar(private val lexerDefinitions: LexerDefinitions, private val parserDefinitions: ParserDefinitions) {
class AntlrGrammar(val lexerDefinitions: LexerDefinitions, private val parserDefinitions: ParserDefinitions) {
private fun lexerCode() = lexerDefinitions.generate()
private fun parserCode() = parserDefinitions.generate(lexerDefinitions.name)
fun saveLexer(file: File) {
Expand Down
55 changes: 53 additions & 2 deletions src/main/kotlin/com/strumenta/javacc/JavaCCLoader.kt
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,62 @@ import org.javacc.parser.*
import java.io.File
import java.io.FileInputStream

data class JavaCCGrammar(val tokenRules: List<TokenProduction>, val parserRules: List<NormalProduction>)
data class JavaCCGrammar(val tokenRules: List<TokenProduction>, val parserRules: List<NormalProduction>, val changeStateFunctions: ChangeStateFunctions)

fun loadJavaCCGrammar(javaCCGrammarFile: File) : JavaCCGrammar{
val javaccParser = JavaCCParser(FileInputStream(javaCCGrammarFile))
Options.init()
javaccParser.javacc_input()
return JavaCCGrammar(JavaCCGlobals.rexprlist, JavaCCGlobals.bnfproductions)
return JavaCCGrammar(JavaCCGlobals.rexprlist, JavaCCGlobals.bnfproductions, getChangeStateFunctions())
}

const val PUSH_STATE_FUNC = "pushStateFunc"
const val POP_STATE_FUNC = "popStateFunc"
data class ChangeStateFunctions(val pushState: String?, val popState: String?)

/**
* Support user-defined functions that when they appear in lexer rule actions, tell us that we need to generate either
* "popMode" or "pushMode" commands, rather than "mode" commands. You must create variables pushStateFunc and popStateFunc
* assigned to the name of the corresponding functions used by your actions, so those actions can be identified.
* We support this because it is otherwise impossible to accurately identify when a rule necessitates a pushMode vs popMode.
*
* Example:
* TOKEN_MGR_DECLS : {
* List<Integer> stateStack = new ArrayList<Integer>();
*
* void push() {
* stateStack.add(curLexState);
* }
*
* void pop() {
* SwitchTo(stateStack.remove(stateStack.size() - 1));
* }
*
* private String pushStateFunc = "push"; <----- Your javacc grammar must define this variable
* private String popStateFunc = "pop"; <----- Your javacc grammar must define this variable
* }
*
* <JAVA> MORE :
* {
* "/*" { push(); } : IN_JAVA_COMMENT
* }
* <IN_JAVA_COMMENT> MORE :
* {
* < ~[] >
* }
* <IN_JAVA_COMMENT> SPECIAL_TOKEN :
* {
* <JAVA_COMMENT: "*/" > { pop(); }
* }
*/
private fun getChangeStateFunctions(): ChangeStateFunctions {
val tokenMgrDecls = JavaCCGlobals.token_mgr_decls
val findStateChangeFunction = { func: String ->
(tokenMgrDecls?.find {
(it as Token).kind == JavaCCParserConstants.IDENTIFIER && it.image == func
&& it.next.kind == JavaCCParserConstants.ASSIGN
&& it.next.next.kind == JavaCCParserConstants.STRING_LITERAL
} as Token?)?.next?.next?.image?.removePrefix("\"")?.removeSuffix("\"")
}
return ChangeStateFunctions(findStateChangeFunction(PUSH_STATE_FUNC), findStateChangeFunction(POP_STATE_FUNC))
}
Loading