Skip to content

Commit

Permalink
String literals now support proper escaping.
Browse files Browse the repository at this point in the history
- Now these escape sequences are supported: \\, \", \', \n, \t
  - Previously "\"" would throw a syntax error. Now it is handled properly.
  - A literal "\n" (or others) can now be entered by double back-slashing
    the escape sequence like "\\n", which was previously not supported.
    - Unknown escape sequences, such as '\a' will now throw an error.
    - This is a BC break!
- Solution for PR #21.
- Handling escape sequences is now done outside of StringValue. It is
  merely a way of how a string literal is interpreted.
  This means that "\" + "n" will NOT result in a "\n" sequence and thus in
  a literal newline. This is a BC break, as far as I know.
- New StringEscaping helper with methods: escapeString, unescapeString
- Modified and added tests for this BC breaking behavior.
  • Loading branch information
smuuf committed Feb 17, 2020
1 parent 744115a commit 0664605
Show file tree
Hide file tree
Showing 13 changed files with 154 additions and 46 deletions.
2 changes: 1 addition & 1 deletion example/bench_all.primi
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ xb_2 = b - i_2;
xc = c - j;
xc_k = c - k;
l = "xoxoxomilanobergamo,anno:domini. Anno, neštvi ma.".string_replace(r"ann?o", "FAIL");
m = "\ahoj\n\vo\le" - r"\\ahoj\s";
m = "\\ahoj\n\\vo\\le" - r"\\ahoj\s";
n = "a/b/c" - r"\/b"; // Test proper handling of escaped regex delimiters.
a = 4;
b = 5;
Expand Down
2 changes: 1 addition & 1 deletion src/extensions/psl/StringExtension.php
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ public static function string_shuffle(StringValue $str): StringValue {
}

public static function string_length(StringValue $str): NumberValue {
return new NumberValue((string) mb_strlen($str->getInternalValue()));
return new NumberValue((string) mb_strlen($str->value));
}

public static function string_format(StringValue $str, Value ...$items): StringValue {
Expand Down
5 changes: 4 additions & 1 deletion src/handlers/StringLiteral.php
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
<?php

declare(strict_types = 1);

namespace Smuuf\Primi\Handlers;

use \Smuuf\Primi\Structures\StringValue;
use \Smuuf\Primi\Structures\NumberValue;
use \Smuuf\Primi\Helpers\StringEscaping;
use \Smuuf\Primi\Context;

class StringLiteral extends \Smuuf\Primi\StrictObject implements IHandler {
Expand All @@ -16,6 +18,7 @@ public static function handle(array $node, Context $context) {
// Using trim("\"'", ...) would make "abc'" into abc instead of abc',
// so do this a little more directly.
$value = \mb_substr($content, 1, \mb_strlen($content) - 2);
$value = StringEscaping::unescapeString($value);

return new StringValue($value);

Expand Down
102 changes: 102 additions & 0 deletions src/helpers/StringEscaping.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
<?php

declare(strict_types = 1);

namespace Smuuf\Primi\Helpers;

use \Smuuf\Primi\ErrorException;

class StringEscaping extends \Smuuf\Primi\StrictObject {

/**
* Escape sequences that are supported in string literals.
*
* Left side represents character after a backslash which together form a
* supported escape sequence. Right side represents a result of such
* sequence.
*
* @const
*/
private const ESCAPE_PAIRS = [
'\\\\' => "\\",
'\\n' => "\n",
'\\t' => "\t",
'\\"' => '"',
"\\'" => "'",
];

private const QUOTE_CHARS = ['"', "'"];

/**
* Return the provided string but with known escape sequences being expanded
* to their literal meaning. For example `\n` will be expanded to literal
* new-line.
*/
public static function unescapeString(string $str): string {

return \preg_replace_callback('#(\\\\.)#', function($m) {

$char = $m[1];
foreach (self::ESCAPE_PAIRS as $in => $out) {
if ($char === $in) {
return $out;
}
}

// The backslashed character doesn't represent any known escape
// sequence, therefore error.
throw new ErrorException(
"Unrecognized string escape sequence '{$m[0]}'."
);

}, $str);

}

/**
* The string provided as an argument will be returned, but with added
* escaping for characters that represent a known escape sequence. For
* example a literal new-line will be replaced by `\n`.
*
* This is useful for converting a internal string value of StringValue to
* a source code representaton of it - that is how a string literal would
* have to be written by hand for it to be - as a result - interpreted as
* that original string).
*
* If a third optional $quoteChar argument is passed, all other known
* quote characters will NOT be escaped - only the one specified by the
* third argument. The only known quote characters are `'` and `"`. For
* example the string `hello'there"` without $quoteChar specified will
* be escaped as `hello\'there\"`. But with $quoteChar being `"` it would
* result in `hello'there\"`, since the caller used the third argument to
* specify that the single-quote does NOT have to be escaped.
*/
public static function escapeString(
string $str,
string $quoteChar = null
): string {

foreach (self::ESCAPE_PAIRS as $out => $in) {

// $in = <new line>
// $out = '\n'

if (
$quoteChar !== null
&& in_array($in, self::QUOTE_CHARS, true)
&& $in !== $quoteChar
) {
// Do not escape quote characters that aren't necessary to be
// escaped.
continue;
}

$str = str_replace($in, $out, $str);

}

return $str;

}

}
4 changes: 2 additions & 2 deletions src/parser/CompiledParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ class CompiledParser extends Parser\Packrat {
/** @var string **/
public $string;

/* StringLiteral: / (?:".*?(?<!\\)")|(?:'.*?(?<!\\)') /s */
/* StringLiteral: / ("[^"\\]*(\\.[^"\\]*)*")|('[^'\\]*(\\.[^'\\]*)*') /s */
protected $match_StringLiteral_typestack = array('StringLiteral');
function match_StringLiteral ($stack = []) {
$matchrule = "StringLiteral"; $result = $this->construct($matchrule, $matchrule, \null);
if (( $subres = $this->rx( '/ (?:".*?(?<!\\\\)")|(?:\'.*?(?<!\\\\)\') /s' ) ) !== \false) {
if (( $subres = $this->rx( '/ ("[^"\\\\]*(\\\\.[^"\\\\]*)*")|(\'[^\'\\\\]*(\\\\.[^\'\\\\]*)*\') /s' ) ) !== \false) {
$result["text"] .= $subres;
return $this->finalise($result);
}
Expand Down
2 changes: 1 addition & 1 deletion src/parser/Grammar.peg
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class CompiledParser extends Parser\Packrat {

# Carefully crafted (or stolen somewhere?) string matching regex which supports
# both " and ' quotes, doesn't cause JIT stack overflow and also supports escaped quotes.
StringLiteral: / (?:".*?(?<!\\)")|(?:'.*?(?<!\\)') /s
StringLiteral: / ("[^"\\]*(\\.[^"\\]*)*")|('[^'\\]*(\\.[^'\\]*)*') /s
NumberLiteral: /-?\d+(\.\d+)?/
BoolLiteral: "true" | "false"
NullLiteral: "null"
Expand Down
25 changes: 8 additions & 17 deletions src/structures/StringValue.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
namespace Smuuf\Primi\Structures;

use \Smuuf\Primi\Helpers\Common;
use \Smuuf\Primi\Helpers\StringEscaping;
use \Smuuf\Primi\ISupportsComparison;
use \Smuuf\Primi\ISupportsAddition;
use \Smuuf\Primi\ISupportsSubtraction;
Expand All @@ -21,8 +22,6 @@ class StringValue extends Value implements

const TYPE = "string";

const NEWLINE = '__NEWLINE__';

public function __construct(string $value) {
$this->value = $value;
}
Expand All @@ -31,19 +30,11 @@ public function getStringValue(): string {

// We are about to put double-quotes around the return value,
// so let's "escape" double-quotes present in the string value.
$escaped = \str_replace('"', '\"', $this->value);

$escaped = self::expandSequences($escaped);

$escaped = StringEscaping::escapeString($this->value, '"');
return "\"$escaped\"";

}

public function getInternalValue()
{
return self::expandSequences($this->value);
}

public function doAddition(Value $rightOperand) {

Common::allowTypes($rightOperand, self::class);
Expand All @@ -58,11 +49,11 @@ public function doSubtraction(Value $rightOperand) {
Common::allowTypes($rightOperand, self::class, RegexValue::class);

if ($rightOperand instanceof RegexValue) {
$match = \preg_replace($rightOperand->getInternalValue(), \null, $this->getInternalValue());
$match = \preg_replace($rightOperand->value, \null, $this->value);
return new self($match);
}

$new = \str_replace($rightOperand->getInternalValue(), \null, $this->getInternalValue());
$new = \str_replace($rightOperand->value, \null, $this->value);
return new self($new);

}
Expand Down Expand Up @@ -169,10 +160,10 @@ public function getIterator(): \Iterator {
protected static function expandSequences(string $string) {

// Primi strings support some escape sequences.

$string = preg_replace(['#(?<!\\\)\\\n#', '/\\\\\\\n/'], ["\n", '\n'], $string);

return $string;
//return \str_replace('\n', "\n", $string);
//return \preg_replace('#(?<!\\\\)\\\\n#', "\n", $string);
$string = \preg_replace('#(?<!\\\\)\\\\n#', "\n", $string);
return \str_replace('\\\\', "\\", $string);

}

Expand Down
12 changes: 5 additions & 7 deletions tests/language/suites/extensions/ext.string.expect
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,8 @@ m:BoolValue:true
n:BoolValue:false
o:StringValue:"1,2,3"
p:StringValue:"1,nested,stuff-yeah,3"
q:StringValue:"
"
r:StringValue:"\n"
s:StringValue:"

"
t:StringValue:"\n\n"
esc_1:StringValue:"\n"
esc_2:StringValue:"\\n"
esc_3:StringValue:"\n\n"
esc_4:StringValue:"\\n\\n"
esc_5:StringValue:"\n\\n\t\\"
9 changes: 5 additions & 4 deletions tests/language/suites/extensions/ext.string.primi
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ n = "ahoj ahoj ahoj!".contains(r"(čau ){2}!?");
o = ",".join([1, 2, 3]);
p = ",".join([1, ['nested', "stuff-{}".format("yeah")], 3]);

q = "\n"
r = "\\n"
s = "\n" + "\n"
t = "\\n" + "\\n"
esc_1 = "\n"
esc_2 = "\\n"
esc_3 = "\n" + "\n"
esc_4 = "\\n" + "\\n"
esc_5 = "\n" + "\\n\t\\"
2 changes: 1 addition & 1 deletion tests/language/suites/structures/regexes.expect
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ xb_2:StringValue:"ab123dfg"
xc:StringValue:"xyzčaukomňauko"
xc_k:StringValue:"xyz456čako"
l:StringValue:"xoxoxomilFAILbergamo,FAIL:domini. Anno, neštvi ma."
m:StringValue:"\vo\le"
m:StringValue:"\\vo\\le"
n:StringValue:"a/c"
o:StringValue:"00011"
p:StringValue:"00011234455"
Expand Down
2 changes: 1 addition & 1 deletion tests/language/suites/structures/regexes.primi
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ xc = c - j;
xc_k = c - k;

l = "xoxoxomilanobergamo,anno:domini. Anno, neštvi ma.".string_replace(r"ann?o", "FAIL");
m = "\ahoj\n\vo\le" - r"\\ahoj\s";
m = "\\ahoj\n\\vo\\le" - r"\\ahoj\s";
n = "a/b/c" - r"/b"; // Test proper handling of escaped regex delimiters.

o = "[0-9]{5}".to_regex().match("abc00011234455");
Expand Down
2 changes: 0 additions & 2 deletions tests/unit/method.helper.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ use \Tester\Assert;
use \Smuuf\Primi\Helpers\Common;
use \Smuuf\Primi\Structures\StringValue;
use \Smuuf\Primi\Structures\Value;
use \Smuuf\Primi\Structures\RegexValue;
use \Smuuf\Primi\ErrorException;

require __DIR__ . '/../bootstrap.php';

Expand Down
31 changes: 23 additions & 8 deletions tests/unit/value.string.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,33 @@ $fns = ExtensionHub::get();
$string = new StringValue("this is a string.");
$letterA = new StringValue("a");
$unicode = new StringValue("ťhiš íš á ŠTřing.");
$withNewline = new StringValue('a \n b');
$withNewlineLiteral = new StringValue('a \\\n b');

//
// Test sequence expanding...
// Escaping sequences are NOT handled inside StringValue, but instead during the
// handling of source code string literal.
//

Assert::same(2, count(explode("\n", get_val($withNewline))));
$stringsWithEscapeSequences = [
'a\nb',
'a \n b',
'a \\n b',
'a\\n\nb',
'a\\\n\t\\tb',
];
foreach ($stringsWithEscapeSequences as $s) {
Assert::same($s, get_val(new StringValue($s)));
}

// Get correct repr - things should be quoted and escaped properly.
// REMEMBER: Escape characters are NOT HANDLED when creating StringValue
// objects. Whatever is put into StringValue as argument will literally be
// what's inside.
Assert::same('"\""', (new StringValue('"'))->getStringValue());
Assert::same('"\\\\\""', (new StringValue('\\"'))->getStringValue());
Assert::same('"\\\\\'"', (new StringValue("\'"))->getStringValue());
Assert::same('"\\\\\\\\\'"', (new StringValue("\\\\'"))->getStringValue());
Assert::same('"\\\\n"', (new StringValue('\n'))->getStringValue());
Assert::same('"\\n"', (new StringValue("\n"))->getStringValue());

//
// Test adding and subtracting...
Expand Down Expand Up @@ -268,10 +287,6 @@ Assert::same(17, get_val($fns['string_length']->invoke([$string])));
Assert::same(1, get_val($fns['string_length']->invoke([$letterA])));
// Multibyte strings should report length correctly.
Assert::same(17, get_val($fns['string_length']->invoke([$unicode])));
// "\n" is expanded as newline - that's one character.
Assert::same(5, get_val($fns['string_length']->invoke([$withNewline])));
// "\\n" should not be expanded as newline
Assert::same(6, get_val($fns['string_length']->invoke([$withNewlineLiteral])));

//
// Test replacing.
Expand Down

0 comments on commit 0664605

Please sign in to comment.