-
-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add string parsing + fuzzed string test set
I followed mostly the same procedure outlined here: https://www.ryanliptak.com/blog/fuzzing-as-test-case-generator/ but used a combination of Zua and fuzzing-lua to ultimately create the sets of inputs/outputs. - First, a giant corpus (16k+) of fuzzed string literals was created by iterating through all the fuzzed lexer inputs and outputting the source of every <string> token to a separate file (see test/fuzz_strings_gen.zig). This step would be difficult to do with Lua's API because strings are parsed as they are lexed, meaning any relationship to the original source is lost once the token is parsed. - Then, I used libFuzzer and fuzzing-lua to minimize the string corpus (via the -merge=1 flag). - Then, I used Lua to generate corresponding output files containing the parsed version of each input string (this code will be committed to fuzzing-lua once I clean it up). Kind of convoluted, but it ended up working well--there were a lot of bugs in my initial string parsing implementation that the fuzzed set allowed me to find.
- Loading branch information
Showing
176 changed files
with
503 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,175 @@ | ||
const std = @import("std"); | ||
const lex = @import("lex.zig"); | ||
|
||
// Notes: | ||
// | ||
// Lua parser always parses into a function (called the 'main' function) which | ||
// is always varargs (the values in the varargs differs depending on Lua version) | ||
|
||
pub const Parser = struct { | ||
/// Because the lexer has already validated that strings don't contain | ||
/// any invalid characters, this function can be implemented without | ||
/// the possibility of failure. Any failures are a bug in the lexer. | ||
/// | ||
/// dest_buf must be at least as big as source to ensure it is large enough | ||
/// to hold the parsed string | ||
/// TODO: should this function be part of lex.Token instead? | ||
pub fn parseStringLiteral(source_raw: []const u8, dest_buf: []u8) []u8 { | ||
std.debug.assert(dest_buf.len >= source_raw.len); | ||
var source: []const u8 = source_raw[0..]; | ||
|
||
// trim the start/end delimeters | ||
var delim_len: usize = undefined; | ||
var is_long_string: bool = false; | ||
var skip_first_char: bool = false; | ||
switch (source[0]) { | ||
'\'', '"' => delim_len = 1, | ||
'[' => { | ||
var num_sep: usize = 0; | ||
while (source[1 + num_sep] == '=') : (num_sep += 1) {} | ||
std.debug.assert(source[1 + num_sep] == '['); | ||
delim_len = 2 + num_sep; | ||
is_long_string = true; | ||
// if the first line of a long string is a newline char, it gets skipped | ||
skip_first_char = source[delim_len] == '\r' or source[delim_len] == '\n'; | ||
}, | ||
else => unreachable, | ||
} | ||
source = source[delim_len .. source.len - delim_len]; | ||
if (skip_first_char) source = source[1..]; | ||
|
||
// like std.io.SliceOutStream but no need to check bounds of slice | ||
// and can only append 1 character at a time (also doesn't implement Stream) | ||
const SliceWriter = struct { | ||
const Self = @This(); | ||
|
||
pos: usize = 0, | ||
slice: []u8, | ||
|
||
fn write(self: *Self, char: u8) void { | ||
self.slice[self.pos] = char; | ||
self.pos += 1; | ||
} | ||
|
||
fn getWritten(self: Self) []u8 { | ||
return self.slice[0..self.pos]; | ||
} | ||
}; | ||
|
||
const State = enum { | ||
Normal, | ||
Escaped, | ||
EscapedNumerals, | ||
EscapedLineEndings, | ||
}; | ||
|
||
var writer = SliceWriter{ .slice = dest_buf }; | ||
|
||
var string_escape_n: u8 = 0; | ||
var string_escape_i: std.math.IntFittingRange(0, 3) = 0; | ||
var state: State = State.Normal; | ||
var index: usize = 0; | ||
while (index < source.len) : (index += 1) { | ||
const c = source[index]; | ||
switch (state) { | ||
State.Normal => switch (c) { | ||
// Lua's string parser transforms all \r to \n | ||
'\r' => writer.write('\n'), | ||
'\\' => state = State.Escaped, | ||
else => writer.write(c), | ||
}, | ||
State.Escaped => switch (c) { | ||
'0'...'9' => { | ||
string_escape_n = c - '0'; | ||
string_escape_i = 1; | ||
state = State.EscapedNumerals; | ||
}, | ||
'\r', '\n' => { | ||
// escaped \r and \n get transformed to \n | ||
writer.write('\n'); | ||
state = State.EscapedLineEndings; | ||
}, | ||
else => { | ||
switch (c) { | ||
'a' => writer.write('\x07'), | ||
'b' => writer.write('\x08'), | ||
'f' => writer.write('\x0C'), | ||
'n' => writer.write('\n'), | ||
'r' => writer.write('\r'), | ||
't' => writer.write('\t'), | ||
'v' => writer.write('\x0B'), | ||
else => writer.write(c), | ||
} | ||
state = State.Normal; | ||
}, | ||
}, | ||
State.EscapedNumerals => switch(c) { | ||
'0'...'9' => { | ||
string_escape_n = 10 * string_escape_n + (c - '0'); | ||
string_escape_i += 1; | ||
if (string_escape_i == 3) { | ||
writer.write(string_escape_n); | ||
state = State.Normal; | ||
} | ||
}, | ||
else => { | ||
writer.write(string_escape_n); | ||
// backtrack so that we handle the current char properly | ||
index -= 1; | ||
state = State.Normal; | ||
}, | ||
}, | ||
State.EscapedLineEndings => switch(c) { | ||
'\r', '\n' => { | ||
state = State.Normal; | ||
}, | ||
else => { | ||
// backtrack so that we handle the current char properly | ||
index -= 1; | ||
state = State.Normal; | ||
}, | ||
}, | ||
} | ||
} | ||
// we could be in a state that still needs processing here, | ||
// since we could have hit the end of the string while unsure | ||
// if a \ddd pattern was finished | ||
switch (state) { | ||
State.EscapedNumerals => { | ||
writer.write(string_escape_n); | ||
}, | ||
State.Normal, | ||
State.EscapedLineEndings, | ||
=> {}, | ||
else => unreachable, | ||
} | ||
|
||
return writer.getWritten(); | ||
} | ||
}; | ||
|
||
test "parseStringLiteral" { | ||
var buf_arr: [100]u8 = undefined; | ||
var buf: []u8 = buf_arr[0..]; | ||
std.testing.expectEqualSlices(u8, "hello", Parser.parseStringLiteral("'hello'", buf)); | ||
std.testing.expectEqualSlices(u8, "hello", Parser.parseStringLiteral("\"hello\"", buf)); | ||
std.testing.expectEqualSlices(u8, "hello", Parser.parseStringLiteral("[[hello]]", buf)); | ||
std.testing.expectEqualSlices(u8, "hello", Parser.parseStringLiteral("[=[hello]=]", buf)); | ||
std.testing.expectEqualSlices(u8, "hello", Parser.parseStringLiteral("[===[hello]===]", buf)); | ||
std.testing.expectEqualSlices(u8, "\\ \n \x0B", Parser.parseStringLiteral("'\\\\ \\n \\v'", buf)); | ||
|
||
// long strings skip initial newline | ||
std.testing.expectEqualSlices(u8, "hello", Parser.parseStringLiteral("[[\nhello]]", buf)); | ||
std.testing.expectEqualSlices(u8, "\nhello", Parser.parseStringLiteral("[[\r\rhello]]", buf)); | ||
|
||
// escaped \r gets transformed into \n | ||
std.testing.expectEqualSlices(u8, "\n", Parser.parseStringLiteral("\"\\\r\"", buf)); | ||
|
||
// escaped newlines and newline pairs | ||
std.testing.expectEqualSlices(u8, "\n\\ ", Parser.parseStringLiteral("\"\\\r\\\\ \"", buf)); | ||
std.testing.expectEqualSlices(u8, "\n\\ ", Parser.parseStringLiteral("\"\\\r\n\\\\ \"", buf)); | ||
std.testing.expectEqualSlices(u8, "\n", Parser.parseStringLiteral("\"\\\n\r\"", buf)); | ||
|
||
// escaped numerals | ||
std.testing.expectEqualSlices(u8, "\x01-\x02", Parser.parseStringLiteral("\"\\1-\\2\"", buf)); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,12 @@ | ||
const std = @import("std"); | ||
|
||
pub const lex = @import("lex.zig"); | ||
pub const parse = @import("parse.zig"); | ||
|
||
pub fn main() void { | ||
} | ||
|
||
test "zua" { | ||
_ = @import("lex.zig"); | ||
_ = @import("parse.zig"); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
const std = @import("std"); | ||
const zua = @import("zua"); | ||
const lex = zua.lex; | ||
const parse = zua.parse; | ||
|
||
// Tests for comparing parsed strings between Zua and Lua. | ||
// Expects @import("build_options").fuzz_strings_inputs_dir to be a path to | ||
// a directory containing a corpus of inputs to test and | ||
// @import("build_options").fuzz_strings_outputs_dir to be a path to a | ||
// directory containing the corresponding expected string after | ||
// parsing. | ||
// | ||
// A usable inputs/outputs pair can be obtained from | ||
// https://github.com/squeek502/fuzzing-lua | ||
|
||
const verboseTestPrinting = false; | ||
|
||
const build_options = @import("build_options"); | ||
const inputs_dir_opt = build_options.fuzz_strings_inputs_dir; | ||
const outputs_dir_opt = build_options.fuzz_strings_outputs_dir; | ||
|
||
test "string input/output pairs" { | ||
var arena_allocator = std.heap.ArenaAllocator.init(std.heap.page_allocator); | ||
defer arena_allocator.deinit(); | ||
var allocator = &arena_allocator.allocator; | ||
|
||
// resolve these now since Zig's std lib on Windows rejects paths with / as the path sep | ||
const inputs_dir = try std.fs.path.resolve(allocator, &[_][]const u8{ inputs_dir_opt }); | ||
const outputs_dir = try std.fs.path.resolve(allocator, &[_][]const u8{ outputs_dir_opt }); | ||
|
||
var walker = try std.fs.walkPath(allocator, inputs_dir); | ||
defer walker.deinit(); | ||
var path_buffer = try std.Buffer.init(allocator, outputs_dir); | ||
defer path_buffer.deinit(); | ||
var result_buffer: [1024 * 1024]u8 = undefined; | ||
|
||
var n: usize = 0; | ||
while (try walker.next()) |entry| { | ||
if (verboseTestPrinting) { | ||
std.debug.warn("\n{}\n", .{entry.basename}); | ||
} | ||
const contents = try entry.dir.readFileAlloc(allocator, entry.basename, std.math.maxInt(usize)); | ||
defer allocator.free(contents); | ||
|
||
path_buffer.shrink(outputs_dir.len); | ||
try path_buffer.appendByte(std.fs.path.sep); | ||
try path_buffer.append(entry.basename); | ||
const expectedContents = try std.io.readFileAlloc(allocator, path_buffer.toSliceConst()); | ||
defer allocator.free(expectedContents); | ||
|
||
var lexer = lex.DefaultLexer.init(contents); | ||
while (true) { | ||
const token = lexer.next() catch |e| { | ||
break; | ||
}; | ||
if (token.id == lex.Token.Id.Eof) break; | ||
if (token.id != lex.Token.Id.String) continue; | ||
|
||
const string_source = contents[token.start..token.end]; | ||
var buf = try allocator.alloc(u8, string_source.len); | ||
defer allocator.free(buf); | ||
const parsed = parse.Parser.parseStringLiteral(string_source, buf); | ||
if (verboseTestPrinting) { | ||
std.debug.warn("got\n{x}\n", .{parsed}); | ||
std.debug.warn("expected\n{x}\n", .{expectedContents}); | ||
} | ||
std.testing.expectEqualSlices(u8, expectedContents, parsed); | ||
} | ||
n += 1; | ||
} | ||
std.debug.warn("{} input/output pairs checked...", .{n}); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
const std = @import("std"); | ||
const lex = @import("zua").lex; | ||
|
||
// Code for generating a potentially huge collection of | ||
// files containing the source of every string literal token | ||
// in the corpus provided in @import("build_options").fuzz_lex_inputs_dir | ||
// and outputting them to @import("build_options").fuzz_strings_gen_dir | ||
// | ||
// This is a building block for use later with fuzz_strings.zig, | ||
// after minimizing/generating outputs with https://github.com/squeek502/fuzzing-lua | ||
|
||
const build_options = @import("build_options"); | ||
const inputs_dir_opt = build_options.fuzz_lex_inputs_dir; | ||
const outputs_dir_opt = build_options.fuzz_strings_gen_dir; | ||
|
||
pub fn main() !void { | ||
var arena_allocator = std.heap.ArenaAllocator.init(std.heap.page_allocator); | ||
defer arena_allocator.deinit(); | ||
var allocator = &arena_allocator.allocator; | ||
|
||
// resolve these now since Zig's std lib on Windows rejects paths with / as the path sep | ||
const inputs_dir = try std.fs.path.resolve(allocator, &[_][]const u8{inputs_dir_opt}); | ||
const outputs_dir = try std.fs.path.resolve(allocator, &[_][]const u8{outputs_dir_opt}); | ||
|
||
// clean the outputs dir | ||
std.fs.deleteTree(outputs_dir) catch |err| switch(err) { | ||
error.FileNotFound => {}, | ||
else => |e| return e, | ||
}; | ||
try std.fs.makePath(allocator, outputs_dir); | ||
|
||
var walker = try std.fs.walkPath(allocator, inputs_dir); | ||
defer walker.deinit(); | ||
var path_buffer = try std.Buffer.init(allocator, outputs_dir); | ||
defer path_buffer.deinit(); | ||
var result_buffer: [1024 * 1024]u8 = undefined; | ||
|
||
var n: usize = 0; | ||
while (try walker.next()) |entry| { | ||
const contents = try entry.dir.readFileAlloc(allocator, entry.basename, std.math.maxInt(usize)); | ||
defer allocator.free(contents); | ||
|
||
var lexer = lex.DefaultLexer.init(contents); | ||
while (true) { | ||
const token = lexer.next() catch |e| { | ||
break; | ||
}; | ||
if (token.id == lex.Token.Id.Eof) break; | ||
if (token.id != lex.Token.Id.String) continue; | ||
|
||
path_buffer.shrink(outputs_dir.len); | ||
try path_buffer.appendByte(std.fs.path.sep); | ||
var buffer_out_stream = std.io.BufferOutStream.init(&path_buffer); | ||
try buffer_out_stream.stream.print("{}", .{n}); | ||
|
||
try std.io.writeFile(path_buffer.toSliceConst(), contents[token.start..token.end]); | ||
|
||
n += 1; | ||
if (n % 100 == 0) { | ||
std.debug.warn("{}...\r", .{n}); | ||
} | ||
} | ||
} | ||
std.debug.warn("{} files written to '{}'\n", .{n, outputs_dir}); | ||
} |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
[[[�]Hk*[�]] |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"\\\tr\tr\tr\t" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"Ea*G" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
[[ | ||
|
||
|
||
]] |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"\1-\2" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
[[e�]] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"\"" |
Binary file not shown.
Oops, something went wrong.