Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
256 changes: 231 additions & 25 deletions src/wasm/wat-parser-internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

#include <cassert>
#include <cctype>
#include <cmath>
#include <iostream>
#include <optional>
#include <sstream>
Expand Down Expand Up @@ -107,6 +108,28 @@ struct LexCtx {

enum Signedness { Unsigned, Signed };

enum OverflowBehavior { DisallowOverflow, IgnoreOverflow };

std::optional<int> getDigit(char c) {
if ('0' <= c && c <= '9') {
return {c - '0'};
}
return std::nullopt;
}

std::optional<int> getHexDigit(char c) {
if ('0' <= c && c <= '9') {
return {c - '0'};
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't these be without the { }? iirc C++ will convert X to optional<X> for you.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll remove these and other unnecessary braces in a separate NFC PR.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good.

}
if ('A' <= c && c <= 'F') {
return {10 + c - 'A'};
}
if ('a' <= c && c <= 'f') {
return {10 + c - 'a'};
}
return std::nullopt;
}

// The result of lexing an integer token fragment.
struct LexIntResult : LexResult {
uint64_t n;
Expand All @@ -124,29 +147,17 @@ struct LexIntCtx : LexCtx {
bool negative = false;
bool overflow = false;

std::optional<int> getDigit(char c) {
if ('0' <= c && c <= '9') {
return {c - '0'};
}
return std::nullopt;
}
public:
explicit LexIntCtx(std::string_view in) : LexCtx(in) {}

std::optional<int> getHexDigit(char c) {
if ('0' <= c && c <= '9') {
return {c - '0'};
}
if ('A' <= c && c <= 'F') {
return {10 + c - 'A'};
}
if ('a' <= c && c <= 'f') {
return {10 + c - 'a'};
// Lex only the underlying span, ignoring the overflow and value.
std::optional<LexIntResult> lexedRaw() {
if (auto basic = LexCtx::lexed()) {
return LexIntResult{*basic, 0, Unsigned};
}
return std::nullopt;
return {};
}

public:
explicit LexIntCtx(std::string_view in) : LexCtx(in) {}

std::optional<LexIntResult> lexed() {
// Check most significant bit for overflow of signed numbers.
if (overflow) {
Expand Down Expand Up @@ -217,6 +228,54 @@ struct LexIntCtx : LexCtx {
}
};

struct LexFloatResult : LexResult {
// The payload if we lexed a nan with payload. We cannot store the payload
// directly in `d` because we do not know at this point whether we are parsing
// an f32 or f64 and therefore we do not know what the allowable payloads are.
std::optional<uint64_t> nanPayload;
double d;
};

struct LexFloatCtx : LexCtx {
std::optional<uint64_t> nanPayload;

LexFloatCtx(std::string_view in) : LexCtx(in) {}

std::optional<LexFloatResult> lexed() {
assert(!std::signbit(NAN) && "Expected NAN to be positive");
auto basic = LexCtx::lexed();
if (!basic) {
return {};
}
if (nanPayload) {
double nan = basic->span[0] == '-' ? -NAN : NAN;
return LexFloatResult{*basic, nanPayload, nan};
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this initialize base class members in order and then the child class members? Didn't know this was possible..

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, that's my understanding. I wouldn't have thought that this would work either, but apparently it does 🤷

}
// strtod does not return -NAN for "-nan" on all platforms.
if (basic->span == "-nan"sv) {
return LexFloatResult{*basic, nanPayload, -NAN};
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
return LexFloatResult{*basic, nanPayload, -NAN};
return LexFloatResult{*basic, {}, -NAN};

Can nanPayload ever be non-null here, given that the if above does an early return?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, it will always be nullopt. I can make this change in a follow-up.

}
// Do not try to implement fully general and precise float parsing
// ourselves. Instead, call out to std::strtod to do our parsing. This means
// we need to strip any underscores since `std::strtod` does not understand
// them.
std::stringstream ss;
for (const char *curr = basic->span.data(),
*end = curr + basic->span.size();
curr != end;
++curr) {
if (*curr != '_') {
ss << *curr;
}
}
std::string str = ss.str();
char* last;
double d = std::strtod(str.data(), &last);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The existing code uses strtof for an f32, but I'm not sure if that's necessary...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's no way to tell whether or not we are lexing an f32 or an f64 without higher level parser context, so we have to conservatively use double precision here. I don't think there are any problems that can arise from that.

assert(last == str.data() + str.size() && "could not parse float");
return LexFloatResult{*basic, {}, d};
}
};

struct LexStrResult : LexResult {
// Allocate a string only if there are escape sequences, otherwise just use
// the original string_view.
Expand Down Expand Up @@ -378,17 +437,22 @@ bool LexCtx::canFinish() const {
// num ::= d:digit => d
// | n:num '_'? d:digit => 10*n + d
// digit ::= '0' => 0 | ... | '9' => 9
std::optional<LexIntResult> num(std::string_view in) {
std::optional<LexIntResult> num(std::string_view in,
OverflowBehavior overflow = DisallowOverflow) {
LexIntCtx ctx(in);
if (ctx.empty()) {
return {};
}
if (!ctx.takeDigit()) {
return {};
}
while (true) {
bool under = ctx.takePrefix("_"sv);
if (!ctx.takeDigit()) {
if (!under) {
return ctx.lexed();
return overflow == DisallowOverflow ? ctx.lexed() : ctx.lexedRaw();
}
// TODO: Add error production for trailing underscore.
return {};
}
}
Expand All @@ -399,7 +463,8 @@ std::optional<LexIntResult> num(std::string_view in) {
// hexdigit ::= d:digit => d
// | 'A' => 10 | ... | 'F' => 15
// | 'a' => 10 | ... | 'f' => 15
std::optional<LexIntResult> hexnum(std::string_view in) {
std::optional<LexIntResult>
hexnum(std::string_view in, OverflowBehavior overflow = DisallowOverflow) {
LexIntCtx ctx(in);
if (!ctx.takeHexdigit()) {
return {};
Expand All @@ -408,8 +473,9 @@ std::optional<LexIntResult> hexnum(std::string_view in) {
bool under = ctx.takePrefix("_"sv);
if (!ctx.takeHexdigit()) {
if (!under) {
return ctx.lexed();
return overflow == DisallowOverflow ? ctx.lexed() : ctx.lexedRaw();
}
// TODO: Add error production for trailing underscore.
return {};
}
}
Expand Down Expand Up @@ -445,6 +511,114 @@ std::optional<LexIntResult> integer(std::string_view in) {
return {};
}

// float ::= p:num '.'? => p
// | p:num '.' q:frac => p + q
// | p:num '.'? ('E'|'e') s:sign e:num => p * 10^([s]e)
// | p:num '.' q:frac ('E'|'e') s:sign e:num => (p + q) * 10^([s]e)
// frac ::= d:digit => d/10
// | d:digit '_'? p:frac => (d + p/10) / 10
std::optional<LexResult> decfloat(std::string_view in) {
LexCtx ctx(in);
if (auto lexed = num(ctx.next(), IgnoreOverflow)) {
ctx.take(*lexed);
} else {
return {};
}
// Optional '.' followed by optional frac
if (ctx.takePrefix("."sv)) {
if (auto lexed = num(ctx.next(), IgnoreOverflow)) {
ctx.take(*lexed);
}
}
if (ctx.takePrefix("E"sv) || ctx.takePrefix("e"sv)) {
// Optional sign
ctx.takePrefix("+"sv) || ctx.takePrefix("-"sv);
if (auto lexed = num(ctx.next(), IgnoreOverflow)) {
ctx.take(*lexed);
} else {
// TODO: Add error production for missing exponent.
return {};
}
}
return ctx.lexed();
}

// hexfloat ::= '0x' p:hexnum '.'? => p
// | '0x' p:hexnum '.' q:hexfrac => p + q
// | '0x' p:hexnum '.'? ('P'|'p') s:sign e:num => p * 2^([s]e)
// | '0x' p:hexnum '.' q:hexfrac ('P'|'p') s:sign e:num
// => (p + q) * 2^([s]e)
// hexfrac ::= h:hexdigit => h/16
// | h:hexdigit '_'? p:hexfrac => (h + p/16) / 16
std::optional<LexResult> hexfloat(std::string_view in) {
LexCtx ctx(in);
if (!ctx.takePrefix("0x"sv)) {
return {};
}
if (auto lexed = hexnum(ctx.next(), IgnoreOverflow)) {
ctx.take(*lexed);
} else {
return {};
}
// Optional '.' followed by optional hexfrac
if (ctx.takePrefix("."sv)) {
if (auto lexed = hexnum(ctx.next(), IgnoreOverflow)) {
ctx.take(*lexed);
}
}
if (ctx.takePrefix("P"sv) || ctx.takePrefix("p"sv)) {
// Optional sign
ctx.takePrefix("+"sv) || ctx.takePrefix("-"sv);
if (auto lexed = num(ctx.next(), IgnoreOverflow)) {
ctx.take(*lexed);
} else {
// TODO: Add error production for missing exponent.
return {};
}
}
return ctx.lexed();
}

// fN ::= s:sign z:fNmag => [s]z
// fNmag ::= z:float => float_N(z) (if float_N(z) != +/-infinity)
// | z:hexfloat => float_N(z) (if float_N(z) != +/-infinity)
// | 'inf' => infinity
// | 'nan' => nan(2^(signif(N)-1))
// | 'nan:0x' n:hexnum => nan(n) (if 1 <= n < 2^signif(N))
std::optional<LexFloatResult> float_(std::string_view in) {
LexFloatCtx ctx(in);
// Optional sign
ctx.takePrefix("+"sv) || ctx.takePrefix("-"sv);
if (auto lexed = hexfloat(ctx.next())) {
ctx.take(*lexed);
} else if (auto lexed = decfloat(ctx.next())) {
ctx.take(*lexed);
} else if (ctx.takePrefix("inf"sv)) {
// nop
} else if (ctx.takePrefix("nan"sv)) {
if (ctx.takePrefix(":0x"sv)) {
if (auto lexed = hexnum(ctx.next())) {
ctx.take(*lexed);
if (1 <= lexed->n && lexed->n < (1ull << 52)) {
ctx.nanPayload = lexed->n;
} else {
// TODO: Add error production for invalid NaN payload.
return {};
}
} else {
// TODO: Add error production for malformed NaN payload.
return {};
}
}
} else {
return {};
}
if (ctx.canFinish()) {
return ctx.lexed();
}
return {};
}

// idchar ::= '0' | ... | '9'
// | 'A' | ... | 'Z'
// | 'a' | ... | 'z'
Expand Down Expand Up @@ -642,6 +816,31 @@ struct IntTok {
}
};

struct FloatTok {
// The payload if we lexed a nan with payload. We cannot store the payload
// directly in `d` because we do not know at this point whether we are parsing
// an f32 or f64 and therefore we do not know what the allowable payloads are.
Comment on lines +820 to +822
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For non-NaN numbers, you store as a double conservatively and fix it later. Can't we do the same for NaNs?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought about that, but the problem is that the default payload depends on the size of the float we are parsing. It's (2^52)-1 for f64 and (2^23)-1 for f32. If we stored the payload directly in the double to be fixed up later, then we wouldn't be able to tell the difference between a custom payload of (2^52)-1 that is out-of-bounds for an f32 and a default payload that was conservatively set to (2^53)-1 and can be fixed up. We could fix that in different ways, like having a bool isDefaultPayload, but this seems simpler overall.

std::optional<uint64_t> nanPayload;
double d;

friend std::ostream& operator<<(std::ostream& os, const FloatTok& tok) {
if (std::isnan(tok.d)) {
os << (std::signbit(tok.d) ? "+" : "-");
if (tok.nanPayload) {
return os << "nan:0x" << std::hex << *tok.nanPayload << std::dec;
}
return os << "nan";
}
return os << tok.d;
}

friend bool operator==(const FloatTok& t1, const FloatTok& t2) {
return std::signbit(t1.d) == std::signbit(t2.d) &&
(t1.d == t2.d || (std::isnan(t1.d) && std::isnan(t2.d) &&
t1.nanPayload == t2.nanPayload));
}
};

struct IdTok {
friend std::ostream& operator<<(std::ostream& os, const IdTok&) {
return os << "id";
Expand Down Expand Up @@ -676,8 +875,13 @@ struct KeywordTok {
};

struct Token {
using Data =
std::variant<LParenTok, RParenTok, IntTok, IdTok, StringTok, KeywordTok>;
using Data = std::variant<LParenTok,
RParenTok,
IntTok,
FloatTok,
IdTok,
StringTok,
KeywordTok>;

std::string_view span;
Data data;
Expand Down Expand Up @@ -765,6 +969,8 @@ struct Lexer {
tok = Token{t->span, IdTok{}};
} else if (auto t = integer(next())) {
tok = Token{t->span, IntTok{t->n, t->signedness}};
} else if (auto t = float_(next())) {
tok = Token{t->span, FloatTok{t->nanPayload, t->d}};
} else if (auto t = str(next())) {
tok = Token{t->span, StringTok{t->str}};
} else if (auto t = keyword(next())) {
Expand Down
Loading