WebAssembly · tlively · May 27, 2022 · May 26, 2022 · May 27, 2022 · May 27, 2022
diff --git a/src/wasm/CMakeLists.txt b/src/wasm/CMakeLists.txt
@@ -12,6 +12,7 @@ set(wasm_SOURCES
   wasm-stack.cpp
   wasm-type.cpp
   wasm-validator.cpp
+  wat-lexer.cpp
   ${wasm_HEADERS}
 )
 # wasm-debug.cpp includes LLVM header using std::iterator (deprecated in C++17)

diff --git a/src/wasm/wat-parser-internal.h → src/wasm/wat-lexer.cpp b/src/wasm/wat-parser-internal.h → src/wasm/wat-lexer.cpp
@@ -14,16 +14,6 @@
  * limitations under the License.
  */
 
-// Usage note
-// ----------
-//
-// This parser is a work in progress and this file should not yet be included
-// anywhere except for in its own tests. Once the parser is usable, we will add
-// wat-parser.h to declare the public parsing API and wat-parser.cpp to
-// implement the public parsing functions in terms of the private API in this
-// header. The private API will stay in this header rather than moving to
-// wat-parser.cpp so that we can continue to unit test it.
-
 #include <cassert>
 #include <cctype>
 #include <cmath>
@@ -32,6 +22,8 @@
 #include <sstream>
 #include <variant>
 
+#include "wat-lexer.h"
+
 using namespace std::string_view_literals;
 
 namespace wasm::WATParser {
@@ -106,8 +98,6 @@ struct LexCtx {
   void takeAll() { lexedSize = input.size(); }
 };
 
-enum Signedness { Unsigned, Signed };
-
 enum OverflowBehavior { DisallowOverflow, IgnoreOverflow };
 
 std::optional<int> getDigit(char c) {
@@ -786,258 +776,127 @@ std::optional<LexResult> keyword(std::string_view in) {
   return ctx.lexed();
 }
 
-// ======
-// Tokens
-// ======
-
-struct LParenTok {
-  friend std::ostream& operator<<(std::ostream& os, const LParenTok&) {
-    return os << "'('";
-  }
-
-  friend bool operator==(const LParenTok&, const LParenTok&) { return true; }
-};
-
-struct RParenTok {
-  friend std::ostream& operator<<(std::ostream& os, const RParenTok&) {
-    return os << "')'";
-  }
-
-  friend bool operator==(const RParenTok&, const RParenTok&) { return true; }
-};
-
-struct IntTok {
-  uint64_t n;
-  Signedness signedness;
-
-  friend std::ostream& operator<<(std::ostream& os, const IntTok& tok) {
-    return os << tok.n << (tok.signedness == Signed ? " signed" : " unsigned");
-  }
-
-  friend bool operator==(const IntTok& t1, const IntTok& t2) {
-    return t1.n == t2.n && t1.signedness == t2.signedness;
-  }
-};
-
-struct FloatTok {
-  // The payload if we lexed a nan with payload. We cannot store the payload
-  // directly in `d` because we do not know at this point whether we are parsing
-  // an f32 or f64 and therefore we do not know what the allowable payloads are.
-  std::optional<uint64_t> nanPayload;
-  double d;
-
-  friend std::ostream& operator<<(std::ostream& os, const FloatTok& tok) {
-    if (std::isnan(tok.d)) {
-      os << (std::signbit(tok.d) ? "+" : "-");
-      if (tok.nanPayload) {
-        return os << "nan:0x" << std::hex << *tok.nanPayload << std::dec;
-      }
-      return os << "nan";
-    }
-    return os << tok.d;
-  }
+} // anonymous namespace
 
-  friend bool operator==(const FloatTok& t1, const FloatTok& t2) {
-    return std::signbit(t1.d) == std::signbit(t2.d) &&
-           (t1.d == t2.d || (std::isnan(t1.d) && std::isnan(t2.d) &&
-                             t1.nanPayload == t2.nanPayload));
+void Lexer::skipSpace() {
+  if (auto ctx = space(next())) {
+    index += ctx->span.size();
   }
-};
+}
 
-struct IdTok {
-  friend std::ostream& operator<<(std::ostream& os, const IdTok&) {
-    return os << "id";
+void Lexer::lexToken() {
+  // TODO: Ensure we're getting the longest possible match.
+  Token tok;
+  if (auto t = lparen(next())) {
+    tok = Token{t->span, LParenTok{}};
+  } else if (auto t = rparen(next())) {
+    tok = Token{t->span, RParenTok{}};
+  } else if (auto t = ident(next())) {
+    tok = Token{t->span, IdTok{}};
+  } else if (auto t = integer(next())) {
+    tok = Token{t->span, IntTok{t->n, t->signedness}};
+  } else if (auto t = float_(next())) {
+    tok = Token{t->span, FloatTok{t->nanPayload, t->d}};
+  } else if (auto t = str(next())) {
+    tok = Token{t->span, StringTok{t->str}};
+  } else if (auto t = keyword(next())) {
+    tok = Token{t->span, KeywordTok{}};
+  } else {
+    // TODO: Do something about lexing errors.
+    curr = std::nullopt;
+    return;
   }
+  index += tok.span.size();
+  curr = {tok};
+}
 
-  friend bool operator==(const IdTok&, const IdTok&) { return true; }
-};
-
-struct StringTok {
-  std::optional<std::string> str;
-
-  friend std::ostream& operator<<(std::ostream& os, const StringTok& tok) {
-    if (tok.str) {
-      os << '"' << *tok.str << '"';
+TextPos Lexer::position(const char* c) {
+  assert(size_t(c - buffer.data()) < buffer.size());
+  TextPos pos{1, 0};
+  for (const char* p = buffer.data(); p != c; ++p) {
+    if (*p == '\n') {
+      pos.line++;
+      pos.col = 0;
     } else {
-      os << "(raw string)";
+      pos.col++;
     }
-    return os;
-  }
-
-  friend bool operator==(const StringTok& t1, const StringTok& t2) {
-    return t1.str == t2.str;
   }
-};
-
-struct KeywordTok {
-  friend std::ostream& operator<<(std::ostream& os, const KeywordTok&) {
-    return os << "keyword";
-  }
-
-  friend bool operator==(const KeywordTok&, const KeywordTok&) { return true; }
-};
-
-struct Token {
-  using Data = std::variant<LParenTok,
-                            RParenTok,
-                            IntTok,
-                            FloatTok,
-                            IdTok,
-                            StringTok,
-                            KeywordTok>;
-
-  std::string_view span;
-  Data data;
-
-  // Suppress clang-tidy false positive about unused functions.
-  [[maybe_unused]] friend std::ostream& operator<<(std::ostream& os,
-                                                   const Token& tok) {
-    std::visit([&](const auto& t) { os << t; }, tok.data);
-    return os << " \"" << tok.span << "\"";
-  }
-
-  [[maybe_unused]] friend bool operator==(const Token& t1, const Token& t2) {
-    return t1.span == t2.span &&
-           std::visit(
-             [](auto& d1, auto& d2) {
-               if constexpr (std::is_same_v<decltype(d1), decltype(d2)>) {
-                 return d1 == d2;
-               } else {
-                 return false;
-               }
-             },
-             t1.data,
-             t2.data);
-  }
-};
-
-struct TextPos {
-  size_t line;
-  size_t col;
+  return pos;
+}
 
-  bool operator==(const TextPos& other) const {
-    return line == other.line && col == other.col;
-  }
-  bool operator!=(const TextPos& other) const { return !(*this == other); }
+bool TextPos::operator==(const TextPos& other) const {
+  return line == other.line && col == other.col;
+}
 
-  // Suppress clang-tidy false positive about unused functions.
-  [[maybe_unused]] friend std::ostream& operator<<(std::ostream& os,
-                                                   const TextPos& pos) {
-    return os << pos.line << ":" << pos.col;
-  }
-};
+bool IntTok::operator==(const IntTok& other) const {
+  return n == other.n && signedness == other.signedness;
+}
 
-// Lexer's purpose is twofold. First, it wraps a buffer to provide a tokenizing
-// iterator over it. Second, it implements that iterator itself. Also provides
-// utilities for locating the text position of tokens within the buffer. Text
-// positions are computed on demand rather than eagerly because they are
-// typically only needed when there is an error to report.
-struct Lexer {
-  using iterator = Lexer;
-  using difference_type = std::ptrdiff_t;
-  using value_type = Token;
-  using pointer = const Token*;
-  using reference = const Token&;
-  using iterator_category = std::forward_iterator_tag;
-
-  std::string_view buffer;
-  size_t index = 0;
-  std::optional<Token> curr;
-
-  // The end sentinel.
-  Lexer() = default;
-
-  Lexer(std::string_view buffer) : buffer(buffer) {
-    skipSpace();
-    lexToken();
-    skipSpace();
-  }
+bool FloatTok::operator==(const FloatTok& other) const {
+  return std::signbit(d) == std::signbit(other.d) &&
+         (d == other.d || (std::isnan(d) && std::isnan(other.d) &&
+                           nanPayload == other.nanPayload));
+}
 
-  std::string_view next() const { return buffer.substr(index); }
+bool Token::operator==(const Token& other) const {
+  return span == other.span &&
+         std::visit(
+           [](auto& t1, auto& t2) {
+             if constexpr (std::is_same_v<decltype(t1), decltype(t2)>) {
+               return t1 == t2;
+             } else {
+               return false;
+             }
+           },
+           data,
+           other.data);
+}
 
-  void skipSpace() {
-    if (auto ctx = space(next())) {
-      index += ctx->span.size();
-    }
-  }
+std::ostream& operator<<(std::ostream& os, const TextPos& pos) {
+  return os << pos.line << ":" << pos.col;
+}
 
-  void lexToken() {
-    // TODO: Ensure we're getting the longest possible match.
-    Token tok;
-    if (auto t = lparen(next())) {
-      tok = Token{t->span, LParenTok{}};
-    } else if (auto t = rparen(next())) {
-      tok = Token{t->span, RParenTok{}};
-    } else if (auto t = ident(next())) {
-      tok = Token{t->span, IdTok{}};
-    } else if (auto t = integer(next())) {
-      tok = Token{t->span, IntTok{t->n, t->signedness}};
-    } else if (auto t = float_(next())) {
-      tok = Token{t->span, FloatTok{t->nanPayload, t->d}};
-    } else if (auto t = str(next())) {
-      tok = Token{t->span, StringTok{t->str}};
-    } else if (auto t = keyword(next())) {
-      tok = Token{t->span, KeywordTok{}};
-    } else {
-      // TODO: Do something about lexing errors.
-      curr = std::nullopt;
-      return;
-    }
-    index += tok.span.size();
-    curr = {tok};
-  }
+std::ostream& operator<<(std::ostream& os, const LParenTok&) {
+  return os << "'('";
+}
 
-  Lexer& operator++() {
-    // Preincrement
-    lexToken();
-    skipSpace();
-    return *this;
-  }
+std::ostream& operator<<(std::ostream& os, const RParenTok&) {
+  return os << "')'";
+}
 
-  Lexer operator++(int) {
-    // Postincrement
-    Lexer ret = *this;
-    ++(*this);
-    return ret;
-  }
+std::ostream& operator<<(std::ostream& os, const IdTok&) { return os << "id"; }
 
-  const Token& operator*() { return *curr; }
-  const Token* operator->() { return &*curr; }
+std::ostream& operator<<(std::ostream& os, const IntTok& tok) {
+  return os << tok.n << (tok.signedness == Signed ? " signed" : " unsigned");
+}
 
-  bool operator==(const Lexer& other) const {
-    // The iterator is equal to the end sentinel when there is no current token.
-    if (!curr && !other.curr) {
-      return true;
+std::ostream& operator<<(std::ostream& os, const FloatTok& tok) {
+  if (std::isnan(tok.d)) {
+    os << (std::signbit(tok.d) ? "+" : "-");
+    if (tok.nanPayload) {
+      return os << "nan:0x" << std::hex << *tok.nanPayload << std::dec;
     }
-    // Otherwise they are equivalent when they are at the same position.
-    return index == other.index;
+    return os << "nan";
   }
+  return os << tok.d;
+}
 
-  bool operator!=(const Lexer& other) const { return !(*this == other); }
-
-  Lexer begin() { return *this; }
-
-  Lexer end() { return Lexer(); }
-
-  TextPos position(const char* c) {
-    assert(size_t(c - buffer.data()) < buffer.size());
-    TextPos pos{1, 0};
-    for (const char* p = buffer.data(); p != c; ++p) {
-      if (*p == '\n') {
-        pos.line++;
-        pos.col = 0;
-      } else {
-        pos.col++;
-      }
-    }
-    return pos;
+std::ostream& operator<<(std::ostream& os, const StringTok& tok) {
+  if (tok.str) {
+    os << '"' << *tok.str << '"';
+  } else {
+    os << "(raw string)";
   }
+  return os;
+}
 
-  TextPos position(std::string_view span) { return position(span.data()); }
-
-  TextPos position(Token tok) { return position(tok.span); }
-};
+std::ostream& operator<<(std::ostream& os, const KeywordTok&) {
+  return os << "keyword";
+}
 
-} // anonymous namespace
+std::ostream& operator<<(std::ostream& os, const Token& tok) {
+  std::visit([&](const auto& t) { os << t; }, tok.data);
+  return os << " \"" << tok.span << "\"";
+}
 
 } // namespace wasm::WATParser