Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
73f6d96
text iteration
matthewhammer Feb 27, 2019
b7a3eb7
clean and expand test file
matthewhammer Feb 27, 2019
fe4e324
Merge remote-tracking branch 'origin/master' into text-iter
ggreif Mar 19, 2019
a22e618
WIP: compile printChar provisionally
ggreif Mar 19, 2019
31fdbb9
add a unicode example and demonstrate total brokenness
ggreif Mar 19, 2019
6d45431
WIP: play with UTF-8 char decoding
ggreif Mar 20, 2019
052d5e2
fix character SR, accept
ggreif Mar 20, 2019
713d8aa
allow printing of Unicode characters
ggreif Mar 21, 2019
53e0bd2
Merge branch 'master' into text-iter
ggreif Mar 21, 2019
fb6e872
renix
ggreif Mar 21, 2019
12cf851
implement decodeUTF8 for the interpreter
ggreif Mar 21, 2019
e77e404
remove one kludge
ggreif Mar 21, 2019
3743c39
intro len method for Text
ggreif Mar 22, 2019
0670e2e
review Feedback
nomeata Mar 22, 2019
3e9fb8b
follow-up
ggreif Mar 22, 2019
8516768
text_len passes compiler test, but somewhat verbose
ggreif Mar 22, 2019
6a98e48
compiled iteration over code points, needs cleanup
ggreif Mar 22, 2019
9e7eb99
uninflict some damage
ggreif Mar 22, 2019
befcda1
eliminate some administrative cruft
ggreif Mar 22, 2019
1360c07
code point printing, needs cleanup
ggreif Mar 24, 2019
dde6353
fix bit pattern
ggreif Mar 25, 2019
7714ad4
refactor
ggreif Mar 25, 2019
d62e5e9
shift-related refactoring
ggreif Mar 25, 2019
2e41bb0
feedback
nomeata Mar 25, 2019
000ee13
renamings
ggreif Mar 25, 2019
fdf9204
define and use allocFixedLen
ggreif Mar 25, 2019
9de17c7
refactor
ggreif Mar 25, 2019
cd6f8ac
intro and use Text.unskewed_payload_offset
ggreif Mar 25, 2019
aa98b66
move prim_showChar to Text module
ggreif Mar 25, 2019
4c15850
remodel printChar to call new showChar primitive
ggreif Mar 25, 2019
823b6ab
better naming
ggreif Mar 25, 2019
cdecce8
refactoring, use helpers where possible
ggreif Mar 25, 2019
f127ff6
tweak of the char_length_of_UTF8 comment
ggreif Mar 25, 2019
910da2c
simplifications
ggreif Mar 25, 2019
ce32ccf
review feedback
ggreif Mar 25, 2019
8776e15
rewrite len_UTF8_head
ggreif Mar 25, 2019
f6cd812
keep the utf-8 byte local to len_UTF8_head
ggreif Mar 25, 2019
dbe1f4c
add summary
ggreif Mar 25, 2019
7b63466
review feedback
ggreif Mar 26, 2019
dca07fd
add Char <--> Text tests
ggreif Mar 26, 2019
fa32164
revert fixed allocation, not worth it
ggreif Mar 26, 2019
b31b81f
Merge remote-tracking branch 'origin/master' into text-iter
ggreif Mar 26, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
307 changes: 266 additions & 41 deletions src/compile.ml

Large diffs are not rendered by default.

21 changes: 21 additions & 0 deletions src/prelude.ml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,10 @@ class revrange(x : Nat, y : Nat) {
next() : ?Nat { if (i <= y) null else {i -= 1; ?i} };
};

func charToText(c : Char) : Text = (prim "Char->Text" : Char -> Text) c;

func printInt(x : Int) { (prim "printInt" : Int -> ()) x };
func printChar(x : Char) { print (charToText x) };
func print(x : Text) { (prim "print" : Text -> ()) x };

// Hashing
Expand Down Expand Up @@ -61,6 +64,7 @@ func word64ToInt(n : Word64) : Int = (prim "Word64->Int" : Word64 -> Int) n;

func charToWord32(c : Char) : Word32 = (prim "Char->Word32" : Char -> Word32) c;
func word32ToChar(w : Word32) : Char = (prim "Word32->Char" : Word32 -> Char) w;
func decodeUTF8(s : Text) : (Word32, Char) = (prim "decodeUTF8" : Text -> (Word32, Char)) s;

// Exotic bitwise operations
func shrsWord8(w : Word8, amount : Word8) : Word8 = (prim "shrs8" : (Word8, Word8) -> Word8) (w, amount);
Expand Down Expand Up @@ -254,8 +258,25 @@ let prim = function
| Word64 y -> Word64 (Word64.and_ y (Word64.shl 1L (as_word64 a)))
| _ -> failwith "btst")

| "Char->Text" -> fun v k -> let str = match as_char v with
| c when c <= 0o177 -> String.make 1 (Char.chr c)
| code -> Wasm.Utf8.encode [code]
in k (Text str)
| "print" -> fun v k -> Printf.printf "%s%!" (as_text v); k unit
| "printInt" -> fun v k -> Printf.printf "%d%!" (Int.to_int (as_int v)); k unit
| "decodeUTF8" -> fun v k ->
let s = as_text v in
let take_and_mask bits offset = Int32.(logand (sub (shift_left 1l bits) 1l) (of_int (Char.code s.[offset]))) in
let classify_utf8_leader =
Int32.(function
| ch when logand ch (lognot 0b01111111l) = 0b00000000l -> [take_and_mask 7]
| ch when logand ch (lognot 0b00011111l) = 0b11000000l -> [take_and_mask 5; take_and_mask 6]
| ch when logand ch (lognot 0b00001111l) = 0b11100000l -> [take_and_mask 4; take_and_mask 6; take_and_mask 6]
| ch when logand ch (lognot 0b00000111l) = 0b11110000l -> [take_and_mask 3; take_and_mask 6; take_and_mask 6; take_and_mask 6]
| _ -> failwith "decodeUTF8") in
let nobbles = List.mapi (fun i f -> f i) (classify_utf8_leader (Int32.of_int (Char.code s.[0]))) in
let code = List.fold_left Int32.(fun acc nobble -> logor (shift_left acc 6) nobble) 0l nobbles
in k (Tup [Word32 (Int32.of_int (List.length nobbles)); Char (Int32.to_int code)])
| "@serialize" -> fun v k -> k (Serialized v)
| "@deserialize" -> fun v k -> k (as_serialized v)
| "Array.init" -> fun v k ->
Expand Down
7 changes: 7 additions & 0 deletions src/type.ml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,12 @@ let array_obj t =
| Mut t' -> Obj (Object Local, List.sort compare_field (mut t'))
| t -> Obj (Object Local, List.sort compare_field (immut t))

let text_obj =
let immut =

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: no reason for this let or for sorting the single-element list, see iter_obj above.

[ {lab = "chars"; typ = Func (Local, Returns, [], [], [iter_obj (Prim Char)])};
{lab = "len"; typ = Func (Local, Returns, [], [], [Prim Nat])};
] in
Obj (Object Local, List.sort compare_field immut)

(* Shifting *)

Expand Down Expand Up @@ -278,6 +284,7 @@ let as_prim_sub p t = match promote t with
let rec as_obj_sub lab t = match promote t with
| Obj (s, tfs) -> s, tfs
| Array t -> as_obj_sub lab (array_obj t)
| Prim Text -> as_obj_sub lab text_obj

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this implicit coercion of things to objects is a wart, and I am not sure if we want to extend it. Is

for (c in t.chars)

really so much better than

for (c in chars(t))

@matthewhammer matthewhammer Feb 28, 2019

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two reasons:

  • Consistency with how the language works now, and
  • The first version (with implicit object coercion) is "better" in the sense that the first version doesn't pollute the prelude namespace with some primitive function called chars, or charsOf, etc.

Having said that, I don't have a preference, I'm just following the conventions that I see for arrays; I'm trying to be consistent. Text is a special kind of restricted array, with no random access, just iteration. Once arrays act differently, I can follow that same pattern for Text. Why be inconsistent?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually would not mind changing it for arrays as well. This is motivated by the backend, that has to compile foo.bar differently from foo.length; in the latter case it must do dynamic dispatch on the kind of object on the heap. This is pretty ugly, and I wish we would not need it.

Also, it is odd to have a few built-in thing this way, without giving the user the ability to extend it likewise.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But this does not need to hold up this PR: You can do it this way first, and we can switch all over eventually, should we decide to do so.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, thanks for the explanation. I suspected that your motivation was about compilation, and that seems fairly compelling. If we can eschew the OO style in the future in favor of something with a simpler or more efficient compilation story, I agree that'd be preferable.

For this PR, what shall I do for the compilation part of this? (it's still missing here).

@nomeata nomeata Feb 28, 2019

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For this PR, what shall I do for the compilation part of this? (it's still missing here)

Hmm, that part needs a proper utf8-decoding iterator in the backend, right?

Well, let’s leave it unimplemented in the backend for now, and make an issue for it. I might tackle that then soon. It is fine if the backend lags behind the reference interpreter a bit. (You can run make accept to record the expected behavior with the feature not yet implemented in the backend.)

@crusso crusso Mar 18, 2019

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know this is a hack, but could we piggyback AS support of UTF-8 by extending the SystemAPI with the UTF-8 support provided by V8 (or worse JS). I guess that might make gas accounting hard for these operations...

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ssh, you said the g** word.

A utf8-decoder is not too bad, we can do that once we need it.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@crusso @nomeata I am working on the UTF-8 decoder. However, while testing, I found that both Text and Char are wrongly encoded when non-ASCII. Curiously, strings encoded by Text.lit env "<some Unicode>" work just fine. I'll submit a fix for both separately.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that we'll probably want to reconsider the object subsumption, but for now it makes sense to treat arrays and text consistently.

| Non -> Object Sharable, [{lab; typ = Non}]
| _ -> invalid "as_obj_sub"
let as_array_sub t = match promote t with
Expand Down
16 changes: 15 additions & 1 deletion src/value.ml
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,21 @@ let obj_of_array a =

Env.from_list ["get", get; "set", set; "len", len; "keys", keys; "vals", vals]

let as_obj = function Obj ve -> ve | Array a -> obj_of_array a | _ -> invalid "as_obj"
let obj_of_text t =
let chars = local_func 0 1 @@ fun v k ->
as_unit v;
let i = ref 0 in
let s = Wasm.Utf8.decode t in
let next = local_func 0 1 @@ fun v k' ->
if !i = List.length s then k' Null else
let v = Opt (Char (List.nth s !i)) in incr i; k' v
in k (Obj (Env.singleton "next" next)) in
let len = local_func 0 1 @@ fun v k ->
as_unit v; k (Int (Nat.of_int (List.length (Wasm.Utf8.decode t)))) in

Env.from_list ["chars", chars; "len", len]

let as_obj = function Obj ve -> ve | Array a -> obj_of_array a | Text t -> obj_of_text t | _ -> invalid "as_obj"
let as_func = function Func (cc, f) -> cc, f | _ -> invalid "as_func"
let as_async = function Async a -> a | _ -> invalid "as_async"
let as_mut = function Mut r -> r | _ -> invalid "as_mut"
Expand Down
2 changes: 1 addition & 1 deletion test/run-dfinity/ok/array-out-of-bounds.dvm-run.ok
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
W, hypervisor: calling func$92 failed with trap message: Uncaught RuntimeError: unreachable
W, hypervisor: calling func$98 failed with trap message: Uncaught RuntimeError: unreachable
W, hypervisor: calling func$104 failed with trap message: Uncaught RuntimeError: unreachable
5 changes: 4 additions & 1 deletion test/run-dfinity/ok/nary-async.wasm.stderr.ok
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
deserialize: T/77
prelude:103.1-128.2: internal error, File "compile.ml", line 2477, characters 21-27: Assertion failed
prelude:107.1-132.2: internal error, File "compile.ml", line 2693, characters 21-27: Assertion failed

Last environment:
@new_async = func
Expand All @@ -10,6 +10,7 @@ btstWord16 = func
btstWord32 = func
btstWord64 = func
btstWord8 = func
charToText = func
charToWord32 = func
clzWord16 = func
clzWord32 = func
Expand All @@ -19,6 +20,7 @@ ctzWord16 = func
ctzWord32 = func
ctzWord64 = func
ctzWord8 = func
decodeUTF8 = func
hashInt = func
ignore = func
intToWord16 = func
Expand All @@ -34,6 +36,7 @@ popcntWord32 = func
popcntWord64 = func
popcntWord8 = func
print = func
printChar = func
printInt = func
range = func
revrange = func
Expand Down
2 changes: 1 addition & 1 deletion test/run-dfinity/ok/overflow.dvm-run.ok
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
W, hypervisor: calling func$104 failed with trap message: Uncaught RuntimeError: unreachable
W, hypervisor: calling func$110 failed with trap message: Uncaught RuntimeError: unreachable
W, hypervisor: calling func$116 failed with trap message: Uncaught RuntimeError: unreachable
This is reachable.
This is reachable.
This is reachable.
Expand Down
9 changes: 8 additions & 1 deletion test/run/conversions.as
Original file line number Diff line number Diff line change
Expand Up @@ -157,4 +157,11 @@ assert(charToWord32 '\u{10ffff}' == (0x10FFFF : Word32));
roundtrip 100000;
roundtrip 1000000;
roundtrip 0x10FFFF; // largest code point
}
};


// Char <--> Text

assert(charToText 'П' == "П");
func snd((a : Word32, b : Char)) : Char = b;
assert(snd (decodeUTF8 "П") =='П');
20 changes: 20 additions & 0 deletions test/run/ok/text-iter.run-ir.ok
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
via `print`:
hello world!

via iteration and `printChar`: #1
hello world!

via iteration and `printChar`: #2
1:'h' 2:'e' 3:'l' 4:'l' 5:'o' 6:' ' 7:'w' 8:'o' 9:'r' 10:'l' 11:'d' 12:'!' 13:'
'
via iteration and `printChar` (Unicode): #3
1:'П' 2:'р' 3:'и' 4:'в' 5:'е' 6:'т' 7:'с' 8:'т' 9:'в' 10:'у' 11:'ю' 12:',' 13:' ' 14:'м' 15:'и' 16:'р' 17:'!' 18:'
'
via iteration and `printChar` (Unicode): #4
1:'🙈' 2:'🎸' 3:'😋'
Приветствую, мир!

2
П
4
🙈
20 changes: 20 additions & 0 deletions test/run/ok/text-iter.run-low.ok
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
via `print`:
hello world!

via iteration and `printChar`: #1
hello world!

via iteration and `printChar`: #2
1:'h' 2:'e' 3:'l' 4:'l' 5:'o' 6:' ' 7:'w' 8:'o' 9:'r' 10:'l' 11:'d' 12:'!' 13:'
'
via iteration and `printChar` (Unicode): #3
1:'П' 2:'р' 3:'и' 4:'в' 5:'е' 6:'т' 7:'с' 8:'т' 9:'в' 10:'у' 11:'ю' 12:',' 13:' ' 14:'м' 15:'и' 16:'р' 17:'!' 18:'
'
via iteration and `printChar` (Unicode): #4
1:'🙈' 2:'🎸' 3:'😋'
Приветствую, мир!

2
П
4
🙈
20 changes: 20 additions & 0 deletions test/run/ok/text-iter.run.ok
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
via `print`:
hello world!

via iteration and `printChar`: #1
hello world!

via iteration and `printChar`: #2
1:'h' 2:'e' 3:'l' 4:'l' 5:'o' 6:' ' 7:'w' 8:'o' 9:'r' 10:'l' 11:'d' 12:'!' 13:'
'
via iteration and `printChar` (Unicode): #3
1:'П' 2:'р' 3:'и' 4:'в' 5:'е' 6:'т' 7:'с' 8:'т' 9:'в' 10:'у' 11:'ю' 12:',' 13:' ' 14:'м' 15:'и' 16:'р' 17:'!' 18:'
'
via iteration and `printChar` (Unicode): #4
1:'🙈' 2:'🎸' 3:'😋'
Приветствую, мир!

2
П
4
🙈
1 change: 1 addition & 0 deletions test/run/ok/text-iter.wasm-run.ok
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
_out/text-iter.wasm:0x___: runtime trap: unreachable executed
72 changes: 72 additions & 0 deletions test/run/text-iter.as
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
let s = "hello world!\n";

print "via `print`:\n";
print s;
print "\n";

print "via iteration and `printChar`: #1\n";
for (a in s.chars()) {
printChar a;
};
print "\n";

print "via iteration and `printChar`: #2\n";
var x = 0;
for (a in s.chars()) {
x += 1;
printInt x;
print ":";
printChar '\'';
printChar a;
printChar '\'';
print " ";
};
print "\n";

let russian = "Приветствую, мир!\n";
assert(russian.len() == 18);

print "via iteration and `printChar` (Unicode): #3\n";
x := 0;
for (a in russian.chars()) {
x += 1;
printInt x;
print ":";
printChar '\'';
printChar a;
printChar '\'';
print " ";
};
print "\n";
assert(x == 18);

let emojis = "🙈🎸😋";
assert(emojis.len() == 3);

print "via iteration and `printChar` (Unicode): #4\n";
x := 0;
for (a in emojis.chars()) {
x += 1;
printInt x;
print ":";
printChar '\'';
printChar a;
printChar '\'';
print " ";
};
print "\n";
assert(x == 3);

{
let (len, c) = decodeUTF8 russian;
print russian; print "\n";
printInt (word32ToInt len); print "\n";
printChar c; print "\n";
};

{
let (len, c) = decodeUTF8 emojis;
assert ((len == (4 : Word32)) and (c == '\u{1f648}'));
printInt (word32ToInt len); print "\n";
printChar c; print "\n";
};