From d0a4acfc698be54a5c574d78021977fd9e6344b1 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Tue, 14 Jan 2020 14:24:47 +0000
Subject: [PATCH 01/78] Per-board router

A 2D mesh of board routers (one per board) now connects together the
2D NoCs on each board.  This means that message can pass over a board
without travesing the NoC.  The per-board router is not yet
programmable, but that is direction we are heading.
---
 rtl/Network.bsv | 221 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 148 insertions(+), 73 deletions(-)

diff --git a/rtl/Network.bsv b/rtl/Network.bsv
index 3efbb480..bd435a11 100644
--- a/rtl/Network.bsv
+++ b/rtl/Network.bsv
@@ -146,11 +146,8 @@ module mkMeshRouter#(MailboxId m) (MeshRouter);
 
   // Routing function
   function Route route(NetAddr a);
-         if (a.addr.board.y < b.y) return Down;
-    else if (a.addr.board.y > b.y) return Up;
-    else if (a.addr.host.valid) return a.addr.host.value == 0 ? Left : Right;
-    else if (a.addr.board.x < b.x) return Left;
-    else if (a.addr.board.x > b.x) return Right;
+         if (a.addr.board != b)   return Down;
+    else if (a.addr.host.valid)   return Down;
     else if (a.addr.mbox.y < m.y) return Down;
     else if (a.addr.mbox.y > m.y) return Up;
     else if (a.addr.mbox.x < m.x) return Left;
@@ -221,6 +218,104 @@ module mkMeshRouter#(MailboxId m) (MeshRouter);
 
 endmodule
 
+// =============================================================================
+// Board router
+// =============================================================================
+
+// Similar to a mesh router, but: (1) different routing function,
+// which routes between boards rather than mailboxes; (2) no loopback
+// in the sense that packets coming from mailbox mesh never get routed back
+// onto mailbox mesh.  This is a first step towards supporting
+// programmable board routers.
+module mkBoardRouter(MeshRouter);
+
+  // Board id
+  Wire#(BoardId) b <- mkDWire(?);
+
+  // Ports
+  InPort#(Flit)  leftInPort      <- mkInPort;
+  OutPort#(Flit) leftOutPort     <- mkOutPort;
+  InPort#(Flit)  rightInPort     <- mkInPort;
+  OutPort#(Flit) rightOutPort    <- mkOutPort;
+  InPort#(Flit)  topInPort       <- mkInPort;
+  OutPort#(Flit) topOutPort      <- mkOutPort;
+  InPort#(Flit)  bottomInPort    <- mkInPort;
+  OutPort#(Flit) bottomOutPort   <- mkOutPort;
+  InPort#(Flit)  fromMailboxPort <- mkInPort;
+  OutPort#(Flit) toMailboxPort   <- mkOutPort;
+
+  // Routing function
+  function Route route(NetAddr a);
+         if (a.addr.host.valid)    return a.addr.host.value == 0 ? Left : Right;
+    else if (a.addr.board.x < b.x) return Left;
+    else if (a.addr.board.x > b.x) return Right;
+    else if (a.addr.board.y < b.y) return Down;
+    else if (a.addr.board.y > b.y) return Up;
+    else return Mailbox;
+  endfunction
+
+  // Route to the mailbox
+  mkRouterMux(
+    route,
+    Mailbox,
+    toMailboxPort,
+    vector(FromLeft, FromRight, FromTop, FromBottom),
+    vector(leftInPort, rightInPort, topInPort, bottomInPort)
+  );
+
+  // Route left
+  mkRouterMux(
+    route,
+    Left,
+    leftOutPort,
+    vector(FromRight,   FromTop,   FromBottom,   FromMailbox),
+    vector(rightInPort, topInPort, bottomInPort, fromMailboxPort)
+  );
+
+  // Route right
+  mkRouterMux(
+    route,
+    Right,
+    rightOutPort,
+    vector(FromLeft,   FromTop,   FromBottom,   FromMailbox),
+    vector(leftInPort, topInPort, bottomInPort, fromMailboxPort)
+  );
+
+  // Route up
+  mkRouterMux(
+    route,
+    Up,
+    topOutPort,
+    vector(FromLeft,   FromRight,   FromBottom,   FromMailbox),
+    vector(leftInPort, rightInPort, bottomInPort, fromMailboxPort)
+  );
+
+  // Route down
+  mkRouterMux(
+    route,
+    Down,
+    bottomOutPort,
+    vector(FromLeft,   FromRight,   FromTop,   FromMailbox),
+    vector(leftInPort, rightInPort, topInPort, fromMailboxPort)
+  );
+
+  method Action setBoardId(BoardId id);
+    b <= id;
+  endmethod
+
+  // Interface
+  interface In  leftIn      = leftInPort.in;
+  interface Out leftOut     = leftOutPort.out;
+  interface In  rightIn     = rightInPort.in;
+  interface Out rightOut    = rightOutPort.out;
+  interface In  topIn       = topInPort.in;
+  interface Out topOut      = topOutPort.out;
+  interface In  bottomIn    = bottomInPort.in;
+  interface Out bottomOut   = bottomOutPort.out;
+  interface In  fromMailbox = fromMailboxPort.in;
+  interface Out toMailbox   = toMailboxPort.out;
+endmodule
+
 // =============================================================================
 // Flit-sized reliable links
 // =============================================================================
@@ -362,79 +457,59 @@ module mkMailboxMesh#(
                      routers[y+1][x].bottomOut, routers[y][x].topIn);
   end
 
-  // Connect north links
-  // -------------------
-
-  // Extract mesh top inputs and outputs
-  List#(In#(Flit)) topInList = Nil;
-  List#(Out#(Flit)) topOutList = Nil;
-  for (Integer x = `MailboxMeshXLen-1; x >= 0; x=x-1) begin
-    topOutList = Cons(routers[`MailboxMeshYLen-1][x].topOut, topOutList);
-    topInList = Cons(routers[`MailboxMeshYLen-1][x].topIn, topInList);
-  end
+  // Board router
+  // ------------
 
-  // Connect the outgoing links
-  function In#(Flit) getFlitIn(BoardLink link) = link.flitIn;
-  reduceConnect(mkFlitMerger,
-    topOutList, List::map(getFlitIn, toList(northLink)));
-  
-  // Connect the incoming links
-  function Out#(Flit) getFlitOut(BoardLink link) = link.flitOut;
-  expandConnect(List::map(getFlitOut, toList(northLink)), topInList);
+  // For routing messages between boards
+  MeshRouter boardRouter <- mkBoardRouter;
 
-  // Connect south links
-  // -------------------
+  // Set board id for board router
+  rule setBoardRouterId;
+    boardRouter.setBoardId(boardId);
+  endrule
 
-  // Extract mesh bottom inputs and outputs
-  List#(In#(Flit)) botInList = Nil;
+  // Connect board router to north link
+  connectUsing(mkUGShiftQueue1(QueueOptFmax),
+    boardRouter.topOut, northLink[0].flitIn);
+  connectUsing(mkUGShiftQueue1(QueueOptFmax),
+    northLink[0].flitOut, boardRouter.topIn);
+
+  // Connect board router to south link
+  connectUsing(mkUGShiftQueue1(QueueOptFmax),
+    boardRouter.bottomOut, southLink[0].flitIn);
+  connectUsing(mkUGShiftQueue1(QueueOptFmax),
+    southLink[0].flitOut, boardRouter.bottomIn);
+
+  // Connect board router to east link
+  connectUsing(mkUGShiftQueue1(QueueOptFmax),
+    boardRouter.rightOut, eastLink[0].flitIn);
+  connectUsing(mkUGShiftQueue1(QueueOptFmax),
+    eastLink[0].flitOut, boardRouter.rightIn);
+
+  // Connect board router to west link
+  connectUsing(mkUGShiftQueue1(QueueOptFmax),
+    boardRouter.leftOut, westLink[0].flitIn);
+  connectUsing(mkUGShiftQueue1(QueueOptFmax),
+    westLink[0].flitOut, boardRouter.leftIn);
+
+  // Connect mailbox mesh south rim to board router
+  function List#(t) single(t elem) = List::cons(elem, Nil);
   List#(Out#(Flit)) botOutList = Nil;
-  for (Integer x = `MailboxMeshXLen-1; x >= 0; x=x-1) begin
+  for (Integer x = `MailboxMeshXLen-1; x >= 0; x=x-1)
     botOutList = Cons(routers[0][x].bottomOut, botOutList);
-    botInList = Cons(routers[0][x].bottomIn, botInList);
-  end
-
-  // Connect the outgoing links
-  reduceConnect(mkFlitMerger, botOutList,
-    List::map(getFlitIn, toList(southLink)));
-  
-  // Connect the incoming links
-  expandConnect(List::map(getFlitOut, toList(southLink)), botInList);
-
-  // Connect east links
-  // ------------------
-
-  // Extract mesh right inputs and outputs
-  List#(In#(Flit)) rightInList = Nil;
-  List#(Out#(Flit)) rightOutList = Nil;
-  for (Integer y = `MailboxMeshYLen-1; y >= 0; y=y-1) begin
-    rightOutList = Cons(routers[y][`MailboxMeshXLen-1].rightOut, rightOutList);
-    rightInList = Cons(routers[y][`MailboxMeshXLen-1].rightIn, rightInList);
-  end
-
-  // Connect the outgoing links
-  reduceConnect(mkFlitMerger,
-    rightOutList, List::map(getFlitIn, toList(eastLink)));
-  
-  // Connect the incoming links
-  expandConnect(List::map(getFlitOut, toList(eastLink)), rightInList);
-
-  // Connect west links
-  // ------------------
-
-   // Extract mesh right inputs and outputs
-  List#(In#(Flit)) leftInList = Nil;
-  List#(Out#(Flit)) leftOutList = Nil;
-  for (Integer y = `MailboxMeshYLen-1; y >= 0; y=y-1) begin
-    leftOutList = Cons(routers[y][0].leftOut, leftOutList);
-    leftInList = Cons(routers[y][0].leftIn, leftInList);
-  end
-
-  // Connect the outgoing links
-  reduceConnect(mkFlitMerger,
-    leftOutList, List::map(getFlitIn, toList(westLink)));
-  
-  // Connect the incoming links
-  expandConnect(List::map(getFlitOut, toList(westLink)), leftInList);
+  function In#(Flit) getFlitIn(BoardLink link) = link.flitIn;
+  reduceConnect(mkFlitMerger, botOutList, single(boardRouter.fromMailbox));
+
+  // Connect board router to mailbox mesh south rim
+  function In#(Flit) getBottomIn(MeshRouter r) = r.bottomIn;
+  Vector#(`MailboxMeshXLen, In#(Flit)) southRimInPorts =
+    map(getBottomIn, routers[0]);
+  function Bit#(`MailboxMeshXBits) flitGetX(Flit flit) =
+    flit.dest.addr.mbox.x;
+  let southRimDistributor <- mkResponseDistributor(flitGetX,
+    mkUGShiftQueue1(QueueOptFmax), southRimInPorts);
+  connectUsing(mkUGShiftQueue1(QueueOptFmax), boardRouter.toMailbox,
+    southRimDistributor);
 
   // Detect inter-board activity
   // ---------------------------

From af7228266b44ab9b0b0308ea64b25a861a092f5c Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Wed, 15 Jan 2020 10:48:07 +0000
Subject: [PATCH 02/78] New NoC diagram

---
 README.md            |   4 +--
 doc/figures/fpga.png | Bin 17842 -> 17084 bytes
 doc/figures/fpga.tex |  84 ++++++++++++++++---------------------------
 3 files changed, 33 insertions(+), 55 deletions(-)

diff --git a/README.md b/README.md
index 00f6a84b..01e744a0 100644
--- a/README.md
+++ b/README.md
@@ -136,8 +136,8 @@ accelerators](doc/custom) in tiles.
 Each FPGA contains two *Tinsel Slices*, with each slice typically
 comprising eight tiles connected to one 4GB DDR3 DIMM and two 8MB
 QDRII+ SRAMs.  All tiles are connected together via a routers to form
-a 2D NoC.  At the edges of the NoC are the inter-FPGA reliable
-links.
+a 2D NoC.  The NoC is connected to the inter-FPGA links using a
+per-board router.
 
 <img align="center" src="doc/figures/fpga.png">
 
diff --git a/doc/figures/fpga.png b/doc/figures/fpga.png
index f4d60fbbef0d5c072ad8fe7a4dd52857311c251e..c05a5e99609a3857048baae3a4ab2a67d6efd9b7 100644
GIT binary patch
literal 17084
zcmb`vb9iN6x9_=QR-9B-tcq=`V%w^?V%s(<m84?Zwr$&1#r9pl_dTcko~Q5a?$b}N
zCs=>1y)?%h<2yfNj2$K~EA|N<8y*A#eUcCtRs?~-z5x3tu;9So`|fcs;2)f=xP}7=
zgwFZTFIY^EDi{bv2$B#MRCZlC({a&OQhVWNPel|J@~!IqMXLp7mldi~O3ZOpzOYc@
zRbC&hr=_M<u4!J{U~_75sHJs^pOAxfx<M?J2tS0+giZ)Ybn`IS&k{^Zx($uiyct8l
z&z|Hk>2>awahP^EMo*eF3I>ls0S%3)2}vQJ6&(l;bs|n7-$v6_{2ELs{sPgPCpT}y
z`A10MPf#qGBE|q6Or403UyoU|Fj;&OolePPv-{o24=n+QgE2c>Ti?pJw>NY&G->&v
z1ZuS%+xBh|d;6N&+M7$|1(cO?cQG_r$ol&F(Ik2pkmY*I-9Qvh??T+n>@2(G(iX5C
z9yZ!;_edqtC1=GRPNc6^8^R!kNXg2Q#fdvPIrS=OeE->a{W<%!u&=N0_V#vaN}l#u
zPnE)lDNcN?)92%Sy;TL=E*@=WeEjV3YF8?eX1Zc?JFAkYS}P5UAR;`x#eQGX$jC@n
zH)Dq^bFxsexVI$3uyz?2DMXrVb$OYUmGyXjk$@#IB*fI%`03?k*Ontp*dRp+yOQx}
z(K@6|WTnl`L0lXbq(AbTax{?^r-3!Cvhw2oY?TO>I8NN-{%>_{Eps;(C9tuwVuH5L
z&odGegZm9a$~%HbtmnDWlBJPgVdY^N&F3gY`q|bn(DCsr9-Yy`AgPDva4DIX6gdYB
ztJQmF<ASHGi4sgu)5Vx_DX|^#haLZ7rHcvqfjg06P)O%V%x&xm#+v&FA$-Q?^Hit!
zwLxj#sn+ABzA?S=jcKbNpJqgyel!%7ZJVU5`U~?H4MocE#KgpznVF@frJa$9wwmak
zk%=|MiXWUIPwFt$w1xp+EU~_75S5mcFflQymTN+RhTQvxz5h9-#`F+7%?(`k7|3PC
z%%_Dn-As1`9E8i~$w5Xou)glKRUlP}D(bEgb>H~)vq33?PCFi)f7Z*lGd+B2c4^pp
z{Ccb6%;n>Pl9H2!@>N#CYja%$#Y*vTL)y%4eDV4m;yhtuxBUb)q}R*ShvYNp7dO2!
z$?HG(yZR&ZdU~z2G&I_EX4A-Gm0yP)@0|0fn`v9U=|x^gk*&fR?DP~V;ftl;?=wQ6
z7tb{{3+wj5V6kK6Np;$oKU2;s+&?v%y8VeG`MT<OcS_=P**Aa!1**C`6jMy*jG{MQ
za|>?z3<k27YF4w?LK(B2yebU!Km~1$jSuXM$TH6)Rm-UHi#@+7mdAC}=EP;*1X_q{
zJ{?F0#W8K(c~B}kNh*0D`^Oc+H{f|1h(;SJ&$ef5mSmFXOxm2Xe@YH@s`&;6GQ(=}
zk#W{YJ1Q|eSeD~B$Vq5PwAT4mjW@(W0|~lx%%e%(NkQqJWBgF1W3m#rQz_K7AAp?M
zOgq5ZlAYZzzNd?mCuRMd@O9qtBEk07mNZ+ugiZX?As_APV~?3S`tBI#9pqbKE%8Id
z)Zlk|^b<b~d#ZePo35Aq=N;qxrU8Sjk21Ju1QQU5D)Z`_qT8Ps)*7TKdi+Jl<8P+c
zl#9J_I|ioNhVM~Uye8T7yQ$V8_|bFS@kQ<Yv@3pDq~l>0cc=HCdlu`**IN#4BI!cn
z<hAn$1kNl&s|6AFr$h69x-DL7fq|I5nwg;^b);!KQOyz;%9C1bpKRkKnB*`SrdoR>
znPs8ZSi;yl66!ofs}p|?H%ghA9pWf~!fJEu`jPF1o~CSV-y&=~H23)n?2-mLw2yH^
zhdr9@m*8(;ZIJ>Cjxc-T3(B63gy}+k<B`Vskq&vWRYt_lqZTCIubopJVYHM@A6i>@
zIN~v%3@ra#SYgFiwUCg-o|{Qs+vns(Iz(RAgMs>>n7vmq7#XEyIw|<(oSkaOQ$;km
z7pdn~M~wEh+19-#AjS05P6|Z$oEPX5xQVP=n*2nNc6yUe6v>;@+Six99a-^qN>#2s
z$l7l9U5wq+xUO8OsWOAQ#xr;r)7-8EYb8@SS976tMOTwvw@#VRmU-kXhZQGie|yP)
z>y8SsqTw1|NO8S(hwwZ!5!Cvf8MP8_d#v2J=)6B<lvR~;)728Y=zM^f0(MMySjvdx
z%+`n3sR9NfWqR77v~G<_EY3rf3JS%z$VRR}ziP*;9jr%ge62wgH0B-D%}t?0vQv?m
zuaEkI7z&rC#*f~|U-?u<;Yur_&!E{96;gSw=1eMFqcUuMNR)0uYIGu6oeOcylkMm!
zZo}z#*-r{CDkaPaD^Gp+hfowo5AifYbRZ7a-2XFABFkmX+sRm>EML;k^;7ZWW%9V0
zB;w41`wS88nke&@P);;x@N=ZSa8bW!{!u?=s!8`g%JN+>P;9yb3#%))(#Xl9W?4hR
zYA%Hz5e@vAy`=GV4OfOxTWuq@o~jp-K?`izJJzjum}}&xsIEFPa1bFZ#q(6SMKZ4T
zs7=)68Q#Y?R0dn$cP2187?5ux^WD?o_i!&J6wuU-{K#r@NzvA>|JQ&ut-oRuhl@!L
zn#7{7xw&>k4fL4^L}o3p9E#6p1I_4ZCQ(23lU$}~Y$M28W5bp-4An1Lg91|tE+9cm
zA7Vl6pX?O;NBbBK{#HjrVd5Q>g4CRtX&)q4bpBw?=HhK}(6YszU&!JR7@V5hY#$3J
zV|3$g3*m78QrM(g^QyC4=edyB9?RoUjS*)XlFF?cDsH;B%hsK+`p1>B_cH@F=iL^+
z5@%fkH9>Qm&qgP*;CQE1mr`}!Kqi>4*sQ+NDY9jUf*P~!RGN&Mi88X*Gz1gJH`6My
zngD}?64sm$L41m8IvexNJbrqJL*%f2(=QjPL*44?LJJe4s{uDF)|RK>CuMxJy%0)O
z&Q>!|^|K&2X(G0dlgVfgoDFliMtJ7P2~JZD^q}LNMI6icID!b1jxZP~^qYNs4hZC^
zoooPaTf5j~CoV?s4-?IzVvr$=A-s1xlNJ+gKe8_n5BN)IpPPu9a-4a`JBPeJaqeBO
z6Cf@$oAqbN-s&qv{{F|og!zu8)mP#~$ucjz$|>TGM&?{eY)j{+RCq^Y5Dd)iEkpE$
z6!V!Vw}XNB^icI9Est4x$SYrn6L7E~yc*Wj?ddYR*+{%Zo|eB1JF0ziJ8CC@164U!
z6rM-WJ5{dk^#188(--Lno;uLdORf;LfeznK*#R<OoVZf*T3{`rLl@7|*+MDOsAbj=
z*B30;nG~GO%B>5yrg-t*%UyOs+oB=@_)_9#C9+^iMk>k(PZQ%<6(jW7AI@MWGbKpH
z>Z2Xu#a=CvF=|hSlH(rCaKGN9s~=;J3??vg+KV&meT+u5ys08T7(<5^I=;`)S!wUp
z+QrE!dB{1%`!3Q>v%>&GJf2}`*;-eK-l;`EoG`4_Ox$~KwzP_po{Xic%){USJenN@
z3kfw9pYJF6rP;6oJqbg`SRla4ndUqTtvOCRoXKYi;K4z`&&B=64n`44wtydnve?=4
zjN$FR2QJcnP9j;~TzC&nAZ|&{mpyv{EkGhf9*BPzueCP+!)zKI5CGwMc5#sz8>{ax
zRH5CB7IOND?pJ(#dU|?DVGSTO=DBC&XUbB){P%=dfoQF@zvGQCvE9e}TdT{(IgWE`
zbgZ?t^<WHvUX4+h^BWS3*Qc;`b1mY$M3eN%L#5)mA1ag8%a%wX-^t?7{{Br#NvV6d
z3xVI>-qz62cz=KAz?CJA`%@4*V1f8tze;2g7aVS~K4;FMr?+{w+V~wsm^kkK@3Nt(
zsi{A$BCwg8Q-QkH*Li7aKSlH-X?cB0SO-_i)gT~D;r#R>Yopo`Uqfq87!HSwtmerb
zChVVJWFC$2^=mQiZnZiMvwfadAv?PR_HQ3-ZH(_`jT|!R4Ldsq{gW>b7s#vK!D(qe
z(Sn%pK~GOlWE6V=Fvu8-^Yin)sv5wtM?7fo{~C%P*i#;cdMYXjt*zW3eL@dJ53$}c
z&v+U@UdmHIEIY&Z7Nr^?ARq+z`-4DabcNLl>gq>_hcI$lIK7T`D`IFb_;oOX@-8t(
zfc6lNi%%h2^xVwXpb}9~QE>tu_6V+)@3G<wYJ=gUFlzf)ogR^vrY4z8iX)(pcd)0E
z21iiin8ya)4NGE*&p0m@tvjC%QgvFL-(E*;M45EjzqgjP4Mw;e5Q|^Ej3<XTKJc*6
zOFDpjYinzLvELUyyNTNcH@bvjoehcJOM8*UyB$qsw-=VMzp4BGPHWCz`yEp=7BPbh
zsxqH1Y245?<(ZwIjLTzsib-lgEljO6+w)1%{iz#XY=w%~x)JW$07x?{bFp84>P7zA
zz7bv1EUqH)9Zz|Jm%<5A#S=7#ux1>e9EyK+$@m&}d<V!ZNtrfmvf>2f4r(^V11%r#
zKxI{-ILF0{`~j4jcB$Jlf&=9TK=?#Za|6ES<{{--#*ks~`K*V{VOo+}El+BFz?raj
ztt=z@_ctYNbYe%Yj>4R{7-eI<OXWh4G?A$^;oJC6YcBjAZ`R8GNbqa!{<8t*^)@Ci
z_3?bBX3b2|aKrW~&mL<TLp>th50oAQ#wni#$U!J(5GY~t5v^GJ!I1A%vjsJ-wl1+S
z33wyg$YS9UTr=1*nBmPuG>yM4lxOFA<5&`~6>(3^`R7@#nhHuUWVmq*EK$BUZ)l_>
zy3lrFf(YaNg4wdqpOB4=RKK#GjOazA3B{EKtukN2_s?Y&uZCqEBRL9TX+%3YfF%{W
z=TIRbjHhnp*~ia*^Z#DZ(IwC|k;3puU3-@h@7v2jqesQ(K$3j!iwkEL?Q|m|O}IXt
zxVJAX@%0c^iDcm48b<+-Q^2$6?8HcpEuzM`LanGVVR)q3vgIWWF1n*)PA0<hZ<!vI
z3*cq|WyLVqqD;o`(R#Ia2i+l0brffgJ>+fLaF}~i(OHwiT${rExD&QbKi}QVj^D7K
z<n(*-Mo!me$bAe9v_@m*arq~=fmoYs{_J#C?t*{k^&w{D;lMSyAgywMV`&-1=cD!9
zS>YvrEwNu|O_$6IkX1ttzDR+FUK95e5E~q13<_d3eOe;Pdg;psUDOzR$yI^be*;>0
z@)PmcxMC;JB?m)p6TidgNfYbW31b9+>g}W93;?}S)nR6oriraut!(yZ<i68EwcDu}
z4|=aMr|fVE7#NYUIFxO5cHh64WKr|HZ$J<oge1d$1<uZm{4+J}%DLe8{8hcW6?LPS
z0{9S}@f5<!{_l}JV<(;<S5q6-dzpa2Ux@O_9Zt#J7+v8osVg49JYg$2Ca!wM5sY@K
zh1Ny_fox1qZYD!8a7GD1TcxQ`RvQz_B2WaM$G+@s81UQkb#|o9qA~6e;6))+dp!-{
zps?sb;==v&>4ElN&YuhVZ_QvhWFtQ1y|m5_ge<kJXt{DJkclveHBAjf8_hMsyAaT7
zu}^v^axhay&y)Q6bX`_}#fiDdM>jXxObe_hE(%-+LRKbXTuN9bYJ9Vy{0mCO%uGvv
ztQ|B))Wb*L;a6L3h84cZ@&tY4dD3DG-)Fd%1R3JlX$tNBsOyy3tdF79p7z+pni=Bw
z!|)0W(^-y>$t?eChp=7xTVU#2poL7o%jG<JsuFE1TyuFbhf{ZzD5phAOM)?<T{)P)
zFD6uF*b-0I`V7sjvE}TYzQV0pT-6tGbT49J+*3hIDF-{!1`}+FB`zmijv2?uid5dO
zsGMFpKOufA(ZOhspC(w28%eJ%6>!gtDt1m;=Ov+9tq=BF%03w|Mm9A^$&wJ(yV0%v
ziE4w1Fc6LRyTqm-Au<Sz@{X?agvQapw}B!HWXgRVGkNqA?mBm!-4svuY2X1Bmz5Q{
z_EC;wE;Y<eMz(xMAmhXtcI@>m%YD6dB&C5Ck{=&&x?E|_IGC4<x~>%rpVa;ufv&Sm
z!{q-}HpY%Y^w2@&SBTpgCws=Rwe%erbmU64h7n}*!FI#Z)&PS8PxFsmVuHM;gC7Zh
zU1xo!jXw8%GCO4Qfn5E@6?OUcJe6#yZc<3k7U6jvPXSqqx#YO5^AH?~v^*R3vKU?V
z^Q6LZZ<CtQ_934Y^YBy>!M2fQwwFb}FNDz${1{4(Tgt+XbsYNK3)Z&KjZ=XqokFxk
zrs|8?!Iy<clvVQjHowv1Q&Hwkr(deduM@`a0}MK8-7P<Ahb*x;`7E0p#^Yg^DEM(^
zFSaxku1tsy3Hw}uLvFRXF(~B!lOz0V=U>2w5Pn{vm8ejI9ku9N3v+!A0|HS<2?caB
z?iAL4hz|y1G2-}P0L`epYPY-JRncZ~Ivj9taC9vKaT^W}4v3H=3DiG&&_DwJzayWN
zM=vl?SXkKc@o{!`w&qOdEzL3rL>$-T^YOmk>hf^bnex9D9u}W5F`f2D5{8C`+S+*f
zW%x4X@)bi=cn3{tIdPGar5kE%EzHel_{`XpVUdw1M@Ft69tO8<(Zc#zf>?A7rz+Qx
zG$QKUZVqB&Q9xb4DU}jwwb(2z{yTc4#D$b885iYplCDT0SCo>9O^LeOM7FnMHkI}5
z_vk+5Dlz6FJHkhc#$%H1|3;`NIsb=Hse7Hv>+p^+fIY)1ES^=>enpO3czbyWIn)0x
zD;vF|Nr7>2c$k_Mi;T}BudSx0hOA}$DSJKV1|WKbzJ3{*CJft@VZMy4CV%||sN!+P
zT8u4ZzSQv+3dPk0kTd?(sNEQ~w4Ck#NdP*Qszur?w#2{{uf*kGRaeBNcKQ_o=dS#J
zD3iy_ZMU1xxXsEgwOLlauXzA)nw!3duW-u?q@kga78h@_-S!uTe)H^jcK2QdEr$XV
z51r&p+E&Z4)4A1ZwIMV#v{<Q-*ti8<DA`8ZNlBT`y-}RN=XLm<pK4;wQh4uWa?O&*
zq4ug*nV@#HX2OW+3gabcwFZ@Fqyi8f4h1`}?FEx+kc8^xkWf(LX`IZRbxsdgn!<Zy
zwEG8%_RIt<Hqv*DmA(eR_^&qDRH8ir7gM{@tT?ErkXF0kLvz>zz>|VLAT=~%I1cv3
z)p?GMGQQO~-e5+(Hg*h&C2qS;^cWMo-(qNRI!HBh`_QwJkeM{?+(hUZ{!0Z>;Xs8(
zfbWhP?hD1uMPk$*a`Y^~<+e-Zg`JO7Wvnh_5`5jg0X_+co#eQVNYj>7Z<^z+FhIwU
zBt_MSmbObd78b2**Uijr%s!xa<=GyXkN`wUM#&{;TAj3mQ|Sjllz3*LK)?bR5M7Sd
z_)Ms$MpED)VD<Wu^0$U#T4iwT6Q8N&rNTPt)VHszeu(k79$bov(snYTQ}qN(OTmbx
zZr!s;?R$}Md7Yf&v|Z^1hVOVOlZ))EUjR*2+5*5Nnf!x8$!}D4Ws~8CUeCe&psZ3Y
zMfK_AFUEz2@5rO$!HmU#^11g8UF07dQAfPZWbM9OKKl=C#LO2*OhQ$0G9&d5Obj9E
z@1GDFd<FE<xRBle+j^-RtvkmrvW(fE(@2ote#&Z>>NR$vbX3eA%F7u#az;Ydha{{l
zqX6)6<SqWcP^6U5OU*rua3fl6UhG~n*%*zZ(WL?e=#Cn{Gl6KNJnBtUsXfi2pr1y#
z^F6Q8Pv@UGXjp9g<40s<lVFjO=YDfT{w;$G=w#B;<Amct?EMAO6(zK2fv_jFpyK0-
zl_lCS8K_Yl;8YY>XF_e=uSey7*E8GfOT<|FpuLR#ls5>?$Ts%Cwt15~Py9b2q(Dwl
z05Wu1h5sqC_NMuN$B!U1iTn1$ko8KW()aA|opb%c&RcIsye=RgWL_sMC;bcYe2iHk
zKx==8(R-6W6R?!#O~)a;M^T;@3Qyp<{^YChYT8dSZpl;rk%nwcpe$j#`vWIJwx*se
zqiqAU*{m^DaGzz>0O9uEW=LEme+!+G6OACF@fP=;s7BnQ+eKM*aCb_6l~~{~enCMO
zuk<F9FVhDoLOq6`v!{~@0&(=*%GKZXyfZ^^bSEFkMBBq`MIE%#d<|eoZUSenh}!q3
zx<ALmexXI}Cln}8GxYz0=gC+`kNveU>xUXNzSN9}F(C0~NYKCo%`zQX(Dr}Jobg6D
z8v!T*)qxZdat&_?GYWekXkUOT!IXFB_retj1d!RTQ3M-PNew)`i1`M44~%aNfg;&D
zvc=N6s)I_QF88Xtqb2mh(x3gTF#Pc;QDdsJQU5FmgFujAA_SUtwDAEJPvxi`?Yo_%
z;Lu1@dSqSuAkd|A0pEwGcy%=*ILIAe(v4dQ6Vp?h!7$b%;%(^Z_1RulsxQ>Xk%g}B
ze&2xzQbr9~OmVdd0yJa8K9Rn~Vu48A*%n~pr^T`yB;7ZG-RoaTNxJsxN)GA*9N1s0
z&<HebvS3Ej7$=${`E*t$wEjZIEa#DHU5QqNSp)Yv2;9Pc=$W_mpS#~Jb<Svk@E%Y6
zheX@VP>ktBD+s`R<*A3flxfKrgD0rN&~D=?|7*#o$X#!2V~CGKT~@?lm`W#0@N=&6
z{_V_wHq19pa^Il1O^jyMW#_m9U2dD7QV&6-rky-v=O4VJAXUpZGdiM;J3K1!>Bba6
z?RKLvh=mF1ylsQdytBOS=(zVn3S)6ut_k=7uq%vS**)nx$i--BSL|i=Hhp@L*i=TT
zv1W|Cf>q#Opb49d1;82rKSl!s(gDx1%EV?Z&q2;9M<GGP$?7Fa?No_F^WoqY>)N;y
z5nz<TzRyre<O`1Y^|V5U<Cf#fom?Z7Ne~qWP68%%@j^<hQNRCLC<38@dB$btCnR%t
zPkGT}9*1Ey>OX+T{8!XCF;2itTVs?+bf`OuzVd7oT;A!S_`M^s|LfAoqotd#o6Ph-
z5J|LfB0wY=f7e$(7$?f`{0HB{LH*}h*Me5-jL%`d^B!~7?vW5R;L|??@ob)JHH`FC
zy9}}}7DEWNHi6uTPY3ume1CX2e;p;bM&>_2u6c~$4-UYUE*;q>KoBXM!K^Ep-Y24O
zph*zyO*N8%Rzai-De9R0T{<RaW&_Wbr1fY!+61VPTn5%S`o57-5zj`B9Df7bZckC?
zhw-7n1Xb27EJ=Hr$p9B*745i!-bleqRht7LcRi2#?>OB=slpaY>CdKB)*Bi7SNAl?
z0do0Y$8sGa!vPfK`Nd(EI1?#j$rlPfz##<<3xYP~FSs6#vwJiqrfI=ly5=kt1jnO9
zIk37kh6wZhHm%n)zZT3YuWDi-QB<=v0@a-(fE+6P@4V9gDGZPPiu2$j(~>Y#!7J;1
zS3xStb$&x@8{WU(&VthkJr8c97!wjlr-V05g1}zM7Wly;hAouhq|}?n>41yt{<#y#
zospB0{>h4^R;%dSb`OZy+cPpXJ=`uXBdXK7i)Q)Fa!M8qWbi)WumKwK@#%?HyXkT!
zUkXr>%?^jof0z7vw*f*WL8_>uvkoK=<yXp!r^`|a*LnUQGksQ4fparuH&32a30!3-
zoNTG%+qOeSHR<U(N=os%UT=cmU0q#ORTDcq`H@26#Yqb&^RBQ1yw9A6H1o1^G}!C0
zCl(i7UhdCOAmhaIcs=fymY0+DBYp#0N(wy4*2P6mN~+hOx>r^2yK!UJoY{iYUXq$N
zz>(@+mA8{RH69nodpcR(luQ3yUfT9I0+&#iC^!(0i;Ej!#E<A_q7++kad%)Y6nw}`
znOl4)JYJ%eRVuV9GFy=O#L4-A8$g8te{*vqBR>R#j89QfT3Q-X{tu;_b_2NHKSR{R
z;ycR+>N&ij{QWuFk<0V(vL^^3pUG_k-TEi(<ktEqAgRF4S?I~bf1Ycdx8?}vQ#^Wl
z{|ixh0C>_)<q)vP-fJWXwvFDGMqoMzzNnwJsx<ryE$IVN8^a#Eff|29`|MOQ0xmCg
zOI1VXFB=h&->?z(u;a@zkyO5-im<edOjHKLX~0D|%n$-}Z7Bq($FpqQ-$FN^LQ_-I
zdcEbXDw;!pY97d5<xr@8a~s3$6_OXn!AY&Ft9uNKm)+d|6dT@@6Q?&l1d|mfOH!74
zn4j3yW0U|D(@g_d!;9V|aJ4(6v-707&N~r&|G)14fulmBFoIiKJ&5=rHH3C-CX|nc
z9I`Fi{Cd~hoL<&HR7u2-);GSb+!_vNN7NtWR{|N&@t+^e|Kd(Uakqe68+e>Z?@UTb
zGN{;VK-?3?r>)mnAR(0~M%J-rgDyPk9!nV!_4|4(Me*xgjvYRG-~*2$ab)kAb7u0e
zG^+15MQcQ4!8fuzK}Yn)6SA}fDdS6bgE$m{r<&!z1Y5b;-XYI6;Jg-ZR{w>o-0#tA
z?eYfL(q-!mxMP1<tICOu{GzOYL*>HLS_Ja7fBVWhhNjEMm508fi~BgFq-s`ts<$5}
zwN3rcS^#(vMF)lD-fLL^pzD?$DAw~uVGOp=WttNg&Dl2Sht5cx{ac#Zbt8fV_L)0`
zsTSYV7f#5Gata1nb=-%OwMOo2^O}}42(_{iIrEsBS9S?eS#)@DC1wR<#qG+F_x45w
z2innCqYBeMF|vhbm$m&DJ$918U#aCE`ah-i?h)IB7+KA@V9^9;@s0v%YH&{v5OA?l
z*t;e6$qJ<)L!`a@{Cv|{e_X1WoKlm6h5S%Q>l%eMl5r7;4&)eYdHOHHsu=Y@h1EI7
z#iRV{{BVXPmhqW(rMwMwe1gNW;{zu;u9r@6D%R7IMyK+5VAC<u(U3tO=K^b7=Y`K9
z&1{PC-<~s67y4(*v=#nhhKZ0G<Py@RtEwmw@?}kQ09N^40HjKZ+hW+6kUst7ex(98
z84*L(5nuiD^~yw=AT#0mz9;KI_e5~hZX*@QSC~wXr{C&fnkj+BpjbYB1j)-bJ(Qg*
zGjpY}l;V{09(&%=F4SjLfMbD`v2$`6KcAbjt=si-Z{XjQ$y@>=B73*4SkrmwNPtU@
z0*_!u3keWI!1`<J)(bLQb%Iea5~<_M!sn<+rBTAj!C!MH`>F~D0p#p;Nu<PI(EmJI
z&p4j}iFTIy2bz^QtN)}~KAc-;(v+pU7S$`N)-gGd!BrG>qm&85qbIKLR}&lNo*H*S
zq5oS#m35f?yOCK%p&KBa(9MlYF=>;P^oMoAk&j|xCc@HOqy-VaBt-qy)~Y%xth7)6
zUkH^Vj`~+r%{^u9mnM1o!v-V%?Y7jNuLS1Y{2LNy;_UO_rVG)5S;r>|C@Dimu{-LE
z!pYYDmrnU$S=c9awjvi^TH++TTN9^eIrpm~XW^aRk6vLrTrY6!FK~+~;TKz_<ZO+N
zg29c0D(iD4SpOFgwKl%N11l9})dB?i=X&v!g9FrU^+h}5a7w~1on3sYeiZr6t?Ry&
z=xHYtV&;wgT_L=zG<(i1bKO5SqG`sWtm}jg`QFjLzx*_9lfyfcWB**fq>0X>j<H*e
zTfDB(osl?@rOVz5+p|=FgVQ%P83&}IhOS+ytEh&1BTLw6pjQCiBo3@;?*yr+(ZB$e
zk;E97Hw$ZV=$-=9Axt{P@2CT%7KXklsFjiAflP&i9DmgK=QCtqQ8**i^fyZgjo1;y
zo|ca~!ZlpzS6W;Hm+hE=Tmi6vPMXw*fDk<^D|&ph5F_g+YM9vSRnClx@Eo)2m-qZ~
zGgSba=ZJn_=miib!!-SBvzpO|)9-QNR{$VGwnk(gk7sjaTvw66hfT3wxdtsce)`(c
zXOIO{NYG?<TknU5mm;pc2-J5qFUOCk|5ycX$=XZ&KeJu7a$x_;c8Qj6Ez?|A$B@YA
zO$de#n@0bTiLnz{0}@$r!x^%5h0NVmA()spBK<H`3Y_Bu=_u^~H`EDN6SBq7Z298%
z<kmcCm^Pi*d?42^2}3ihy2`^WzJOq7pcct-I{ZHTlRshI0driD;<vi*cLs{i5e?#G
z@Wv+gM(DX;Qut<Ux02W&bid-qh($<!<-wn<dln}Xdavj)wjxz^pd~&0KBQjJx|ocl
zEIH#^c0d(|DW84%{X4+IbQ7nLpmtDP8sIZ|&wkE}x+iUknzcL3RAr_J+(Z*k;Stf4
zIL^h5;qub5#@y$@n?DyGJ#!=i$z-kIw>c!$9s7s5vm);7?DXcGs~u!c3n9lV;@$AL
zC!PBf0XtLb5>(*B$o?z8MglHUB`t@o=r$5alX3zMib#?%_Y@_rOlxIEt1ZGab>0a|
zuz_RY)<5MNR8XOY=G)YruF>8O9S?ZW2jT5-xzt)ohp%)lq`#r5d^!;~!CJg)UslNo
z$W@<iBaT}Bivb2IdmS``zs%zWSLP1wiv1=LV?ltW!sn_SuP2|(Iu>Cgt-k|%m}Psx
zamHhoze#7&?oe{!Rj9VvuW&ps#>bQq=)C(GI9bMfljxaV@54TN@``MG=}$WP;t4NB
zHP=!K^!Yp3LqeZM1~qP0Q|l9U=oON4zSwGXS|uoJOydozk=Nr&`i(y|d_+87BN#7>
zs1Un_$WD6SBf8Nn)pHtJKAZF5`srIO&Z;6~uo4D#qtF=fdcsB|N%g<&6*K2C9as8X
zaA!Ip&ezmbfPs{9584x}L`4=k+S|<L*=w8I*y^NTb_zQ?Tou)df_P;O*#~%+e#|Js
z#Yo*moqIZ#p1o^4cI?jiK!6@uMUc>wV9g~pFVdkwO@-c6MYM5_E^ck_BI9A{uO;9)
z<q?OeFGgsL<PYE^*YgXlg{d~U1@3{%gY@T~o-p4@wP2!?E;*rHr(gKrOIwO@=PmWp
zCb?9#)8p?CY(_frLP+q2mcrcQ?y;%q%q0J)m3fm<AGtaon}P&R-<U~U*8l!2-d5RL
z9+)6%eN6IsumK5(P2Xk3Fb9n%w1xAu)#H8K+j*;iK?+tlu=78ATFY5jofWfuAj_$}
z+ISttd(`0bZLIw0O9?<`k50Jc0D;=_Y2&u@E*f9c9pLYXX^X6>KDZC(eh)RfBeTu1
zB;}&&bS^pV@p=(FnHnh59jfdxijsqY{umxyQ{7<xe4Gzzm>vfC*1S=NBATl1Xer0Y
z`zLr^I4~7A)ymANF6w;b71l@Nze~zl(Eb$xfr<q#ZiKm^(lXzl!RJjn8N~~Ib6p~o
zXJ3^cTw=q1OZ<=kfts~9mUMS_apy?RpSTa}t|MOj&ln8q^gO*R=A(KLsC;Vb?T=8M
zT%1lNuf=+_NCQ8ZZAjw6Q=2}CBp?f;u@>G5fi#(4X4?zdDL0L#bx1sSKp+v7<%G$M
zwkZd{DkWm2sxR6eYBy&)ohtSg+icjfU33XuyaccT0l#~oVFQAyI1s~5TJPm=?ly6l
zI~#AKs*%D!4k21K6VMFzYQPk@+3V(y7&W1?s=F(fRykbYxVRpzWE;-Wikph43IW~4
zpreICp{o2`d#`DDgbli^lqhAcm1A?j!Ek})Ffb5?2W3^av$fs^OFo-@NHiqggM9ff
z-ZmPuXolH<_`+%2=t#djUc9-Nuqn~$QPf^)Y;|4X@DQU&OS6vW`D}HY$MiGHLwJ+F
z{q5uc>dNM{LMOZF#eQRXscRTf>IFDb*Xs`v1{|OFS(jPoV@DXoGHUJ{Idn@W1LFFU
zu0tt>gXI*pjr4NXFTOr%cSUtHFu9hGN6pPOb1mxI8cp0YPHGHD4X8+~b{A;Sl=}n5
z_-Uqnmb+NP#!D|1VYGg~WxE$RNFa+F9G1BCQR_Lz&F>ilB&%BhHH!ZI@4A?8V>L{y
zcRa2Ny(;d?)$?yB4x0DJyNe6{jux(;9{aU3Cf0^kgA@TyysYLMAvX_SBo?tCC50aU
z!Vy(cZDL5c(;dETpKsmVN75SYAiU6*6-OTDVnJ4ud`$4>wr9m=irH7+Knm!tDuRJ1
zO6koH*qkp{k#*O!<n1~)k*v`Rb<Z;I(>GpE`@P)5h>o_Mt~u9<JEZ9_!y&)TH-C1J
zZBM<W49XipINHVatlYZ}M41m$K&rfZ-z{~-Rx)mim(00p;nw2LkjOJH`U@%a*6{l(
z<A95_?FagM_#uu*X^eMI<roZzMUT6$lF~lCbDoW?fA^05WcI-+VdIo){1+@2Ug7&H
z8<z$h=BV&G-~_|m&Z3o6QI2Hkh6(h=l|ce>7COs*jl8-a{`j2hfXAuXg+0x>id3j|
z{Cq5-lYJhx3tqTJ(F6jSU6@;6m{V5Lcj$Ot&5dUwE;v1}e<yfr=@D<iKO^vYf1keU
zo+Sdc|AFZ2sHdv@JJGcDd5ka=PeVCje7_z}1pEvts0&R$I9~^*QinL60$Vb!>Co*P
zf`su3mg5gX#Pk8nQ_IAF&jq!iuD`_Ba+`)v&wF++!?9w-t5CurA#oLGP>Xqr%1_YX
z7``APMW_|$K=xFWA~P>cO1#8fx-}j_jr>T~zIQoK9^S6?X}fg0(8umA&o0L<mMnaq
ztjuVXLX@3A5R~<L<^u`3rj#4_1m=~NAH}ONkd|l@|B@08ziOYc*-##C$h|Zd+g54I
zFK?=?-@k_g{P<;aC~R4d1yJWKhan|0Sq~>sSCT4Qdx~a@c|3ojNrQP88aOb;3i6dT
zF4*K{A#qng;jX!$F*BJMPMBrabG4QJk!IhlO}0oWiyuo8<SGi&6ojd4A`<mSpuUR7
zZC4Dd_ZyvAIK=wX{B$u0guOfxJmTu)crqLO>sJa-!eC*H=1`rpt9o1l?t`e?_pAN3
zS`Y|4<5qG{o37Hz@^TJAV-TLIcI|Jwn)K$}l;&GzEdgV>`e;GNWKQdQx<^Z^*p~Ul
z{CY39`<+0i-l>2~@)xd~)!IzQ%T<+<qShEXmupVXA8OJ9J^O9^%J^-+GCN%Qr@tq4
z`kj*66n33L9irF_eA77z{=RQ-XBsiWn>;~_&l5>>b}l5`)6p`p3mb+?sgB4-=?u5V
z5fA(OXYK)eoysQ7tJMCLzVlpQtlGtIZF{4p>LWTBCg-C0dMLk|>-llM2|wx%cT78-
z+M|j)D;0ly-G%tmT`}2<Q}KMKAjRaqO;(JCdHom#D9nd+%XKG8g5K}kFfF4yfvlZ+
zPYqSq-9a#e;*b%=|6Fc<jpj9C)Z*o4<HBd-<;{LWBctkc(xda?E~f}}7jQlA^bT^?
z%jG#Gmbe9r`R(c8?-QI}57YZ)<cnLfMovo0ddSfa&imZhavP6uF(hf7KDQTZv-~=b
zYYE4kR&(GoLM^K=kq8-y7_F{W*AuPZl3tvxtcn`6byKD3S`}<NY#d=gM_Y}bmH9#u
z(yrU8F_C&vsp?y>B;4QI9$J;jcr%?aR2${{fbiz1M{l6M($0Ju4}_te?)cMY<3<c3
zJ3Sqjh{w+N_DS)U=*zHCx^_5F(>(*ph)bKKOHrf32p=nFj?2#+%%Ba*srqm1Z6erb
zIC`6!`HvbK^1m=@AJ^VUQf3CBr3*A~XWDco-VCL;_I{mcKOUa#L`ag{8r!-OqFjf%
zD%!A?u^@hn9}Oh-BMuyLx94Rf&c6tiv~j5_#;_KcFvWYhPp_Tia0*WX3ZlPO&`BdV
zUG@FA;*%9LCuxjWCQp*p_;5KtX1G{tyXNklii0jXg}rD>j26{(fdOST<O`vamoC<?
zQM3euSbDxSHQsgU8{2DGcI|b1hh;~gr}<OW6D~xKO5;et2?;@h-D|+`=d+Kw7UB24
zcyY)YgFyIqd{)8ej8r2+*k7k8{r90kGHETg7mOo8IGhHrjmN7Pn0bY{ACGafNP5ea
z*On48_|O4&c1XmiLqc{B434Cbj^);SwhT_ZGY+=|jmU}Yw-xwgs5ApHX&Z}ifjj5i
z-;#E|o7lRRFXEdV(WP!@ufGkd>72D;K<%9}T|3*6z1D|MtHTO}Z^uzT9h+Xf!hghz
z_cw;qLsyev`b6EQ!t$4Lyh%M_3kbOV4id_t#E-oEy2_sTNDh-f_}D`ymq>sM0nm7L
za4<1te79CzTWjr0mj2Izpc>bW=Z|!U=O{`XDr|z{IIAxL@f9k{WmECt6%HfT%bL|k
zb-DnE+`Re%o2#%$Plbc30|5eG>&4WC1U0>1ro}kK+JA^K_PH<~DhoYYUfukpfIZ5y
z_&Z48Q)Z}8GB)fC-!#c_R~fA<PNpFC(l2HFONrg5q2B%vuSqe?XrmaZ*|95|&8t0f
zA_1yg*-tvPWWrLLa^kH}xb0dvjvn!t>HSaN5T#myXvf98oDNP0nu<IUsc_DC#m5Kf
z4;(3*YB@7EW>>c-%GO4FlI{}tg1yW|L{nV9yPV7oqmuUxbx5LGN{B~nb?T2WE@}qu
z9vnA$&^T4eNHi`ZQb2c^r&C0ul7NcRPYY9YP}jtYoxu@}3k-pnR--Imh%Y&9@~dv3
z9B!O|w{raAlqzeFv60EW1l6#;>fasx7z&&cTHLfEQ9<u3FEyA(-bW?*mmx^rKD!Zk
z-2M>;ol!p?*7r?aSuo`q6$z*F$fLP9{M_;(Hmh_QDyd;e-q-K}@|%Iqp#Ck<rbvw2
z%TP#!`WEYbzEu7n&E%tU6ljU;7$iBTWY%xt$2hrfWX|v$DwC4i2X>-kooi=aJn9{a
z$$J_eXFZQTF$A3J3nTA8dvi^!0~(dKnqNZt6|d@wo^s!I&dSm$uo7dQZ*Tsws#=v6
z>)gkaF)SIWz%q(kI>8WWI#z)=6kjtVi!*9`<*ofMRptj`7Vq4PDLqFRwvM>zfj=Y~
zoPW6US$XC;Yl2YQQtP~=^>a>aV}q?QAOaJ7w8e(U{r0k)af0F;KY^-9R~_M+T4D@R
zn<{`>AgV-45(c8xZX5O{qkVyjB%rnxmIIuJziYqoSf$2xTvI#Uu<aL~QI)QnN9-mC
z3SePeP`K;uLv4;$Cfr8Js%7GvSH^cDn!$qOP$*c)+hLbMzl5qFgKN!Stpt@?&hb9g
zMoADc={y^S6EUPEs3CSw3aRr>us9nuWw$IeW;x>3<80#^!IySwabW|Z$YSK2v4>N=
zu!u&V#$)*K40$4qBdUy&8sDe6sx`ar0-qE&;rwpF$Ba(_S=H5$)-1#3Z+?he`$G%K
zJjvJ$JGPu5hwNLU@{mo-HNs!bL~q8lv`+0`n>OtZHMvJEgE$_47%7cEbbjs=gw$ts
zzc4)qh2&;Z{?0o-)ojjkbRu;LK?MaNWofM(V~}8Onl-zPvAN&(BJV=h_w$z5r`!{&
z5WJDvo^pbG>sm{%7m<&)mB2u4;o2ZUmQ00Dhn+#pq^*yw6YIZP*${{F(&f&h*W5@R
z+NZ)Z7nlqWh15OIj#+-;X${KRR^JKS71;<LW3RYzL$xe>WE&Ua*+^?t4q7`jN9w77
zKu4aQYGfx1bubbI82h8f*e~R_muh+bNUPHl4UCj**<m{ss)%ujAYVeni=WYfgTQL*
z1Y1d?_BlE!>xg?Mt4&W-O{~YDO6#*%1Xd<i)3V36K=RYO&}+Lw^9RDaW3b{Y#hi5R
zg&sw^g!?-7x(!aa59+lPLJx1D*e0n9uW{-Eo4Nuf`E+xC)`V(<XHf_k96<Ez(eXzn
zR%$}l<~Gnht4{XH=`M-oIU@swFDtWA-KblsSQ8yvPw)`Mu_52oL7*c1Y>;n5nIn}4
z=^8v3=!*igdn1t`y{yS8H1USP;VFMEV`u@k1F(aBg+*V-!X-O&?;J;Y`!g3ypHZC#
zdbO`yY!t(KC*C0G)ndjx;g^c&CY*-U=9h&%b{gC^LYhRiI2h2D8fGjmkEIa#E^RVW
zd}6}}=kRzZEGRatg%(!~<*O`Y3rXB|U+ACUT*fA-q;EP_3*A?)Qd(^ZI%3nTaG4}U
zUVFb@`GWO-?6Zr`O%r?G&pExTmY8y6NM8IbS7`0hJ`1Elgb!n5bWDsd=f)SCvhcZ;
zsytmpU!+z=dsm`F3N)_`TIEQKNDwEabjG$2+wCWC<bA~PzhqhMXeDP5h7atQgi43R
zNemo}<JAj{&Q&k{%i5#065hJvYi%C&YD2j(gEfEG<;0uyCLT;6%P=P3guK97+`=mk
z0%^!yM1My;gbFxsHI$=Ifx~Nuop<UKh@rtpj9vGamZt(Zj4~^LRM4x0YbHB8*(u{L
z7o00@8nQ|R@rK9rapDayIVK$UuDl)y@1IZz!u<>d0zOgPikzm$w>T(cQ%^X4y3~4U
zy|2Ad$ci>PiM@w~oDXU%@vo>6+`sOcwfw2OO+mm6w8nvs)~%yT5YPT8!sY@>fd8G4
zQ5VRXrHmh*D)M3=N|2lu`of}L%|$EB)^<pR76#;ATwK^S7btYCx2{~l{RDESe<MG9
z?~2>A=S&(E(Shx^4Ty@03JQWU$kws2C{?Esjlk@;g$U^d0~OtUYlp#k^yE7cp$M*}
zAD%-cJ~<P|!^1QCne%1h=;Q?3%*@n(R|M#eGh#}XEyTH_ZM0r5DJ%Qy;WPG!(UR?D
zD2`<04k1Vg4i**`0YOQ^bGs+#`q~~BNi16)C?lSmo+gSP1_uZKJqsks8aKUPIzQen
zK$BR9wjBH)ueCs5T-6VVvX{g}EM(-UL5Oq?yPd1uL7>0Q+nYYE|87yw$Jo@AoQzCV
zRP^LUu~_-}Xgas6OMo`U%)OHGJ1N?yIuWMsEFc}^l<9S!OqanB+X%GReMh<Nh9T(d
z@Bc<lJ_*$BP*I({zr7^U>p_FYQdmnH8t&(cm4N;|x0`YH!6+P;yOa5+n?vdW!<Cg4
zELzRmzst2igCS7MZr|+>_4Vslpcl`=((*60si^1xC?5)>uB8=&&3dv_t$W>^>HYlk
zYNuawMlP9YALux|yDJ%liNxc+pUxHA-`|IBV<9IeXJo7{E*|!MNMV|D_w$Z6^1l%Q
zs=z;va$}yY>glwbfZJAAS9f)H2M4LsU;&k5b#=Im27N%s;dby)PfySEbevFjtT-ty
z(&5QT3-GwW1_6WA6;r8EO9o^(ffof7mVrP#_#X>eHpNy1_V)G#a+wuXA^~86C^QU_
znAECiJZ_3o?$}W#riBN>d`!t>2GxijKaGs`+C3iro}GPitz7Q@tH!3Tp+T8Lm@GZ8
zd$l#tSXd}FjdHQs*?AGLzP>)0!K)Ol`Q+JA5Ud3SbXv+~^3_*WUEbZz{|*+)?h4=m
zdNzS3$dMZ3QDwBTO#V)rODkhzR-27BgwJ$EpZXGV5whirh^;(4xRXjVeng9t{u(yg
z2h}t*U`+^G4VxifoSvS}J2$;NV+M4~JHkAAc4*TjjqZc{$sXF>JiP`7LxX&U5{gK^
zBJ}t7Gn<ULe!M;WVAgD~T#o$8ESt{d^#1w;RDXHB-7SKA$HvA+Mn-aSa{hT!aeuVw
zHaomt{2*Yiudh`rv^Dhf^t7~6Q&Rr9;qM_xw7PANfN`yCdYrFust?W<$S9=}a&x!l
z=Nli-<b!<6%4o16j16<$9YqZc3>+QnuG1qUBY(~oMB#BSjEqq6^6Jd40#C~Mb0!aX
zqwg0!AE^vKgn-Ti5096V`BG+PW_s<WDwA>Sgr(flQj4eSeRMKuU!V7@K?1;bHH?k-
zSL!WPQaWx9CkgnyUnluLv|F96BqSs@)iQx%$X;ZA&)cKv=etvzwWjR$-#}OB;n|sv
zmKFj?TuSQZ>S}mshyzG%hoO_}dYsjFR6IT%W{Qc5dSCT_2KiQYzCQz2eRp^FkG*!D
zFhKoz==kmH>kG^<yakGM+07QG(<ZxJ(P-S`{e4VqY%Z&ny7^MI&gYX-pl|ilr%%+X
zx4$Mno=+=*bJ(s0^=>;~Y_!{Kb~v3bjA!!uELP|!C@731G{Sp7Zii%KWB@M*7>VN7
z;_B)vz)RRSr}b`c*){>qxxEv!YincrMVyEz5&eo3^izM>*x7si<n8URs|^Pgm6Ya3
zP(FW7`t|EqM#lTYrq9dM6DK!!=fj4_^z<|W77YmWJ0T&R&&z$I&240RR*b-l#R2G<
zOgle6Hyn)qgoudCYOX9P8E$|nEiXT{*6aWVs;;gEUL-K1vlWZOJ@I+n>ns*4xLwYH
z8v@EEzmbxzwYtbfI4_iG<X-v#m;Bfpjt43fu&z^WIw)yrRjen0wg^`qU3YgbTnT(`
zXBrZc-hqLE)m0#ozY3Tm-iTCol{!tBFkoB$<zh;(cJHrl&;fSOw?9Fkq|u=GAJyv6
z0o!|fc|{G|U5KFAloUzATm0!J%Vj3wxG%W4Wo$!%<$0BjAHbk7B}-R#?e&GBi-?GD
za&qeG>XxdO`$N&u)B9ZS!TsGZsOBIiFKlbWeUD$ML_$OiBs2pI11*e{nR#|_5aylO
z)$%**rM%{94Pe<olK=H~R?dh?gAnEN$x~`{LumygAX8RW_6y<x;9r2++He@aHWN}M
zp#z9eVC$5*s8+i583Y9d;Wt$mWWBxLffdc`y2~czvmsC<i;9Zc-Q68eF6RySP6aXS
z+l%g>FFF4H{##r6G*~zo7%U$5mXBn(zTJu0YjW~tEf*-GfL%Fe!ih^uOG`;n)6uE^
znFb0ufclGN%X{vNwp`l;V5S071}#iD$O@0&+jDmyDy2q696=BTE8?Gs0GJOLDEwk3
zP>$~j=+UuUsS6V(F4ybSm69ky>;}51u_E$^IyXPxLE%?s=nTxv*bheI0q<;NXvkOV
z7;t?wIxV*w?H=T^X<N1&nj>ImBt#HRA0-PrnEaF&@N3m{6coM?g{O;^)W6bsJygFD
zfV%vl5Z>Q#tL%Dxf<DMmV5v@pmGRYrOG{75Gd*jp*SYp(YXt~nCG*4KwJ0#)gM@+r
zZGSM4?s2^*A#+~x%0+|44K%Hns#Q`wAB9(B*$;Ou0&x|vtT&H~b}{0(t!|fQ?f2an
z{+?lsC|ea36*;*{!2Kj-0#3|aP|z>O8<@{W)ESbTMG4s#PU7N2z*|H^NB=RHD4BoJ
z`O#5Z>mVsPLf`qSsH|)R9T%r7h9yOcCi9*5yFU;xE;?Q++S=9tFS)Yf{&2nym=a(h
z3=9mkwB>nufrGpK0|P6os{~w*;-qK_VP3&@0LfDQ=X(JtF!=fTeSAKEkj!D!>G^aG
z+!Yjfx0;@pilQP=6%r5-fa#Y8{+V8v$MsU*%j>PybP|Y0uWxS&AJ3!oooT!t4HXs9
zlZPH29;LUQAP`_K6%`d)4VI{3!ivSW+r1$`lt|&QBaY~|cXDdC+Z_O0&-Kj>1UUHC
zQcX+CGcer1eFL$xp`jt%7kvPz!3Ii>)fE(o;>3l%e+L5@4n#`G$<b3$nVXq4IiIbJ
zB{RG6Yyiegz~>1o7#kBa3G5@l!w(tRJ32bb$jAWoIPw%Ul$5|MRIAhlOe!Znes+H}
zNk>O#bYw(QO3LTutTCP2g^QC@2dFFtn)?AKx3RI|?d{#s(V^%4WbESd0Nh5hbiuVg
z``?i8q-Y@gclkj89%b9U+0M?6|MjxB-eM8xFf%hd0=yWVZrjqv20jMH@8sm<xVY)L
zISdSpz~1eS_vh1{ot>$vDH5@0@%)+l`+LAqF4vm2HZ}eEy*g#aGEXGgUJ#t8SbYD)
zswC>+(RvLIn=HMsv0-gt!TpDcg^f*BRkfw5=}RxumdW_aO3UVNgX*T%R-AKeO7_Uh
zOdw|7OKWOr9Xqxp@<Ks`025|(zr4I0xRR(>99Y-xl{A@yKr|%CH^fUJ>q|qHJOyw<
zh%=JAC@uM{FRJnxXbfv{Wo2d3<;5a?22=XR#`(@S%|zTr#!MPoS~xA#aQzIo#4@PK
z+oo)QNAVNzZ9xy;1_4cIR0$<N@ev8_XQ<qZK7}WJL4iD;NqX|3trUJr_Y3p^zK#MU
NAtEbWA*ApBKLBy8Yf=CJ

literal 17842
zcmciq1yo#Zu<r>sF2RBa4H5`0!5sp@B}n7$?(QK#a0~9zxVvkx5Zv9}-C=g#_ndpr
z{l0V8%&avty%x|bf!>eQQ}wHV)!rcra^k4S1jrx|2vt%-R1pM%;so9wBSHh;RbnAY
zzz-xl33W#h2#fpg7gSU?7zzY>50VrWR(4xB)^^vzGJWuklR<L;A(?(hT#x=N^tP#r
zM5Rgx?(5>Unzm}m&oa%D7VWZ<YYX2Q>$>35vc>Y54Y2vN2PqDY*jw~yrKT7PA;0ew
zu2lEWEFAgV@4v%RF|W$MBqg_gy8PqVdU`2v`Y6C8R|bY*D4e!{C>ANxLcycp!{cGV
z3S)flS$C?ijLcIkQuwG`q>Nv~tcNeMcJ79NfWXVkJB&j#k$?;f9U}UT1rY}Rr=JW}
zjT(KPb}tNwhnJTHtJe=oC3?_EB>PP4uvp=vDEZqyY!v)4_z+R@cqu55g1r1F^UKJ|
ziH$(!>E@>4bb~S&ywc(-|JvNq;VUEar%bcq_wV0qcp;)WQBm?DBG54sQ;b>)3ej<K
zFrci-eSLj>C@84;`S}`?;R*Mhk#t_waxHF;t37`VdJM$&#}nRosq2Q0XH*cV$4*^c
zT|(kV)kRZN6Q|7*KJaxkUsG3shli)zXy-q)&8X8l_w$xSz^k#U>O9?b#ooaI8uT+h
z{{G=1RwD1E-Bt7*ZJ<ckkWsb%Q_QN*<8des-2=VL?nt_SYwzMBFD>l>>FXUrh^UmL
zq<W=Z7hOdwIc5kNsQ?P-M^u!ntLti=rK*(F?-X{+*x1<bBI}kFZD19Zz@>7<B7vu~
zp3VL!ChcZ61_tZiP#iA--@(`q=p=k&6;6@Fyf>Rsf@skUk&%((<KxtFX+m<O{2o7(
zlIA8R3`)J++}un}cXP#~oXQd4;W0ygEjHMu-mU!n`7@4M-o@1w284~=yVT^!1k6y{
z-SCw7D;OEniGG@O((HWb^RQ2?ba3b23imC`&kqFhOHCz=mx7F&>FDSj&XjFBG!1k}
zO!ga8A;Nx(i&JuNaBy?G+Zj%cSwtu1QC3sC^L@Sb1qL<{pZv*`ii+x>WAEwyvP89F
z<j)_XtSLLLw2F#~9X3{0R!+{@;o-7G3Hab1mb>+pm6iOC#9jlIWSIzgd*Dj<m%Fzl
z!Yp?yZ{gyl0!deNC8*+xl#6$iDKU$tE$TEFaKqT~<TC}db#*HoDNuSMA|eP36-uTh
z-n>s7l65}?j3qk1Qw@qIoe={OWA`#nUO>IZ1eN5KSDjv`Rc#j;lm&qpFE=||TU)nr
zzBn`y?T?x%(Zxulq^3r^D~pT6Lg`7N8y+4`Pfu@~I%)Ik-z2j=#TpqIAwY&G(l}Td
z;K^hwOwG>rF!Qyh3ya<c)x-mfTcO?Jaxhgiu)osg*&qJS@#X0ri(1ae#N_2X$`?2?
z_xJaJ#Ss$|2cr@5xEv+**>KvfXo11?=9BqHM^<$j&o7Us@=46=6A)=$*AuVXWmivM
zTYdfY=f_(pkmJrD7PYLpy6c5HOF~W?SP-B4g=n?{EF9eaPryJQhIC(YydlNKzeh($
zySnseO4SKWRY)6bS2F~>A6}jxUmsUr<<fcTa&kn<z^4m!hhy2Ffe{rI6#>h~(?G;!
z*SWe%T2xF+OS=M?bV^D}Mf<}+0J^|f#+9O$me<xm44rDZ)qM3fU_TEPjI^{KkH-y{
zmzNz+7eiN9S1V0Uz=Y@K<|Yf|VL-r@aEOVYhdEb?_&u&3t`8+8B_D8p13U9AAb`_)
zVQOZ^W&3x000R1jTkD@ce`HfRP#<@&3Hnz1^qA{yS6?0uD<}$1J6>Pp1)rxTCoh8q
zpP)cz)2h1hiHX6%!6nlN2<RlGnhj_)U@-XRyq|Qn%@gli*52OUyH6%tJ3A-KEw{Uw
zzTl78r){@7+S=MoDXPFMZcmoaH~SA?J&)#CSd*8gr$59<L=tgt?Cu7$MM;$8=&p2i
z2|r#POqY~^%S2M<r%O~_ot@_w7Y|q2%_s5#-{58Nd)@$BAS^7Lk&&_L`|53IxS9Pq
z7=zS#wcQ&5jcBZWd)ak$`pWs<tG%tw+v@M@XsSpFSSf0mB)=zPTiXtYZ7EPEa5f6$
zGe_3#Y6X>O<6rJaeMbfc&TelrfeWN@JDQAUEQbm5xt=VQmeOyzb$x>atc9M5>2)KF
z2?Uz0(Cu*ApU8Z<qCQUe#L0R4c-oQ4Wls)zPeEb${BR8zUU+CI0n(*i*7~zMj~oR_
z6gn#E^ju~AYDrf-ohlI#(MSe=iTUI&1D1Fxm6GZ8^>upOuwH}hogItrdSL5|sWy0k
zi_<9OmufX#tiHZ9yu{(rt7X}$8umwuX)s54{$0$pv^3A#6Bhh%!RLK>rF<D-9%^(_
z0nxrjef+-7^S`EXu|0(9!|pGHh=kO&ZpXsH0`d#PW8&fFPUmq(2EE5d{^x&%&^9(U
zzC;i>Zw<Uf{HBM$b9uio+!7Q4xI#hCo6`DtTwL6X`={>i?wlNQkY8{xLR9}@+iWod
zF|<Dh5s{Sw8{jAJFSb9K{1KAD234Ak=WwoiP4@Rw3kV3Xv*XlEtiz|Dqgw-ZSFJ{m
z_-)SW<{R7_wtPrMzC)8%?w1G0jD4pEP1`nOa(-ZI{r&yj-Q5Z3BEtT@JU1ga8?_h>
za(;B&CU!iDDSe^DOy_f-NQ$?J3&j^QGb`Nr91TB@53J~8$LsUy5L1Vfi;IBcPI#38
zEVQs`GmXEH-}{drK8R)7<|C}5=-`F*KHeN_wz}DCX%S7Z{|F8BxI0@14g;fZd#lNC
zimuPSaS|s-Py24!a?OOmG|}X~6XnAB)*u$B69`pr-n?P|;`Z|7PFb4ceF{XE*c-xH
z1VeT>r!<wV=8<mr2nUL`#)AWkbh)~GZy3<l)|Od+4laYn)!q4KXehE$JI{v?a3LmS
zLQV_#robsLfk-EklejrL)@YQyjQYL={1ooimgvcdiwpQ}%FY=S6ckCoPRqn3KPf(a
zpQ9t1HA-}RNJd6Rgl2lW(q`Ls4OUgf1FZqhB1_>TumyKllk#MU1UGU_P$Bv#(m=fM
ze!RiK#Z@etLc*fTD=1iPbys?i@jW;gu-OgZumMgnJ3Cv8@fHXt$H&LOUS4bs%20h`
zW<Hyg7mSyn1We7*-v0dj{C+Pt3K<z74XEIrahrUMp9u**=e>Bw#>Qo3WvHmAKvY~f
zmH~rjVkpE94i6U>7Xc?eZ&g2X%*e<X5)#tldRksm(zAJfeSHlaKcDS*)ibxJUm;k8
z1OzWP4ISD50rVP>o$#Ra?C<Y?0QO8w3<K7jNDLMj0<XuFnX_}9Vv#5cFBezR=q`o#
z#o=s4|F2(b=Xbyq`Q0yIgL}lp#YuUbKGM-if5etcVMB`Q&*XKDl%NE{0w+EF5#T;I
z`yvQ=U5<bapR;nYx3{;nJe<|_ZS%UnP}OmV6c+9sR&<mX7TVd{BcY*Xup0jcJlV?l
z_*V^$g|V@$kVSL9R*Cl*qN1WpK-k_IjFXj>1po*LE`rZjN`{8H6&1F?VufH)11lgN
zMY23M_rohJF)=YJYIMCPxTd<gQOgFfa$vuO-Z#3QW&m+Hf0vJ5xM0%$;cZ6@lE2W}
z-d>@#HV_}_aix8^etGO29eF)m(Zmg2ou9Mpyd<N>OHGfD>odzhR+o@qp>e|krw|up
zc|cihZQ30f6Zgoh$v}uK?f(c-lzfk&p`no^LzOH;vXZjDvlFCF1CL_Xpvcz;hk{?Z
z_ZC1*W;y^6YZQSudf-qf82DV~PfJlf>+p_0pkd%OgQXy>TxGo%;Lx6Re3T$yLr3rl
zE)wTOQK;u&NtyLgk?8dB6C^14`T3vI>8H`@0H-cPrL-!*qSvX1j0g*zE>|$Q57^bS
z25uPVGaL}elR^wSneDi^xvwp4l8e9=zyGiT7f-S&#I0Av44E>e<7st#n}U|Q218X!
zW(wyo#Ow`UEje8Nk|Oh5Mt{X3vVrAY4UOtmi9|l$8Qf(A7bC$|<r>O4Dg|7VStx0R
zop>+F)WV{8$C!qO28a|CahuAtD!d5YKrH+!VFZ900I?2#^q4*57noc1H9dG0=-TYo
zyMgc$txSN849U*cH<Hq($DN&>1;7FqRB63f4^RR&W^!gG>AQFDNJ-@jCS6ZgB8)@`
zk}Db7*v-9*a&vQCz?#IbD?>w;Ks3T*(&2O18bCZfproS8hd``X+xZ0rN$Z{`hEq8a
zU}3H1Duv_7jE#^j2d=^NxRy7T2XHnvHdO9?Yy@SAz7*u-7Z(>mye31OkefD^l3$+5
zrWh|QEG#W89T<?Xro#ylb>boZAUd({#L4-W&)3%Uj5JeXDCleV%l2CeQEM4w<>c}e
zi-6$*kO!cKX1&#To&>4+^yN&M=4hs%u7U!y<qSPqaPIsxG`}@Xx*UMUvjDxrL3eqc
zwk4*f0?}nThK8PPuWa9XU#JYAN5z``Xuy`W7<bRzC=||=Y2$~s6Mp`Dj|a^D^5Vin
zl9-E9^x^)#uCC6UjRN3MPEJ=N4H|+gm$T9-CtEv8CufBr1glo;Ko862c!QErW<4R^
ze+=-L*%8MrQ%4vudu!{mhK7bp@$=I5me%%yb`^t_7K3l<S{%-`bn!w$hD6o!U?M>u
zF96M%OMO;o%hy93rFs8&#=u12Z}lSHm^9XW#HGI2=nea1I`aL!*_xAs)^VTR9u!d!
zMY@?|_r~Qt;8-MFF8oVuDjV7|5g5$a4E>5NO**Es`YVRN0dcQH*`oQjHlUz{p<9<K
zpwabMjWIRoB_k@5cFm4UxZx?6G-F%UNn;B~66whKvI*vGt3V8Zjj$kk-J0{~3TfD7
z<gl^igriSPGsNl6B`n|{qU3^t9ZyfxP4Sg=DzgXUxdI)yLDPgC9tsy)`sx`rIy@!|
zuy~8r6QK_y4d~dKHw-fH=X*PdysTY{qO|e9>23+p(yHx{of%UfPEj|<tqLDX_YMx`
zy<2A#Rf=8G<9~K?j7pVFGx57t%x_R3gYLL;wdQg`^)UEg-?QUSVS?bXvbjuk^DcxN
ziD_KUuOJ~*d9OXI2y18P8sruRn;eiIFyB<(yq#rB96FqvkM!FNdz-*S=c5OrbSoNu
z!W0D%-dp#wYbSVUJiN0UJp4nPfvQppR#~=Qjn5vCB29cFOVj6EAOcGSde8X1&LeVG
z%{GX}6<Db5%3+j~Wk##h8kBgMGrN2c2;XJwA~_8wBv;nU%_nf|8^@~YJ5{Rr4ESW>
zf{+J4c)pU;`iz5_vtK(!<tnAMr6Xqrk++^rCDc$+j~C08lKOH#u)V*GbCtVL31qI)
z)F8$5R5k~h>TMopjXSZPf7p$qs&akh#s`fte8p99{qUtJ<?boRVyG6<$Hzj<!^wD~
zd7YB@&WT_W0XwI)9U-t7nDj9=4a&@~^K2%^wocmX;<kIvHwz$;VpAy75yCa3uAa=<
zkUSadP;O%4<xgq^;bd6!rVre6*|_&X;7F8@TmHV$O_*H3gP$D!Q#qOHvT({Y=&hZt
z%=OqC{{b?JaY4-QuGQ53>xkvN>>S@A-eTFx(%WCKYFP_0mLvV(d0IaF*7?mlANOO7
zSxYSdemW3-S7}p3EjNt@8&iLM>+7f{rMo;_TzA?;taqeVaZB24v69=cmTi@c`Vf}<
zyHl?_FY6Ko0V~lH3F^L}N;9LqYqxr3G~MRa9B@E{?H!yuOTv*yq0(Fid|B!!L+ODs
z493pp+5DiC8ia(FXE|x>7WZ-b@`GoT>K^vZbrwU$hJ8!ruq(DUd9eHnKA3kSN7xym
zh+)`gLPud-eg5OaK3Hi*2?`VJex{tI>!6d?LKl~2Uq)8YxH;SZgj3U6Dymo@0V>hq
zKr8Fc7hVgf6t{nxg;#^Td<edsETf9S#vNL#hzM6{t?6tiEq`>-IgD?ek&D@w{?SGF
zai*Xqg<8NdxIi}+aowd9<xeBnx1exyFMbzTf;l<xJRA|226B;jQmD*0E<n6e{03E@
z0m3@PO8K3~ITU+f3JxA8CmLmn0vX~oUS8;rWp|eTsXQ__db5@(4*sR{kM9l{g2vV{
zr|>I^N~RkIUU#u2uBC}R*I1BScY0S4pWkQGg}5jFeV+^`kWh)x&W6xN-C<BJJ$c2o
zBrkQ~h)5(UU(0oGvULJ3F5)gpP4U4(c>IIs1TBGRp9)xed@SxdDJt#7pz$T&OwEb|
zEp5R<#wvkzb_Z?$f<G#}nT2cZSgwo%8x|e}V)xT`)=AE4%|-xa=?|>sy7K6&q4pdc
zcEp&5kN98KtIS=MwvQ!^Yy8p@*3CVKQPo!VYz2X;P`-3Oyksy<gaLkO*h6sm{961e
z1WIODJFXwSC+z`|E(s+zjN%>0uLo+lT70-l3e^U;Ru;ucv|LHzu25(PV-m@aN-rn7
zCg<xE?ax5FcPRq2tT((*TkPM9dEKndZOr>??Gl@+P7K}w0t&i5oad*dq^;Nm#>g;Y
z2d!z!=9i?=YU&r~Y}>CY1$1#<4Zk{nqokx;+p0Qp1UI-92g{T{tZVT1plNW#eg1$I
zi#az-dY(AEHTH2Vb&}8LkA;k&Qu+gr>pTith4w)odFF2$0udf>ezQsH=0={_+<+uV
z-SUEy&11dQwH6a&ixN@^8T850@{Ya<1Xo%6QZIh~roUX^^+l?!%K(LTiUE8hi;KoJ
z_M^`vf%x%#clfmC1IcNuxvk{`B+Nxj{nzI74h%|(>GN}WmjN~=NzgPhmPrl<8By|&
z)0oZ&-@r#+uWCjkKhc|$<WxE82$5(y6=aBiEa2yBjYz}XS_$xOUB-<;ot^wT#9);C
z1GICiM;4-7noF5`Igp<Qsz>q@w;dKS5e6a`<D%U$ReT9{zAUcN$7y(3Y!JqQq3v59
z&-H?Rr_j7gVJtWhtUHTeMCU^B;f4@L0Bwnyc~Vrs^&J=cQi4Zujp0(+?b&w)>eRJ^
z?<|q^<V+>SHjNzt?qPSO1`rpc?j49>>A4XQpY&rgQPuZb)m#mS_dzi~JL}g{?oh*?
zl2A&*ukt=TW-*)htd%f=^oFWImo~NFaMuOcL_6OhRp~`AY)adedtus2ddW9>8xRm<
zm%Yl3>l_zDg)D1HE)ACbW%Yr$Wa)<c_=;)aVr|?TlO#VAs8*?geCs-}^cJVP4xt|2
z6?Yp5Rd0|580Py4kT66VZ<2X)=+zp4sJ0X5O!jfK(4cgDv5jZoo5}%5Wwx1uWs<7#
zSEMn!tHkh#OJH8}g+uOw)UN?G={jTe&P7Q9R4+;6+``(KG#pcj1P)i+UXPXU6zO+^
z&Cxq(=GtL%N2OW;Lc8zZ^ZanW$;JO!$jF9>{M&f4(Td>rB@QYy(Y~lS56gGUinv$!
zz=#gsFihfK1nU<fQ3nS1ZY=+a9RHJfO&|dL;!7pH+hlL-NgL^X+cHmre6NQkiiSjL
z^$kZ&6X4>VbAwJleX}w)Afx@J(>VN}Xc?tU>pWeok}`RHIGY<iaCUa~{X6VdQkR|?
z7T<*4e`U+}h=3-Ut?&`Zl$ah`Fupr*x3%asm>_ZC+Zm8lscGrzcDNjCfI91~7prS(
zzTOQ25>ZS{43Ky^@puC$Rv?#NP1fSzOay7$@l@O^^{0p8%y0O3ESH*CBHYq*yuDp#
zyV?#U_$1s8M&A)AF|n{FCnf+vCnGa6idMi3tI7Rx$Is6Xm5>9-3S;O?$!0Yt9odtn
zHpb1`$_&GQ{1_M-dV_`*^j@M!nP6+cE-+FwHz%jMx|*DVqOJ^30LS;8equ-#OfD}i
z(bCZc1OzxbIDGk8IhNl*9y1V2QXxU9_)3z}5gQkGe0213UEvEUC^((32^M;9Mu(76
z^}36$0UMDr8OTd9?Dm5rvEF9c<7BA`1_6!F_r+^Cg?+(!Y0RgnKI8-tdHN%XfqW5=
zG$X^pYH;+6rVnt14PDkh44VVrjwfG$48Vie?Rf%7ki)}KKtcdEdU!OKl+fYFWGNN_
z=_TXGk06j%yH`_XB}QObTuaL%AmL(&-`0!@C>1GFaCjI2n=^(#3S@$8{m#zL6yO7c
zs>ZxLQAdy07jJA#b@ls};~qyxNAc2DD{0%Rb09?;i|oQ@^4DkYP%>E=i71z;E(l9U
zg)|%xW4;Tgf92d<(X$N-gPE5?k`g74aS2zL%Qah*)_`NJ9p~l}CPt-G$Z0mwDDqiV
zrG7jEw6G{rK>_QTZva0f7yRUrkiJRAJB$>WKdcl9Mo*zrkGc;YnL%tuXkKMZlwEFh
zlJTuUxRKmywBMYVoHR7p7+JOyaBKEhvs$k%;#GyqW<ZzA^w8AQJlx$?)zw|4P<ee;
zs1j4YjS5NoP~iZd(>!zGwz;w4@xy26malri6&i$=Q7*TvJ0>k)FtyPk9{K#NaNhrJ
z8|Gg+e46P%rZ<pSxVxU!B}`pZy45@yWam2vV%Lb8jd*UCe?!A<L8gz#wogyav3r8z
z@@d5sx`K0=F~p);+N3;?p*V>8<uTg;zrO1HmP*9PN36g&(|reRw7R9TN^PWL#*QA6
zxZr&FT#dFEt9mpwi=Jjt+j!7^UEx)uGD{0a?m4*aV65tkvvk{VytOi~$$B&<=Db)e
zJ>QD@^qXN%J*{Qd${tJXY(GRQ1f?FwU0vQ8ZaMVxm-;vo0y+lI73;UBBWl-HpTQ58
z40JX>U23vWpS5tD=#Wczt;KY}D%v&AzJSX$&XrWkY&=o^;Rgfy6<7%#JtsJ{KFAQ{
znBrycorT0A4P<1Z@S5GrjD-FgrxI9dZEZKXoQ#*OFAz2vQ`cEFq`RoDKj3hc7&n0Q
z)ta`h4z=#X@|_I1_D}_KLxD&V-~1=3cK9<@A;l14NsgKER9iB9%ac4>_C<pSIZ6K%
zo7N8bnm-ay^^=Jqz9)aI$-Zi}c((OV)@19^;%8{f5*zI>d<L*ZDOKd`0tE`VNcDJF
z+>4_~s~7La&}^jDp6^Oeu9R0&`tg%3hm8!ANVdq~*!?*AC>uW=AuFG9QMm+k?yM9L
zJR^!oJos_Bjf^ltVAs=+BzqLMv|KH$22LmX=zdxhE$#i(0Kf^ocV+j%$wk^@v2$`T
zL-c#ldN%=E;mP~5>Bu+w5f1qZ65j*k#QAMww)k;mkR7F@Z@3+bP8@Vf<ks08+j?1z
z8RYqlTQAoy>&is#E8jLzq~jBI=fq^sZq6%md+qy#X`phnuj6g?s#PCm%jtU()Y=F%
zR2D0t3MEDLm{QZ}$9oj+IT^;TY96VMCLZzYWGl_h2GqNS_8)h1tbTP0Arf%UwdYc&
zAy9UWE@gX^<V_W`ccFZ}()@GjqVK7Uq13U}%H^s`<vv=a0rFFX=&%5Jc~rASwt^Ji
zaVTp*_-m_cuYVsKWXoSk(^0DY=Jr12J37FFvmuj2uXXzSP9sd*e8Vdfx|wC%XL3m|
zlPl_LnrhP_h2Ihhucr|cOVBkC#C0B=z?Z*FPtM4poo?Syi8K<^!88ytdauwnDNUVk
z>-N~-)n6_+^KR`7T27bBR){fjK(;0ys9*W2O**@w*Ov?dGgQ_r5-vokLqW=xODBh`
z_C*Uo{we|Q<v=dVLIL#~7@jdQ5{OFOTyAjMe@jjH#QG$farLrrw7Zhb7B33Ho~*Rl
zq{@Vj3}FF7-h^W=8o_Q)#xu5qdU-T1@;izak;DVo0{)X`plpB5glw&1dvt?<d4W4O
zxYqn}FW3|qKx&nuue^p>z(C{gVmbm3HY`h@syZ5zXGEkmzXsmAg*h$g{{pCb^T}Uu
z^RR-ZnM>PW2;DzR>@tIwyTZ;xjhQiLZg0I}bO0gFJVDAbvO;h}9>r~bNtWc4@9BF)
z<+s82O<b{1ATFY@Qv6w>lBqQl;l*E_cqCMT-=mO?&{)zzX3$gwJahzNWg_Z`iP`(P
z<c1wR@;aXjn*(MOb`o4<-nb?c9x^6$f&A_;z?Gam5t58W#~Y1vWc@0#IfqgA^qw2K
zcn`QL&dscYuF@!fptq}wILNdO1rFwkRyK-ASep)lEw6#&JTHS5FC!tPr|bJm7huy|
zAzQi2JC+ibXBp>%!zJTST&*regkfH^E#}epS+WHc2-Xjxc&g$KP8c<{^_r<8)~g9+
z8Hc^buC)5>qMhk2i&ic0<_boi%bUSf6G?grFMhnbA6M-ve+UOgn~BQVi_4vdjk>Ln
zj}yUwSZz7h?K=IxuyIiLfp6d%D6o=Ha$dq~3{?+lDEUxIa4I$O6^I8)d7{;?b1qAr
z>ElaoOu=vep;Q*O5ss1BE^?=4u4q)9+5DQ~ueJB76N%|w3_Pg|5Z)+(+^>iZ5UFfs
z!YKsJs)_eZK`(smWUAUFZmn&~_$i$xrwSzza+-#$KR9Y}p0@0RmPv}VAk5Z*-$jZ}
zSXi;k&w~*Vp`lD=jPfUGV!PpGr-HmlK|+=1O+LUjfU6Z3rtB=Qsc*}PEkBb?O}rzb
z<4+#Smm@N)$~gE-q@t#c9iM$?OVP=e)UgkXPWm{X#&Gx-DhHwlE1w306`>b{MA1NY
zd01seARua~Gdsd1%1TS=X{Jd4C~K_lXYDLome;J*lb#!(wpXpl%etN_yT1N{hU;ab
zQ9asxua){nLMC;s+S5524sO&~Daso||9r$<qgr00I?7mc+?FG7QEP;juIS{^m${?5
z-XMH7M()W-d*Hb`GFU!PzL;lN$B2)8iF<BQfGWwHPME@56rn28CbID&tFE?w=qQk0
z2Hw6k^Ro=I<ptOJwJMNk{o4y*O1JGRC8gI7`^ixn4kkNpC_nPjISGHy%w^Ia9({nk
znDFEj18;-P4#?ewg{@UJ63VjUFy|zZtbTg>ozREMj&XB2(@81g4fTgX!7TXy6=Rd)
z0j~U4fBh@O_Ec6X28#IGPlE6KkcR5j^min}X;K{{J7w3t4k4tz$3*oKqNZbx!h;D~
zKRM6O&ledBgXjM6p?7l9(dZ&Z3R137C*|1e^X!3*`HqAnBRblT=o3DQ^*3<>0Adg<
zAb5;gb2X;uu)+Yq{@;)~y@BfLTT4hiq@l5|ul<shhldB~Q80XIOH7<?_kIM*_HICx
zGVxg(k${X0*&6)6S3Tcjgm3v23AYi;fPeHFWGl>>vT$;8#^s1Fw1^%8x+ggk6B8E~
z{<rs#i(V;ph@$!bCGm3&sD}9jssz15IXKp<7NNvU)@VYh{BJZ+e|<M2@Tg{k4L`t~
z!oppp>NQ+MBY-qNK9ex9e+AT$kC&Q&0!i!biYK7z11c|Za<@ppO3F$CTCFZn1Bi}^
zfdUnlmC4G>1LaAvvhP4*zd&on>QWpx)6}7!QKWpZKUwew8R$1jWYT4%rx(>f+&VdF
zH|$6IHU4~5GpbCBlfqD}GUv251wNi!6iZ|;Xb5Axb>K<6!bS1gUicqC%0WPe8zwEK
zb0X*YA4IBN<PFrxc@4IWH9f*25)C>*|A?nO3T~|r{8mJ01>P*w|GWeQR>;k1M7)no
zWt_xJzu*vq_s*a|l}3ZH0Cm3c#0HeDAllqe8d0B@`&~vx#$n<KOLWb!;ue|t-lg|r
zX<s4oZ9q|Hxyf-CNJl`Ry|3DICd&J`bgw>zb5{D6(YoWwy5slH@ej2|neJ65V5xgU
zl_jRbxrfYZC;`#2vc{NI7mAMKvH6rX%YF@x#Eoj>Av6*hQo-jkKxRa^D<sHh4EihY
z;GLXS{yp@@)R$LRt=^9=#<G)~syt)4e`%FkCj6eU-)3#KC<6E^cF(Vi7+~86uQjS-
z!*bi_lB}lB7G0)5>2J|^>B2v?Ffx*HA;XkDkWI>48j;8I<su;$6<2k+?UhbkyTi%X
z1oATS2Btrmx0N7cASjRV=&M)SSDOfG+sAU0AO?<V)gG+$s*xKPsuT7fBx;h*wp$I$
zOou`-@iS+lCb#VmzaDiH=b{Hbl}~H#|0Urb2Y*P038Wa-{Ht}L0T7in<edfizlPA#
zu@^NA|F25url!;0ohRlyD>}uF>pa^Sr9oIDPT?TpS}!V7u?;gU(wDQMF>=xF$_4D0
zCf30lwgD8L;mCFr0V&ItuYlXFnpjfoVR&@wNWAYxa$8C~4U$E6ga+|3;yr}t*lQij
zRGe9fYyqD?=88-pD}K`>8qF5c_BaXzN}G`C!e%;_-&5|)^2PL%t{bkuQFLar9XAa+
zR4Uv7!Ii~Zh-|E=Ie=5=;5TJ|Fi%@)hsuG9=cO%fEfgrv0Y~>^jhiO_)*kk|la)Uq
z&eZ1SzB|Kx5&L2T_TDA1{9a?(mw--ep5ywQGdaVuRm>{^M*dg$bOdch$IP)Ibpx_y
z5M-d~WNI#Y6_t|qe{$yH+c6>2hP;L{C;&x8&F~pZg#!-ICUn*Ss`)yCjT};GcTu>@
z5ZhYt4G?|J2L;3#r?p2cr)gSE${W=3OSeM`!d(Tt+Ur$5l5&u{=3yCrR%2tm^5D~?
z4Q5)l{aVrm#Gh}i_|h)?Q#4lc0W-t9xM1gED8x(qyCb6yrHbe{={7PKo?*`i_pxpL
zbWX^R9%H!j=Z)HPJG`NcF+J3_Y-rSUV!)tc<4pw3O=~`mL~do;w5bI-v{drb4s_n6
zUT!RDCvWlhTR?%N;yrrlE4&2{iUUPx*zJd<b_6f!MSOjA3sihMT%h!$Hts4=r3)N=
z2G`A9%AOyrhW%Z;uQG2&pjW$#<wS8>mkKMsG-`Q|@o-&C{D%OF_cp7gg5_=ZvD&-F
zZM*$T@L+mQB-1N_OG{<?EF*iAe+!rdZ7ZCR>lsj40d4bvi5_Ew-$Z3vtS+A5`)`Gz
zXpDm?8bu&ve&h1Qj)lrhL;Nbk((zI*P?Oa~5eP{BKstC(!)!IM*t;pLg;-Lq9$pCz
zS`#ANkmQ3Wya1u4o6BF^<~_PkK!M7jlfz&nSEXdvbxJDt-N4OarB~|S>s_uRjXaSD
z6-Kf|%j}xY)_qdX{N^~Q<eRi5<z{<>9)HB;e5*zjcnSRap=@0(>Z<I%5p-7aRLCf-
zjKP4$m|?AfA^}0r^sFz1E#f{=A0R%m==`thrlR?OsGCg7A%mOg!I_nB!OZU{@x)m{
zpkSC3oXr%&c!DK*vsC=M?w-8pv#<Vt$nvp3vYn!zO~rBxRX)C1XJzwNHac>>8?;1J
zaXnj@Sel45UuSeQT<&GX2VszQBLlS~mcb8o+C+G~TaCq%PR}fv#_~^xBC9#Ya;jAK
zHX;7!bU>LRAL8@>g}RxdzU~3!r$CMBg^jLVp?zKNZ2&Ua_>Z)W$g~Z-%=1>U9Bz1!
z(TY{~x*tr0nN?0bAV<hb%+7g0{u`1Ee1drYSCEXsPpV;MUt+Ce4;0;t8!7%WzGExJ
z?rT0dKgGE0e)aA@wNJFv{eYLxXqF}AiG(-|Bo&##<6<iRVx%yWC5@FCi!vZeb{#4<
zN$D?Wxd$p*kBPpGuSSe%JDYUk5mTJY3V{Xz<u^dT1U|$4jcl0#L^$`IOi&#P4WJhW
zr9plU=9b=`(Igz+t3{Q_z_a}6?*eZbt4zyE>E(X(5PiRP1F&WUwd4F05NSA!AmhUH
z()SRaxmKreIPXy4-?TWU37l*+-<pY2&4sp>7BmC=w(P|iT8a(oG@j&OA~v+gdN$Jj
zF}!UtS2-}6w8|T;q5dh-g60^&!XLe;><shN-z{x5FREh?d}?TxtfuP>{t-#_Z2N#*
zl%e`)*=4e5E(R&H^bKIkf==}&YeQj8R)2YyK2=U^lK1&G=zRBtU6axf29Ro+Xc5zk
zie<hoBN-A+jw)*!mF;)_BYU3E|E4|^>Fk_I`6>as_&eQD<_u6mC5$So0$KArd;P7Z
zJwk;#KYXYJsIi+QP#G;ZVMN%EA|1fu&IDs9GlgYg0;algo@s*PVp*pNK<5PgNNNRC
zB{cyZK%GMS^S1<EiVgE0SoxPP@q;i;`rJ@$Mv5HmVQrwJGz8{gV?K8yr0T)VBmjl@
zCH|GEDP=^D@%IIJd2Svakum~6tB6Vo*jx^1bczxo@9F8m!o-X=gb&D;J(CEq-ai&9
zyXp>n<9T!B|F^LwP$Zj;jV)ZJD6Dtm;o$+OAwz=zW=Kd+j}RdTI<F`wC^k1Yfes&_
zElfV0huxrTs{RUhc5cp9=Zm<wIBmSepZMF0iv^%vt~U%XiirE7y5o&7a-ad_fMzNV
z&^BUjZVt3k0<Cr8)*VilJHtS;DItdy1|lp)ldgq=EkYcQL6zFaBTw|V|8gXohOJ}E
ze<`3V((0D}RH2QB_dMu9;lYdu4F$^Oq2b|E^@=ePq6L#*MJUtdTpb-tN=s4D&_)Ia
z_xVd%TlN3>Wb&AzRIUHEmj3@VM@C6FU7V9})zzuY867DX*qs)@=ZgQI%9w@Z75{+b
zjE<Zwp7u=I3S;xqV^O`GwY2Ru%NRS$-`-WC+3D$sIkWF-^o%s_O;ra#OC!()VZ42M
zGx}$En7HbqnxU<?Pnty!^iM9L|B+AUnWP>t0u(G55TIj9Q09!AVZK{hA}d=D8D6wV
z*tFmb`h4q<o8v;<ghuCd=Xd4R+SUdI0wA&(*WTSb%01N=-gEKMeT+~0=#SdVDs>Du
zH#eYT51fPc!;wtv=apS?;Q|krAEz`Sit5VB*aSd#wouf82E8f_&|GZN#nkZx4YET|
zJGcMRB$h$?4<dpdluG_WZ0EK^uj2O0pWjea<Ks&LXM8*lq8WVdtb&5R^S|pzpU<rP
z+>6<qfZ9V}HO&;funx4p-p-%o`1ttS%n$}hGmJcxF+<cXAZm_oKB^z^LV@ma(!=U^
zab}(7(^_H^U!8jUqv&IIk-NGkr;8rn=*`8#z-RhA6umYM0>7qC8Bg>L&>#PTaJ7>L
zApp;+6*|j4dEMWNHG8m_RxkH_)DaF!uU<ac4>}Hw@-Mhw6^X(xMo;OhdEcp?cPb$q
zym=X$CwL>E0JYiP##nU+$hA%xEvANxeP=i)^2K7fzDF~!Kd)mieGp5P=W;9cRPs=T
z8w>z`MGbjcYTF?&^dv<?0YoPYH(aIs=j3?Y3Mn>*j`;=dyXXNux*|qqcgHE=>Iu{9
zWvctb|5ex=-cxPpr?O*dI8FF2MfJV~iS;OJ`x7@Erg-Yz8L!xJLe3<1IiQ{V1E4@j
z9}r3dr_*PYmY+i9E}sCobibz{2*SE@Y8>hcO>(>4q7PJKF=zk?O>X}*E2Nl*R^4J*
zNrw8rXHc=}gnbC%W&v6{A>Lz|$M6se&(J}$b+6RWp=W7v^OM{=jnXMS12a`BZh@(N
z2z<#@$<59xy%`KhVw<+Tl5}de?uWI=R-Y&<^-Ovm<T;s>kTV~TtpAzdsZuYmADc(;
zVLdtj(-IR8LhMJcX#KMm`od<QifC4)=q&`hM5x4n#gptXFklD$50-}W%TiWBK(XyL
z*o55=+FM_W5`IU>6&Wj#H4{Uj{-b~rjQytqI#gVq%!V>j&=*+wg<I((Wq0>rZbpN|
zMFuh;^+G;85;ZBia#yGnoTW6a7$jFGP-n2N+@IYqfU~{kek{evg`?_^4~!k2TNM$t
zJ9UVa!4s~Yy}2-;pTAZpsN_f0;LHRR#v=IlfBM&J;^Hf%_?4ZSSq4uTZc28gZ%8V}
zXpRnrHTilsUG!5=KQ@b^WwwrUrIli%0?2J`eiL`|Ep+7fmvQ-q6ktRkP{W%=Qa+r$
z#5BC!li!r4T9yMeCwJE@z5t=FQ#$<1qLot5(rBjznzH0H^a)+MgzuExP5o`vwQsju
zXXxC^(#d;E-ZFlcFlCqb+Ch`=p!^3zlkCf0<i7Tg(~VLGa#y7XP1{hIDQKG^cIX-x
z8cQHLA7Il&bJL;Fijuu66vvh-s)86Hta5K2aQ$BRN$BYbF!2@g(JAkM(f=gQ(&Y9{
z065)M|6AY;0XR3t@Z8X}9jKVq(FodESjr8mV10+{Zt4eYhQko@l2IE9A7dY{qRMt<
z#J@vP(1BmxK|_FR`PnltTV=e!hHML{HYT3B8%;%Y{fnUE%Kt^w{Qs-E%LQCm;BXL+
zN|B~j>_Hx8^2=_?hW$$zi}`gxeroU(`rf{yc>7{NXT3>L?ati;#mg^H{!TeSrd^dV
z-N;k{TN@Japfx+88UroIQpJE&IBP5z;+R8hOaa<x?q5#OSx8A{d!=UBb;!nKT0!|y
zq2bSHg6^o<2WUE)DVhr`3`@o~r7ddkFrbakYF#TRoK#JD^<kR^!9=R;A?1QQHAf-7
z+IyPH)BC?ER~h#4eZU_7FGDF+#a|gwt*DCx`#+`4=3Xg~(Em={tf8N)Eh71hhKEqy
zb=Y4%qa@*m52n+rlgS2zP3QMD2V&?_hibH{$D^g>B3!I*R}Ib1XqHyHimxmu)u%r$
zJjw14Wn|!{%N|dpi)W61gl$rUC=|<1AvkrfJ$?UI(iGPDA}QQnBVRb3l{lE$S4(RB
z9>Wgc!8P4~c+`j`P;uGN$ct;Le}&xXrn>O=e<0~iIl5U7J{s{qjX&wked8WpD7UVa
z*~@D&?nYq4z}`DRAq)lTLk9uIZ#%0D0=tcshL!%#o+3Rd;m8mozg$h<8Pv=st+UPi
z4t#!_AH*AMcg%2I+Ynd!p+b<#VXVW&*G}1WxA^_PHFurT7IqKW)WpatvKR`v)0H|y
zh8MMei}ygm{r2E}*yKAP&~2%pRnn-*I}gjv`ZZ*@__Ugo!j*iW3b-52Ki?Z2(;pbk
zHa_`LowWua;_IVnUT+Pu7@%GP8faI7AjACbCcB|^q31V^>A=#5I-qTVW-~o=+BG%(
z0Qo85*rJwhdS|O_S<$<iF)xUkm3JBk1nAHx7X~u;n`ZpB-RWZYtAZ4n4WS4k)Uok&
zem#{86$ryXYt6TBUl6Kt0@U<=1q|ie&$In(fq{h{q1q!#YsUFd9Jaj)XqU(Tpyxjf
z>i0jSP5qBh`t+4tL0-Fx#|$_qn3I!}0U<~=6ER_AtIC?I^x^{ot4V+5Q22lTQ16eR
zh1C%+6VOmd5ifN-Io{1j_wy>$`2muoHCEK{FX`E$0i-u{B^@vIK}dKC_7Ej?+_G0K
z*oU0K5P|`#k8^`j58ort#DcQuvwxg3RMwqH)9U2V3DA|wqi(uVdUsnU@E%7g-9J}l
zE^l~HK)&u%(?95B&Q#s^Klqt>f`>gJ^gLzvjP1-)R58rjrbyv8k{dLMEmw%Y8dVMk
zO^UCxT#G2Q=fcp>{<h3eV{+PL6Jao{U3c1%Hj<lTH*6b%GXws`F*cGL@#wus7{p4o
zNUunwc-<(opc0u%gfgNO3e=WnVfihx6afuSoFkQe!Ji|GZzq6wlf6gaHe>FpwGOj{
zN*5f+RxstRNqyI!8_l5x;q$n@lt!1+Gnn}DG?#q1<0C{iul^|Zc`N~##S_qyloTF;
z)ZWoTsNTW=T%5zZP4uHhf1Y8b&2r6w<|PNlr#1%SBOmjUN~BNYI?+lF&SN)utZy?j
zS3;8uSrEmJJ>{*wsYsQy8&{S&orcIJnR)i(nbu2gWB$arY$Puz+<~I{sdLs^cX5!b
z1orODx{>A;gPwfM;qvqRaKz4;Z`#mUqcnB0epL1yr?)POmLSZtDNKKtna+U)^#0Rh
z<%q5eWeQCC^wL*bp03Hh(Sol!l)S1%D#Y4l@`ss-9t$YAIs)F$@xg-TQA08B9X5Ui
zf<QXr&pj7b=@(U9lj+CHAkePEC#M(MGSBc>Gz*vU50h_NKJilr51%ABVV)_3RmL20
zd@28#KEl25sC{XWP<^EB+y1N_m8Y~@F<#y-Ywau*r%8)$dWz)3bWar@Mz~;+DZr!v
z10&Z?IBD4T9Zpc|Rq@f_hb!f*ceqB`2xIfqhnHn=x7<y+eb*-tXd6{*lmNNz_2K$F
zd6paxJ+A)e?D^7Fw{J?_TXGEfm4}^|Ny5W8*J-xI24r`Af$_H0X@-3-@T<}9M(zjS
z;Ybg|%hb&4ur=OZRV||VI1v-=#@Ed{@q&Xz3vkEH?T5*g876b_oZ%m8%+61I$6*eT
zBafGNwzn*{qpt;WGIKE}Y@841*-HbKQSyg^WudA;u|YX`H!&d}mXk<)B_Ga?+QP%!
zspsT!l`z4^Ca<Px^5Oh-Q9^rV-EV*;pN#6lIQt$2e&*ycOWB{|sGnY;YiYll?x;m1
zVA^?3(oERiSbOm;iSpp`JjBbu<S+$qk@I36-(GL#nh5t?AxS#stWMF8?qEIlzw<5I
z;h(mp&S(H@Oz#sb(;cf~_$m&R9(7BD{31_gB5iz(4JCp4Wu~p~;%<cOmxFILw?1<(
z&&@t>=~aI(v}iz(L!?lL;;Z3(u476b<2F?-ROb*4z7=S<zHa`kY~ng=x%V(gF03l!
z=-4>fs%%=A{oEAOTZbOVf2`OvmsOu#uxe~Dh~$H0?0k>ega}d?S5Tf;E><iN;7cf;
zB;U`Kray{t+Sd(MIcs^<@oL!8=s>hNv<gy@s*-^%S+kKebs2~3mc^g8zGSx-<STcP
zo4YQ}?Jvx#Sf~I(*Fvt6QvWE0S;UgU4;RH`ps$qNO1TKt3TUmK=J9%f^y$ePA52+L
zlB+`v=_-WuT%X)j;+Km;OU|ZJV~}+Z{TfDAV~p!qlFSgvfd-v%eRD|`@W;mqDQt1X
z4LfnTSwTl*Gx^A=_*zaE<&*NRYVE_My<44zwZdFIG+(4&<Ke<ZES)iv6)5<$&6-Tu
zh-z7V)nVY*>hbv4{A6|v8WQ<uH$+gDQXg!Lq(pRit4LFfeDR4fo=zH%O$J#(#f!V8
zit?KBaJ*ed<5RKc=icM&(do63jXxnCpE@qeiIZ^$-nm}PaIyxn*>8wIXG9ZqC?e5;
zK-iepa#yLc$IcJuqs%r_Q|Ff_2`@aq2g8Oph<G{>nqH*#EGv=G*1jV(FUsu_RZ)y9
z?phOXzN<dDbO-qfjy^1w4v2|YkSG-CBZvI9KYu>CXt!zei9y@LeiW4PeAKe}VxyjL
z<IG{{WawF@HWPPyH1pHm@r|XEYXOt($8RdX*T&Z&qnB@2iz*-5g*pFFF*BLafU@c#
zUSEEF1NMvi!z?h@uH2qMN?yv7?NgSSd4GN;ll|ucPA96d*!LHq$%GoPsIteSreEl}
zd??!Ln}~ouJpQ-AJ)dDWU|;Xr1Z|XIoMpypRYFpITHQaEtR00Ty-5JZX0u<n6A9lW
z!n4A3@~f;OhR{>|$E_$6=WE5t$yn3L>D!CT^ixU)d&MVY4$E2}-J^q^OjGeT`--dC
zMCWJ`9wX<CvEQkF)KA{$?T$_FqUanQq{??xS7g&~<(m{;7&&3UU3iy1e2J;{jGUio
z@w7diq*g`KkF-}M7tW8jefiy74()r1f8{}eM$k^;Wg*-jR7ZD7GVy9)LGNS2+B;l?
z95)*hZ)tvBd^=bIPP*Lky)AnMy><KeIhi<trNdm>uaGw}F@H8^(GRebxJ)K`E*611
zOXWwz;pAAj81{by<CW9;LBu<zrfrZNk+PRJrk<<qJ-TgzJ<WHw8sR${=`oo|tgL*Q
zb>;4(qC`Vp@vV5NIUy97&4c4oVtWcztV?>^ZBNg|(L#a)>s5|EB}P{1w;EN_=cDa|
z)BSEZ&{l%6c}fM~p;Xx81#HWBfXQx;k0#H*-uDw`?sMrb;GYtH;<g&O*gsSqSvq^J
zgPCqESa?+&tR*qWo2V%?qt9TqDSyEtNic1x;fqO`TBXRlP1<H|P$YGUrdXDw82X(*
z5n@R7b?}R;2Py$^2A4hAg`+bO`?xP=M_lkP_{00@*QP5ZMHzA|&bnE$WswV8eI#OZ
zCDf_qj%2BonpCVOtZ>-}tWX{s9G%9Cvx4RRSzZ>cIINev%>p>PZk;eKJ|>f*#MBO{
zvQSc9rB^bbSQltdL2O3PnJ?b4@yD!(F1x~<&Rh1f=6m!n9T6!4cQ}VWD9}bfRl>|R
z#Vg6E)GVwt9ig=E;oxm_NJoeaoaHS(`$h3iJX99i4KnA=3b3l0fq0S654|0u$1C-G
z=^{_sX0#k`JYnlRfGxKsLAo<;2K8&4bC3oO%XeSDT`7Ixc4w~>Bo)a(uj-fX!~#6Z
z4GmiGp>g|%7$MPzI1nfgpCp(Q+ig#`6BWVEVsb`%rVcC`E;&ZB^KO^Wk!sh>SZUvO
zS-XX1bLT+Z27)}eJfT|<k;$}Y-g3XydaOYMB|o;q4JGdvF%;%UZMFsXjLXIRk=3}v
zFpo+$r`WQrmf~wb8wH8#(zuq<MEa{M1u^#v5DI%vwUIAFd=%4598W8vMxQK?&mC`B
zAC8bwFY9ZIi%O;G@wc}E)8HdXUh`=2%n9v51u8x3piU?dvgL(O(<A?d<w>EhvVH%&
z|FA*poa;wgJ7kjJeyF8W+ez;q@F={&t6DRvB=M_XHsT)OOWXcvQ0|i)ABcBi?YW=<
zu^Hr7wepq#XsuoHaL^AoEp*9L`n2J3JwE!09QPv|S;DH|JlL+_uDHXraK|a!6o0~f
z(k9Zb5ifIQ>P3s9Vq*aT<d+s(vgu_A6bmRd%IT_CN7zQkV-nGM6#};|dO#pJnZM5k
zC<9ZZdr^=-4&Uhq%8*?fBS3>%m-|4V)}^OZIMkGG%%(jc{x5tsqx=#6X94f3Hhy}$
zBKbLrnu*9ZZNwB!XBg<?>l1kM^&>X<ICRSrzinCT>s8<BGxD9JvyH=(lqc<{&QEM^
zPIv3lnFH7pkjaTBzP^bfG>CK5Za*XOF4m@yalLCDqNfih^ZSI5V)xONqzhaqbU3@d
z5MzFAgLOis8Hag)SibD~`+3s*e2=F4#hAjnVH{gI_n|~$%6MtIR4EXMCJrpQMefPl
zf8JiTa$<<4yHtYo)gl@{29uPvXWITJaWZ&TkxcdXU3%;HXi;H#%|=}-NAb;r*%w^G
zD~m)mKT*~*uBWHQ?G+nCuQ~rpsrjX^9Yz`pfEQ0EUMf!ctq~~uLPCajLaN#s%Kg>0
z4_4OE`6_FEK5^fC(M)v}Kdv9i^=R&VJ(x7bb%FYdI(Ujr(6EN<?F$kb8yMg|*U$>S
z@;^u<#2NTac0VW0d`v$@bSGgwBgmUFB9SL@V(%~MXxwrc$aLe6Dj*P8rrFA@OcK`Q
zt`MCP_GIyZ*>L5FIhk13a6D;iQf{3}Re$|Ev=Vn?V37cqhsH+D%NoCV@X1i*Vir%p
z;?!c?y|LgECyh@aI`4pgUo1_YL<IslM*sbnSvJ8p1pF(u%4dMTT2{}HFFvL4FR#*h
z%hELo&AnM@<e}uL?Pof-p*r*3j#_Fv$HS*CtAH5fynpFs+rTo2L?_<zu%JZT@<H2L
zdzhyp1M4|IpNHcua-az9F)z7T!6brny*Jb4QISzICU|c#B&4!{wWU`iKuIQ?%<%HK
zp~H<hDBb~5s(bnYQ{sih5HL03WiPtv^yd^u+v)im9!c8TMLJJ{8YQD0k^)ia5X6{Y
z)jpJ`D;L2qoSA99YCfrmF%K;Z<LSNjS*$~Zp$`zU=Wd{ScjY?5zo1(FE=!Fg%j(b@
zE&Y(^feV^M{5|Fmlf7i<1j5U@FKAOj*7YIed{ZbvsffSc9^1x;W;Nxm<Wsm%u&MTJ
z3|cx2m5+vME$K4B^+fCaZfvsFX7C{)lUQ}g(^ptj1kxKRfye&T;4ylfD;YPtJnn4$
z`GsNX-LEA0oVFn}1dECsmTWWhNzeh4D8aCXZ4}J*C{3;K(^3(#j=fovUfKN>sW?rD
zaT1x#T<_+B9wEN6D-lR_u6ZMiJcvb3GNSKI)4G7iW7HB(NM_dr#mDoEB>}ZjmppEL
z;6`7S8jfrR;dx$3QUj7`mT;GA!tX)-dFP6ro}N7EHeC%ap|F4RAIQO&z0>fJBRf~J
zKvkLpAU`I*ee-8%6#3zkjzw(z!u&k&tb{OM%2Gw(@ClSK{W>t@%nzQ|Imz1Igab<q
zT83qP#m`_|@<(ungZ39CuS$yFICldcJW+ZgN)10L#iEA6z`($al4<~<x|UX<RgsrR
z?*pUY(+6f`kSNJqiG4CRXeuvf1e$<Y@L=HITwUummXr)#T|ECd2kyci8IcR`#b-vT
z#IZIv-=Af8ej^V&$0I}(=y(Bs!V2T}e}%46v$~-P>hhO@D1O8SZU%)MxPp0r2AjW4
z?zAXf0xZCrJ*NMk`#%5efoBEEDy?F%FhEaI7lF_pEP3)6W*xmwKe_iiA*RfgS6cYY
zS=c|^ff=>3gCY7rN4%dNYqA?3i5bUYv=l@ndn~+38u<9i*pP+!HZ_MQhq`Pj2rx9@
yPH(C>eZFo#eF-6|I7v#(h%cWtzQVquvr37sd|!g>0}o*VNs7sdmW$|p`#%7XRTMS=

diff --git a/doc/figures/fpga.tex b/doc/figures/fpga.tex
index 02922a0f..21901fdd 100644
--- a/doc/figures/fpga.tex
+++ b/doc/figures/fpga.tex
@@ -14,15 +14,6 @@
   \definecolor{myorange}{RGB}{197,90,17}
   \definecolor{mygreen}{RGB}{84,130,53}
 
-  \node[fill=gray!20,rounded corners,
-        minimum width=6.3cm,minimum height=4.8cm] (border0)
-    at (4.5,2.0) {};
-  \node[fill=white,rounded corners,
-        minimum width=5.8cm,minimum height=4.1cm] (border1)
-    at (4.5,1.8) {};
-  \node[fill=none,color=black] at (4.5,6.4)
-    {\footnotesize{inter-FPGA reliable links}};
-
   \node[fill=myblue,rounded corners] (tile00)
      at (0,0) {\footnotesize{tile}};
   \node[rectangle,sharp corners,fill=black] (router00)
@@ -123,16 +114,16 @@
   \draw[arrows=-,color=mygreen] (tile13) to (mem13);
 
   \node[rounded corners,fill=mygreen]
-    (ram0) at (1.7,-1.6) {\footnotesize{off-chip RAM}};
+    (ram0) at (1.3,-1.8) {\footnotesize{off-chip RAM}};
 
-  \draw[arrows=-,color=mygreen] (mem00) to ([xshift=-7mm]ram0.north);
-  \draw[arrows=-,color=mygreen] (mem01) to ([xshift=-5mm]ram0.north);
-  \draw[arrows=-,color=mygreen] (mem02) to ([xshift=-3mm]ram0.north);
-  \draw[arrows=-,color=mygreen] (mem03) to ([xshift=-1mm]ram0.north);
-  \draw[arrows=-,color=mygreen] (mem10) to ([xshift=7mm]ram0.north);
-  \draw[arrows=-,color=mygreen] (mem11) to ([xshift=5mm]ram0.north);
-  \draw[arrows=-,color=mygreen] (mem12) to ([xshift=3mm]ram0.north);
-  \draw[arrows=-,color=mygreen] (mem13) to ([xshift=1mm]ram0.north);
+  \draw[arrows=-,color=mygreen] (mem00) to ([xshift=-3mm]ram0.north);
+  \draw[arrows=-,color=mygreen] (mem01) to ([xshift=-1mm]ram0.north);
+  \draw[arrows=-,color=mygreen] (mem02) to ([xshift=1mm]ram0.north);
+  \draw[arrows=-,color=mygreen] (mem03) to ([xshift=3mm]ram0.north);
+  \draw[arrows=-,color=mygreen] (mem10) to ([xshift=11mm]ram0.north);
+  \draw[arrows=-,color=mygreen] (mem11) to ([xshift=9mm]ram0.north);
+  \draw[arrows=-,color=mygreen] (mem12) to ([xshift=7mm]ram0.north);
+  \draw[arrows=-,color=mygreen] (mem13) to ([xshift=5mm]ram0.north);
 
   \coordinate[] (south0b) at (4.3, -0.9) {};
   \coordinate[] (south0a) at (-0.83, -0.9) {};
@@ -282,16 +273,16 @@
   \draw[arrows=-,color=mygreen] (tile33) to (memb13);
 
   \node[rounded corners,fill=mygreen]
-    (ram1) at (7.57,-1.6) {\footnotesize{off-chip RAM}};
+    (ram1) at (7.97,-1.8) {\footnotesize{off-chip RAM}};
 
-  \draw[arrows=-,color=mygreen] (memb00) to ([xshift=-7mm]ram1.north);
-  \draw[arrows=-,color=mygreen] (memb01) to ([xshift=-5mm]ram1.north);
-  \draw[arrows=-,color=mygreen] (memb02) to ([xshift=-3mm]ram1.north);
-  \draw[arrows=-,color=mygreen] (memb03) to ([xshift=-1mm]ram1.north);
-  \draw[arrows=-,color=mygreen] (memb10) to ([xshift=7mm]ram1.north);
-  \draw[arrows=-,color=mygreen] (memb11) to ([xshift=5mm]ram1.north);
-  \draw[arrows=-,color=mygreen] (memb12) to ([xshift=3mm]ram1.north);
-  \draw[arrows=-,color=mygreen] (memb13) to ([xshift=1mm]ram1.north);
+  \draw[arrows=-,color=mygreen] (memb00) to ([xshift=-11mm]ram1.north);
+  \draw[arrows=-,color=mygreen] (memb01) to ([xshift=-9mm]ram1.north);
+  \draw[arrows=-,color=mygreen] (memb02) to ([xshift=-7mm]ram1.north);
+  \draw[arrows=-,color=mygreen] (memb03) to ([xshift=-5mm]ram1.north);
+  \draw[arrows=-,color=mygreen] (memb10) to ([xshift=3mm]ram1.north);
+  \draw[arrows=-,color=mygreen] (memb11) to ([xshift=1mm]ram1.north);
+  \draw[arrows=-,color=mygreen] (memb12) to ([xshift=-1mm]ram1.north);
+  \draw[arrows=-,color=mygreen] (memb13) to ([xshift=-3mm]ram1.north);
 
 
@@ -359,33 +350,20 @@
   \coordinate[] (south2c) at (4.7, -2.3) {};
   \draw[arrows=-,color=black] (south2b) to (south2c);
 
-  \draw[arrows=-,color=black] (router00.west) to
-    ([xshift=-2.3mm]router00.west);
-  \draw[arrows=-,color=black] (router01.west) to
-    ([xshift=-2.3mm]router01.west);
-  \draw[arrows=-,color=black] (router02.west) to
-    ([xshift=-2.3mm]router02.west);
-  \draw[arrows=-,color=black] (router03.west) to
-    ([xshift=-2.3mm]router03.west);
-
-  \draw[arrows=-,color=black] (router30.east) to
-    ([xshift=14.4mm]router30.east);
-  \draw[arrows=-,color=black] (router31.east) to
-    ([xshift=14.4mm]router31.east);
-  \draw[arrows=-,color=black] (router32.east) to
-    ([xshift=14.4mm]router32.east);
-  \draw[arrows=-,color=black] (router33.east) to
-    ([xshift=14.4mm]router33.east);
-
-  \draw[arrows=-,color=black] (router03.north) to
-    ([yshift=2mm]router03.north);
-  \draw[arrows=-,color=black] (router13.north) to
-    ([yshift=2mm]router13.north);
-  \draw[arrows=-,color=black] (router23.north) to
-    ([yshift=2mm]router23.north);
-  \draw[arrows=-,color=black] (router33.north) to
-    ([yshift=2mm]router33.north);
+  \node[rounded corners,fill=myorange,minimum height=0.5cm] (boardrouter)
+   at (4.63cm,-1.8cm) {\footnotesize{board}\\[-1mm]\footnotesize{router}};
+
+  \node[rounded corners,fill=gray!20, text=black,minimum width=5.25cm] (links)
+    at (4.63cm, -3.2cm) {\footnotesize{inter-FPGA reliable links}};
+
+  \draw[arrows=-,color=black] (links.north) to (boardrouter.south);
+
+  % Is the board router connected to off-chip RAM?
+  %\draw[arrows=-,color=black] (ram0.east) to (boardrouter.west);
+  %\draw[arrows=-,color=black] (ram1.west) to (boardrouter.east);
+
 
 \end{tikzpicture}
 
+
 \end{document}

From c1a492c9b411a79dcd99a0866844a73a46487637 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Mon, 20 Jan 2020 16:55:30 +0000
Subject: [PATCH 03/78] Board router now has interfaces to off-chip RAM

(Untested.)
---
 doc/figures/fpga.png | Bin 17084 -> 17166 bytes
 doc/figures/fpga.tex |   4 +--
 rtl/Connections.bsv  |  82 +++++++++++++++++++++++++++++++++++++++++++
 rtl/DCache.bsv       |  62 +-------------------------------
 rtl/DE5Top.bsv       |  30 ++++++++--------
 rtl/DRAM.bsv         |   5 ++-
 rtl/Interface.bsv    |  37 +++++--------------
 rtl/NarrowSRAM.bsv   |   2 +-
 rtl/Network.bsv      |  44 +++++++++++++++++------
 9 files changed, 149 insertions(+), 117 deletions(-)
 create mode 100644 rtl/Connections.bsv

diff --git a/doc/figures/fpga.png b/doc/figures/fpga.png
index c05a5e99609a3857048baae3a4ab2a67d6efd9b7..2ea48fdcc2aa8e3ffc60dd4168ddba0d49ff1cb7 100644
GIT binary patch
literal 17166
zcmb`v1ymhfx2C&scXtgg!JXg|+}$m>yAy&-fZ!H%<L(+DxCM8I;O>6w`_F&6@9EJ!
zy8Dh>BY`4e@2Xn0YR+f9@0>eQSy37Vkq{9C0-?ytNT`B95ct6U5j-UD^{!{!ANYpg
zB%|XB0%7s}^A{q%R}%sRA_vJzh^l)npBi}Tt7$!d;>kpo5(}#7OJvrAaQ+$LPQ}a6
zHMgwWSlu{(X?tu_Uaf0W-sE^}yRWy=)~HKwQ@forlvo5=7d{3pBR2H%>dZCpJ(PM?
zg!2CRa%caq%USEB(ByU2b=KpzT^Bx)V95SoL88$h7{u8G<)0!X2ryAvFuchd=+M@I
z#%VXDPt0&?bcprn(-Kj=i+e#$GIYxL;|W9NSJOpuIG?nt4TbEGzMQSLqN1W+G<J6f
ziiw3FAhx>feI`EBaZ1R_IypHJ6ins{^(bFYfQA$`Fj%|V8-)PrH9K?$ygt>q7v9|5
zpb&BM^71}BJ~HdKTmbJ&L$}-Ny6<ta7~Hp&n3N<-t?KXpdZ9kAyj=NWLHAD2#>OTj
zB*e+7F88E`Gm<M$jd_2*(d~QhkSwxV80zNgD&V~RJMn{D!?7ptfvwC5F1UX%KR=(v
zsEe3@AU!v?x@CC(x>WtBrfh%Fjyt@MB5Bye!()7W+~@Wd4rOa=YiVie?d^@1SdlUz
zt@!(ZE%H<23WaR~x8pi7CMF~(joo6T&SEso9%cXdxZVHR13pBCdOAnQ7r0pY8p+SV
zrl0@=YH4o=7ZsV*E}E2A=hf-m#!1j5K|w-di^ULfeO9SO;pq<z4`=MkmZ0qU%+4hp
z6%(T#cd5-Zf8kb0(V!<&q$9DPF!m)1Hd~sVq+N3V-?s-5%Nnzh#~Zyc!+5Lrn{*s8
zMpR*Qw7^-43#q!Bhg&qVS1nvL`}8QgX{f>jye-oJvQ&zCRDNS)V?_lMH+NHE8Dn=x
zeMuQ(b&Rd{go*IHsNLuyIc%n+l;+UU(NRoHj7B*y59`QWTu5Dkx76UiL>cuHqvQCM
z&ApuKphTOIS_c9F$Bm4PboTTlYdM^@4oxuBEO@ma__oL(ujMEmLVjRqJ*YLvh>Rsx
zDQ&NJWaZITT2<BMeXYqH(UG_jHOrW|tvza02d7$D0#BbHW9WO@2=(dhJbVoG>uU$A
z<_B;2c;}CMN7dD9xHve>x(x?|W?Hf4Z9W3idmZxz;iKs9SK^tJ@~f%pQDO{9-R=9q
zJ&o-t^Yr%Kpb%JUY+1bXG4T<%AO3G^OQe>&G6UQOuTL`Bg0*ZC5FoqPy*zbG#T_|<
z=3L4%Nl;J+``KQHCN#&Y>o&r+AvB2BwV<@HY&7P^!fs}6G6B`7jAPnUll#(@%ynFb
zaU-XcQ}u5<YAS^WnA7uy65qSbc%^3$IEob4nk|g|6W?@W$X&q9_egD8JxoxLW$^Q$
zd1`-T6~AT9b($HAZmHwx)CYo<;Fvx!5J*XSB$2)f0YjeT+Hr?+ChUj0=ivt{dGgm&
zYX^ukPJ^7I16%}2RacdC6@_~TdTBIi%&0<}uh5^g7%<)`bC;Ye0z3E6KnG_D*cu)m
zq?QiT-oGN&Bj7L)&ceA<7-~k@GQmFGXKc0@!-9Su(Hl`Fhd*jl9fMxCRqUG$UH&FJ
z3lY<^c;NE?-I!gs#eHJ$I`lZ&0){Gb`FSRv1pf1MR>yLULm-Bq!NPQb@7utE3qHb!
zIUD5Ji7<Sk-^!%1+8HT<jXzO9<VnE;9$TBE!x~_AW*0=sI?XE5E)@$J29BmfU|PcR
zQ;d3$OT$ekpGSOf$&)Fj7mAH3&^7L1gd@hy_|zl2M+4S_4wv^24i3uZGP%&QJZ7YY
z2shEj{8bKbuHY#WXRgo>yqsQQwL)8SgjX67L5{|>xr1#R{El>*xtJL8+`Z6g26b2~
z8iRt=F>Z>f>G>Oan#5k#Q1QnP=Qx!vZctE*@ql9{B~4iAAzpv@a+Ttpc|D!&zV|mD
zz4VH#(c<>W?9GGY&ua}sgA-55F0{WD@OWy5ME!PE{g0?q>zgy$U?z1Z&u7Z-I`LL{
zE`41Ek#gPMlGh+XPW7}($X13O8~KTrXLlvRsZ)0!K0UR?NN`Uk>D48>B3Y2B%R0S^
z3PdZ`HPYm?&BJfBsw|T3Srs%<>hsT?#u~J;6VR2N@Yk^H#%8yEwG?R^<)a$tnFwpy
zX|@3cNzfPx_S@grx+HU&mZB$Y3*Ag{QNHO6o~W#SW$tDXSJs)VT_`Dz7T4EMb+@O2
zg))Ua<WhpUoU|2YL^PDcz>m>%m`PxKP9{M}u+ykp_Lcz?Af&gcoE(VoE76gVT~!eC
zKbm4f6nx0Xhb~EWl}9f~Vhy{Mq}q{z=ITKrl_T_83h=j-!^<6Q2>2_L&IkTJol2Se
zH*Q^0z$M0}ErJgd7V->rcZ4SR>C<SiI>ThZ5PkVR1n7IVD<_wifZE9Mqi#i0>`FOZ
zFa;Chsmo`Jt1F)zv5tD4OhXNS3ez_DiZ|Sw&yij+D6zj9Xdpr4@N`dqqHNQN^_jm!
zU-)ys$x#|_f?hQs^-)0}N2{aj$#Cpvb8^tn{k$K|92zP;gD@Pht<8(^S(DZEE_zf_
z9e<`;&{wo3zoBy)KoiryYR+^ZDAo_wATsb<<F-g984O9BoUk!HWQg@LM|eVlPTmWq
z*TY)qdMr%wT_>~$dq)(nHG<09(o@=tNLb{f52sL_vT%uH?mfY=XtnN*%@=O@@=?Zd
zPI<8;!(YB&oe67pIdt6@I$5d`mupcI?G~x%?yBoQw<y<N>AI4-t}9)}pfwtxpBH)*
zX4(XEAo-WZ$|Z0ayy8~URJ0e#DwnFWq}(}Tco9qdHuDm)vHVRN%Fe@bC0ZDN6EN+u
z*8?kI>adqT6fm@OqgJ|X&dIJg<$db8S(3!6srT}%2?JBZ3`a*z9B-{VQdp3M4qZX#
z#iCRZ@EsIcG+>mwROi&&E^nK%4X%5Ft*Yo@GkDddR&c!*OOVG?0f9^-^(*N>AQG+O
zG|1HxE(hZ2sp!pMDuv`UFcu63Unl2&K|z)a-=AGc5NKxOCExB?3OIFQdba5auUvj0
zFR<v=g0Y0Ov8gOb|C%XKl5<#{9!nL>#BjYQVGptweDPD|ohd0U&!y@QXXEm|16?|o
z#S>^af`V8zq4BBo_+u}YGP`8<!umyWS$R^jfaEYPW26$mEycd3Axr9WE)5B)$lfn~
z-j-WCZhH7g^N~?o<r27R3cZaP>(I(*Y4SPZ9^?9<@<c)O`+P@DxXiU>Y^wCL8+<t9
zUnL}wK?-`L7tUp9stk`N{w4J1jk6)3raiP{@$Y5(G@<Nh4#*sKF`@#Nu_h$T*um;d
zZM;*rL-X+0h-AR@cKT+zd12v)&mkPMG>o;-e8=gEtRpvRsiKWcS=V^}y+Mu$@6k?w
zcV=};Z-!~}R_10r*e2Muo|b5Z<@E6P`9ipi#T9qeP>m4b9k+y4@i!bCv@v2Sc7s*W
z_)t_7nlTjiom>8)*0eUVxf1R2OnONdI*-APz5q6zvb0z*q&xM$cVLbH{vt=5Q)%zI
z!S{b84Hs`MDMDat%>Rf)5ne_I+e-Wg7Xb#bfNvUox+0VC??m?T<|fGK`T3canmX<0
zcSfzM-&;8_c*DcP6B84CYC7UynvY1V>8$4!kPi<JiMZ{YoSZJMuk~7-wu<CZ<&}qA
z_a{ylYoWsXKG4!8$rL_4J#Cd%<TdE273lDYws}(~v;@Att+jjJqqHkX%kc0tW6>%y
z>C{QI2ZVp^3j6KQ$c&*XS3mJT=dI=EC-H9=7Z)cdc7cI`#Hh3hGV*FNJ6y1@%ctKP
zWyB@x9c;L=s;kfMPFE=4DHCLT?@ns#>p6OG>4A-l3kS4$c9xTp64q}TUfC5k;t*Uq
zRe^@ioHgJN=3(5BUBcx<|DQuv3IX4A*G#4!Z|uk?6qYD}x3Y>A^<-ruxA!}J@54@w
z#U0L~&6K~1nTU!8BOrRezxnsOD=3WDXw0K8GxNuXLjLPkq`}^-FKQVWoSvPbg7Ak(
z`>YB~EW2b=Cim6oegxYId^4F<sFReG6crT(5I=Qhk&c1k>iYUCxH8tHjrWu)(jRU4
zD|rc6t8V`{R^gDWLXvIQ`7~2<eo0l8u<xDSP3RJtzflPM3Y%YkgrVhf_4mrk$|M?v
zHUK(hD-X(da3Y2X{+e~N%?-}pW!>90Zv;G_RvEMhJl^t9rB|)z8F;)E^b1_07e2h+
z$?vuMlBVGm@qvCVE$N7$1n!Ue74r58v`I#L?2`Ky3nvW=x*hyk->~I=Rge0S<!!P0
zL#b{&dYTwiV>4Igxo%)3_zw^jvOg-N`k;d|tL?S}a}6yFqe^XYNV@04yqW+k;_@Vy
z9b0DbXt@J<rySeEk$Kt(2hTxMBoD{0_Y+RK-MSXWtSH6)zVZ1hrIt)%h>|orBE6A{
zPk&w6Ehxdjm?zVA?3B(HdUAm4e;>{_=MP|2Vj0UT@!`wb-+&qXvD$$0k~RH?=>eQW
z)#h4~p^rS}U@D7Y?1fPH?6~t%yKO3sB|PCNdXU->0hUalZ$@TZc#YApBUeGti-&CZ
zr94l!KAEO#seIKuIAL3xl|8pcZ)Q;|VgX4^`+tO`<A5<Me_J$4mz257n?#Q4nBy%O
zXBBDPf>JRyyyU%P=UAIIAKt`bRq-e@gHvz@4e6Z6eW$gRcnpdu-GYGJ%vCjR0|dyN
z{m&xHtk8RK%osy~rop3af*M_7ql+(`VO|;`r@H_K=UTAh5anpC1Iy->x{`XoN!}w?
zmb!yWo21jwEFM1mq7D{pphZ)c3#Jo(*dAlQl>_w*$#*J>iCTxQ2!uj`<;0FGI{F=A
z6Pw@vW}GrAxCkKKWS<OMs?1MZBi=T#>+q#2c;*W9Of4k0StrD2-!kqgMr57^5hMK7
zP952`a^%Wxn;*ja4AZHq0JU^Je(9$@^Qz?`XXznFYb40_lFZM0|Mc}P<Z2=c_N~-5
zb}6106h!px^q$OlRx?x%+~)84G~5}U_5PxA+oR7q^TSmt=4&>}<;BKrbCdwfUL#A+
z_8r3^lJksY4wM@?@v&>OG{MCHTW}<Q;)ivniI=A_Tqwgl^2yK-$QER?d-RPbHB-TE
zIm-OPyE=SsSN+tGQgRY(@XCz)Y2;G0M##s}QUj$$COC56!b<JleXVfxQ<qId-WW!F
z;yQu!^qUv~BhDRa$k1e3du^sWx4~cDIKL+sL9XTcd($w1qt$0`A*1NICTCxq4?!oY
zu$?~*6iej|4JXwiJnt;kHaNYQhC#`PdZ`5R5EWiEtp%P_al{1~ERf~h7`xg$w!ij3
zpbL*;;deio+FE2tP*<DE;@PRZhWdqbPmJgp^BvXx^>M}Od;|uo8z;-}yE!69kw0!2
zPSntR^W%h`asv8kxHAPLo4m~2-_On!iO5WkZ5+1_CDO5bh_}Ru1QMtI(5(7b_^tV!
z%WjPo@@iJga1YgI{LiUE-6G2ZLL3QNK`Z{xgl1pMxFg;%Ht}(%=~7pc(p|P6<h$&n
zE?wfO(4z#>Apdql=s@3>z|1aNGQHM(v@XW{1m}+?50%?-di|`HGoRk_)Sa`h(=xBq
zY!&)Fuja5a)tJ8{v{sh#y7$yb@!4kj{;&}CZo~))!hxxdTonAZHqG?Ua_QDs<>pI5
z4ZaMPKP4scv8bJ#t21?zC86vhzdJGSv|CJ7rVs%JpTB`M)DJZlIQ{YCWV>;5`PIc@
zf$34z?mrHNsTlTagZ*}jkEU!fEv>PNROF4`EURg;9dOa6Qc1xVgmhFCrlGL`aSeVn
zL^`C7FhKGycp0;N2$uMD=C!gRlOEV52r8|pDDnDDKaR85G&>p7CSI-N!ku^+_NXoW
zcz7tIjU4(jBl&o<@mg@aJeP3K>MM3`D?2W2Tb+(MG)ETcj&aPyei=2|^|Gsj;LKXu
z5fn0hqgnkByvul>{Zvn^$zLy*z5z*L8RzjnhNa7<_xxd&2%<j|7H{xPa&ClM;kSA9
z12tocMs~Q5vsNnbnv6Avz3pcx1SEA}tov#bot0I!x!>J-Hk)U1whR*s$+(*i*2Ur0
zzar4g2B4>bMMd)5rHvl)>NEPL;<Zb@7r#o3dZsbkp9;2$ONyrWV=t)GnT1%(u4C%T
zl85Zkmw%GUSNA^4ge4{yzeT703|fpDfSrmrdg}AH$HgW@BQ9f1+74Ph=#+oP{X2F2
zYv(^iAMCOM4{^nM2TEl*H!B-S6I?M=4HaY;zMssPh}b1mc_Jnw|7aJfECyu{tv<I$
zCVC<O7Pvb(1y#SkzG7iw0-TV{sMQ<#0Ti^mYi?z=4=@6dQg7p^(cuA2A%)cl6|~sq
zetdhp5MD`D1O_`U*7N||<fJ2@VF2&TE=n<(EzoQ>h)Rr_nw2F_nQ(M`91gAvn|9#3
ze1-&JVqwk9%!rGNHx>r&>C{1hBq*KV-(K52&(Br@xkxD1R#z3Xcx%mvF;AaJP@f1f
z+vl3*Sr{0O0I?|{A>r}yF)GtfIQMfAYPnC`xTPyEF?+h5mDR%PYK8wFyApI#(wyw<
z)ARHAzAb9ZNNYsvj)9*>?Id-$*4KZ>DJUpFLG)_HOnOari}OxhU0o~9j^X4H5|pP)
z^@9LMLF*L)+w815D5$;NS67!1H=3mBiIAaP<6Op^osWF8pQS{lK#HD$TaB>?S8)db
z53biAqkHr#lsF4KQEzQ3H>rkE2xw?pegcsaA<5=8ao7X|rNle6+DvL5g#gPmI}tX%
z%ocV9M4Q5&;(|+OG*z!cWRBN6efvCWD0q0ZoSCr^kq{Aq8}WR9&RAt@V-r?c7xu?t
z${RpO^j{*AN*rkyKO}yo5pvv#kS7nLSgrvJQRL+uwLF>S;UxUK3Y~Kc?xw@-2DG4?
zBAxJabuI+dLM@amLMHmSWd7lPXrZ$I1|}}MLoYDZWS#1P1vUObFNLH`BePR?h*jQ&
zp-c=6pJikKdJK_(eRb*jt1J2pT8ilNtH7c<m3Pu}>-K23UulYnh$vM9Q(ClPiKRO#
zxT~qN_%zGp20SgjePWng{UWye41DExEjZj!C#zqnn=og;#D4y^;(|^wQUx%FZ}Ant
z0>Hw7T)Pq)24*~itNdq${e2|A#O@g54xnjpoHyIa-?CK)neL4wuQWMU<30ih)4I~D
z-fadHs&ipX*Imj&2^Hgot-6zN;%gUbrW`kh4NRT3#}S%00V7=UQ71vqt-AQYfRKuR
zI4C=ObSxs0pz7gedjbDn3{yq}v7P_x;mGToGBp=f{|R4kjeGCG0j_m^JA1``(`{<_
zz{5)iyTiaK*s_pyC;hBA_p^YvDS$CwQfhIX-QA)c@I2F^H8TW!o=NlLu_u7bE>kSE
zKP%-VjQ-th#V|gU5M$K_3M&6cQc7f*7pF1Jwvz>cGGNZgk_>=gb(GfHvmb*ZY?A@g
zMr|hW-6YZMKh>nRJE+X%1QRY+zv<G{EVkgAW^gM^%Z%Gtx7zjNQ!#8ohJ&noGih95
z>FkSssl<o4+y8}-WFpT{zEj#swtX9-Ll7PO)=7ge+}`;w9qBk=fL7I9;=|mU&v3fd
zz^zkZ%c9OfTA;seyw1VsVzKXsZVfc)>EEvW%*(+)smF)QQI7BMdKTQDh>xseSi%#2
z-qolI5SiMci?;^WaWcQ#h!TNBXrN+G#jr38qqD>kDw4~k%@)TFxsdz`R@{1iX>r$(
z3;{r}CjhKgks3e#qaUShux-3ThJX93<fPg<e2Cjak(A*Q{{M@DbpP_D@xQQD+|+-v
zRVG>V{~=qYz&m?yQ>7lM(Y9uF&J`L5Jqvf*Z20#0V|DWk)?di(8!a(|Ks>%zzn%W}
zKQl8}yX}Sq+KxQYCtSlCX2c(n+C93bv(Rhc?~5<NX<+IH{+Y;6va5^)S8?@vI$$@I
z3w_Bv6*2t3m5`LBlV3YnW)fq5)$X))C`$g%<)bJZWKHaj6v{tw)suFjzY~`EbG!b7
zqR!%fVV()97t5J8a*qCQ8IAtSVsh|cqMORnf>qN^R49mInU*A0g3rQL`aGY6?>|rt
z3c?f4#Ud%p*@W-==e;E~K8ssH&@Vuky*TP!Nmn(TTrh34Up16OKsHhNlB&j#l)7+@
zy`9~;bW?VOgfSy&O!V!i*5}`5c3+^2xiCN=QIV-|P5sf`O&d|Y(DV9BBTNV=xPxlU
zO&1Ud5TtSgg$oz8VIe?5Xhj{_rBR5&ENN^*o&5p3egWCZD}-#@-{Js0yUT|q^yeX<
zh}sZ&f<&Gwi@mY3EaOdZ)ZM0;oT6#kzeg@EnQ32eeGAn?KZgb_zDpN%qByIBjQ;-k
zH>)=8TXI>wlvMr$C+w`E&N}m{lZmc;yhqSZ!=gxAw}8E0L}hvBH!zlkDzs}TlKqK{
zlz67oeeIR|btmmNj-+kvPRb?nYMmKM@b>cHw~XY4jO(=PfG8`!lG6|`+D0|KJMO==
zwY^bMfX2FglOXkKv$4T4p8${lkCrsz+h6BTn|mVg2>q6H)u>Fp;(LWa3Wsrt-^h7P
z)mbstpU{yt`!TS|MUu=<OM+@=bK>|P0Q?-CBm6(`U%!pgG>VgSm{34Nx55U2KeNj`
z_dVUH-<1s&Gz24JPPvKLOuDrR?h_?OjT>R!xx|N%;&P)e;K%@Ny2QvL{<-ph=}R^9
z<NXN@_}@+8ON^YnO%?W4Rwy~=$L8g4%l{FVHf;>y%%ueAIW=Zv(yIdEZsyf_sQhWO
zr9KwWtwu#DMRDK~T>@BENVBxaRL4d)p4(VEgrR~42`Dsd!EUd_!#Ro)-^ug{a#_(n
zR?{!<aV!lS%zD?*&6p9Rk2w`Mj|q(pE_d9S0;|x4@q6H0ed3oD;+;h_$@ybzcI*SW
zqmjoDwUKZ7p3u#YpFffvvucv-=K(rXIJYGE@imaF1X3TNT)zU6S<b`&fw2fcc~Z|a
zf!*sLe{q0%{VKjLrqJ(@$U@ia`&&u&gPGA<vKD-re{lRdl?7f5I3zLZgDW$U(oIst
zo$5yFb6nr+5f3<fWmwB~YEdBoSe{L-I}|QCARJO;M#HrzOx>F<bwHfyC#cBuwDRo`
zSn1nTvxdDsATFR3jv*i$SZx^sC(&{n2lsAf6^HhV02A`pLRfZ=nPh*uwLf^?tp|h}
zplaI&Ww0bdX)JrA<;2x&g7PCdT^qBEsl`RKhVBm(rOLltzyD_cA!73jV8#!Mj}FB|
zLC!5GCx1OEG3bVNbZPbD{UU~!-%i2$-|fIKfG7bwFzmbK+U!TqGkv&ltLTlF{}&T(
zJ57c}|5qmD{K0ul6ZU^+`vI~}Ci$9Rg3M}n;QQHHy9T6l66W;y`04LREV&e>sj7{w
zpVbt#diHXmv(pvVkA4iv{1r!h+?nHBPDAE(+1UoJuJwlgucG2!US1v^P2Js}P{WgC
zsEg?fFA2W}oO%rD7Ut#a@H7%mEG&3F-<_gEC&&~E`Q9xqEu|Yr{{Xg(3`CHVr>Byf
zT%T!epQe$xMf0!OFZ1rZsapE~!ZY=))NYlVh1mRV&R6yFw2KR;{$}71+OoxeWdH^p
zVIz(1XQ!85_VjT@0;}BTX3Q?!gAW&(71hA@C12(hP+D7A<3w=~zrn!301oKu^F7W2
z7Z+Eb=076cf#83lIN>jAGSJrbBld`#G(QQSi#b5hs_BCFY|RyMRu^Y`2zg^5&x};8
ze$IR5djpkxRct@u?zXn&wYARHg66`uV+Z<uQAsca(fQWv3NR&m{VQ(Q7*_tJEY);@
zK!|)OD>!J5Rd?=1{m|12PURIW9+}_2|1xI>47k^m??tMm43x#i#ZpRXuT%FQpmt%t
zF3mxJ03*?|hpSo@8xW{?9M^r6m59lcZKRuJRImX_7LZjb=n7+F6?AoVUw+Gi&o5wQ
zB=whm0}2w<9|<tV{P?}9oUuvWNXVcuE{py{-}?mNu-h%_=~=Qz$F+rU$cfLt)u@PA
z>@WbbF+M@-h;2Jgsvixx=Gk@x_pP;fAHTlOlFJ;dO))Rus2%1-yZ<e!UR+!pw|?i!
zIXv2z5lgrM1havMiGOHLjxF_C7tVz;K4qEZnIf@pHoQ(Zrib3=V|`PZtg+M+V6#2X
z#EO+%2Eu(VWN!tc(dZ)~6<a4jmkC_tX0{foQ-WkYWo1dUQZ>w~J%EChmOw=?>BgA(
z&>G+RpC%xnR^?nT+7XwQ8i27X7g5dAGUsLz>dd2T#X=Pa=K%^t7GNu9b#b@lz;OYV
zej7P|i&JdHE3>{-Qn&tREr3|UtiJl)&-_t9KgTO?Rj*W{>aVRD1!usQab_{&=ewy2
z;6lCnDmdwXgaRqw{$NcQasQOkSWJ*!a`<0ez9CIIaa2I~y5p4rgaUxh+|a32iI1W9
z9$mgWH>)e`{9CzVbhOren+pLPGkI9Tm2YkPlgA0<l(bseBMn3J*u%Q5olw=v@+cXo
zArgPw;P@$`nFiY){|kkw85xrsTPfhbBriRU$o?|HPl3K*fYkZliB<v=0>ZN%?sc@8
zk~#vDjI%2v=6m|wYyM}@(neZG#gf7v^VL5DC>+|L`0MbU<J=W?&!0C0MjI>%+MRs+
zg`$PYz;{XvG1qw~Ul|apK7Fy8l5Lv*t+qh8CBTEYGB=@HM=}HmRh_krS~O6S$p&!q
z-#ThJJpUlec=B2O=3QJu_k7Vr0ev6ZjRv=z3Tsv+N-^Qss4nHjS7BUA?O&@sd*E0I
z4H^O}5bC8P5&U2q&ENahl>#@~Z#?+-WJZCXte?|<NibU*v$<t)YmA~2k;v>L6zz!T
zB#YA;$x|95^ZJvC{uhI7mK}blmb%fWRVT~){nGvPLnQnFG!4L62JL{)K2FjJh_m=}
zZWylA61?+9DF26En$&l*wF-z-DA2E!^rx3=iv9t1c+i12-QNE~pE8&FT__H?aJ-nn
ze6XC;(3nZw`2T_{jlsznd6J6#jqylajMZHGZ2Pa+lMl)(Fqeup(ru-<y!9T`|7cVp
ze<WzWVgtiB;)WGk&4>b;QwAq9K5e#B5)#098am>5t}>h{zufMSea2bfi*E(~%&Ys+
z8VS|5cA~I%2vr>wYc>u6LP`PC^>BIYzuYc`-+vwM{)%eb*v+6^*Unc<PIL0a1GgxS
zQB58K;*9WKnLaG&2_}_x9?+pUu+lGC$ncIpdp|}Yw61&oIJTKM$_`;`nE`?Nw~WA!
z|IwGgw$?BG^BJ2x`S?nEGN%)WcMY7dw{1(3=mXqqi{xPeaoA)75-Edv!GJ|&sorVA
z_NS>RKx)L#4DgwB65tmBj*Fq7&X&>^n4F0r9`-8E#2z%QAHs*4jq$P&r1-%nG|<!W
zRC?#OJvgM(0r`)e)vm)kGAVVhG1lfgHQ!G)7MHn%UrYo@{D9nr|Ir4(igP0Yc6!uW
zcUUs;6W}?S#W99$_oe-VC`Fk7v#8}t66`=n0|sY+Hd&N<3=hQ*Zzr=f>zszX=)<bd
zSAYaLlLGgv*mmgR11?mjz>3+Uya$n%x~l!=I$Rmd#CVR~8S)BnO83X&{NCG7^EsAr
z!M=!#Oo-8<oHj6%mI?8Jgw{_ksABTMw#c?VF;LsZ(L@pQfPp2HjSwaT2ycuG!a=bk
z*4$^IEr<F0%XuQf{Qu5g|JwNvPfKO<59J>c@zDUVA8a*^EKrM7Ml+irPbh9%;4cJs
zOTgJ69=x3#t=XVuhYp>`l^ZFM%#cS?hL=>Q*C5HRLdTIrm5JZ~=O!Txs=K50e&oVS
zP*;Cs^U|w$PMRI#$`gmhmZ~<xmV%E=Y`|r$bHeSXYT|$ymzHcr5TXGYA#hfgWuYE)
z=;1Bp3J*83j+xxk?a(}8E2nB<ud~n@p4h=>@Fprn1##wo59}-bTWB?uS3XHDfNOu$
zov*W<fIVt3Q`^A~(rXAPr(-w@c!6UESozU-1X?j;=dPM?QSHmhu|MO?1JGxaHA!Cn
zsGG_;$PHIfw_=lUN|Ov%kf(nBESzhm8)u+*#d+LaVxuh0PJ6?d64_cNL7vhzRdwC;
zhYCz|-ihrIaojQ9#71xPt{ZsAsLoYFm9C?|aBt};$r5KOc^$=RS+9l+s<|R7p#PP0
zhp2o?4k?~OEmY>KtndDAVlQf<arcq63sBjqvcC3q?lfHgO!jZCIK4afKNlGHB9ks!
zAb$Prg}EGSByk<(bu~lS7>Kf=C1P$Pa9h<>l;1#e?(9D`zAdl50KCD=TCe`;Y*uUO
z2L6}omVJ={#Rhqt79e!h^Iq?FcliNJ*U$@drh#7tWgnrD(o4{uSz+aBOd^+$jmjRK
zphFrB9DSjbXM@mT#cRIbUQsX^vf$h&fhWF?6uv+EZakjsl%yoPqkz%~x!z+#xft38
z4)-L>yR!TC8Vf9z@jkuZ?(IBLGM_jTwzHk}peCh!_R`OAbX(}lJiVXjy(a;`VS#>C
zGkc`9NGR(qd)#w^ehJ+gg|w;4Dm4#g8qMdd4p*~KFSmJhbt=(dxL&b*q_k>S<6Coi
zg`cAu6Ao}*C|JR#T;g}?2%Tj66)pU7NX9jeQ)2nN?RR&Xn6cW>9H@5pL5TPexboTP
zh6QbBQMKL<(_0Q5(ft{@ZV+hsESsS9Sjy3sxte{0+k<7PzkB;R5E8^6T@B-(L2p;O
zt?_rK^7Nru@uD1w0^TLN`Ed*H+GpkHC_w4@{lRT7aKTfki}84Gr;<dfg73k2M;jp?
z2;+Lom;{PX0izqO-35k)Fu+L#?!O6taLHN!iqgo8^F?BT)q~_!WJB6eK4xUEW5jRZ
zfEomH9TPOv2&P2oE*|3MCjcpT1;Qk3jeYhDKf?H2Qx|xbCC_kXgV(rl&nb9jUk(oM
zo?p>~^Thy(F#LHIoMM642kxmv<B}ut^oPxZHOjdr!Kqggb4m+UZA_2|ODd21>=Q5C
zZIti(giz(nc9*e04E`zJ#ve`p&V!JVT)RqQA4#D#p|-@AgbQ4RE!MyG)T+)L&9yB@
zT{8K98w}w^Z2!L4o*ycV@aVWLz~}OUM4a6f=Vk_3Z}rJfehgtV!7fa#xs>lZCa>zY
zDuiLx$bE2{gzMcTFRFp)HKt%f70-XF5}I{6byE9_meR#GFM7!;T+^;V&nCq}gx=_C
z=37ZxpaCKzddcXFb#4AxpR{pie$0{(6<zK+N)&}M(+Cr6Dp~aDw1gw93Ig<=*1*rP
zua4BL;v61CMI%)R$Zh;bE+bXth4FW}=C;;opoi<Ttu7b~*`}##$r0x6Qr}?*#aHg`
zZI7pHagw!4+a%BIcF!0alf$#k8OH+JWLc;+s;9TZHG)i|_&_bsQ^bBDokuG(C|VAt
z)5$dfk3G>aW$K5USg3;6?PnTPbLp77T>OK#>5pz|(x>CwW3{5yom>`QKR=;YAIO@>
zLJJ<Q7nJ+EmfM!kj@mdk>{lqGo8x?JIz1x*iL7LASR1mA1Rs$sdEBFw2l0mB)Su1+
znjsL%$Rkd+9S=YK{cxlP;(biVWl}=EToQ)th!qOD$=odoXx;u3#{;hDGr6R#dUZOV
zN}iKlDbGo1bjl17k|L-$E>*eyWLPsO)7C_;jM=*{51*drr$&7^KJwnT*0aBst~c~M
z_0C9JjbKK)lk|7oi@<rzU5Ho!>2Uu97RcMc_5<i=M_$XfmD%BQUN2`ylfBLWftK+s
z*v3F$L3I7jayBD}EXrCPJ$Z<6Y2i|;Oifog+@1y$F37&;e^>X0@Xz{so?@v0uab?w
zcd>jX)FuXI(+CqNr6#NfmsehwN|c3-D3Z@Q$`PE)=7{&|ac@<7cQrQuU9@^Bg9M3;
z&WXP)n=ZiU*_){fuD{e#_#1gHt)11`MgAOhKKf)|khG^-D?e+-6OfR$u*GWu6MVLt
z_<|880Ry^enoA|~TG$z9JxrJ(p<(_Y5LbgrIEl}t=b1pdfOnX~%i=V0+lvGWiaSe*
zJDZOROVs;-7ubDS-_xep?Rt>(UimSxO8R*)x7*Xs@4^jUgmwRy<CpRY)@~t|hYuX^
zrqZ}t`%d1cVhJL?q7*gIX(rYS!^hu;sH4JEsn7fkhs9LWn_vv^;l;shEA6_wn?;9A
zlsQ1rD}%$8PunGLM}9vQCQ~RS1cWrUoCE`!O?jIHBr@#RzHN;%y5O4Uk*_hgt}_9Y
zep6IFkuKUasju0dmM#xjxh#H{b{}K6<`)gWsu#^%>@Q5Ne1N6BFCqt1_>%>Z3yy2n
zeiEBrGkjUH#mCfO!5IEP+5cd;)a+PpzM|t+RDXcDKCQVrH}7*y9ojGVpf3TfImHDE
z3ZjiF0@8jcweFk(c*z~9CF;h;ojE1SM^en}fDj}CW}i7H&LHTdiy&F@#O36;pB*{O
zWL^DK^--}^pC#!Sn-*;-K}@g!YPA=!yskh>{->^XPTLz+^cE%_XMgaSpqU#3C`hWQ
zE|1+ZM$vy-TUR%;`s(3E1u1<V6^)@jX=lCc3)Aq(CnOO0rfjw3#Lm_7Agog7cmIwW
zhL!Pk-~JngSbMH_;b(-Is7AY^aHh874R5fo4@}jap-^XsnHK8Jb0ex-YGhu0m%y_Q
zZMS$)RNM8(^61Geq;@k%%(|<0W$z0Y(WiBsK~xKWqy^b^#oz_RNs9Ftxwbi!@R7ak
zD$QhH_G~>tFMRRknb?R>fA`oy3`{E38hks|6U-K88VtG_DdsPB)vbW{GqTfjQi4Fi
z;gD+RV%}1FzyinaQ8EXFraG4aOS1i6I7X8q`U^~Lw&K2xKT}g*AC?~yW}f*~b^H_5
zX)ZlczS(mvnyH)Y3DN3#8jK8H{Fw+otK&5p-x}S((OBztdgqNaW^<nn8Ms>}k_>eA
zJ`G$Kdf8v=0|JqE@7bqw(z9bp6{-R0XQda%zd0ROnf-Hoyki`xcOHlj0{1uB;W|t4
z)xFt68Go<3&vL!ylh5t=-CwN)<DA~sYrC%%-~G0bw2Jo|RH|(;r`Oi4^e-9;7yE%w
zxK2NQw9?p^x9sA5%HDe7v-Q|DMFRSnw4tKy{t0+$t_L)Ek)=y!k94UIU$4(hO67fo
zpYbak@*>_n&knl`f7dklI^J3Rll5N*7p_Je8N{}`+RiX8-|z2*sSf#jX<5K0u%OMO
zuQ7BRmkeupR<WuR=SfsK%nA+pf7+qjns&pQWeWUvh?dK6J~0x?C7I&pwtK3pJuL<e
zt4CXLI_h`_#vo+r>n6dycWAh%Sci5IqoP5Jkzy$MXreHph!5|w=gItpr>G+{l8cc!
z6v)4-5LncQ<ssj%j}N?GUjk?O^vgkj;*hoQis4qhwrjk|z~qeyEB#q&a^Q-0fpfPs
zPrHEr(|~2%$ilWZ-9|a`<n-e33z!fZuIbMSbMB#@`nFbB)nF`;@`7$@MAF=`t^>A#
zIEZiy^QGeUm$8M5j@_@_E^&CCD3%UklW#cR6e+b2gk9iar8vX-jX$!!&o+sKMbIX|
zoA<-xya?GKt4u1eqeBn7N0|Ed_gX6Yx7_3J^~Msjy|v%iM<GtnOa;CSSxd8=Yk4;{
zaFc!Q0Wvu7;=GnzNgmVMhdX->E2Qr-3Ns11D*`TLrTmG|iImm0?o{DlOOF$B3f^1X
zYw|Cbj!ZCnOkai${Yhl$&RPw8=!MN`!WTSlMF)&|r#q#!XhO$+iQu;&Ub+)_ms4ZC
z>GpqNf+p|37}^VQ+Z%LD(U*_(MS?_M{Tg$Z>9ucy3oRNo)BU3w0{Vxv4jpyIb`N}R
zt-o0$9v_|10G@|$-ODEa&$d?N_z&n#5_<{Fw%_*u#`;QQd=m%Nd96RaXZt=?(i3&b
zz01eiF+f$|;v(Udp(Z5zdwz3FjN3ii7Nlg7+WWg;MS4_Xi-W^>Q<#6edX?!q*a?JG
zfi^t|kV@B%E2+^!$#H6OYy?085NBKA^<DmX$8h_q+v_fM?ph5EQ50R#t=)kQ#^+lV
zf!z-xB#MQ70Z)fu{<heWEW6a8=N~g4&2GflBl(wE4H=SN0(S897{mF@QhaLsd?iEX
zC&1T(toO!`7?;17QxKC8>cX0AAmh@|g#p%xxxqJ>(jKX4$P~88<49Wjun7qok1^b}
z*&jaNg8*5^DY`dVG&>Px@*f>=wlDsIMh?wOsKn|`1U#9XR)1wlUCiJt>~`wkt_>)$
zvcEC5hDCE~Y*iOewvV!gFm^uTkBt!Mb!0=9WsWYlWk7&D=Hsx$t=a`1c$eGUFZrMa
zt%rBfDE+o0ps)ItUk2`uFj%NKToqMhQ|is|;Cv;Pl<l`RCAbi;qR-p;TVCICL)yY=
z4rJogG&kuq!wz*(cS|qdZ(I<6CRTp7y|R=7$KBxDOEy`b+i2>1ra9Qt;ugyJ^hUx2
z%hPL7_%l_STDxS<o#_5B1YuN2CqOKrCchRM!+^(?NK<|T|CFaz?pJAmE-X(xhuPbf
z!}(6nRq_b6lE4w}z`H9j>zXw0lnBY1oQOKJApwY@Oo;+ZG6Ur4Xmu)4TjRF~&ONu;
zFG{PNb|>lwET=EJl;)Coo=E%_F5>xk@FJXbjdwU`3>PlQ+FZv;T<_^t3MOs{y!>&B
zy*1Q&?H`45Ifc{(n1kTgm6nBRKGrHl7X}dZgG~0w7Ib{f$oWdvq)ey{pg>H++-s0k
zX9bHiRmZ^kD#~I6UVg~qN&uNfhwuCG9#AR^83&p~b9y}WNoLZoPnq2%tTu-us90Dh
z+2ui%SO5jFKuCMCva=dwE}`la<1kKp`gx1jrkE(ZLWK5<dr|u36{0f*Z&eTm>izjq
zl^YwjG*F11&f?GBkB&bv!Wgz`MNvU&MJT|ws%&tqr`(3h9pi+`{%vlfGFt(T^^D8n
z9)F9tLQNg_V@l_4blmg4L>A5b51jZe-NhO0c~3OVxX<@K`x~6OCC`i<wYZ#<&3Ds%
z;C26*gd+^gL!wwP8!`AzjbWF^=ncn%S*{yj@yv*2n5=0{cuvP}e5X=qEbYhDdqu-l
ze4p;0O@D&d-oB|xYAI|5!-Wx;qH?NMffx{-8)$7>5sY8D>vJ5t(=_Eu>PJjl!gy$3
zKGOhK3h-I5-XsL5P-so#{W*eE7zgyLd<+U*;+kO2>rH@Hye_)?xP#f-wl7G2`SxLA
zD0|^r7{2nwo0-FdG_-#qTajw1*8K$<PY+E!{t5R-RbF<p+Rw~RJYA3F%lRL-J}2C%
zqYrc6KtWZ~p62<a3claXKdW@<7uN=gp#>^195j$*n-e9OVr9^sIrO0qgFx`T7dgA1
zy@(Aha;n&pxysMI5Td%MP)43D+}x>|VB6iNJilC82WNI_iFwr-#?z5`mU*D5=8}FV
zWc{$*YR1g8r)upeZ3gHF7v1@l2zC3k(2?!8>kA#GPg1rFP;;d;03<UOJ90YJ^d#C{
z&s|Y{c`TFsC}4A(e|{tQvRmXYVRKIoj7X;tJ~2d>_0|bePW8v{h~C;EFrc6#3`EeE
zNkwp-bZ;d(C<ue=dWoC*U~kxNP2{)UiQhFIC+w8E*%`1yxxQVLA~lGxWMyEE=)YXD
zP>aum`_=w0_fHE%o=e$Uxi>cJ#?QkH1?Qm^MUKd}S9-XphUmDX<zsLlU}-9gO3Iy=
zbeSR_E-Sa<kh_hm4-FcU5W<b4ia@4?TuT^oGL?|;o5El1Ws>!5cXtw8pT>2xSkUp)
z=-r1uB5k7%jG)lgevHL%Z$iu!b~dO<J|c(%Py4Re%&~mb5}-i(DweN+oPBD395;w@
zvrP6na{M&@`!*^$l$D`BSlPbGB{R&m788NGU(xLJOWQ+16<nY=dXD9a=RI_oOmK~M
z>Cb56zQhaOy8Y-s&e*oeol;(Mdd{8#e-tVc-s$`Fmh;<4TdLq1$c8Wa5W)yb&U)Ht
zG@9WMK;*~%P1;Joy@!83pDoTrTZ&1uin*_`{-KJ7gEDU>$NCYK+(N8@awPK?nknJr
zdXh~3Gj&H}_VEkzzeF02#eqU?m^cm+Z@-OC5&puz!%a8w{}@9d`cHX0|2QK~9*E*F
zfO?rHoe%I323$3Hl4-->pT!Yc28#y5CHkaZ8O-C1#Cv8^K&u^SvPrLiLHrpg>i+NL
zSRZlG58Ot~**R^<<ZpyLQNLXV^&d5$FJ0u6%TRA$KLV|9Uvvt*8iNOS5TGGrrPPF1
z(IBJ~NB8tu@Z2e|`%t*Kxp8rEBffdq+uNJg;xlUk|3-`o2l_dmdP8crdHq;|mxn*m
zMXe?sQPNE1?%|PN6luy^0JthY^Rf}*@a7g4l7`LsvmIueS3A7x%gV~mU)jtCYL-q4
zq~jSDSg{d5kdu=$G0A_p{Dy?bz`)?dOZ-hNozq&?$_gw`g9Hz+tVoqKyzYO$_WrW}
zsV$4AvUB@oFV)DLeJKou$at-}smaIdiy-O2?r>5(snBOx*~@F+L#C?kw<ktgT3VpJ
zkDXn0BlUC9bT}FrEG(>zx5SLE0s&>hm!Hgu2(a$Q^Oesp*OTs-p}`#=1O<she#Q$w
z(Xp_w@bP)iSLmpysNA2ejgE{!g2r-$+N-M0?l%IB>dc2%9J_@aR$G7~cqZ){5jN79
z_4TeVQ~6+}+$w#bWpO0Ay8F#XN@@V;?aRo>cp6DhNMIAOh5*ga&lgC=@&~*SJg!cD
zdhhVMG-W;n%cr}%KHY}F&h0=A#F0E-Z1qJ&Awn5uWM*cjr7f<m+Wi8~*``zElQetJ
zV?JQ#1zru)zBcLNvFPIu=`k@e2?_~8fS7Qj<!SVEb>r~aOo6j@5;r$7**Z9o$y1i0
z#zYNwb$2&tb}#_?AA_ObHU}3fbg+ntU;bt~ZaeXUK&0`3``MneO=zvHtutUHt;PJF
zAaZzI{DCMOM#XIYq7Oo`QuXy}u9Rfy@|LyQVV#`p>{oz&dUtmR+p%zBe8)&sQBi@d
z@?D;0=hDk*XJuuj{7VGTOu2F2larGJ3=D&GrJt~oT3=;ITU#5gQjWH^_QTVY&1dmE
z<xSBXIiPf}%g=kM&Vm5}mr45%&qGsPUB;)tH(*}O!fuaDeN(1WPiQJA_&TaF`B9o0
zbJ)Dj1*D^;1-JjR$-E)_9t8!(Mqt(dHLAy$t_=z}5=-jno=dO@_`2oMFTl)<3M7*E
zc{U;DTVi5jI)_!j!&Z0=0W(mC_jfG4`~B_t@p`}6e&ueQX9EOc=in$WD~pPbMkD40
zNt*(GHQ?jIp%M>GP8zkkSOCR?1_lN;Hb*1LjA$hM4PKYqW`l9of#*Q0sWFLCCbxop
zNLE%BaIb;J2S`v|eZ3A-N^I=W6amoaI5jmj;>pg=UaXMG?Qx>b$XI@J<EG%)?YuKk
zD4TS5w)VX7{^r>mim<o0M?ykUu2nrTHrC;^)tkX(t8N;9c6PQmnyQrc5#sG}kSx&e
z?Qvg2L&J8VO2hhTBKK3^_V36~?=P;W%Z)`vMgB(qz}3ZndVARF3qM|{?hbfuyHj0m
zcFb*V_B1dc0g*E^j}H#gQc>+ZT<!q(#BV2-=c#QQ&&UshfZ!2$&VW+n$G@5NR+HSE
zoG-^l??+lB#h~l;R7`aA3t;N>5*E!I0u8t%JPxZv2~?dO9b)3*V1>*kptO^K!*X_h
z9*7`S+LtkIZ-MX6M&ja7#GYGDEkKjUIZ*hx*5;0c%e2^NtEH~~!)32Bge>r-wz-*y
zipp%I$(|w5#>R%<=cYuTW#5UHmzaf-5y=DS(7=TiA2$ioW=awBjvQzxDuO``1v-1<
z$v)*|WRz)!wY0PV7CTTA0Nm7-$;rv3rN^&m!Vz(C0k5|U5FlVwfw`FTDX`1s;bxdN
zcPxWTjso1?{=8Ug1_cEr9ZQ&;n3%5(gN2RF>9{Tk60x(hQ&pV?#=KO$j4~I9B4^8u
z$3PPaFR{aHi3$P&g6H}ATz)o}?L3{|W*^!oLH8r<_`$E=UhY;Lfxz)&<$8WzlQcdx
zCkItJaWI~2a%iZgw)XPwuFL%x_ee%6{7RLM30ICXp`)Y2_;?CiEbCKXPLFYxus<PC
zv*fa64qR!EF)tq<eVIKk;K~*j7h}Rt#(mLmvLi^4K}JQ@Br@;SoL*RYf4NplmoL^f
zcDdRe-rn9`UDao0tp+-9dmw@F6%2U6_i(J$&dSWxQ&E9<mp#>jgM}4C=k&Sx5-E|G
zo_@Hq0~LrN*px@;MOSIB0|c>u;>_El6?-}pI{brQSJ7U*+A{XnTt!7id}N><NC3F@
zC^!xdj^xbGuwN<A;TzQX8J2sDKLQ;ph#Q*oilU-DV4xE7^?rl#$qbnJ7#J8pp>7hb
z63|%EtRhW#bKYb9oZs8qySZu1gi8brxYY#l0}XLdPfFgZlJb|fbM(;`=N-3P1Q~gG
zc{w>odU}<#snu53q@<)JyF1_WPOww**w`4*WrG<h@y$M)*V%MuAU2~;Lk39{9XA^I
zc^jCN5TK~@>Cj2xXP^|=Zn+^+g0j-6+fYun47q2=i5EAzXsCMw?-qu%I>%sOdfH_$
zj-*_(a%5;INbeAbQIpA_?FI;Qz_YRI#H%|3aY{u2)$(38zm4;W9vgABmW7Ea6bcMH
zfAopjLcW?G$Uwi2f!IH}9~4XU#`y9nSVUQeHvg)HUEjdq)ALC^ka6feGhiTr#zqbG
zMX@|(I&6jxc+f}3^$s<#qP2_G-U86(ty6Dl(CQ*NC9UNKlPm|@b#`*{vK3Ay{ICgy
z83~QnHM(~Nj4Xm;@t4r@@^W8aU+H+#%9oSbQZ*zbB%E70DJdzVOZ5@aN@V|PaWS!h
zSfbRlG<bm~cGxm<U@1}Z&*uWDGrV5(p=D=htEsB~^K5Kyn{q_N-0;_F1KE-AhyW8C
z2gm>Y^&STY2grv&FasW1U?)X}IygAk+S(e3FM_13+}!i)>u-PZ#3>UZ#PWc~`^V>J
zASw4!_-bm-A50a%K=v#c1sdq;j_hB5{rVLrI13Vw)lksu!dP6q4@fla?d?F}ck=ra
zFls>CdsA0e&g4FDLFG4oAP_KJs;a7h7>yArp;~&=^A!%r5*fVCl+pbz?(Y9|{R5TY
zh=_=gkdT{;b;ZShfdhW{@BujErlzK-AgqCG0Z&v?L2Vr!pfenZOb{Tmff!jOB~}&|
z2RpkCAV^i{)O!i812>pV&<iB`JwARC*ayaN*xbd<%?+R)pu$I)j)|GsqE=h0N<W>~
zIX@{$O;a<ayxi8?+gnFx$+7#j$#q|mmKKbJ$C3s7e_mb>ko<tiv#@YqUfp$ccnG8g
z)TCi`(vz@P<;!WHe4B*VX%i?X*Jt5jVL1b$IB+Hm;e7}wC_(`*5B<@20{r|TAt68(
z;$UY7AmD3Ah!gKB&xRjxr!WZ#eQpj^O4XOArhrl)HsF!L!^2B*AtH<|=9eg4nEkHK
zFelt?IdX7tplk>NK97K(pWn{j9!@O`sI)07V*q~JOdEE!LPt;UK2wzm_x|SQmjTsY
z7{(h`?&Ly_tezfGw&Fq^*m(2147E#ZM`h*s`uchaD?7+bFlX2tbtnO7RjFY`#O}kz
zM(iIMvH9s=*yW4_{ERrF1{o%hn4Fwk%GF4xj7$gs!K{NYk9l~7JdLfRqqOITJp3@j
zLV8M$moZr3Tvl{=Wlu3SVq+5%(IQo@@vjgPin@|$jiQ%`P$A?7Pwek##3ercXMy{v
Rzz1A_WF-|Ps>F;#{vV?@oK*k-

literal 17084
zcmb`vb9iN6x9_=QR-9B-tcq=`V%w^?V%s(<m84?Zwr$&1#r9pl_dTcko~Q5a?$b}N
zCs=>1y)?%h<2yfNj2$K~EA|N<8y*A#eUcCtRs?~-z5x3tu;9So`|fcs;2)f=xP}7=
zgwFZTFIY^EDi{bv2$B#MRCZlC({a&OQhVWNPel|J@~!IqMXLp7mldi~O3ZOpzOYc@
zRbC&hr=_M<u4!J{U~_75sHJs^pOAxfx<M?J2tS0+giZ)Ybn`IS&k{^Zx($uiyct8l
z&z|Hk>2>awahP^EMo*eF3I>ls0S%3)2}vQJ6&(l;bs|n7-$v6_{2ELs{sPgPCpT}y
z`A10MPf#qGBE|q6Or403UyoU|Fj;&OolePPv-{o24=n+QgE2c>Ti?pJw>NY&G->&v
z1ZuS%+xBh|d;6N&+M7$|1(cO?cQG_r$ol&F(Ik2pkmY*I-9Qvh??T+n>@2(G(iX5C
z9yZ!;_edqtC1=GRPNc6^8^R!kNXg2Q#fdvPIrS=OeE->a{W<%!u&=N0_V#vaN}l#u
zPnE)lDNcN?)92%Sy;TL=E*@=WeEjV3YF8?eX1Zc?JFAkYS}P5UAR;`x#eQGX$jC@n
zH)Dq^bFxsexVI$3uyz?2DMXrVb$OYUmGyXjk$@#IB*fI%`03?k*Ontp*dRp+yOQx}
z(K@6|WTnl`L0lXbq(AbTax{?^r-3!Cvhw2oY?TO>I8NN-{%>_{Eps;(C9tuwVuH5L
z&odGegZm9a$~%HbtmnDWlBJPgVdY^N&F3gY`q|bn(DCsr9-Yy`AgPDva4DIX6gdYB
ztJQmF<ASHGi4sgu)5Vx_DX|^#haLZ7rHcvqfjg06P)O%V%x&xm#+v&FA$-Q?^Hit!
zwLxj#sn+ABzA?S=jcKbNpJqgyel!%7ZJVU5`U~?H4MocE#KgpznVF@frJa$9wwmak
zk%=|MiXWUIPwFt$w1xp+EU~_75S5mcFflQymTN+RhTQvxz5h9-#`F+7%?(`k7|3PC
z%%_Dn-As1`9E8i~$w5Xou)glKRUlP}D(bEgb>H~)vq33?PCFi)f7Z*lGd+B2c4^pp
z{Ccb6%;n>Pl9H2!@>N#CYja%$#Y*vTL)y%4eDV4m;yhtuxBUb)q}R*ShvYNp7dO2!
z$?HG(yZR&ZdU~z2G&I_EX4A-Gm0yP)@0|0fn`v9U=|x^gk*&fR?DP~V;ftl;?=wQ6
z7tb{{3+wj5V6kK6Np;$oKU2;s+&?v%y8VeG`MT<OcS_=P**Aa!1**C`6jMy*jG{MQ
za|>?z3<k27YF4w?LK(B2yebU!Km~1$jSuXM$TH6)Rm-UHi#@+7mdAC}=EP;*1X_q{
zJ{?F0#W8K(c~B}kNh*0D`^Oc+H{f|1h(;SJ&$ef5mSmFXOxm2Xe@YH@s`&;6GQ(=}
zk#W{YJ1Q|eSeD~B$Vq5PwAT4mjW@(W0|~lx%%e%(NkQqJWBgF1W3m#rQz_K7AAp?M
zOgq5ZlAYZzzNd?mCuRMd@O9qtBEk07mNZ+ugiZX?As_APV~?3S`tBI#9pqbKE%8Id
z)Zlk|^b<b~d#ZePo35Aq=N;qxrU8Sjk21Ju1QQU5D)Z`_qT8Ps)*7TKdi+Jl<8P+c
zl#9J_I|ioNhVM~Uye8T7yQ$V8_|bFS@kQ<Yv@3pDq~l>0cc=HCdlu`**IN#4BI!cn
z<hAn$1kNl&s|6AFr$h69x-DL7fq|I5nwg;^b);!KQOyz;%9C1bpKRkKnB*`SrdoR>
znPs8ZSi;yl66!ofs}p|?H%ghA9pWf~!fJEu`jPF1o~CSV-y&=~H23)n?2-mLw2yH^
zhdr9@m*8(;ZIJ>Cjxc-T3(B63gy}+k<B`Vskq&vWRYt_lqZTCIubopJVYHM@A6i>@
zIN~v%3@ra#SYgFiwUCg-o|{Qs+vns(Iz(RAgMs>>n7vmq7#XEyIw|<(oSkaOQ$;km
z7pdn~M~wEh+19-#AjS05P6|Z$oEPX5xQVP=n*2nNc6yUe6v>;@+Six99a-^qN>#2s
z$l7l9U5wq+xUO8OsWOAQ#xr;r)7-8EYb8@SS976tMOTwvw@#VRmU-kXhZQGie|yP)
z>y8SsqTw1|NO8S(hwwZ!5!Cvf8MP8_d#v2J=)6B<lvR~;)728Y=zM^f0(MMySjvdx
z%+`n3sR9NfWqR77v~G<_EY3rf3JS%z$VRR}ziP*;9jr%ge62wgH0B-D%}t?0vQv?m
zuaEkI7z&rC#*f~|U-?u<;Yur_&!E{96;gSw=1eMFqcUuMNR)0uYIGu6oeOcylkMm!
zZo}z#*-r{CDkaPaD^Gp+hfowo5AifYbRZ7a-2XFABFkmX+sRm>EML;k^;7ZWW%9V0
zB;w41`wS88nke&@P);;x@N=ZSa8bW!{!u?=s!8`g%JN+>P;9yb3#%))(#Xl9W?4hR
zYA%Hz5e@vAy`=GV4OfOxTWuq@o~jp-K?`izJJzjum}}&xsIEFPa1bFZ#q(6SMKZ4T
zs7=)68Q#Y?R0dn$cP2187?5ux^WD?o_i!&J6wuU-{K#r@NzvA>|JQ&ut-oRuhl@!L
zn#7{7xw&>k4fL4^L}o3p9E#6p1I_4ZCQ(23lU$}~Y$M28W5bp-4An1Lg91|tE+9cm
zA7Vl6pX?O;NBbBK{#HjrVd5Q>g4CRtX&)q4bpBw?=HhK}(6YszU&!JR7@V5hY#$3J
zV|3$g3*m78QrM(g^QyC4=edyB9?RoUjS*)XlFF?cDsH;B%hsK+`p1>B_cH@F=iL^+
z5@%fkH9>Qm&qgP*;CQE1mr`}!Kqi>4*sQ+NDY9jUf*P~!RGN&Mi88X*Gz1gJH`6My
zngD}?64sm$L41m8IvexNJbrqJL*%f2(=QjPL*44?LJJe4s{uDF)|RK>CuMxJy%0)O
z&Q>!|^|K&2X(G0dlgVfgoDFliMtJ7P2~JZD^q}LNMI6icID!b1jxZP~^qYNs4hZC^
zoooPaTf5j~CoV?s4-?IzVvr$=A-s1xlNJ+gKe8_n5BN)IpPPu9a-4a`JBPeJaqeBO
z6Cf@$oAqbN-s&qv{{F|og!zu8)mP#~$ucjz$|>TGM&?{eY)j{+RCq^Y5Dd)iEkpE$
z6!V!Vw}XNB^icI9Est4x$SYrn6L7E~yc*Wj?ddYR*+{%Zo|eB1JF0ziJ8CC@164U!
z6rM-WJ5{dk^#188(--Lno;uLdORf;LfeznK*#R<OoVZf*T3{`rLl@7|*+MDOsAbj=
z*B30;nG~GO%B>5yrg-t*%UyOs+oB=@_)_9#C9+^iMk>k(PZQ%<6(jW7AI@MWGbKpH
z>Z2Xu#a=CvF=|hSlH(rCaKGN9s~=;J3??vg+KV&meT+u5ys08T7(<5^I=;`)S!wUp
z+QrE!dB{1%`!3Q>v%>&GJf2}`*;-eK-l;`EoG`4_Ox$~KwzP_po{Xic%){USJenN@
z3kfw9pYJF6rP;6oJqbg`SRla4ndUqTtvOCRoXKYi;K4z`&&B=64n`44wtydnve?=4
zjN$FR2QJcnP9j;~TzC&nAZ|&{mpyv{EkGhf9*BPzueCP+!)zKI5CGwMc5#sz8>{ax
zRH5CB7IOND?pJ(#dU|?DVGSTO=DBC&XUbB){P%=dfoQF@zvGQCvE9e}TdT{(IgWE`
zbgZ?t^<WHvUX4+h^BWS3*Qc;`b1mY$M3eN%L#5)mA1ag8%a%wX-^t?7{{Br#NvV6d
z3xVI>-qz62cz=KAz?CJA`%@4*V1f8tze;2g7aVS~K4;FMr?+{w+V~wsm^kkK@3Nt(
zsi{A$BCwg8Q-QkH*Li7aKSlH-X?cB0SO-_i)gT~D;r#R>Yopo`Uqfq87!HSwtmerb
zChVVJWFC$2^=mQiZnZiMvwfadAv?PR_HQ3-ZH(_`jT|!R4Ldsq{gW>b7s#vK!D(qe
z(Sn%pK~GOlWE6V=Fvu8-^Yin)sv5wtM?7fo{~C%P*i#;cdMYXjt*zW3eL@dJ53$}c
z&v+U@UdmHIEIY&Z7Nr^?ARq+z`-4DabcNLl>gq>_hcI$lIK7T`D`IFb_;oOX@-8t(
zfc6lNi%%h2^xVwXpb}9~QE>tu_6V+)@3G<wYJ=gUFlzf)ogR^vrY4z8iX)(pcd)0E
z21iiin8ya)4NGE*&p0m@tvjC%QgvFL-(E*;M45EjzqgjP4Mw;e5Q|^Ej3<XTKJc*6
zOFDpjYinzLvELUyyNTNcH@bvjoehcJOM8*UyB$qsw-=VMzp4BGPHWCz`yEp=7BPbh
zsxqH1Y245?<(ZwIjLTzsib-lgEljO6+w)1%{iz#XY=w%~x)JW$07x?{bFp84>P7zA
zz7bv1EUqH)9Zz|Jm%<5A#S=7#ux1>e9EyK+$@m&}d<V!ZNtrfmvf>2f4r(^V11%r#
zKxI{-ILF0{`~j4jcB$Jlf&=9TK=?#Za|6ES<{{--#*ks~`K*V{VOo+}El+BFz?raj
ztt=z@_ctYNbYe%Yj>4R{7-eI<OXWh4G?A$^;oJC6YcBjAZ`R8GNbqa!{<8t*^)@Ci
z_3?bBX3b2|aKrW~&mL<TLp>th50oAQ#wni#$U!J(5GY~t5v^GJ!I1A%vjsJ-wl1+S
z33wyg$YS9UTr=1*nBmPuG>yM4lxOFA<5&`~6>(3^`R7@#nhHuUWVmq*EK$BUZ)l_>
zy3lrFf(YaNg4wdqpOB4=RKK#GjOazA3B{EKtukN2_s?Y&uZCqEBRL9TX+%3YfF%{W
z=TIRbjHhnp*~ia*^Z#DZ(IwC|k;3puU3-@h@7v2jqesQ(K$3j!iwkEL?Q|m|O}IXt
zxVJAX@%0c^iDcm48b<+-Q^2$6?8HcpEuzM`LanGVVR)q3vgIWWF1n*)PA0<hZ<!vI
z3*cq|WyLVqqD;o`(R#Ia2i+l0brffgJ>+fLaF}~i(OHwiT${rExD&QbKi}QVj^D7K
z<n(*-Mo!me$bAe9v_@m*arq~=fmoYs{_J#C?t*{k^&w{D;lMSyAgywMV`&-1=cD!9
zS>YvrEwNu|O_$6IkX1ttzDR+FUK95e5E~q13<_d3eOe;Pdg;psUDOzR$yI^be*;>0
z@)PmcxMC;JB?m)p6TidgNfYbW31b9+>g}W93;?}S)nR6oriraut!(yZ<i68EwcDu}
z4|=aMr|fVE7#NYUIFxO5cHh64WKr|HZ$J<oge1d$1<uZm{4+J}%DLe8{8hcW6?LPS
z0{9S}@f5<!{_l}JV<(;<S5q6-dzpa2Ux@O_9Zt#J7+v8osVg49JYg$2Ca!wM5sY@K
zh1Ny_fox1qZYD!8a7GD1TcxQ`RvQz_B2WaM$G+@s81UQkb#|o9qA~6e;6))+dp!-{
zps?sb;==v&>4ElN&YuhVZ_QvhWFtQ1y|m5_ge<kJXt{DJkclveHBAjf8_hMsyAaT7
zu}^v^axhay&y)Q6bX`_}#fiDdM>jXxObe_hE(%-+LRKbXTuN9bYJ9Vy{0mCO%uGvv
ztQ|B))Wb*L;a6L3h84cZ@&tY4dD3DG-)Fd%1R3JlX$tNBsOyy3tdF79p7z+pni=Bw
z!|)0W(^-y>$t?eChp=7xTVU#2poL7o%jG<JsuFE1TyuFbhf{ZzD5phAOM)?<T{)P)
zFD6uF*b-0I`V7sjvE}TYzQV0pT-6tGbT49J+*3hIDF-{!1`}+FB`zmijv2?uid5dO
zsGMFpKOufA(ZOhspC(w28%eJ%6>!gtDt1m;=Ov+9tq=BF%03w|Mm9A^$&wJ(yV0%v
ziE4w1Fc6LRyTqm-Au<Sz@{X?agvQapw}B!HWXgRVGkNqA?mBm!-4svuY2X1Bmz5Q{
z_EC;wE;Y<eMz(xMAmhXtcI@>m%YD6dB&C5Ck{=&&x?E|_IGC4<x~>%rpVa;ufv&Sm
z!{q-}HpY%Y^w2@&SBTpgCws=Rwe%erbmU64h7n}*!FI#Z)&PS8PxFsmVuHM;gC7Zh
zU1xo!jXw8%GCO4Qfn5E@6?OUcJe6#yZc<3k7U6jvPXSqqx#YO5^AH?~v^*R3vKU?V
z^Q6LZZ<CtQ_934Y^YBy>!M2fQwwFb}FNDz${1{4(Tgt+XbsYNK3)Z&KjZ=XqokFxk
zrs|8?!Iy<clvVQjHowv1Q&Hwkr(deduM@`a0}MK8-7P<Ahb*x;`7E0p#^Yg^DEM(^
zFSaxku1tsy3Hw}uLvFRXF(~B!lOz0V=U>2w5Pn{vm8ejI9ku9N3v+!A0|HS<2?caB
z?iAL4hz|y1G2-}P0L`epYPY-JRncZ~Ivj9taC9vKaT^W}4v3H=3DiG&&_DwJzayWN
zM=vl?SXkKc@o{!`w&qOdEzL3rL>$-T^YOmk>hf^bnex9D9u}W5F`f2D5{8C`+S+*f
zW%x4X@)bi=cn3{tIdPGar5kE%EzHel_{`XpVUdw1M@Ft69tO8<(Zc#zf>?A7rz+Qx
zG$QKUZVqB&Q9xb4DU}jwwb(2z{yTc4#D$b885iYplCDT0SCo>9O^LeOM7FnMHkI}5
z_vk+5Dlz6FJHkhc#$%H1|3;`NIsb=Hse7Hv>+p^+fIY)1ES^=>enpO3czbyWIn)0x
zD;vF|Nr7>2c$k_Mi;T}BudSx0hOA}$DSJKV1|WKbzJ3{*CJft@VZMy4CV%||sN!+P
zT8u4ZzSQv+3dPk0kTd?(sNEQ~w4Ck#NdP*Qszur?w#2{{uf*kGRaeBNcKQ_o=dS#J
zD3iy_ZMU1xxXsEgwOLlauXzA)nw!3duW-u?q@kga78h@_-S!uTe)H^jcK2QdEr$XV
z51r&p+E&Z4)4A1ZwIMV#v{<Q-*ti8<DA`8ZNlBT`y-}RN=XLm<pK4;wQh4uWa?O&*
zq4ug*nV@#HX2OW+3gabcwFZ@Fqyi8f4h1`}?FEx+kc8^xkWf(LX`IZRbxsdgn!<Zy
zwEG8%_RIt<Hqv*DmA(eR_^&qDRH8ir7gM{@tT?ErkXF0kLvz>zz>|VLAT=~%I1cv3
z)p?GMGQQO~-e5+(Hg*h&C2qS;^cWMo-(qNRI!HBh`_QwJkeM{?+(hUZ{!0Z>;Xs8(
zfbWhP?hD1uMPk$*a`Y^~<+e-Zg`JO7Wvnh_5`5jg0X_+co#eQVNYj>7Z<^z+FhIwU
zBt_MSmbObd78b2**Uijr%s!xa<=GyXkN`wUM#&{;TAj3mQ|Sjllz3*LK)?bR5M7Sd
z_)Ms$MpED)VD<Wu^0$U#T4iwT6Q8N&rNTPt)VHszeu(k79$bov(snYTQ}qN(OTmbx
zZr!s;?R$}Md7Yf&v|Z^1hVOVOlZ))EUjR*2+5*5Nnf!x8$!}D4Ws~8CUeCe&psZ3Y
zMfK_AFUEz2@5rO$!HmU#^11g8UF07dQAfPZWbM9OKKl=C#LO2*OhQ$0G9&d5Obj9E
z@1GDFd<FE<xRBle+j^-RtvkmrvW(fE(@2ote#&Z>>NR$vbX3eA%F7u#az;Ydha{{l
zqX6)6<SqWcP^6U5OU*rua3fl6UhG~n*%*zZ(WL?e=#Cn{Gl6KNJnBtUsXfi2pr1y#
z^F6Q8Pv@UGXjp9g<40s<lVFjO=YDfT{w;$G=w#B;<Amct?EMAO6(zK2fv_jFpyK0-
zl_lCS8K_Yl;8YY>XF_e=uSey7*E8GfOT<|FpuLR#ls5>?$Ts%Cwt15~Py9b2q(Dwl
z05Wu1h5sqC_NMuN$B!U1iTn1$ko8KW()aA|opb%c&RcIsye=RgWL_sMC;bcYe2iHk
zKx==8(R-6W6R?!#O~)a;M^T;@3Qyp<{^YChYT8dSZpl;rk%nwcpe$j#`vWIJwx*se
zqiqAU*{m^DaGzz>0O9uEW=LEme+!+G6OACF@fP=;s7BnQ+eKM*aCb_6l~~{~enCMO
zuk<F9FVhDoLOq6`v!{~@0&(=*%GKZXyfZ^^bSEFkMBBq`MIE%#d<|eoZUSenh}!q3
zx<ALmexXI}Cln}8GxYz0=gC+`kNveU>xUXNzSN9}F(C0~NYKCo%`zQX(Dr}Jobg6D
z8v!T*)qxZdat&_?GYWekXkUOT!IXFB_retj1d!RTQ3M-PNew)`i1`M44~%aNfg;&D
zvc=N6s)I_QF88Xtqb2mh(x3gTF#Pc;QDdsJQU5FmgFujAA_SUtwDAEJPvxi`?Yo_%
z;Lu1@dSqSuAkd|A0pEwGcy%=*ILIAe(v4dQ6Vp?h!7$b%;%(^Z_1RulsxQ>Xk%g}B
ze&2xzQbr9~OmVdd0yJa8K9Rn~Vu48A*%n~pr^T`yB;7ZG-RoaTNxJsxN)GA*9N1s0
z&<HebvS3Ej7$=${`E*t$wEjZIEa#DHU5QqNSp)Yv2;9Pc=$W_mpS#~Jb<Svk@E%Y6
zheX@VP>ktBD+s`R<*A3flxfKrgD0rN&~D=?|7*#o$X#!2V~CGKT~@?lm`W#0@N=&6
z{_V_wHq19pa^Il1O^jyMW#_m9U2dD7QV&6-rky-v=O4VJAXUpZGdiM;J3K1!>Bba6
z?RKLvh=mF1ylsQdytBOS=(zVn3S)6ut_k=7uq%vS**)nx$i--BSL|i=Hhp@L*i=TT
zv1W|Cf>q#Opb49d1;82rKSl!s(gDx1%EV?Z&q2;9M<GGP$?7Fa?No_F^WoqY>)N;y
z5nz<TzRyre<O`1Y^|V5U<Cf#fom?Z7Ne~qWP68%%@j^<hQNRCLC<38@dB$btCnR%t
zPkGT}9*1Ey>OX+T{8!XCF;2itTVs?+bf`OuzVd7oT;A!S_`M^s|LfAoqotd#o6Ph-
z5J|LfB0wY=f7e$(7$?f`{0HB{LH*}h*Me5-jL%`d^B!~7?vW5R;L|??@ob)JHH`FC
zy9}}}7DEWNHi6uTPY3ume1CX2e;p;bM&>_2u6c~$4-UYUE*;q>KoBXM!K^Ep-Y24O
zph*zyO*N8%Rzai-De9R0T{<RaW&_Wbr1fY!+61VPTn5%S`o57-5zj`B9Df7bZckC?
zhw-7n1Xb27EJ=Hr$p9B*745i!-bleqRht7LcRi2#?>OB=slpaY>CdKB)*Bi7SNAl?
z0do0Y$8sGa!vPfK`Nd(EI1?#j$rlPfz##<<3xYP~FSs6#vwJiqrfI=ly5=kt1jnO9
zIk37kh6wZhHm%n)zZT3YuWDi-QB<=v0@a-(fE+6P@4V9gDGZPPiu2$j(~>Y#!7J;1
zS3xStb$&x@8{WU(&VthkJr8c97!wjlr-V05g1}zM7Wly;hAouhq|}?n>41yt{<#y#
zospB0{>h4^R;%dSb`OZy+cPpXJ=`uXBdXK7i)Q)Fa!M8qWbi)WumKwK@#%?HyXkT!
zUkXr>%?^jof0z7vw*f*WL8_>uvkoK=<yXp!r^`|a*LnUQGksQ4fparuH&32a30!3-
zoNTG%+qOeSHR<U(N=os%UT=cmU0q#ORTDcq`H@26#Yqb&^RBQ1yw9A6H1o1^G}!C0
zCl(i7UhdCOAmhaIcs=fymY0+DBYp#0N(wy4*2P6mN~+hOx>r^2yK!UJoY{iYUXq$N
zz>(@+mA8{RH69nodpcR(luQ3yUfT9I0+&#iC^!(0i;Ej!#E<A_q7++kad%)Y6nw}`
znOl4)JYJ%eRVuV9GFy=O#L4-A8$g8te{*vqBR>R#j89QfT3Q-X{tu;_b_2NHKSR{R
z;ycR+>N&ij{QWuFk<0V(vL^^3pUG_k-TEi(<ktEqAgRF4S?I~bf1Ycdx8?}vQ#^Wl
z{|ixh0C>_)<q)vP-fJWXwvFDGMqoMzzNnwJsx<ryE$IVN8^a#Eff|29`|MOQ0xmCg
zOI1VXFB=h&->?z(u;a@zkyO5-im<edOjHKLX~0D|%n$-}Z7Bq($FpqQ-$FN^LQ_-I
zdcEbXDw;!pY97d5<xr@8a~s3$6_OXn!AY&Ft9uNKm)+d|6dT@@6Q?&l1d|mfOH!74
zn4j3yW0U|D(@g_d!;9V|aJ4(6v-707&N~r&|G)14fulmBFoIiKJ&5=rHH3C-CX|nc
z9I`Fi{Cd~hoL<&HR7u2-);GSb+!_vNN7NtWR{|N&@t+^e|Kd(Uakqe68+e>Z?@UTb
zGN{;VK-?3?r>)mnAR(0~M%J-rgDyPk9!nV!_4|4(Me*xgjvYRG-~*2$ab)kAb7u0e
zG^+15MQcQ4!8fuzK}Yn)6SA}fDdS6bgE$m{r<&!z1Y5b;-XYI6;Jg-ZR{w>o-0#tA
z?eYfL(q-!mxMP1<tICOu{GzOYL*>HLS_Ja7fBVWhhNjEMm508fi~BgFq-s`ts<$5}
zwN3rcS^#(vMF)lD-fLL^pzD?$DAw~uVGOp=WttNg&Dl2Sht5cx{ac#Zbt8fV_L)0`
zsTSYV7f#5Gata1nb=-%OwMOo2^O}}42(_{iIrEsBS9S?eS#)@DC1wR<#qG+F_x45w
z2innCqYBeMF|vhbm$m&DJ$918U#aCE`ah-i?h)IB7+KA@V9^9;@s0v%YH&{v5OA?l
z*t;e6$qJ<)L!`a@{Cv|{e_X1WoKlm6h5S%Q>l%eMl5r7;4&)eYdHOHHsu=Y@h1EI7
z#iRV{{BVXPmhqW(rMwMwe1gNW;{zu;u9r@6D%R7IMyK+5VAC<u(U3tO=K^b7=Y`K9
z&1{PC-<~s67y4(*v=#nhhKZ0G<Py@RtEwmw@?}kQ09N^40HjKZ+hW+6kUst7ex(98
z84*L(5nuiD^~yw=AT#0mz9;KI_e5~hZX*@QSC~wXr{C&fnkj+BpjbYB1j)-bJ(Qg*
zGjpY}l;V{09(&%=F4SjLfMbD`v2$`6KcAbjt=si-Z{XjQ$y@>=B73*4SkrmwNPtU@
z0*_!u3keWI!1`<J)(bLQb%Iea5~<_M!sn<+rBTAj!C!MH`>F~D0p#p;Nu<PI(EmJI
z&p4j}iFTIy2bz^QtN)}~KAc-;(v+pU7S$`N)-gGd!BrG>qm&85qbIKLR}&lNo*H*S
zq5oS#m35f?yOCK%p&KBa(9MlYF=>;P^oMoAk&j|xCc@HOqy-VaBt-qy)~Y%xth7)6
zUkH^Vj`~+r%{^u9mnM1o!v-V%?Y7jNuLS1Y{2LNy;_UO_rVG)5S;r>|C@Dimu{-LE
z!pYYDmrnU$S=c9awjvi^TH++TTN9^eIrpm~XW^aRk6vLrTrY6!FK~+~;TKz_<ZO+N
zg29c0D(iD4SpOFgwKl%N11l9})dB?i=X&v!g9FrU^+h}5a7w~1on3sYeiZr6t?Ry&
z=xHYtV&;wgT_L=zG<(i1bKO5SqG`sWtm}jg`QFjLzx*_9lfyfcWB**fq>0X>j<H*e
zTfDB(osl?@rOVz5+p|=FgVQ%P83&}IhOS+ytEh&1BTLw6pjQCiBo3@;?*yr+(ZB$e
zk;E97Hw$ZV=$-=9Axt{P@2CT%7KXklsFjiAflP&i9DmgK=QCtqQ8**i^fyZgjo1;y
zo|ca~!ZlpzS6W;Hm+hE=Tmi6vPMXw*fDk<^D|&ph5F_g+YM9vSRnClx@Eo)2m-qZ~
zGgSba=ZJn_=miib!!-SBvzpO|)9-QNR{$VGwnk(gk7sjaTvw66hfT3wxdtsce)`(c
zXOIO{NYG?<TknU5mm;pc2-J5qFUOCk|5ycX$=XZ&KeJu7a$x_;c8Qj6Ez?|A$B@YA
zO$de#n@0bTiLnz{0}@$r!x^%5h0NVmA()spBK<H`3Y_Bu=_u^~H`EDN6SBq7Z298%
z<kmcCm^Pi*d?42^2}3ihy2`^WzJOq7pcct-I{ZHTlRshI0driD;<vi*cLs{i5e?#G
z@Wv+gM(DX;Qut<Ux02W&bid-qh($<!<-wn<dln}Xdavj)wjxz^pd~&0KBQjJx|ocl
zEIH#^c0d(|DW84%{X4+IbQ7nLpmtDP8sIZ|&wkE}x+iUknzcL3RAr_J+(Z*k;Stf4
zIL^h5;qub5#@y$@n?DyGJ#!=i$z-kIw>c!$9s7s5vm);7?DXcGs~u!c3n9lV;@$AL
zC!PBf0XtLb5>(*B$o?z8MglHUB`t@o=r$5alX3zMib#?%_Y@_rOlxIEt1ZGab>0a|
zuz_RY)<5MNR8XOY=G)YruF>8O9S?ZW2jT5-xzt)ohp%)lq`#r5d^!;~!CJg)UslNo
z$W@<iBaT}Bivb2IdmS``zs%zWSLP1wiv1=LV?ltW!sn_SuP2|(Iu>Cgt-k|%m}Psx
zamHhoze#7&?oe{!Rj9VvuW&ps#>bQq=)C(GI9bMfljxaV@54TN@``MG=}$WP;t4NB
zHP=!K^!Yp3LqeZM1~qP0Q|l9U=oON4zSwGXS|uoJOydozk=Nr&`i(y|d_+87BN#7>
zs1Un_$WD6SBf8Nn)pHtJKAZF5`srIO&Z;6~uo4D#qtF=fdcsB|N%g<&6*K2C9as8X
zaA!Ip&ezmbfPs{9584x}L`4=k+S|<L*=w8I*y^NTb_zQ?Tou)df_P;O*#~%+e#|Js
z#Yo*moqIZ#p1o^4cI?jiK!6@uMUc>wV9g~pFVdkwO@-c6MYM5_E^ck_BI9A{uO;9)
z<q?OeFGgsL<PYE^*YgXlg{d~U1@3{%gY@T~o-p4@wP2!?E;*rHr(gKrOIwO@=PmWp
zCb?9#)8p?CY(_frLP+q2mcrcQ?y;%q%q0J)m3fm<AGtaon}P&R-<U~U*8l!2-d5RL
z9+)6%eN6IsumK5(P2Xk3Fb9n%w1xAu)#H8K+j*;iK?+tlu=78ATFY5jofWfuAj_$}
z+ISttd(`0bZLIw0O9?<`k50Jc0D;=_Y2&u@E*f9c9pLYXX^X6>KDZC(eh)RfBeTu1
zB;}&&bS^pV@p=(FnHnh59jfdxijsqY{umxyQ{7<xe4Gzzm>vfC*1S=NBATl1Xer0Y
z`zLr^I4~7A)ymANF6w;b71l@Nze~zl(Eb$xfr<q#ZiKm^(lXzl!RJjn8N~~Ib6p~o
zXJ3^cTw=q1OZ<=kfts~9mUMS_apy?RpSTa}t|MOj&ln8q^gO*R=A(KLsC;Vb?T=8M
zT%1lNuf=+_NCQ8ZZAjw6Q=2}CBp?f;u@>G5fi#(4X4?zdDL0L#bx1sSKp+v7<%G$M
zwkZd{DkWm2sxR6eYBy&)ohtSg+icjfU33XuyaccT0l#~oVFQAyI1s~5TJPm=?ly6l
zI~#AKs*%D!4k21K6VMFzYQPk@+3V(y7&W1?s=F(fRykbYxVRpzWE;-Wikph43IW~4
zpreICp{o2`d#`DDgbli^lqhAcm1A?j!Ek})Ffb5?2W3^av$fs^OFo-@NHiqggM9ff
z-ZmPuXolH<_`+%2=t#djUc9-Nuqn~$QPf^)Y;|4X@DQU&OS6vW`D}HY$MiGHLwJ+F
z{q5uc>dNM{LMOZF#eQRXscRTf>IFDb*Xs`v1{|OFS(jPoV@DXoGHUJ{Idn@W1LFFU
zu0tt>gXI*pjr4NXFTOr%cSUtHFu9hGN6pPOb1mxI8cp0YPHGHD4X8+~b{A;Sl=}n5
z_-Uqnmb+NP#!D|1VYGg~WxE$RNFa+F9G1BCQR_Lz&F>ilB&%BhHH!ZI@4A?8V>L{y
zcRa2Ny(;d?)$?yB4x0DJyNe6{jux(;9{aU3Cf0^kgA@TyysYLMAvX_SBo?tCC50aU
z!Vy(cZDL5c(;dETpKsmVN75SYAiU6*6-OTDVnJ4ud`$4>wr9m=irH7+Knm!tDuRJ1
zO6koH*qkp{k#*O!<n1~)k*v`Rb<Z;I(>GpE`@P)5h>o_Mt~u9<JEZ9_!y&)TH-C1J
zZBM<W49XipINHVatlYZ}M41m$K&rfZ-z{~-Rx)mim(00p;nw2LkjOJH`U@%a*6{l(
z<A95_?FagM_#uu*X^eMI<roZzMUT6$lF~lCbDoW?fA^05WcI-+VdIo){1+@2Ug7&H
z8<z$h=BV&G-~_|m&Z3o6QI2Hkh6(h=l|ce>7COs*jl8-a{`j2hfXAuXg+0x>id3j|
z{Cq5-lYJhx3tqTJ(F6jSU6@;6m{V5Lcj$Ot&5dUwE;v1}e<yfr=@D<iKO^vYf1keU
zo+Sdc|AFZ2sHdv@JJGcDd5ka=PeVCje7_z}1pEvts0&R$I9~^*QinL60$Vb!>Co*P
zf`su3mg5gX#Pk8nQ_IAF&jq!iuD`_Ba+`)v&wF++!?9w-t5CurA#oLGP>Xqr%1_YX
z7``APMW_|$K=xFWA~P>cO1#8fx-}j_jr>T~zIQoK9^S6?X}fg0(8umA&o0L<mMnaq
ztjuVXLX@3A5R~<L<^u`3rj#4_1m=~NAH}ONkd|l@|B@08ziOYc*-##C$h|Zd+g54I
zFK?=?-@k_g{P<;aC~R4d1yJWKhan|0Sq~>sSCT4Qdx~a@c|3ojNrQP88aOb;3i6dT
zF4*K{A#qng;jX!$F*BJMPMBrabG4QJk!IhlO}0oWiyuo8<SGi&6ojd4A`<mSpuUR7
zZC4Dd_ZyvAIK=wX{B$u0guOfxJmTu)crqLO>sJa-!eC*H=1`rpt9o1l?t`e?_pAN3
zS`Y|4<5qG{o37Hz@^TJAV-TLIcI|Jwn)K$}l;&GzEdgV>`e;GNWKQdQx<^Z^*p~Ul
z{CY39`<+0i-l>2~@)xd~)!IzQ%T<+<qShEXmupVXA8OJ9J^O9^%J^-+GCN%Qr@tq4
z`kj*66n33L9irF_eA77z{=RQ-XBsiWn>;~_&l5>>b}l5`)6p`p3mb+?sgB4-=?u5V
z5fA(OXYK)eoysQ7tJMCLzVlpQtlGtIZF{4p>LWTBCg-C0dMLk|>-llM2|wx%cT78-
z+M|j)D;0ly-G%tmT`}2<Q}KMKAjRaqO;(JCdHom#D9nd+%XKG8g5K}kFfF4yfvlZ+
zPYqSq-9a#e;*b%=|6Fc<jpj9C)Z*o4<HBd-<;{LWBctkc(xda?E~f}}7jQlA^bT^?
z%jG#Gmbe9r`R(c8?-QI}57YZ)<cnLfMovo0ddSfa&imZhavP6uF(hf7KDQTZv-~=b
zYYE4kR&(GoLM^K=kq8-y7_F{W*AuPZl3tvxtcn`6byKD3S`}<NY#d=gM_Y}bmH9#u
z(yrU8F_C&vsp?y>B;4QI9$J;jcr%?aR2${{fbiz1M{l6M($0Ju4}_te?)cMY<3<c3
zJ3Sqjh{w+N_DS)U=*zHCx^_5F(>(*ph)bKKOHrf32p=nFj?2#+%%Ba*srqm1Z6erb
zIC`6!`HvbK^1m=@AJ^VUQf3CBr3*A~XWDco-VCL;_I{mcKOUa#L`ag{8r!-OqFjf%
zD%!A?u^@hn9}Oh-BMuyLx94Rf&c6tiv~j5_#;_KcFvWYhPp_Tia0*WX3ZlPO&`BdV
zUG@FA;*%9LCuxjWCQp*p_;5KtX1G{tyXNklii0jXg}rD>j26{(fdOST<O`vamoC<?
zQM3euSbDxSHQsgU8{2DGcI|b1hh;~gr}<OW6D~xKO5;et2?;@h-D|+`=d+Kw7UB24
zcyY)YgFyIqd{)8ej8r2+*k7k8{r90kGHETg7mOo8IGhHrjmN7Pn0bY{ACGafNP5ea
z*On48_|O4&c1XmiLqc{B434Cbj^);SwhT_ZGY+=|jmU}Yw-xwgs5ApHX&Z}ifjj5i
z-;#E|o7lRRFXEdV(WP!@ufGkd>72D;K<%9}T|3*6z1D|MtHTO}Z^uzT9h+Xf!hghz
z_cw;qLsyev`b6EQ!t$4Lyh%M_3kbOV4id_t#E-oEy2_sTNDh-f_}D`ymq>sM0nm7L
za4<1te79CzTWjr0mj2Izpc>bW=Z|!U=O{`XDr|z{IIAxL@f9k{WmECt6%HfT%bL|k
zb-DnE+`Re%o2#%$Plbc30|5eG>&4WC1U0>1ro}kK+JA^K_PH<~DhoYYUfukpfIZ5y
z_&Z48Q)Z}8GB)fC-!#c_R~fA<PNpFC(l2HFONrg5q2B%vuSqe?XrmaZ*|95|&8t0f
zA_1yg*-tvPWWrLLa^kH}xb0dvjvn!t>HSaN5T#myXvf98oDNP0nu<IUsc_DC#m5Kf
z4;(3*YB@7EW>>c-%GO4FlI{}tg1yW|L{nV9yPV7oqmuUxbx5LGN{B~nb?T2WE@}qu
z9vnA$&^T4eNHi`ZQb2c^r&C0ul7NcRPYY9YP}jtYoxu@}3k-pnR--Imh%Y&9@~dv3
z9B!O|w{raAlqzeFv60EW1l6#;>fasx7z&&cTHLfEQ9<u3FEyA(-bW?*mmx^rKD!Zk
z-2M>;ol!p?*7r?aSuo`q6$z*F$fLP9{M_;(Hmh_QDyd;e-q-K}@|%Iqp#Ck<rbvw2
z%TP#!`WEYbzEu7n&E%tU6ljU;7$iBTWY%xt$2hrfWX|v$DwC4i2X>-kooi=aJn9{a
z$$J_eXFZQTF$A3J3nTA8dvi^!0~(dKnqNZt6|d@wo^s!I&dSm$uo7dQZ*Tsws#=v6
z>)gkaF)SIWz%q(kI>8WWI#z)=6kjtVi!*9`<*ofMRptj`7Vq4PDLqFRwvM>zfj=Y~
zoPW6US$XC;Yl2YQQtP~=^>a>aV}q?QAOaJ7w8e(U{r0k)af0F;KY^-9R~_M+T4D@R
zn<{`>AgV-45(c8xZX5O{qkVyjB%rnxmIIuJziYqoSf$2xTvI#Uu<aL~QI)QnN9-mC
z3SePeP`K;uLv4;$Cfr8Js%7GvSH^cDn!$qOP$*c)+hLbMzl5qFgKN!Stpt@?&hb9g
zMoADc={y^S6EUPEs3CSw3aRr>us9nuWw$IeW;x>3<80#^!IySwabW|Z$YSK2v4>N=
zu!u&V#$)*K40$4qBdUy&8sDe6sx`ar0-qE&;rwpF$Ba(_S=H5$)-1#3Z+?he`$G%K
zJjvJ$JGPu5hwNLU@{mo-HNs!bL~q8lv`+0`n>OtZHMvJEgE$_47%7cEbbjs=gw$ts
zzc4)qh2&;Z{?0o-)ojjkbRu;LK?MaNWofM(V~}8Onl-zPvAN&(BJV=h_w$z5r`!{&
z5WJDvo^pbG>sm{%7m<&)mB2u4;o2ZUmQ00Dhn+#pq^*yw6YIZP*${{F(&f&h*W5@R
z+NZ)Z7nlqWh15OIj#+-;X${KRR^JKS71;<LW3RYzL$xe>WE&Ua*+^?t4q7`jN9w77
zKu4aQYGfx1bubbI82h8f*e~R_muh+bNUPHl4UCj**<m{ss)%ujAYVeni=WYfgTQL*
z1Y1d?_BlE!>xg?Mt4&W-O{~YDO6#*%1Xd<i)3V36K=RYO&}+Lw^9RDaW3b{Y#hi5R
zg&sw^g!?-7x(!aa59+lPLJx1D*e0n9uW{-Eo4Nuf`E+xC)`V(<XHf_k96<Ez(eXzn
zR%$}l<~Gnht4{XH=`M-oIU@swFDtWA-KblsSQ8yvPw)`Mu_52oL7*c1Y>;n5nIn}4
z=^8v3=!*igdn1t`y{yS8H1USP;VFMEV`u@k1F(aBg+*V-!X-O&?;J;Y`!g3ypHZC#
zdbO`yY!t(KC*C0G)ndjx;g^c&CY*-U=9h&%b{gC^LYhRiI2h2D8fGjmkEIa#E^RVW
zd}6}}=kRzZEGRatg%(!~<*O`Y3rXB|U+ACUT*fA-q;EP_3*A?)Qd(^ZI%3nTaG4}U
zUVFb@`GWO-?6Zr`O%r?G&pExTmY8y6NM8IbS7`0hJ`1Elgb!n5bWDsd=f)SCvhcZ;
zsytmpU!+z=dsm`F3N)_`TIEQKNDwEabjG$2+wCWC<bA~PzhqhMXeDP5h7atQgi43R
zNemo}<JAj{&Q&k{%i5#065hJvYi%C&YD2j(gEfEG<;0uyCLT;6%P=P3guK97+`=mk
z0%^!yM1My;gbFxsHI$=Ifx~Nuop<UKh@rtpj9vGamZt(Zj4~^LRM4x0YbHB8*(u{L
z7o00@8nQ|R@rK9rapDayIVK$UuDl)y@1IZz!u<>d0zOgPikzm$w>T(cQ%^X4y3~4U
zy|2Ad$ci>PiM@w~oDXU%@vo>6+`sOcwfw2OO+mm6w8nvs)~%yT5YPT8!sY@>fd8G4
zQ5VRXrHmh*D)M3=N|2lu`of}L%|$EB)^<pR76#;ATwK^S7btYCx2{~l{RDESe<MG9
z?~2>A=S&(E(Shx^4Ty@03JQWU$kws2C{?Esjlk@;g$U^d0~OtUYlp#k^yE7cp$M*}
zAD%-cJ~<P|!^1QCne%1h=;Q?3%*@n(R|M#eGh#}XEyTH_ZM0r5DJ%Qy;WPG!(UR?D
zD2`<04k1Vg4i**`0YOQ^bGs+#`q~~BNi16)C?lSmo+gSP1_uZKJqsks8aKUPIzQen
zK$BR9wjBH)ueCs5T-6VVvX{g}EM(-UL5Oq?yPd1uL7>0Q+nYYE|87yw$Jo@AoQzCV
zRP^LUu~_-}Xgas6OMo`U%)OHGJ1N?yIuWMsEFc}^l<9S!OqanB+X%GReMh<Nh9T(d
z@Bc<lJ_*$BP*I({zr7^U>p_FYQdmnH8t&(cm4N;|x0`YH!6+P;yOa5+n?vdW!<Cg4
zELzRmzst2igCS7MZr|+>_4Vslpcl`=((*60si^1xC?5)>uB8=&&3dv_t$W>^>HYlk
zYNuawMlP9YALux|yDJ%liNxc+pUxHA-`|IBV<9IeXJo7{E*|!MNMV|D_w$Z6^1l%Q
zs=z;va$}yY>glwbfZJAAS9f)H2M4LsU;&k5b#=Im27N%s;dby)PfySEbevFjtT-ty
z(&5QT3-GwW1_6WA6;r8EO9o^(ffof7mVrP#_#X>eHpNy1_V)G#a+wuXA^~86C^QU_
znAECiJZ_3o?$}W#riBN>d`!t>2GxijKaGs`+C3iro}GPitz7Q@tH!3Tp+T8Lm@GZ8
zd$l#tSXd}FjdHQs*?AGLzP>)0!K)Ol`Q+JA5Ud3SbXv+~^3_*WUEbZz{|*+)?h4=m
zdNzS3$dMZ3QDwBTO#V)rODkhzR-27BgwJ$EpZXGV5whirh^;(4xRXjVeng9t{u(yg
z2h}t*U`+^G4VxifoSvS}J2$;NV+M4~JHkAAc4*TjjqZc{$sXF>JiP`7LxX&U5{gK^
zBJ}t7Gn<ULe!M;WVAgD~T#o$8ESt{d^#1w;RDXHB-7SKA$HvA+Mn-aSa{hT!aeuVw
zHaomt{2*Yiudh`rv^Dhf^t7~6Q&Rr9;qM_xw7PANfN`yCdYrFust?W<$S9=}a&x!l
z=Nli-<b!<6%4o16j16<$9YqZc3>+QnuG1qUBY(~oMB#BSjEqq6^6Jd40#C~Mb0!aX
zqwg0!AE^vKgn-Ti5096V`BG+PW_s<WDwA>Sgr(flQj4eSeRMKuU!V7@K?1;bHH?k-
zSL!WPQaWx9CkgnyUnluLv|F96BqSs@)iQx%$X;ZA&)cKv=etvzwWjR$-#}OB;n|sv
zmKFj?TuSQZ>S}mshyzG%hoO_}dYsjFR6IT%W{Qc5dSCT_2KiQYzCQz2eRp^FkG*!D
zFhKoz==kmH>kG^<yakGM+07QG(<ZxJ(P-S`{e4VqY%Z&ny7^MI&gYX-pl|ilr%%+X
zx4$Mno=+=*bJ(s0^=>;~Y_!{Kb~v3bjA!!uELP|!C@731G{Sp7Zii%KWB@M*7>VN7
z;_B)vz)RRSr}b`c*){>qxxEv!YincrMVyEz5&eo3^izM>*x7si<n8URs|^Pgm6Ya3
zP(FW7`t|EqM#lTYrq9dM6DK!!=fj4_^z<|W77YmWJ0T&R&&z$I&240RR*b-l#R2G<
zOgle6Hyn)qgoudCYOX9P8E$|nEiXT{*6aWVs;;gEUL-K1vlWZOJ@I+n>ns*4xLwYH
z8v@EEzmbxzwYtbfI4_iG<X-v#m;Bfpjt43fu&z^WIw)yrRjen0wg^`qU3YgbTnT(`
zXBrZc-hqLE)m0#ozY3Tm-iTCol{!tBFkoB$<zh;(cJHrl&;fSOw?9Fkq|u=GAJyv6
z0o!|fc|{G|U5KFAloUzATm0!J%Vj3wxG%W4Wo$!%<$0BjAHbk7B}-R#?e&GBi-?GD
za&qeG>XxdO`$N&u)B9ZS!TsGZsOBIiFKlbWeUD$ML_$OiBs2pI11*e{nR#|_5aylO
z)$%**rM%{94Pe<olK=H~R?dh?gAnEN$x~`{LumygAX8RW_6y<x;9r2++He@aHWN}M
zp#z9eVC$5*s8+i583Y9d;Wt$mWWBxLffdc`y2~czvmsC<i;9Zc-Q68eF6RySP6aXS
z+l%g>FFF4H{##r6G*~zo7%U$5mXBn(zTJu0YjW~tEf*-GfL%Fe!ih^uOG`;n)6uE^
znFb0ufclGN%X{vNwp`l;V5S071}#iD$O@0&+jDmyDy2q696=BTE8?Gs0GJOLDEwk3
zP>$~j=+UuUsS6V(F4ybSm69ky>;}51u_E$^IyXPxLE%?s=nTxv*bheI0q<;NXvkOV
z7;t?wIxV*w?H=T^X<N1&nj>ImBt#HRA0-PrnEaF&@N3m{6coM?g{O;^)W6bsJygFD
zfV%vl5Z>Q#tL%Dxf<DMmV5v@pmGRYrOG{75Gd*jp*SYp(YXt~nCG*4KwJ0#)gM@+r
zZGSM4?s2^*A#+~x%0+|44K%Hns#Q`wAB9(B*$;Ou0&x|vtT&H~b}{0(t!|fQ?f2an
z{+?lsC|ea36*;*{!2Kj-0#3|aP|z>O8<@{W)ESbTMG4s#PU7N2z*|H^NB=RHD4BoJ
z`O#5Z>mVsPLf`qSsH|)R9T%r7h9yOcCi9*5yFU;xE;?Q++S=9tFS)Yf{&2nym=a(h
z3=9mkwB>nufrGpK0|P6os{~w*;-qK_VP3&@0LfDQ=X(JtF!=fTeSAKEkj!D!>G^aG
z+!Yjfx0;@pilQP=6%r5-fa#Y8{+V8v$MsU*%j>PybP|Y0uWxS&AJ3!oooT!t4HXs9
zlZPH29;LUQAP`_K6%`d)4VI{3!ivSW+r1$`lt|&QBaY~|cXDdC+Z_O0&-Kj>1UUHC
zQcX+CGcer1eFL$xp`jt%7kvPz!3Ii>)fE(o;>3l%e+L5@4n#`G$<b3$nVXq4IiIbJ
zB{RG6Yyiegz~>1o7#kBa3G5@l!w(tRJ32bb$jAWoIPw%Ul$5|MRIAhlOe!Znes+H}
zNk>O#bYw(QO3LTutTCP2g^QC@2dFFtn)?AKx3RI|?d{#s(V^%4WbESd0Nh5hbiuVg
z``?i8q-Y@gclkj89%b9U+0M?6|MjxB-eM8xFf%hd0=yWVZrjqv20jMH@8sm<xVY)L
zISdSpz~1eS_vh1{ot>$vDH5@0@%)+l`+LAqF4vm2HZ}eEy*g#aGEXGgUJ#t8SbYD)
zswC>+(RvLIn=HMsv0-gt!TpDcg^f*BRkfw5=}RxumdW_aO3UVNgX*T%R-AKeO7_Uh
zOdw|7OKWOr9Xqxp@<Ks`025|(zr4I0xRR(>99Y-xl{A@yKr|%CH^fUJ>q|qHJOyw<
zh%=JAC@uM{FRJnxXbfv{Wo2d3<;5a?22=XR#`(@S%|zTr#!MPoS~xA#aQzIo#4@PK
z+oo)QNAVNzZ9xy;1_4cIR0$<N@ev8_XQ<qZK7}WJL4iD;NqX|3trUJr_Y3p^zK#MU
NAtEbWA*ApBKLBy8Yf=CJ

diff --git a/doc/figures/fpga.tex b/doc/figures/fpga.tex
index 21901fdd..12f0bcf5 100644
--- a/doc/figures/fpga.tex
+++ b/doc/figures/fpga.tex
@@ -359,8 +359,8 @@
   \draw[arrows=-,color=black] (links.north) to (boardrouter.south);
 
   % Is the board router connected to off-chip RAM?
-  %\draw[arrows=-,color=black] (ram0.east) to (boardrouter.west);
-  %\draw[arrows=-,color=black] (ram1.west) to (boardrouter.east);
+  \draw[arrows=-,color=black] (ram0.east) to (boardrouter.west);
+  \draw[arrows=-,color=black] (ram1.west) to (boardrouter.east);
 
 
 \end{tikzpicture}
diff --git a/rtl/Connections.bsv b/rtl/Connections.bsv
new file mode 100644
index 00000000..4de1c4d8
--- /dev/null
+++ b/rtl/Connections.bsv
@@ -0,0 +1,82 @@
+package Connections;
+
+import Vector      :: *;
+import OffChipRAM  :: *;
+import Interface   :: *;
+import DRAM        :: *;
+import Queue       :: *;
+import DCache      :: *;
+import DCacheTypes :: *;
+
+// ============================================================================
+// DCache <-> Core connections
+// ============================================================================
+
+module connectCoresToDCache#(
+         Vector#(`CoresPerDCache, DCacheClient) clients,
+         DCache dcache) ();
+
+  // Connect requests
+  function getDCacheReqOut(client) = client.dcacheReqOut;
+  let dcacheReqs <- mkMergeTree(Fair,
+                      mkUGShiftQueue1(QueueOptFmax),
+                      map(getDCacheReqOut, clients));
+  connectUsing(mkUGQueue, dcacheReqs, dcache.reqIn);
+
+  // Connect responses
+  function Bit#(`LogCoresPerDCache) getDCacheRespKey(DCacheResp resp) =
+    truncateLSB(resp.id);
+  function getDCacheRespIn(client) = client.dcacheRespIn;
+  let dcacheResps <- mkResponseDistributor(
+                      getDCacheRespKey,
+                      mkUGShiftQueue1(QueueOptFmax),
+                      map(getDCacheRespIn, clients));
+  connectDirect(dcache.respOut, dcacheResps);
+
+  // Connect performance-counter wires
+  rule connectPerfCountWires;
+    clients[0].incMissCount(dcache.incMissCount);
+    clients[0].incHitCount(dcache.incHitCount);
+    clients[0].incWritebackCount(dcache.incWritebackCount);
+    for (Integer i = 1; i < `CoresPerDCache; i=i+1) begin
+      clients[i].incMissCount(False);
+      clients[i].incHitCount(False);
+      clients[i].incWritebackCount(False);
+    end
+  endrule
+
+endmodule
+
+// ============================================================================
+// Off-chip RAM connections
+// ============================================================================
+
+module connectClientsToOffChipRAM#(
+  // Data caches
+  Vector#(`DCachesPerDRAM, DCache) caches,
+  // Programmable per-board router, reqs and resps
+  BOut#(DRAMReq) routerReqs, In#(DRAMResp) routerResps,
+  // Off-chip memory
+  OffChipRAM ram) ();
+
+  // Connect requests
+  function getReqOut(cache) = cache.reqOut;
+  let reqs <- mkMergeTreeB(Fair,
+                mkUGShiftQueue1(QueueOptFmax),
+                append(map(getReqOut, caches),
+                  cons(routerReqs, nil)));
+  connectUsing(mkUGQueue, reqs, ram.reqIn);
+
+  // Connect load responses
+  function DRAMClientId getRespKey(DRAMResp resp) = resp.id;
+  function getRespIn(cache) = cache.respIn;
+  let ramResps <- mkResponseDistributor(
+                    getRespKey,
+                    mkUGShiftQueue2(QueueOptFmax),
+                    append(map(getRespIn, caches), 
+                      cons(routerResps, nil)));
+  connectDirect(ram.respOut, ramResps);
+
+endmodule
+
+endpackage
diff --git a/rtl/DCache.bsv b/rtl/DCache.bsv
index 3162aade..b99d4667 100644
--- a/rtl/DCache.bsv
+++ b/rtl/DCache.bsv
@@ -496,7 +496,7 @@ module mkDCache#(DCacheId myId) (DCache);
     // Create memory request
     DRAMReq memReq;
     memReq.isStore = !isLoad;
-    memReq.id = myId;
+    memReq.id = zeroExtend(myId);
     memReq.addr = {isLoad ? readLineAddr : writeLineAddr, reqBeat};
     memReq.data = isLoad ? {?, pack(info)} : dataMem.dataOutA;
     memReq.burst = isLoad ? `BeatsPerLine : 1;
@@ -589,66 +589,6 @@ interface DCacheClient;
   method Action incWritebackCount(Bool inc);
 endinterface
 
-// ============================================================================
-// Connections
-// ============================================================================
-
-module connectCoresToDCache#(
-         Vector#(`CoresPerDCache, DCacheClient) clients,
-         DCache dcache) ();
-
-  // Connect requests
-  function getDCacheReqOut(client) = client.dcacheReqOut;
-  let dcacheReqs <- mkMergeTree(Fair,
-                      mkUGShiftQueue1(QueueOptFmax),
-                      map(getDCacheReqOut, clients));
-  connectUsing(mkUGQueue, dcacheReqs, dcache.reqIn);
-
-  // Connect responses
-  function Bit#(`LogCoresPerDCache) getDCacheRespKey(DCacheResp resp) =
-    truncateLSB(resp.id);
-  function getDCacheRespIn(client) = client.dcacheRespIn;
-  let dcacheResps <- mkResponseDistributor(
-                      getDCacheRespKey,
-                      mkUGShiftQueue1(QueueOptFmax),
-                      map(getDCacheRespIn, clients));
-  connectDirect(dcache.respOut, dcacheResps);
-
-  // Connect performance-counter wires
-  rule connectPerfCountWires;
-    clients[0].incMissCount(dcache.incMissCount);
-    clients[0].incHitCount(dcache.incHitCount);
-    clients[0].incWritebackCount(dcache.incWritebackCount);
-    for (Integer i = 1; i < `CoresPerDCache; i=i+1) begin
-      clients[i].incMissCount(False);
-      clients[i].incHitCount(False);
-      clients[i].incWritebackCount(False);
-    end
-  endrule
-
-endmodule
-
-module connectDCachesToOffChipRAM#(
-         Vector#(`DCachesPerDRAM, DCache) caches, OffChipRAM ram) ();
-
-  // Connect requests
-  function getReqOut(cache) = cache.reqOut;
-  let reqs <- mkMergeTreeB(Fair,
-                mkUGShiftQueue1(QueueOptFmax),
-                map(getReqOut, caches));
-  connectUsing(mkUGQueue, reqs, ram.reqIn);
-
-  // Connect load responses
-  function DCacheId getRespKey(DRAMResp resp) = resp.id;
-  function getRespIn(cache) = cache.respIn;
-  let ramResps <- mkResponseDistributor(
-                    getRespKey,
-                    mkUGShiftQueue2(QueueOptFmax),
-                    map(getRespIn, caches));
-  connectDirect(ram.respOut, ramResps);
-
-endmodule
-
 // ============================================================================
 // Dummy cache
 // ============================================================================
diff --git a/rtl/DE5Top.bsv b/rtl/DE5Top.bsv
index 2173526d..0e5672fa 100644
--- a/rtl/DE5Top.bsv
+++ b/rtl/DE5Top.bsv
@@ -22,6 +22,7 @@ import InstrMem     :: *;
 import NarrowSRAM   :: *;
 import OffChipRAM   :: *;
 import IdleDetector :: *;
+import Connections  :: *;
 
 // ============================================================================
 // Interface
@@ -114,10 +115,6 @@ module de5Top (DE5Top);
     for (Integer j = 0; j < `DCachesPerDRAM; j=j+1)
       connectCoresToDCache(map(dcacheClient, cores[i][j]), dcaches[i][j]);
 
-  // Connect data caches to DRAM
-  for (Integer i = 0; i < `DRAMsPerBoard; i=i+1)
-    connectDCachesToOffChipRAM(dcaches[i], rams[i]);
-
   // Create FPUs
   Vector#(`FPUsPerBoard, FPU) fpus;
   for (Integer i = 0; i < `FPUsPerBoard; i=i+1)
@@ -167,13 +164,18 @@ module de5Top (DE5Top);
       connectCoresToMailbox(map(mailboxClient, cs), mailboxes[y][x]);
     end
 
-  // Create mesh of mailboxes
+  // Create network-on-chip
   function MailboxNet mailboxNet(Mailbox mbox) = mbox.net;
-  ExtNetwork net <- mkMailboxMesh(
-                      debugLink.getBoardId(),
-                      debugLink.linkEnable,
-                      map(map(mailboxNet), mailboxes),
-                      idle);
+  NoC noc <- mkNoC(
+    debugLink.getBoardId(),
+    debugLink.linkEnable,
+    map(map(mailboxNet), mailboxes),
+    idle);
+
+  // Connections to off-chip RAMs
+  for (Integer i = 0; i < `DRAMsPerBoard; i=i+1)
+    connectClientsToOffChipRAM(dcaches[i],
+      noc.dramReqs[i], noc.dramResps[i], rams[i]);
 
   // Set board ids
   rule setBoardIds;
@@ -199,10 +201,10 @@ module de5Top (DE5Top);
   interface dramIfcs = map(getDRAMExtIfc, rams);
   interface sramIfcs = concat(map(getSRAMExtIfcs, rams));
   interface jtagIfc  = debugLink.jtagAvalon;
-  interface northMac = net.north;
-  interface southMac = net.south;
-  interface eastMac  = net.east;
-  interface westMac  = net.west;
+  interface northMac = noc.north;
+  interface southMac = noc.south;
+  interface eastMac  = noc.east;
+  interface westMac  = noc.west;
   method Action setBoardId(Bit#(4) id);
     localBoardId <= id;
   endmethod
diff --git a/rtl/DRAM.bsv b/rtl/DRAM.bsv
index b9bab54e..e5d4a33e 100644
--- a/rtl/DRAM.bsv
+++ b/rtl/DRAM.bsv
@@ -5,8 +5,11 @@ package DRAM;
 // Types
 // ============================================================================
 
+// DRAM client id
+typedef Bit#(TAdd#(`LogDCachesPerDRAM, 1)) DRAMClientId;
+
 // DRAM request id
-typedef DCacheId DRAMReqId;
+typedef DRAMClientId DRAMReqId;
 
 // DRAM request
 typedef struct {
diff --git a/rtl/Interface.bsv b/rtl/Interface.bsv
index c3d16860..dffd8ac2 100644
--- a/rtl/Interface.bsv
+++ b/rtl/Interface.bsv
@@ -248,6 +248,14 @@ function BOut#(t) enableBOut(Bool en, BOut#(t) out) =
     method t value = out.value;
   endinterface;
 
+// Convert queue to BOut interface
+function BOut#(t) queueToBOut(SizedQueue#(n, t) q) =
+  interface BOut
+    method Action get = q.deq;
+    method Bool valid = q.canDeq && q.canPeek;
+    method t value = q.dataOut;
+  endinterface;
+
 // =============================================================================
 // Merge unit
 // =============================================================================
@@ -578,7 +586,7 @@ module mkDeserialiser (Deserialiser#(typeIn, typeOut))
 endmodule
 
 // =============================================================================
-// Expansion and reduction connectors
+// Reduction connectors
 // =============================================================================
 
 // Reduce a list of interfaces down to a given number of interfaces,
@@ -651,31 +659,4 @@ module reduceConnect#(
 
 endmodule
 
-// Connect 'from' ports to 'to' ports,
-// where 'length(from)' may be less than 'length(to)'.
-// Works by wiring null to any unused 'to' ports.
-module expandConnect#(List#(Out#(t)) from, List#(In#(t)) to) ()
-         provisos (Bits#(t, twidth));
-
-  // Count inputs and outputs
-  Integer numFrom = List::length(from);
-  Integer numTo = List::length(to);
-  Integer q = numTo/numFrom;
-
-  for (Integer i = 0; i < numTo; i=i+1) begin
-    if (q == 0) begin
-      // Connect input
-      connectUsing(mkUGShiftQueue1(QueueOptFmax), from[i], to[i]);
-    end else if ((i%q) == 0) begin
-      // Connect input
-      connectUsing(mkUGShiftQueue1(QueueOptFmax), from[i/q], to[i]);
-    end else begin
-      // Connect terminator
-      BOut#(t) nullOut <- mkNullBOut;
-      connectDirect(nullOut, to[i]);
-    end
-  end
-  
-endmodule
-
 endpackage
diff --git a/rtl/NarrowSRAM.bsv b/rtl/NarrowSRAM.bsv
index d0651392..0fbd34fa 100644
--- a/rtl/NarrowSRAM.bsv
+++ b/rtl/NarrowSRAM.bsv
@@ -9,7 +9,7 @@ import Util        :: *;
 // ============================================================================
 
 // SRAM request id
-typedef Bit#(`LogDCachesPerDRAM) SRAMReqId;
+typedef Bit#(TAdd#(`LogDCachesPerDRAM, 1)) SRAMReqId;
 
 // SRAM load request
 typedef struct {
diff --git a/rtl/Network.bsv b/rtl/Network.bsv
index bd435a11..642acfcb 100644
--- a/rtl/Network.bsv
+++ b/rtl/Network.bsv
@@ -23,6 +23,8 @@ import Socket       :: *;
 import Util         :: *;
 import IdleDetector :: *;
 import FlitMerger   :: *;
+import OffChipRAM   :: *;
+import DRAM         :: *;
 
 // =============================================================================
 // Mesh Router
@@ -366,27 +368,30 @@ module mkBoardLink#(Bool en, SocketId id) (BoardLink);
 endmodule
 
 // =============================================================================
-// Mailbox Mesh
+// Network-on-chip
 // =============================================================================
 
-// Interface to external (off-board) network
-interface ExtNetwork;
-`ifndef SIMULATE
-  // Avalon interfaces to 10G MACs
+// NoC interface
+interface NoC;
+  `ifndef SIMULATE
+  // Avalon interfaces to 10G MACs (inter-FPGA links)
   interface Vector#(`NumNorthSouthLinks, AvalonMac) north;
   interface Vector#(`NumNorthSouthLinks, AvalonMac) south;
   interface Vector#(`NumEastWestLinks, AvalonMac) east;
   interface Vector#(`NumEastWestLinks, AvalonMac) west;
-`endif
+  `endif
+  // Connections to off-chip memory (for the programmable router)
+  interface Vector#(`DRAMsPerBoard, BOut#(DRAMReq)) dramReqs;
+  interface Vector#(`DRAMsPerBoard, In#(DRAMResp)) dramResps;
 endinterface
 
-module mkMailboxMesh#(
+module mkNoC#(
          BoardId boardId,
          Vector#(4, Bool) linkEnable,
          Vector#(`MailboxMeshYLen,
            Vector#(`MailboxMeshXLen, MailboxNet)) mailboxes,
          IdleDetector idle)
-       (ExtNetwork);
+       (NoC);
 
   // Create off-board links
   Vector#(`NumNorthSouthLinks, BoardLink) northLink <-
@@ -398,6 +403,14 @@ module mkMailboxMesh#(
   Vector#(`NumEastWestLinks, BoardLink) westLink <-
     mapM(mkBoardLink(linkEnable[3]), westSocket);
 
+  // Responses from off-chip memory
+  Vector#(`DRAMsPerBoard, InPort#(DRAMResp)) dramRespPort <-
+    replicateM(mkInPort);
+
+  // Requests to off-chip memory
+  Vector#(`DRAMsPerBoard, Queue1#(DRAMReq)) dramReqQueues <-
+    replicateM(mkUGShiftQueue1(QueueOptFmax));
+
   // Create mailbox routers
   Vector#(`MailboxMeshYLen,
     Vector#(`MailboxMeshXLen, MeshRouter)) routers =
@@ -540,13 +553,24 @@ module mkMailboxMesh#(
     idle.idle.interBoardActivity(activityReg);
   endrule
 
-`ifndef SIMULATE
+  // Interfaces
+  // ----------
+
+  function In#(t) getIn(InPort#(t) p) = p.in;
+
+  `ifndef SIMULATE
   function AvalonMac getMac(BoardLink link) = link.avalonMac;
   interface north = Vector::map(getMac, northLink);
   interface south = Vector::map(getMac, southLink);
   interface east = Vector::map(getMac, eastLink);
   interface west = Vector::map(getMac, westLink);
-`endif
+  `endif
+
+  // Requests to off-chip memory
+  interface dramReqs = Vector::map(queueToBOut, dramReqQueues);
+
+  // Responses from off-chip memory
+  interface dramResps = Vector::map(getIn, dramRespPort);
 
 endmodule
 

From 48bd4510cfce23dbf88fa1215f471c2d8a05288c Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Thu, 30 Jan 2020 17:15:15 +0000
Subject: [PATCH 04/78] Destinations can now be routing keys

Support routing-key-based send operations, but programmable routers
don't yet interpret routing keys.
---
 README.md                        |  19 ++++++++++++++++
 doc/custom/ExampleAccelerator.sv |   1 +
 doc/custom/README.md             |   1 +
 doc/figures/fpga.png             | Bin 17166 -> 17154 bytes
 doc/figures/fpga.tex             |   4 ++--
 hostlink/HostLink.cpp            |  37 +++++++++++++++++++++++++++----
 hostlink/HostLink.h              |  10 +++++++++
 include/tinsel-interface.h       |   2 +-
 include/tinsel.h                 |  10 +++++++++
 rtl/DE5BridgeTop.bsv             |  13 ++++++++---
 rtl/Globals.bsv                  |   6 +++++
 rtl/IdleDetector.bsv             |   2 ++
 12 files changed, 95 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 01e744a0..b001abcd 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,22 @@
+TODO, document the following:
+
+```c++
+// Tinsel API
+// ==========
+
+// Send message at addr using given routing key 
+inline void tinselKeySend(int key, volatile void* addr);
+
+// HostLink API
+// ============
+
+// Send a message using routing key (blocking by default)
+bool keySend(uint32_t key, uint32_t numFlits, void* msg, bool block = true);
+
+// Try to send using routing key (non-blocking, returns true on success)
+bool keyTrySend(uint32_t key, uint32_t numFlits, void* msg);
+```
+
 # Tinsel 0.7.1
 
 Tinsel is a [RISC-V](https://riscv.org/)-based manythread
diff --git a/doc/custom/ExampleAccelerator.sv b/doc/custom/ExampleAccelerator.sv
index 34a97fc2..acc73455 100644
--- a/doc/custom/ExampleAccelerator.sv
+++ b/doc/custom/ExampleAccelerator.sv
@@ -5,6 +5,7 @@
 
 typedef struct packed {
   logic acc;
+  logic isKey;
   logic host;
   logic hostDir;
   logic [`TinselMeshYBits-1:0] boardY;
diff --git a/doc/custom/README.md b/doc/custom/README.md
index c380f9c9..fde29010 100644
--- a/doc/custom/README.md
+++ b/doc/custom/README.md
@@ -74,6 +74,7 @@ custom accelerator or a mailbox.
 ```sv
 typedef struct packed {
   logic acc;
+  logic isKey;
   logic host;
   logic hostDir;
   logic [`TinselMeshYBits-1:0] boardY;
diff --git a/doc/figures/fpga.png b/doc/figures/fpga.png
index 2ea48fdcc2aa8e3ffc60dd4168ddba0d49ff1cb7..71a4c97f0c8b413775eb60c379461ee91daa082e 100644
GIT binary patch
literal 17154
zcmbW91yo$&lIJgOA-Dv$;E>=hArRc%9fCt}_u%gCPH@-Y?hXNhyIb(RdGF2anK?VV
zGusDgINg13f49D>fBmYeL*!&ckrD9_0RTW27ZX+h00><0{vkXh`17`F%oF^DU@N9>
z4*(ckf4?E3x>X<mfCLa1{-orxe4_2FrKtKSz>)Umi;z!MPb`Bbgk5&9BRMB+$K0|;
zeP#Xpg~gG1S*3<WS%b}y#h&JROT7k-dG%)IU~IvgrOzgi6r=&+>%^*tqhtsi(Sy%k
zPk)-5)7?+)C$4z+cul=XQ>0+!vIUtw>P?af`a(KVU{cG83i)m-L&zL(Kz~{$#-;Qu
z^&08I4DF^5V2#QB=){%HfR2dy#h!vWrOn|$;eKMSOciAXGL^@<!rvdt<K*ltD>_=w
zPpCqx89m4yTZ@8<iid{>Mb;QOXSLLs!dFmERkhLM{u}Uf{)eW`uO=o-LK+4JtDf)3
zZ{NQ4^72B$VQllbuR%a`eR;Yk<Z=3K+qAuHgdU=#t1G~pz!vXB=l(9#TS7`oN>UOD
z8F?=DMLBSj3L~T!3hwRkQmrTrDhCbe>aSlinY_(LL+B?Dgl`}4&|CY=E~<-*i|JG=
zgui^*-PswkxxDwFP2es+<YG@7+prxps>#UEwzsd<@qGUD*~P_0MJ2JbQ{Zh7Mp$1V
zP5uR*zt@S=pho^!8b?q-000z9rPNq1)#z*X+&?^6uXhmok)VgvIiKl(Zx+KcCN?%!
zK~WI`I6XaWbw2AisP0kG{cPNLNEjkT2?q^57>LVe9^YnwZykz&f|BIT6C&)FU}PSN
z`R-i_?pC#0`No0R=dX>?3bn!G<SA2_$lOsz+)iQR<T--A+~D$4PE(lMT#|@!b#(_K
zMG^7QVv3()I{u>D;4R-j-p77-Zh(SqoS$Ue3!iq5Hm?@`E}WhQOT@y$LS0?m($dmS
zSyji?R9{)u`rSfRdXGoB|6*eGr#I|j;pNeDb8`m=2W2WhAOR!3O{1i5L@Ex->=(!*
z9zxVCKQq{uv#e#QQj=-`Ks1UdA|%Al*_pWMVA?DoMn@&@*{VNqaadZ_M%0gF-%ETz
zm5TNahR_dDE6qbwr<Rh6iVnvkCx+m*)P>Mly4X#%5z`ttg&))iG}vLf?kDyB0xxGl
z;cxw)wXn)!c?$<PlfT^*nKuRo28LlXJY4)?BTuPVi*$J3=(rHW+}rtdUI>qI_r4%E
zTFm(D)bp$DNkxI<ayu{Z2?{-wBFuT0%$Pfr|Mk=oh2<NCVOoWkXTFS17LzapVEMcg
zBWy0SC3)bSNp>m%`21o%P57k(&GO6r5@AyZ8sK!VEomwp3BNYBoSB=5iPWlKnRZrT
zzp$5RHpx%0nNY!_xzZLLjxJS9<#9s`=S`?Okt=z}6)P`&=jN5>73sj&*}l$dL^eN^
z2n}Sb|9)XwGPcVti%c<1`Zg*;ck_d<g~E_0+D;7~_<bOz&fLdF>Q`d;Vx?uY9KCtS
z*uEW(kl)ES#_%(va72-Cl`KchfRUD^^k9u{V}C<TC`Q;S`uCp1`%T?#7M{qP%El)M
zV3lvPd$^*PnuqYpTTP!mM@Yo~A)B#xM8YegPw-U-g8*j~0F?N4!rs?>B(^Xj%+}?t
zIGL9m|HWJpMBdjoE;R6zw(LG8q}$K2D#J%m5=W)t?yXS=_32Zp;sV}0A$36Ik%pVY
z#5%6_Tk@PHjR@c6B~(Ly+|WEUN#{N75gRB#tRTBuoAb}NZ*wJ5Vd+9KbqQrbHcBZW
z!{JsfgsZu}GJ`dQgHyf^eLS89DRLBZ@tI%x<r%7eCqTVL8q4~f?+`b`;P;tQS&u}|
zR2ugwVd-I5{HH!{BJ?>er1}hDG`YIs1^9GCG!DAN1m^J=q3+m_MFyMjKL@Wu${3mo
zwy$?B&Xy#6?7`LgzK!Zq-G`CMf-QLrLn9%v%;QYM3$Oqx&gdU2#opeJY|puW<W}Z}
zGA*%<Pk1ZkZ0O<=?3WzwnlT7GwM){l?-$iO6;1=14mM%2xvmM@wX(Q}c7B}{T6fjX
zUp>~9KA*vqQF_@#ns&vFch+bCfc5Qm$i}E7ibnkO=qeH0IrDPIwy17W%;2u8xn!b&
zb<Xi;2?T3iOVcUwhWXx^1^f*j4y5J0C(But0Lh}3_G&s&TA08DZMcQNwrUpB$v0xo
zT&BI8i%Y1^Pb$a&s-$sPQPT;RPDOm7ESYe=3yQZxMzHrhle^ZP?MUSas?xmuF=!#1
zsHzFV+AG0bKFNY^V~U@iu2LsL{T39UVe=AJ9Ig)`L_~!sGz-d|h~~|{2$fo6@^D9w
zV@UJ|@zMp!coB#AOrguaft(%Yq70rII`h8~A>R^%X6r&ClEiaa^73?&#K{_|^V$`A
zw^S4}oy4298@VRJQxjp{63m4T3wio>dzjMqL14s3m~6zYm!@nF0*KDAXJvEYQ5-(H
z*C=m@SfTsiOG=M;@=YSY=F%-wtIfVKRY%#A)Sv~v`~~}3BG@GyIpSv>B_u!sKlnI#
zX;Jb<Ys4n(+>`x9l1%S6@T?4}^%ekZOb@Rng0>%xNPz6UocKl-Wu2}87#7&(#>J@g
ziAolG4e~E-lT%HoE2<NLsI1!11T-g3Gi?YmwF6a`blhgxO(JpHgTlwhO!R+Jg}NDn
zogo2&KY7z@-<znrj16$@$JGY>hGnjleac!=65ZbuGD?LVOue;D$Ho`Cb1sTNsdlVy
zJa@>I3e}6W&50l!`uY{)luM_>y5lb2)<l7zOqGIgyFgxJJ6`v(Nsh%{vsRMptjMK5
zXqhqCS%GI-y8dTMV8?8K!C-2ISC|UM{OUYKu`t<D+_epw3xRO2p$nh6NiS6ZGY8}4
zr~IgEuW6^gE?8j$>%QDUx89{|#gJtqR%V$s*AwU8#c}IJ4S((`5fSw?-^z(`63x`b
zi;HrR!mG)jYw2V?e1fD%#Pjxa8tFzjWB#FjnH747-&6xT;BaFR!#XyGB%+`VI>qY{
zV{;J*V84mE5A(N4rQKF^qz*I+fmvxUZw7Vn;6VWwG2u@9kPqbtNFe!T7e_>sqx9f#
zPb?(HssC|y>czo+DwHo{zbhA)<_h#M=nyGf7P=gG87AfTS^1oa??krZx>LTeX+IOt
z@Q0TS)xrfwDSD0;yFmdS)nq?$m%tb@v34y&%RvwO;L%*Yt;HW__2nV-vX&|KphDtu
z(p>-@&^zIkUB%Ho)M@MsV)9k*3JqQFq=jF4$dn<s)aHdO*@cHq6VXZ7_HDxPtmR%j
zo-f25awr<$1fag0h(RBp?m!S1J&+W{63+*vUg}6P!0TzgRPjTXxLFHPUrqLZ?CHm?
zGn~hr-;hQV>&p$p(2lZ>_*i+~Ra}UqN)7u~w?Bw!t0y_5t>a-n(<GAA;$Vp$G3=9Y
z!!2Q~li3zoRz#Qw7!WK{VcLkQJGkT+Y^`sN3|9*HGX6Y2poS>T_hY?l5bas}zCT)5
zmAmPi(j!(FhM)aTUH$^X(x*F9hV@wjFU<G&K-irdV-9DtFf7v|2omN{j9M-`GT>i#
z1PKvf5QTGi&5I<Wa)$3X35G=&q=QSa0@dNUlth>YcYGSa$K#I0e2+F+o&D&1ss#rJ
z_h)&1U4WFd3p!A%`P=)qcl1qSGBQq1PDGi*ctWkpXJVhlYpo5$Bqdj>^!>qGR@OQw
zEk5XWSo#TN3V3@BL=timFmiFF2$Rh%Ec}&o1*@reV=V~NWB*%wu^qD${vWj0T!XL}
z!x(qgM3F*CPidxM?GnM;ASud~r6o2F4vVuR6s+!^p4yt4r{`xo&U7_;c4^Xx1>9?k
z#?SL)$XH`7C3ChN9UUu;HbEr8!el3_tqbGh<ItM<;B9tR3GiuceLP)ltyVj(np==t
zV))3GdIOOgIMehxYSPC;vqA=!n|yw5d*2u*MxiENxGNSD6BEir6xPQ~F1qY(VvkfL
ze^JyW!W7^XHZyBR$H<uY+UEWGQblMvIOJ;grc{o4bYg<z6DcvT3rS~1Ny)p0ze^^v
z7Xu3Czn5ahXAS?cw5)7(brk}DS~C1&$vQg9F$C&M#6ad~tD=wLE#eJTRUfIT;Q_tP
zZ+Q5io&kmKd0jn|6i$&Rh5Su6AH1KNTc5iu|EJLOS=Q!V<s=hOuQ@v2G~4gxQjNas
zKN<)xMs8q;Tr<yj1zK8JWw4q}XYe?OT8QXo6blNEe<VJNDB&+`*lW{meZF0>aXwkT
z8L8}iH{@{h_N3;L=)+n{7)jq*qbc3`>f>Y^^%w*It|6rPA0FLZ(!O_VQw$n)<oHW0
zHz`uP8$CgH&pNdSMd<n4qYPVx9<jzc@w<SJa+yl|)hYK@N`+S>Ma-~Q%P)V<RS#o~
z*T*{AD+MX-vi;s0_TJi6$NchUC~<@NdH$dylo-?IHK*b_#VdqzO85{Zr(>ddm%_3v
z@`80&bBe-+T8^|*)(i9Gd=>tOjzzH)o97)g1$6|GSWsMrlNyOO?zakzmYjld4O`Xm
zU{4I^l<h99UoY}zCGcc7H-WuEEU6{pMDTmFIv*j8WMLDf@V<kGtT6_0$#BQ-5E+PM
zt?^{Rhn1OM4!i-+<~n8XV78q@dBROwTz4L-QYG`5>-i%v2i$V*16EV}JA~SFP`eG7
zCOa3P!B7GK%Ch$cAyRjS{70JZs4=zWkwr=A&Bj_NqM>2j(|FQYq0Pngjo}tbGjlyL
ztO<Au1V`oqbF3Fl4VpKF8OTiR!8vMY^s5=p44udTNrGR$Q_kr<ijk2DTK(a$?mq?6
z3eP!W-)wftN=Wv>TL&R*^+-p1h@>KqT-yJ;B&x*BYL1a|7K{ww9B8>Gql`ZNDtY-W
zH$U7ypagBp?_H-=XmW8R0SW)ClVc54nut2j1l{b)u+fed`>N+Dw5YD?VW9}W(|lz-
z52<wv)HjM`Lv6BzjMi0$2RCwEYP4$!R%D}2j)5ys(dw!*yi;e$c<Kn*sGRL;<{)a=
zOLA6WJ(tzB8Fd?l0DjSb^*&F|Yar9&o;yinkv$dIeA<gDMcH*rDNJWEAUL)R7JaGc
z)Sk8DYIMB7Y*ZE34NhacB*=!^P|IrN8}{LdM39<|p}$a(?_V%&#63ska;2o#r+$oP
zlr1q1ig=rU6%cztIx(CRw=PPiS0{|=57d8)L@;o;eAM%t8sw~JE%!vv(Wm7(U*qCh
zC~(4!?u@O;G|ZBh#$gf_3NTy+YE+Y1Rqn3OsH})FlE0ptcB7y2mRo)6UrABWnOHEW
zxB8{yia^pD|LMcmBt^1>goUf^&5YK${8ZSe_wTZqJ&&0EX?%82*qyH#mt(AyW<>OE
zIQ$5XOpu!h0AOu=b}tKMh$Hb6@KY7kbnO`>mx%m~$xMUoeO}uhUKuGfcubptkzj7r
z;kJbgpUMas89*}7qQ0dg9)vVw;8q7kP}B!I{k&?n_1RMUgaM4Vab(HlT3-!Cnemnb
z+#{1J)6V#4(o--84za}HTvZgq(l0HDam-DuEu3ifo-rptDGPHUquj5K{jpAP(%}aF
z#F8z1<;kf;X1scBg=*FNaOg!*_O}3y4wA^UXHn+Y4K>m9#7P(26Pvu2+v)_mhu^J2
zc<<mazOOSLHfdiQus?4ROKdMJ75q9@s9?idA5xXIl&Ke_+-@k`oQ3JZ+bOp?m5;xl
zt^WcKse;)?X>bKNOV&;%RZB1Ui%jK-bQ_RJUxx(SmcvyujP=DVrkYXcAI;kR7Mo#e
z0b`=@_O)XHvYbrH8(X$Y)f%=qCOgBlQ?U<{&qA#+I$!D>iE(PVcEb8TZtk*kNBGR{
zHrpDqUlP%WzLSRSw#W_-2u5Rn;F`Wnu{3gR+et<p4!>kc+kiudXx*~%CO3Y$zXbAY
zYH~Nxidd5jR}=(T4&S!(4=srm;_n&~?i3Y~>ocMHnX8ohFN|NYR(hGoXCW(i?1rkP
z?+=^AppDwPh5IvcXMsuRsKrF{g_N^lDauE^*~l?6@<BF~Mv)-)?<reD$oM$-Bz<fs
zWkv%lFJY9eB-VmFN%!Q7j8kR-puiZ%>tW{ctXAKi$r2$|`|-07>8ypr$YC&3hPTSe
z$jV+;8RJza#JBasj}lH@RSlokf)t`c2y*`LcNk(HT&9&(2Fk3MY|lRX@QjUwY#s<R
zDOmaS3D)*k^#?5`&R@VX7FFVpc`S%)$X|em#qV*jgFX5_Mpmo7{ikYUoK2QY4@@qk
zI{i&W`qBc^{ZY5x^*Bt~pZqIo&Hh{X!Yuwv6{zDyPrj#{tO{o*?vKy@r9A|txwzZk
z1U3JKgu3JZvWpMeElMx;H-~dQ4L-Z$8Arc=1G<lok5m*C$=T5$TnRvs1c0HTAuzb~
zD5`@9<B(9C1a)?9jzPDB0H|_2n#bp`exoTW@jnt8=9_mg00kvwVL?IQ_wVN7ufw%Q
zP(ZNorY{uS?eS9HvNzlRS$E7c($XFdC(wZr-__OiGR=cOOQPUynOo$Ti9II)bBd*@
z>B6sH<(`w4#i&F?nF$FyXJ=788@&eAW{75O{n@&$gf-Y^SG!}Rq@;ikjbb6aW`pJ8
z{QpFb{BRQ`iyMM~_~7&@0*_5Xr#eLgp}`jKE^Dd$ph${RatWKJvqSyQ!dQ3bwStt3
zOViVeI0bq+g`$1_;GkF^Zd9K^N)7pmKy7p2Oz}<;<;{&ICb+Jg(ish#n>fN6cW3Lw
z4gSbnTwIqHIdaKaSvGz`{4VE$UfPO^ES3pupsM)?HUePXyi&UYiYOraXOpYpAVZW)
zJ{~3hVAG$xO-B@omC0PJG97dKP+KhxY7#&=!Gy3wagKsBS%uBkyegkl<&Xjg<C=P|
zKwUyaUR&GqdVlI3Nk_n``NibZBXBRyrxst5ce0~(C5E7`rj{HXZF{jPQjPhrBQ$pN
zbPG{~7Id#h^<_Ebcf&pi(pDO*(9qBni)2KG9l#W4J@t^XQtR3}hX47x7Rp;GvwbOG
z@Gh%;$>G*yW}t-Mq|rEI$Y_)17OBygTqs_9srU8PawpJzOt&0fuu1~~5iym`Ld8pc
zB{R-3crZn?e~e+vg!k)G@}Z#)L?3wl8?DxL7%stsRqod7j9Db4RjheZ><)^Eg^&$Y
zkQ=K<aqjMjsqr2-WWKL)xJr<Ia_$%vP26xf?KUQUeS%i!vL9*UiHc7_!(h?6xbLfE
z%KV%7C`&{3iuhj4g&=}>=1b_3JG%Z+z|<7Ow6L?W+Jx27hp6z2`)1=`*F3UgV!Hh-
ztI`4q4wnzt_W6b>{TjV<Qw%MNv-*r0W~ckge8R*u>EPQ|%1~JOrD(>A9Rfn8)EM%2
z6Alo-!sW`fJ%x68kW?@CQY-+lBK;DeY^9fRm}9c{pLmDwG0TeNAl~mp(nB$j85#dx
zQ4532;RKGop1j9g=R9BW1D7G>fsL#^l5NpJvUkz$)r~ar7q->SLu)<)amAARW`z($
zf}`3kwO6+hVqkKWoU!~^BD`jX=`{$;q$gQPWkAtgp!W!-mKhuB*<Jdhhm-r9Vuu>p
z%yYPG{$OxuS?f(KOJn%ahL~M-?IhFd!#vSHLctxVxQHV2L-rj06Gyt{nw|NJkv^U1
z7dWx^E|PpVcr`jWZL_$y0)P@s)&?YLVHbvPh$7P}6+{vGEJX|Se@Pq+l<v|`ClXP(
zX{<9aTh%BIG&3TY>wb#7KgHpsXZ7}ro0gVognyed8_xrMT#n%1$*if%g~0iyCl=5V
zA+h+Yb1Z22xua!BH%<C#6#BUo#MGG4T6ZGQ1fF$^%?%_GPhWNJB9Ze4V42wI&)L_n
z^5*`7w&MEoZ`#V*i~hf<tu)AbcW(Y2AW?Eb{kk(#eq2XVk$zeunDC>@@HVaofM3T$
z7(FRTM0L~!Q!z*{KWL7Mgvaq+%=jzZn|2nATk@3*(ou|w6(nr8^m)Q$YU(L7JJ#UW
zo7E>PZnLc#pj;1>8abSHKm=D+bM(6<n!{=LO)A6z<6bx}91s3x<a#LtU{GQS$}hfw
zk%Z}M@c{19<2}cBiAK<ogbO>462oqho?v!Puvr>gY=O^&i;5vu=}H@2saKWzyB^cm
z$-~nLi7aw<{{H$o|HKdy!_gZ$(ROcxSsT5qKt1yRsnNKQTglGVVi{>4AC41sVo*th
zAM^8Ks0%Gm_qFTDSBeY6lF9W+exC|fSWbUrq=!Nep+7<^D$j%inqpTFb+rFm0Ld~B
z^?wSG@JIfe014bGR$FfYvDQ^q8^4)jrt+synN%3-DI*+aG#>XiNF9aIwn&>9008$=
zT-Qj-C}Ep|+n>%dU!ASsIzK3enbq&JOf&NT36j_>{wt7FZ){^ogilvq%xRd$C_`*E
zTY39@qCgnp6C-BiiLP#w<?vcU^(f2d`<m~<fGK{L;pR8+NtpKw!`Oq^2ReH@iVsw5
zqgg++rr_4GrUyBjuH0ISw;H;0i=w=#V{}1WT`So<ujE11DSIN`0&zN6H4t^%D4#L4
zKxS%XAC)^=RKX^M3YthW9wlo=3bskk2XqL@A~Yo`S~OPeXCWTx>mCpU%jPP!FH<F!
zV8P6{G;_vcL&c&%+#oSl%FejXHo!-))v_r*A5s@d!8WBXK%`8`AWF}%D*Vk8+u7#E
z#MZ@&&_<~d8*?|~HpBbAw9yVhm58&mhgrUnxH7e(|0X=^59Fm<OOa>`h}INgpluJR
z^zMy7jaq~{-AgqvaXS*x4`1WLfq4Hbfj!DqHG5@yGU$#SuiC|+%h&RpW(*0Uv4UmD
zk6l)>#>k**Awf3>QlX^>{?UPlFf4dgY89m?5D5)FJO#+@jvK%H{G~@R6IfJh2a7vd
z&Qw12a=mWD`sVzN515COG#C%Nh@yAVtFD7sr!@&qYsjiywfo)g+kAwa(Ww{qFQr>2
z2fcZk<dNDHAIcB$2GwC4@bvSO=E~KAfQACDL&+%zVigBuqoPE(Fy`rzZwE>o%?F!r
z=EquV`PMQ{^v_FJ{aqa*<<W@y!%+;awYF!5j#%`3yMLz_2J{JnLiFdt_l!(Kk-ii&
z&!TyUF5s<~0_gj+mjuc5#IIFt*&6kc;}cTU&`ar?zJ02qo`fQk|0l}$-=dJ42R&-w
z^T=R$C?p6~68tZVwL770K*hxQRR|}ymGlE*E$XyzX!qid&v&Q)M4N(4!`4BTgMp4N
zB{!Hrr@Co7;THxL@k~wkHcHA#>ojj7S>sucD8JX$)gj}vb8>S2xxZ)7YC6B%8KI=!
zzQ5XYK3??g*@%sc6Q@w{^n5;7nwMMtQ8HDYMzYEqBgSj{`)4Mfdpx9F9QyRw*va-#
zoMa;XRK@y6b|q=Grj=yC>~#6n|5tf0Cs&=L9&dbM!TIs_1Qj|)EWh3TZh2`bMK3HK
zyrrfh0=CZ1>+S9w*kObX4|ue#Foj=gE66+Bo?8B)&(J%cKqY65{ZepHq|p};6g1MA
zAxzdK!OSKa8Xm3`exXL6D9$zg!jh)R8PW1Oo6kR&RcQDRIrsP2Y}?yyoUqE&n7ey>
zY1z>zM7+_u3$wFtH2;=QbFcoz0RzndP@@G*>?u-zAUtpYwOAk$4v5p!6|B_>Cw{Mv
zU<jo^oU(qP3N<^`RMD2tRl@Y(?`&yVUR`Z(&TDLHInqJc(-$d%z(3n~UID9i-Jaz)
ztF$YX(La9tsOkWKZ(PVLSSU6Xw~hsU(9_bkW#x=cY0=R?jhG{hTAq)i3lvHu$O;P!
zzsM#(Pu=xFZNvC4%|QVA{PQMVY?Vrw;L9{xuX?G;5|AXAi#CesV1lWXNrXNrb$&#|
z7zpdOsARb9lQ;xvH-{;`KwRTP$sJK;-I1gLi4>=d%?n#=yU|Vj;yf-^SmylIxifz&
z<HHrI(Za|-anu*f{Exp@SCQ^X2hq{J-5V4{Gm*5nkO@wAf2^Fn7QucJI*YWPD$J62
zt6R$Q!LzTw-+Jv?_n+VjQ@n#)UO)WomuvoL8>T&2OC+JX{91BSLqsYr;mKLshnt=o
z#B5%c3T=}AAxG>R$ryD2-m7z!8d(C!pE|XQnb2n}tl#0NLoqj~IK3OYLMt!jcJ+jB
z5Rv!;Vmh$WxRqjUj3H1I4|*@{37a#Hh%SPor2>;JG3gWTjIVv5O8Zs*?<$SEp(0Py
zXfZ^+ba3j&So{c>`+2*krfF0l*V{4dmHt~V04Rmf?d6wxLK*W?D|E_t(f`4%S{Sp;
z$%?0K8}x#^#ZT1UT%WOZ*>$}E1plSZ)aFx7jB*PH>Q~){KvjHWo8PpkPNJFfhKoWW
zWqFGP^w3~boB0$iW>=1~y*=`&za5<|qUfLKs<^G7^{Qg05dz3pm|SF><@45w!bRh1
zK2$0ck;xHT9zG#-2{bUDbU>x>E999(P&eY5AN!9oOv%^RCxcD@T-D@=jtU}3Uk&|d
z8N8A7|AShca-RM7Dsyl52Mv0;=GkI8iCxYJW>SrhyH8Gz)UWS3>%A5JOSR-QAA2IG
z1Cpu;Z7u~G>hfdmCdTcE3@(2Y!Ilv&^8T*z?@e2c=75hHttM}u)x^cO0#=YNQ6><+
z>~o{pIPeS9+o&r1Te0o6!PmCj!oUmUSQ5?(F8&wU*Zq&;D+AD!=HgIcIa_rl8cvJ*
zg50vy?}#tU3Apb)!34}_<_;}DgKGeq3H-aN<Ev4+2CP1BV!q0U#!zOW__ax$*kC;=
zy-o0i>gkYx$zM~gUqu`Kjb=U1`82W=v}IQInXfdpU!vo7=T0&f3DeS5?hW<vIF}tL
z&*6|2hW$LWfd;P#o-No}(w0#&AwV2!L+=mxWCkg$jVrmr5aW#EZm$^#eB<vNA2o}>
zy$}*L5tibnC=Bx<C+(}YR?${wW4LdReT!4rcl{NPQzi^K|FLxjgbyvtnl9Y(lp-HR
zn<o1sjacgt?g&2Ua;&>6aI(-u&e190tGz?}uk^}43Saf)q+eUyFas!=-{o`8v&k2x
zk$7mE0n|a<|4E2A$p<}E62Hokb&fOcm?VAsQSwKo!_C0n9pA{sU29JIhj~nGL$j#-
zE?A=eN4YCCcx?4}%)e@O@h|H6p&OQOW=(_^=}4$rm<$Q|i|sV$-<1bb%ea(Lf2)U9
zF;u)FC6v|}($!dS?Pzf!es9C<L<~ClN%&n!-t)n3E|#W+A6%FLZvuHe58dL-H16z?
z3C<_n6$h~V{lKN*fs3I|WW%NM-ZfeN=6j#uO5(#ink$gUiy4cM=`D=iF@KK+BRp8T
z`lokLAP5^HGkhb*vYBuJ>IqPpTpboI&x8<@n_hf7*2M}qHrb#Sz+Y8mIyx26#EJUI
zV{KxXY@Hl+1M)7tpT!wiSl)UkgZZ4j^Y1*;4{PmWqPQw>Ls1Ng)-ZkJt-C$SWxcVD
z!!hWwrJe5eJ*gF#*1*H$MCcc~PJd@CYz5`hr<ql-Pl8?h^Y;(cxcjS1L|`YzN>lEp
zIn*8hKdV;uri%?l7@fMr(?dsPHYaAgxJ^YqW4Tqh!g(s^me}{bMKOgR_DFiSK_CTo
zYLHmBH2Dr1$x<!Pr06a7goc7*4QjDC-DFD!WW8<`goyv-On<vq{sEl+(QAYX^OgZ}
zm8tI7In?yrJXV-8F2`Jma44Utj=l7dt9T+i2Yg<oR$Xqpfj<fSk9#SKQYFL_x%TAv
z)m(<y5TTe9f~8*S?}bsU1b=qM+Eu>~Y)x${Kt~O*{NRrbiSDGcQ^yaP(?S}$M`cbt
zI(00BDu`tzDr;6S;ELaWn4EkI6RP^Y?z1oFL(6i2Y}R#y8F{;;EqUE4m(`-IbhfJ~
z+I}P|wlepjm`UW?{IaOKB24|WqJt-%B+cU~VUL$N9GzXW`=Qfp#^mI<)|A7+Ksp;~
zi%X1M-{2p9cSk*4Rs2%4*_crjzn0k2&%~(`mP1xFYY7x_+5Wr5<SAH2=yn%-_EIC&
z7GYT?FF4A`hhn<{nf*2V70hVC1vYg@Z4B3g2mS6J0tq*{ZECHgLYF(|vmO!uKF|bv
zp=9m2Vz5iT{Hsn(zok<s0I0kevckB;>Ohnd3hNEV<_|ccgq0HUmX3WW9>&}4<DmR#
z7ho^f<ectAR9ADBSigF&ebX@$erKoTHdZYgPL;FzM})O9m`dP>{Nuv;^Sy9kgW{RF
zCar||H_0O;8!Y4O7NQM;?KA9M^P=5R>F0hy_eZz4g=fMeN|Bl(??CV6^>Q@QSXeF#
zFWYbMhPM>)b5(Sx(xNw$njf;{tj?>#SP(gC`H9;KzE2x$z(-uaz`7(9{R?JsT>;m_
zHz7)wot<~OgWSI6>hcSKPl^21x|mvCzD2r@PTk=OmzEBXMq%ln`^pdC+0<7Jm>u^w
z!SP6G(C*rqbxeV^+WK_?xOM0B2U!5<Y#7wvME_(ZzS0g>a_(lm5e_AK`BsT+ywu&z
zsXAMHz8-Df-+GlaMa8+tpmF7$oTen=i<S`S-(G~Z60LWk4$b>+^?5`>=v;5ez80HR
z2(p!hBB@h_T6}Zrd``>#aIhf;H{+bEli%JSij<1Ji&d62gvgzx^8y1*I=vnbYIC@2
zTDT^-HdNaWF}`4}zhVHB>JEJSGjH@Iu_~ESY^(-4J!5sXfltv#ZDUa#$qOqgs&|jp
zuTc|9FV3E-2YhB9h;2?j8jR%x;SYqm{w*|2Cl1<9-)X*#-xJ=DF@PuahiC7OCg$pz
z$g<d~M3B72tNLgW?t%Z(z(9#|OL>>+3l#*AY`A+-dbwlvhb^#SY6$STgrW<6W2&;r
zqZB3Qm+)M>mmk|v;Uw?wDf`*XMw5^4seY}2doc_E(0$kTLYxpd1zvC7myUTEMl(W)
zo?(|X?1Y@2kzmlob%Q91Ii<WLZ+?KQxUur?ES)9vg9^-we(%@LKi?Y;@dI%ZUW_bu
zCg~0jPNx#KqXHVm;Juk_$P=&G?D0Xv4DL0TISK^oOvXEJGnVpQ;zXPy&O@L8Ol4D5
zJ_gT{JxKi`;X*y8rpVL8l|FB}o%vxa=5!xzvhVKvsIYJ}0m!JZNcwiT=yT2|8;M7v
zsmxwg*AdlkL%lD2SFI33n3xZIiZk$6EN*arKu#|HUO2W&Z-vRocyglBbgs^?Eaon1
z%NhKrt%(XIcn>p?+=lyjz<HE-8B2|9qb>f9Gd!n(fiNPFU6RMxdi-7C;p?k-L*gyq
zBXCA2&p<6%6XC_6Aal>KD(Pf@*yxJoJbg-Zx^JzW)^_ytyrqOB=?s_Uh0hnFguYLd
z^HAh$YU#!IqP<#AcYHK7%7cT?nWAsz{qk~AhB6aK)>ULfKyZRnnaUYW$h7Qjko_px
zGbv424{oY{*!J7`cw~u#&uzI>f$!1I^B<!sOLkQjR7#TOHos1&=a3B2>RGO|*8)Bb
zt(`M%I`L3*JT)h4#3~SW&~zp%CyD?7xr!!;y41Hm?L8_vTLjo%GSjPN;%?S@Lc8F*
z0|y*K^3UJ;atLi?)Ps69Wp!RRUbf20;u^pUVX?su;~vhewxF1|>)obT8RKB}DVri-
zA_bNC^F^W9ue6lApjQ8hR+h%-kPj!FRfWujQEq&}g!j&;>L)OPPrR{V8B{<`%e(|2
z==xb6sj(z(-EM!n!*BTU*~vDI3)%d&?satcReQNI3D-|Y2PbQWd{fdGJx5GVA<ms<
z@9r+{MTC^*3i5#yZk=u@9W|GiX?UZ4)!QRM#*h}-SwWmtM^;sHPqA7UIUz%^cs-GM
zElw*apwY5h_sYXUckE)~kMcd>lV}liUoTRD)Ue#2@6vW}{CWB1w8nkaSl!@L($=i^
z@)xQR3Qq<3D9{Odw!Iyop@VmzO4^S4NI?r^H~#2&`rQh85Amq<(~o8}OLhT{lP>+m
zOD$na8oI^qn@vAyI1m{j0qzz}@QV&F)CNcQ;dyrg_K5rBbVK%~v0JQ#=!DpN<K^O_
zYlakPPlxKv`N3WJG~V<(X_O?GP+ciwY^NSc1QJ}bpXhqu3$&>!KhtB4;7iEso?S^`
z$BNZ6>~{;|az`w!&5=QUkTylTppe(Ce4?x@!jI}1(c~rT5~4;&we97v7CZQ4rJN+@
z)1_yt)~0~<9@R<LGr7a-^fU`z_c~Z7mmqOC-+M1JgMfdpC`^|ZU&^qFtzm&g`GEE5
z{^rUlq-)5PX3$J*1%(7&wl?7`3e~w0$MY8NyKq8?2y?X<e~0Sr$%@316SmF?eOqW_
z-N});MY|_r_hz35BN-&!0UjXu6XTI0cW*hMS7T1PXvq<&>GIm#ag_?~CN!Xbk`3^Y
zF)m!?<7IJ^N99p)#AYSmyB<DB`@qjh${~{?MVWAbNrf^PBgC5r^{X4Pv?lk9RJMj%
zX2%6a*akWdYoG5aub~4i;Pa)SCWqN1T*h-+O+g{8==#sKJW|U1TNJw5*sZnFuk_=i
zJa7QXZ}Ccz@vY0{0a)4gWXIfW+L5s}_r7cC2rIT%{yT)3(0a?mAo`Z0^=4ByH<*fB
z9lrLqEmf4;$NIMpNg+A49XxmHRGptoLR%iGYr`fskXj8P(V4Ga<lN3Byp}t#22f1B
zkQT((WPBHp#>v;h!&>K%gNBVfwSP$5<#RW>oPYeDUxbf_pzIhifQC-4P=#x$aE#t$
zM~Ox~^M&E7Wo5JapQP!nuH#|^QkK(rEhnR<>U~Bqx)jgV!}!*G>o+0Z`-!DHA*i)L
z-(vCEy*@iBN!iWwY4D&lQb|RZIwtg4`?J0#@-42ioa_3~ymrzh6K|b;HdrE1v)>+e
z4pc^A<mdcErY1#LpIzO4YqFuMrSq6|7y0CRCT6n6m09&T;V!DwPHo=%=j&_8>)2wN
zrdL@pH-3=o?(~`rg!f*0^CLA&*)`s;ka9D|*Q2f>Q|<fl!Wj;){w;W%Q{q>iyU_`p
zPFrI)8T^|TO48L6&wdzL$x!V!cGtbtSgE;+AINiwBNj=+`QRog{^AT?e=pHbbN=A2
znZVcF_(+hWP-YR+bO)XWg~Cu^`Bqg+Alb`0m#I>Fu)f*d+VR9+VF6dLU_W9r^w~gJ
z`!4v;>!W*Tj052YMv}G@24H)l?-qaNQgAw1`-SH5ZU}86ky1l`UmH?e^IMN*{#3^W
zven#W7FNW2ai*||b%8<}Ps^W1rF|@^jibK=`k`eQONIiy*BCqUKMLu-4T6ItjS8mX
zEBlW1Jtv>JM2;dl2vxmc0fSsYNmMD*5iV`yH|)01e8|}v&t1IY2X7kSXUxW7_q7kc
zECKe}7G^&Cx7K)1w}bc_aGxPuMUIs<<+ZEjD~QuIir*KO5M|7*P#r78s5s6o0*-_m
zgbT|{y+;Soz-+zB@Rrd{3)n8Vfa>$530lB{@1t6Y6dwLZbw~IOla0Ows`9GbNpWbT
zRnfOyKZE<LaNq3iWG!P4qFzv-`_dEV_;rd(N9XNPzJvEO;ufxM_v|*2@oc)o?dy&p
zV0vk-v${l_o|*D~8Z;BFB#?7!$Y)3K?*d(G47`!Pu59P({`3Q6LE3!bp_q1e#bG2Z
zR$~%LrG+DTkblWhOlIC|lV@@6#nPcZdY8dJERf>(s9EtF7rzl@(1O#6{(~Op^fTD`
z9WeS+5Vr~O!ttGJD<{;8#{E}%Xp;N$!GtH)z5yF7U5OBPBuLa>{^7T2E_d>j(4Q`6
zIx#CDpfN?&sVP#Qy5O^_ORIas2r!AZkXrfFy=>pet!jNAK!@kd-5gxG@w>X}ZzqrH
zCI!@aZJuApxvsznxnAH|_@yDqcL1#)nvxc7eL*&@ketcPX`Be{S;q0}bbV~-j+?@?
zqs;w}=B>4ynPkG%AAk?_`o&TK2kYU3u`zkB1t93-{wHSCMtA>V<37HrG2L=?hvA1u
z@R`E+88O46GsV^V2Y$IAjDxByuZNOI>pWJfmG{5-cUVVp+oPp&BX7f!#QHkdpZey9
z^~aM7i0IdKVzTS64xTPX_|&|Fd8-==gv|WPbAF`ykW&Uskp6}z=li0{n)s3aq%cZj
zf7Hksp$QOV$-`-0cD;0W1$<KS6f>-r7=HB^MsjnHF|vPzfYSeJbQDmP-!NWedPblS
zn<z{|)yIf%L16XS5ZzAL<y$n>aVjQzYcV&^w_eUSu(jX%v2BWc4tHxBDog;MjQXwK
z$)(e`{E-dT+8%EqWHp%Y{o%7X$$X!R*(<Bt-2y=Fdpx5QyK}tQ*F_}HuB6GyDIe+g
zw5hBL*C_BuQSNRKnsCT&7?GfF_@nlS4?WvUmW~XBQjM~>^y%pSYz$G}k3sem?J)_-
zA?Ws};lk<rt@SPgTXL;0h;FXJet{a=jdod#$yLANcBn8B=<veOrW{!s_4}?Pro1p(
zq0<kGjBT$tS$2s}Z~2MnbwiY%>hi4hp9BV?(yc9wU-K_O5izwwbN=+4=Na2!CXe*x
zv2R74N1mRZxa9gu`=o8VR=LGG@6W?AwzOnGIrX*mry_ISY(+z461*ji$56QUE?ltX
zsW(HZucNhvA6uXa2t}#%)nb2WCuhp05|w0N3qpgvLaH-lW{Y2#1l8^b920*Lh`lv$
zl*bic007Av=|-QjN9u-<BR46%T#poHNQ|0{r?>u+DzHXwu4sUd5Bbk4C@rs6o%cu!
z<$Y{i{d7y8V2|pZ;qD`#rr3aN$)8iJVWW}@CJmnx&H6&G33b=x@{Mt4wE+DK)J%9?
zYHDanJx;oTb!^taBhx(n1|}Ne!Z3+KT*w}EyFSz>`hdADDMPC7_@x$}ND>QO-_gcD
z6;?g8*4G|Pf}7x&h&<ijF?1NRzb46J>f(1>@Jw%N&E|zFk~`jr(jCc0u|uWvi}Y29
zTU9r{#V1fD5%YN*wsR1I2~c9klnY*u-J!sHA+M_hUHyfc>#&XOr=8d*{gsTF43P!C
zwdIj;ahlf&3A$I#av0>bq^S?Q1kX}A6w<m2p)dty<jp+c0JQ_KIxSO#Tk_}2(HKU<
zls%Cb0aY>Gsa4rb9bC!oA5&k=2`GSqZ&msAF_?%S3;0<!3WF+q^={5(gNO5TO;{(G
z{We?l_$W;)yE$)<<2MrrsJyf#zT-jZ-aHyVrd+Cae1gW&L{W-*z>crT$!Ju}PHV@}
za9X~YkH2v{W=|5opMwE>Dh8Yja*3qfqw^8vJETghy@gP`rOEc|2s4cE;|wrT7Z+B)
zlg2^-r~w;%Lugl=ZNpmHhO`!mhb{<+b1|@e&&(~{=-B?NE&Dumf9igYt<jP4s#l6)
zM)4@O!ig8a<@Za5IGw5?Nu~%-uS_jE?Xf;p+FJNs-y?kTTy;CM&|&h-V@vQoRKh4P
z3CEh@z~fxh6K}XTzn8D`m?KLjtaqGpb|dn-oaQI1caH{u*fpXzk-&PL?JdlNDy~=n
zFMSszAb0~01Js!o$S;v^sK5h1gH!J<@zJj@jT)>;HW{2c?ve2Pzm_sNgLlwuE)*p5
zwgQf88JPq5t!9tZ-e8AyvD@QMG6$yp$zOc`np~$*VuY2$4Yn`i_oO~02KwU!^j*?k
z0l?ECWUPOX^uAHs`x*V|=?0#n=KCZ705x&o7Cc!`h@dkCb2={Cj4dl{BZ=a#Rck_m
zsK&ZJ%_T={ZHy}}zZOx(Az}k~Y!xp<;fx>MMY38rmIZF#w@R+p8Z_#WM(1P(yR)Oe
zVth8`S!&s`gKobS?YwO9VzF!Yc)t-QVMn5rR$9gqgcS@O>rqtg#H#%uzu@0V3W`wW
zRX$%*1rmA{vVF3}s$Sm6>w7W>f<6tiO0>1<hRRmL{lfXd*Z2(vD|u}ctp%v%D5F0c
z&^>RI?)0?dN>}`$f#_TQ6+7z9HlxW>p4;o(l*y?uIqf^7>%q+~V2NQ6y1?K)qo}e!
zcPtl#B*o*?6<i!Tf{oQ4w>MHL9*A(>fICpdcS}l?X#EaFKMUm|2bU9;b<syi7o<f$
zcti`TH>DPU&U(?htc)lSZYc)ktCAGcY%Ak9B%#nI!DX6AULlI!C(DZpLHw*enJ*rm
z<o^#WEF1=!J42xaGh!JI13d(qA>b1fcG--OQIC5CSxoT6-re<ewsFMf94t{36)HR{
zp7e#S05pL(Wt?5J)@-2M8;-@rMKv`w671#r`udxjo1DH-gC?+w1^}}Uv1|Uw9{j{N
zWSBimN%@EHXg5yz<Kr@QtnzP69h{v}SnF82E^;3p+?Z3Oie@4n5;YsGm6erC)`(e5
z-lUE3I~^w*JPYN>K|(@ycX!95>$bU6=jK8N^~h7li^owqIyp%cO#Ao%eXI&4*Q?I!
z&lBD+c_Xef%XH(qUiTuj3D%2M5c8fE7G*&}{ad~=8QeF2F1F(1F$Fr4#y201Dm!ax
zYcVh|$jQlXJWGVhT(9@#_4Kk6MOoVy;z`j%V3)~K^yI*jQp<$Es}_zd?w}^vGENHr
zu%8#Txw#1z3QfmSv9PclUtgX_N2P$D;3VMD;i1>-^WA&N#5S-adaFO8)$Qic4+d#_
zi|OR#1RN-Hd$>LTdxKOfwC=$(C@L!YpxIDfQX;NAetmrnA%O%`H8c>hna_q`ObqUM
zz1;8hh2dC;3m6Z_gFS4ltklHBADxaD!6`UpB_-VR>GbsU-rnBbT@$D^i($DVm<O&{
zgmD}~PTSvcvit{)b1g0xn}TZw`uZCO2MB;(wVDz0()9GV-e5HM+hbpRG9)A<QBhIE
zKq0aiTh8>dvZEHqqr;;kumlJ%&|xx``uq29j&%==pdJX|+~tYqtSzs>b8u)Vh2OK~
z7dA31^rxzyA63fL>>ldazr4YVFflFK73OD788xVW<8EeTv<((Ak55jJT`HHlj#b&!
z)YXUbNJ^B3w=RD7H#Rn^uAvfh+6vxuWoBj;$!5`h`QgE@tJw1+prxgSN;VTTtUnJA
z<`SQC<bHq397+11+x*RVsm53W0h?Ye8_z>iU41M|;B~Vv%pA(DZ(oQSlU96XYN~6e
z)HInKJ;aDvoyn)XtjynDPLp}L=kfb@*m^>1<xYc-xR<9u)<n{f5omcNlLx%k{8{UZ
zi|{~n8M-5dVNO$%%kF3jvDY6`%P%8|3~5~UyJwxRUK`zkRx1sjS7RK2Pk4BEX=!P0
zZmvHZN_R{$I9Jf&`Pk(TcXfHGQlYgpISCG&CZ(pfeA^lB2}1dx)Ak26z2$ZHmnT=X
zftf;S#WWHgp4R$$$Ajqtz^A;N9y`p~Fweu`i-CcGgG1eB251dtGlda^JoCfDv<wW=
zGb=W0ZCqy4`JRur&v%`#86Wk9!0tj((SgIcG8SfLMxC}MlQF!4#k|&5_xsBo3`!{<
z@0W`K;w<kMS7qh7<$8;R)DCdYHJ#V>@~=I&I$4Q}ha8qoWb!?N4-;^^{`&lI!(shv
zQb*$P?(FXNwxX;I3OHVCUl|yn;^TYn4Z$dq$#C0>;84=O2>Z|x93K93Ir0GlSogRZ
z14jn%@bF%4D!p&Y-YWXIo=GCW!@Gb>r5`bUaQ#EGF%mT2?ZFrw9i5z<9B=_N+ud%$
z-?5lmSm5L1tCqWb;ePFWeXh*OfrxCi<@^O!+HcR+JHfpNgM@v&Tu(<!D^Y!^(j6u6
zvbeI6mYLaMwZdK?z{bV~PU}j6?ue6s;B8V$3IdNTIHBvGv-?qy9$PY}P1taGdKwZ=
z2<#4}l1UenmR6?wtfi$TAt9lmvApj6da}2-cYOTR1xG9_D%$z-a0HUd%S#s1@zE>+
zT_%r*?KoL*7ralJSzYx!Uaa=@_09VdAt5dvQ|b4&0g%In2q0l+XQ!e0yQ!6El`|*u
zM5CNemL2!T2?^eEn9mjm1O&L>otjOJgWZDL{O21%#F_k_sRLVGLSBz2jmE~tp(m~u
z7IaYql9H1C<U<%#GWkVCD~pRR?(XXyp7@Rwd7tfQFtNkl$B2Q8S9S9hk(5!Z>8IYu
z@pLXI;P<y?TV{If5xtL~C)l22CrE|>z!_kK08>#`M$LwBF$!2XxH4pBeLDG@TY=Xn
zMcRa6oaA(F$I{|rYAULMkr9izQst_jpc^YFD5!G3ySdrf-d0jl`suCkI0wF6ROkpI
zJ`H+oQzN54fByW`g^seKh9g3>&;-r={&$x@XkH*WL4t<l&$2PQKaOX=|6LRphXL!~
z?0#o0B;@zh6(0VkuQ0Yt4;~tlAt5r>=_9U`oLuy;My&d{tgP3AqO89T^YTRvw0vP<
zVO!gCu(M*_>(Od92KL$AcDbWCJDY@)6F00676xW4g$1qy0Zy;VnEDoDJ5`0y;2no`
z`Y8)UNJt1c^2`X1(WFiNYPOGyi(9h1bw6vTa!vrZD!6CRLWG;GGI(9Tfj&94CP5PE
z6DoFC2B+QTU<^3~aCtTzFv0&QMUQQ>To)ot_CvQ*M^c;?xogXo6I}X<&UKs;$SBnb
zMnf~RX2XF<a4MyssHo@1bto2HBBQq3*;+f;R|*;-!;a5wOf<wVZ)J;%%PdSpm{T<z
zOw6CYGdD*IJ453cJVo&+!0(#{-Pb+)pZ&<MST9+EB-B4J<lVH;Dk>_nJa3mk4~)q(
z2I(tE7Bfg^nIlJyiMRs}2PbpbtR2i0u{D=%9#yFwm#f!S=(N+G<&CuZg}#T<b#`)k
zIqyLs_Be+C$GYLxjhUBZ1)ia);yAj&;p+DGcF-<WJe&?^OJHDNE+5|W@o6jb7JQSV
z^FMSH6B}%@+d{{{&>u(?qX7M0PHnBdgv2nR8SXWRaMS;OE<k1Hi-wYt5o}D1jwp6m
zpMmsezR!MOSOy(qM{aI+UtizyvK#12L_|aksyRP?w7olBegFPFFVP2f_W8B7m-CGt
zP*4cvfK4~|kB^NF4c(;fCMLW4Q+Y6uT?@M2+6oHYdspBL*p6>HK@y`jx5K{N=g&Pe
zMY64}t<TTTSg(Hrb|yIR-_emdvFGmYPIu!009{>O;Ie`J(ctw(57`E%h`=b3$!SLx
z*7wcPvE6Q~A6(AMt1Bo-$lr@Kg@wC|)rR1?g67`P&~WL4(ZAa2j6%q*q^=H5gMa?~
z83Hiu4;Pn|1pSS<rDYqqlfi{{;avm$5fQ%!{HN%sDDX|XBO)RW8hvwcaImtn0%!K+
zsOf2F^sCiWD|JAV%8iRtG%+bGE3*I{w7U9|P3Loi{hkaJRZ$quhfH4Anw*@jm6er=
ziHU`UyRyoT!-E5GvoOXDDG?lhf0lchme2b_$m?<rdK4{24o1e)0b+0POz?tw5Rj4i
z>^6G~B$K$gx&8e7n3<WUXJ*pU()|7WK&J^tYDFzA0z5po<3*+r;pwTVn9NLm({TbE
z9Gtjs_;?Y8yuu|5v(dt2Q~aGK!~6UDa#{dsf&>pYH#pq|r}!Ogt|=|01s`pw2K%dA
zT~qTet){XPR4H($cr@(c=UIO|I&P(wmzVb)S`ztSpn|}@q9G%24Q_95zj($#{M@>b
zqO=!`ga&+qJms@-8?xo7Ra8`<PRL988K~vn&{4}E;GT+s|0<lH%@t(&T2)h{(0czX
zny^Npq^`Z4a5W*o0M{+1FnIKw1s^<^Ac^2ALZ7WqU?5C_#2hOMEGm~77T+vvAKe>x
fKZk>nCxMnEcjLSZX(sSPEdX&58Q}^cJ-`12z7Lzp

literal 17166
zcmb`v1ymhfx2C&scXtgg!JXg|+}$m>yAy&-fZ!H%<L(+DxCM8I;O>6w`_F&6@9EJ!
zy8Dh>BY`4e@2Xn0YR+f9@0>eQSy37Vkq{9C0-?ytNT`B95ct6U5j-UD^{!{!ANYpg
zB%|XB0%7s}^A{q%R}%sRA_vJzh^l)npBi}Tt7$!d;>kpo5(}#7OJvrAaQ+$LPQ}a6
zHMgwWSlu{(X?tu_Uaf0W-sE^}yRWy=)~HKwQ@forlvo5=7d{3pBR2H%>dZCpJ(PM?
zg!2CRa%caq%USEB(ByU2b=KpzT^Bx)V95SoL88$h7{u8G<)0!X2ryAvFuchd=+M@I
z#%VXDPt0&?bcprn(-Kj=i+e#$GIYxL;|W9NSJOpuIG?nt4TbEGzMQSLqN1W+G<J6f
ziiw3FAhx>feI`EBaZ1R_IypHJ6ins{^(bFYfQA$`Fj%|V8-)PrH9K?$ygt>q7v9|5
zpb&BM^71}BJ~HdKTmbJ&L$}-Ny6<ta7~Hp&n3N<-t?KXpdZ9kAyj=NWLHAD2#>OTj
zB*e+7F88E`Gm<M$jd_2*(d~QhkSwxV80zNgD&V~RJMn{D!?7ptfvwC5F1UX%KR=(v
zsEe3@AU!v?x@CC(x>WtBrfh%Fjyt@MB5Bye!()7W+~@Wd4rOa=YiVie?d^@1SdlUz
zt@!(ZE%H<23WaR~x8pi7CMF~(joo6T&SEso9%cXdxZVHR13pBCdOAnQ7r0pY8p+SV
zrl0@=YH4o=7ZsV*E}E2A=hf-m#!1j5K|w-di^ULfeO9SO;pq<z4`=MkmZ0qU%+4hp
z6%(T#cd5-Zf8kb0(V!<&q$9DPF!m)1Hd~sVq+N3V-?s-5%Nnzh#~Zyc!+5Lrn{*s8
zMpR*Qw7^-43#q!Bhg&qVS1nvL`}8QgX{f>jye-oJvQ&zCRDNS)V?_lMH+NHE8Dn=x
zeMuQ(b&Rd{go*IHsNLuyIc%n+l;+UU(NRoHj7B*y59`QWTu5Dkx76UiL>cuHqvQCM
z&ApuKphTOIS_c9F$Bm4PboTTlYdM^@4oxuBEO@ma__oL(ujMEmLVjRqJ*YLvh>Rsx
zDQ&NJWaZITT2<BMeXYqH(UG_jHOrW|tvza02d7$D0#BbHW9WO@2=(dhJbVoG>uU$A
z<_B;2c;}CMN7dD9xHve>x(x?|W?Hf4Z9W3idmZxz;iKs9SK^tJ@~f%pQDO{9-R=9q
zJ&o-t^Yr%Kpb%JUY+1bXG4T<%AO3G^OQe>&G6UQOuTL`Bg0*ZC5FoqPy*zbG#T_|<
z=3L4%Nl;J+``KQHCN#&Y>o&r+AvB2BwV<@HY&7P^!fs}6G6B`7jAPnUll#(@%ynFb
zaU-XcQ}u5<YAS^WnA7uy65qSbc%^3$IEob4nk|g|6W?@W$X&q9_egD8JxoxLW$^Q$
zd1`-T6~AT9b($HAZmHwx)CYo<;Fvx!5J*XSB$2)f0YjeT+Hr?+ChUj0=ivt{dGgm&
zYX^ukPJ^7I16%}2RacdC6@_~TdTBIi%&0<}uh5^g7%<)`bC;Ye0z3E6KnG_D*cu)m
zq?QiT-oGN&Bj7L)&ceA<7-~k@GQmFGXKc0@!-9Su(Hl`Fhd*jl9fMxCRqUG$UH&FJ
z3lY<^c;NE?-I!gs#eHJ$I`lZ&0){Gb`FSRv1pf1MR>yLULm-Bq!NPQb@7utE3qHb!
zIUD5Ji7<Sk-^!%1+8HT<jXzO9<VnE;9$TBE!x~_AW*0=sI?XE5E)@$J29BmfU|PcR
zQ;d3$OT$ekpGSOf$&)Fj7mAH3&^7L1gd@hy_|zl2M+4S_4wv^24i3uZGP%&QJZ7YY
z2shEj{8bKbuHY#WXRgo>yqsQQwL)8SgjX67L5{|>xr1#R{El>*xtJL8+`Z6g26b2~
z8iRt=F>Z>f>G>Oan#5k#Q1QnP=Qx!vZctE*@ql9{B~4iAAzpv@a+Ttpc|D!&zV|mD
zz4VH#(c<>W?9GGY&ua}sgA-55F0{WD@OWy5ME!PE{g0?q>zgy$U?z1Z&u7Z-I`LL{
zE`41Ek#gPMlGh+XPW7}($X13O8~KTrXLlvRsZ)0!K0UR?NN`Uk>D48>B3Y2B%R0S^
z3PdZ`HPYm?&BJfBsw|T3Srs%<>hsT?#u~J;6VR2N@Yk^H#%8yEwG?R^<)a$tnFwpy
zX|@3cNzfPx_S@grx+HU&mZB$Y3*Ag{QNHO6o~W#SW$tDXSJs)VT_`Dz7T4EMb+@O2
zg))Ua<WhpUoU|2YL^PDcz>m>%m`PxKP9{M}u+ykp_Lcz?Af&gcoE(VoE76gVT~!eC
zKbm4f6nx0Xhb~EWl}9f~Vhy{Mq}q{z=ITKrl_T_83h=j-!^<6Q2>2_L&IkTJol2Se
zH*Q^0z$M0}ErJgd7V->rcZ4SR>C<SiI>ThZ5PkVR1n7IVD<_wifZE9Mqi#i0>`FOZ
zFa;Chsmo`Jt1F)zv5tD4OhXNS3ez_DiZ|Sw&yij+D6zj9Xdpr4@N`dqqHNQN^_jm!
zU-)ys$x#|_f?hQs^-)0}N2{aj$#Cpvb8^tn{k$K|92zP;gD@Pht<8(^S(DZEE_zf_
z9e<`;&{wo3zoBy)KoiryYR+^ZDAo_wATsb<<F-g984O9BoUk!HWQg@LM|eVlPTmWq
z*TY)qdMr%wT_>~$dq)(nHG<09(o@=tNLb{f52sL_vT%uH?mfY=XtnN*%@=O@@=?Zd
zPI<8;!(YB&oe67pIdt6@I$5d`mupcI?G~x%?yBoQw<y<N>AI4-t}9)}pfwtxpBH)*
zX4(XEAo-WZ$|Z0ayy8~URJ0e#DwnFWq}(}Tco9qdHuDm)vHVRN%Fe@bC0ZDN6EN+u
z*8?kI>adqT6fm@OqgJ|X&dIJg<$db8S(3!6srT}%2?JBZ3`a*z9B-{VQdp3M4qZX#
z#iCRZ@EsIcG+>mwROi&&E^nK%4X%5Ft*Yo@GkDddR&c!*OOVG?0f9^-^(*N>AQG+O
zG|1HxE(hZ2sp!pMDuv`UFcu63Unl2&K|z)a-=AGc5NKxOCExB?3OIFQdba5auUvj0
zFR<v=g0Y0Ov8gOb|C%XKl5<#{9!nL>#BjYQVGptweDPD|ohd0U&!y@QXXEm|16?|o
z#S>^af`V8zq4BBo_+u}YGP`8<!umyWS$R^jfaEYPW26$mEycd3Axr9WE)5B)$lfn~
z-j-WCZhH7g^N~?o<r27R3cZaP>(I(*Y4SPZ9^?9<@<c)O`+P@DxXiU>Y^wCL8+<t9
zUnL}wK?-`L7tUp9stk`N{w4J1jk6)3raiP{@$Y5(G@<Nh4#*sKF`@#Nu_h$T*um;d
zZM;*rL-X+0h-AR@cKT+zd12v)&mkPMG>o;-e8=gEtRpvRsiKWcS=V^}y+Mu$@6k?w
zcV=};Z-!~}R_10r*e2Muo|b5Z<@E6P`9ipi#T9qeP>m4b9k+y4@i!bCv@v2Sc7s*W
z_)t_7nlTjiom>8)*0eUVxf1R2OnONdI*-APz5q6zvb0z*q&xM$cVLbH{vt=5Q)%zI
z!S{b84Hs`MDMDat%>Rf)5ne_I+e-Wg7Xb#bfNvUox+0VC??m?T<|fGK`T3canmX<0
zcSfzM-&;8_c*DcP6B84CYC7UynvY1V>8$4!kPi<JiMZ{YoSZJMuk~7-wu<CZ<&}qA
z_a{ylYoWsXKG4!8$rL_4J#Cd%<TdE273lDYws}(~v;@Att+jjJqqHkX%kc0tW6>%y
z>C{QI2ZVp^3j6KQ$c&*XS3mJT=dI=EC-H9=7Z)cdc7cI`#Hh3hGV*FNJ6y1@%ctKP
zWyB@x9c;L=s;kfMPFE=4DHCLT?@ns#>p6OG>4A-l3kS4$c9xTp64q}TUfC5k;t*Uq
zRe^@ioHgJN=3(5BUBcx<|DQuv3IX4A*G#4!Z|uk?6qYD}x3Y>A^<-ruxA!}J@54@w
z#U0L~&6K~1nTU!8BOrRezxnsOD=3WDXw0K8GxNuXLjLPkq`}^-FKQVWoSvPbg7Ak(
z`>YB~EW2b=Cim6oegxYId^4F<sFReG6crT(5I=Qhk&c1k>iYUCxH8tHjrWu)(jRU4
zD|rc6t8V`{R^gDWLXvIQ`7~2<eo0l8u<xDSP3RJtzflPM3Y%YkgrVhf_4mrk$|M?v
zHUK(hD-X(da3Y2X{+e~N%?-}pW!>90Zv;G_RvEMhJl^t9rB|)z8F;)E^b1_07e2h+
z$?vuMlBVGm@qvCVE$N7$1n!Ue74r58v`I#L?2`Ky3nvW=x*hyk->~I=Rge0S<!!P0
zL#b{&dYTwiV>4Igxo%)3_zw^jvOg-N`k;d|tL?S}a}6yFqe^XYNV@04yqW+k;_@Vy
z9b0DbXt@J<rySeEk$Kt(2hTxMBoD{0_Y+RK-MSXWtSH6)zVZ1hrIt)%h>|orBE6A{
zPk&w6Ehxdjm?zVA?3B(HdUAm4e;>{_=MP|2Vj0UT@!`wb-+&qXvD$$0k~RH?=>eQW
z)#h4~p^rS}U@D7Y?1fPH?6~t%yKO3sB|PCNdXU->0hUalZ$@TZc#YApBUeGti-&CZ
zr94l!KAEO#seIKuIAL3xl|8pcZ)Q;|VgX4^`+tO`<A5<Me_J$4mz257n?#Q4nBy%O
zXBBDPf>JRyyyU%P=UAIIAKt`bRq-e@gHvz@4e6Z6eW$gRcnpdu-GYGJ%vCjR0|dyN
z{m&xHtk8RK%osy~rop3af*M_7ql+(`VO|;`r@H_K=UTAh5anpC1Iy->x{`XoN!}w?
zmb!yWo21jwEFM1mq7D{pphZ)c3#Jo(*dAlQl>_w*$#*J>iCTxQ2!uj`<;0FGI{F=A
z6Pw@vW}GrAxCkKKWS<OMs?1MZBi=T#>+q#2c;*W9Of4k0StrD2-!kqgMr57^5hMK7
zP952`a^%Wxn;*ja4AZHq0JU^Je(9$@^Qz?`XXznFYb40_lFZM0|Mc}P<Z2=c_N~-5
zb}6106h!px^q$OlRx?x%+~)84G~5}U_5PxA+oR7q^TSmt=4&>}<;BKrbCdwfUL#A+
z_8r3^lJksY4wM@?@v&>OG{MCHTW}<Q;)ivniI=A_Tqwgl^2yK-$QER?d-RPbHB-TE
zIm-OPyE=SsSN+tGQgRY(@XCz)Y2;G0M##s}QUj$$COC56!b<JleXVfxQ<qId-WW!F
z;yQu!^qUv~BhDRa$k1e3du^sWx4~cDIKL+sL9XTcd($w1qt$0`A*1NICTCxq4?!oY
zu$?~*6iej|4JXwiJnt;kHaNYQhC#`PdZ`5R5EWiEtp%P_al{1~ERf~h7`xg$w!ij3
zpbL*;;deio+FE2tP*<DE;@PRZhWdqbPmJgp^BvXx^>M}Od;|uo8z;-}yE!69kw0!2
zPSntR^W%h`asv8kxHAPLo4m~2-_On!iO5WkZ5+1_CDO5bh_}Ru1QMtI(5(7b_^tV!
z%WjPo@@iJga1YgI{LiUE-6G2ZLL3QNK`Z{xgl1pMxFg;%Ht}(%=~7pc(p|P6<h$&n
zE?wfO(4z#>Apdql=s@3>z|1aNGQHM(v@XW{1m}+?50%?-di|`HGoRk_)Sa`h(=xBq
zY!&)Fuja5a)tJ8{v{sh#y7$yb@!4kj{;&}CZo~))!hxxdTonAZHqG?Ua_QDs<>pI5
z4ZaMPKP4scv8bJ#t21?zC86vhzdJGSv|CJ7rVs%JpTB`M)DJZlIQ{YCWV>;5`PIc@
zf$34z?mrHNsTlTagZ*}jkEU!fEv>PNROF4`EURg;9dOa6Qc1xVgmhFCrlGL`aSeVn
zL^`C7FhKGycp0;N2$uMD=C!gRlOEV52r8|pDDnDDKaR85G&>p7CSI-N!ku^+_NXoW
zcz7tIjU4(jBl&o<@mg@aJeP3K>MM3`D?2W2Tb+(MG)ETcj&aPyei=2|^|Gsj;LKXu
z5fn0hqgnkByvul>{Zvn^$zLy*z5z*L8RzjnhNa7<_xxd&2%<j|7H{xPa&ClM;kSA9
z12tocMs~Q5vsNnbnv6Avz3pcx1SEA}tov#bot0I!x!>J-Hk)U1whR*s$+(*i*2Ur0
zzar4g2B4>bMMd)5rHvl)>NEPL;<Zb@7r#o3dZsbkp9;2$ONyrWV=t)GnT1%(u4C%T
zl85Zkmw%GUSNA^4ge4{yzeT703|fpDfSrmrdg}AH$HgW@BQ9f1+74Ph=#+oP{X2F2
zYv(^iAMCOM4{^nM2TEl*H!B-S6I?M=4HaY;zMssPh}b1mc_Jnw|7aJfECyu{tv<I$
zCVC<O7Pvb(1y#SkzG7iw0-TV{sMQ<#0Ti^mYi?z=4=@6dQg7p^(cuA2A%)cl6|~sq
zetdhp5MD`D1O_`U*7N||<fJ2@VF2&TE=n<(EzoQ>h)Rr_nw2F_nQ(M`91gAvn|9#3
ze1-&JVqwk9%!rGNHx>r&>C{1hBq*KV-(K52&(Br@xkxD1R#z3Xcx%mvF;AaJP@f1f
z+vl3*Sr{0O0I?|{A>r}yF)GtfIQMfAYPnC`xTPyEF?+h5mDR%PYK8wFyApI#(wyw<
z)ARHAzAb9ZNNYsvj)9*>?Id-$*4KZ>DJUpFLG)_HOnOari}OxhU0o~9j^X4H5|pP)
z^@9LMLF*L)+w815D5$;NS67!1H=3mBiIAaP<6Op^osWF8pQS{lK#HD$TaB>?S8)db
z53biAqkHr#lsF4KQEzQ3H>rkE2xw?pegcsaA<5=8ao7X|rNle6+DvL5g#gPmI}tX%
z%ocV9M4Q5&;(|+OG*z!cWRBN6efvCWD0q0ZoSCr^kq{Aq8}WR9&RAt@V-r?c7xu?t
z${RpO^j{*AN*rkyKO}yo5pvv#kS7nLSgrvJQRL+uwLF>S;UxUK3Y~Kc?xw@-2DG4?
zBAxJabuI+dLM@amLMHmSWd7lPXrZ$I1|}}MLoYDZWS#1P1vUObFNLH`BePR?h*jQ&
zp-c=6pJikKdJK_(eRb*jt1J2pT8ilNtH7c<m3Pu}>-K23UulYnh$vM9Q(ClPiKRO#
zxT~qN_%zGp20SgjePWng{UWye41DExEjZj!C#zqnn=og;#D4y^;(|^wQUx%FZ}Ant
z0>Hw7T)Pq)24*~itNdq${e2|A#O@g54xnjpoHyIa-?CK)neL4wuQWMU<30ih)4I~D
z-fadHs&ipX*Imj&2^Hgot-6zN;%gUbrW`kh4NRT3#}S%00V7=UQ71vqt-AQYfRKuR
zI4C=ObSxs0pz7gedjbDn3{yq}v7P_x;mGToGBp=f{|R4kjeGCG0j_m^JA1``(`{<_
zz{5)iyTiaK*s_pyC;hBA_p^YvDS$CwQfhIX-QA)c@I2F^H8TW!o=NlLu_u7bE>kSE
zKP%-VjQ-th#V|gU5M$K_3M&6cQc7f*7pF1Jwvz>cGGNZgk_>=gb(GfHvmb*ZY?A@g
zMr|hW-6YZMKh>nRJE+X%1QRY+zv<G{EVkgAW^gM^%Z%Gtx7zjNQ!#8ohJ&noGih95
z>FkSssl<o4+y8}-WFpT{zEj#swtX9-Ll7PO)=7ge+}`;w9qBk=fL7I9;=|mU&v3fd
zz^zkZ%c9OfTA;seyw1VsVzKXsZVfc)>EEvW%*(+)smF)QQI7BMdKTQDh>xseSi%#2
z-qolI5SiMci?;^WaWcQ#h!TNBXrN+G#jr38qqD>kDw4~k%@)TFxsdz`R@{1iX>r$(
z3;{r}CjhKgks3e#qaUShux-3ThJX93<fPg<e2Cjak(A*Q{{M@DbpP_D@xQQD+|+-v
zRVG>V{~=qYz&m?yQ>7lM(Y9uF&J`L5Jqvf*Z20#0V|DWk)?di(8!a(|Ks>%zzn%W}
zKQl8}yX}Sq+KxQYCtSlCX2c(n+C93bv(Rhc?~5<NX<+IH{+Y;6va5^)S8?@vI$$@I
z3w_Bv6*2t3m5`LBlV3YnW)fq5)$X))C`$g%<)bJZWKHaj6v{tw)suFjzY~`EbG!b7
zqR!%fVV()97t5J8a*qCQ8IAtSVsh|cqMORnf>qN^R49mInU*A0g3rQL`aGY6?>|rt
z3c?f4#Ud%p*@W-==e;E~K8ssH&@Vuky*TP!Nmn(TTrh34Up16OKsHhNlB&j#l)7+@
zy`9~;bW?VOgfSy&O!V!i*5}`5c3+^2xiCN=QIV-|P5sf`O&d|Y(DV9BBTNV=xPxlU
zO&1Ud5TtSgg$oz8VIe?5Xhj{_rBR5&ENN^*o&5p3egWCZD}-#@-{Js0yUT|q^yeX<
zh}sZ&f<&Gwi@mY3EaOdZ)ZM0;oT6#kzeg@EnQ32eeGAn?KZgb_zDpN%qByIBjQ;-k
zH>)=8TXI>wlvMr$C+w`E&N}m{lZmc;yhqSZ!=gxAw}8E0L}hvBH!zlkDzs}TlKqK{
zlz67oeeIR|btmmNj-+kvPRb?nYMmKM@b>cHw~XY4jO(=PfG8`!lG6|`+D0|KJMO==
zwY^bMfX2FglOXkKv$4T4p8${lkCrsz+h6BTn|mVg2>q6H)u>Fp;(LWa3Wsrt-^h7P
z)mbstpU{yt`!TS|MUu=<OM+@=bK>|P0Q?-CBm6(`U%!pgG>VgSm{34Nx55U2KeNj`
z_dVUH-<1s&Gz24JPPvKLOuDrR?h_?OjT>R!xx|N%;&P)e;K%@Ny2QvL{<-ph=}R^9
z<NXN@_}@+8ON^YnO%?W4Rwy~=$L8g4%l{FVHf;>y%%ueAIW=Zv(yIdEZsyf_sQhWO
zr9KwWtwu#DMRDK~T>@BENVBxaRL4d)p4(VEgrR~42`Dsd!EUd_!#Ro)-^ug{a#_(n
zR?{!<aV!lS%zD?*&6p9Rk2w`Mj|q(pE_d9S0;|x4@q6H0ed3oD;+;h_$@ybzcI*SW
zqmjoDwUKZ7p3u#YpFffvvucv-=K(rXIJYGE@imaF1X3TNT)zU6S<b`&fw2fcc~Z|a
zf!*sLe{q0%{VKjLrqJ(@$U@ia`&&u&gPGA<vKD-re{lRdl?7f5I3zLZgDW$U(oIst
zo$5yFb6nr+5f3<fWmwB~YEdBoSe{L-I}|QCARJO;M#HrzOx>F<bwHfyC#cBuwDRo`
zSn1nTvxdDsATFR3jv*i$SZx^sC(&{n2lsAf6^HhV02A`pLRfZ=nPh*uwLf^?tp|h}
zplaI&Ww0bdX)JrA<;2x&g7PCdT^qBEsl`RKhVBm(rOLltzyD_cA!73jV8#!Mj}FB|
zLC!5GCx1OEG3bVNbZPbD{UU~!-%i2$-|fIKfG7bwFzmbK+U!TqGkv&ltLTlF{}&T(
zJ57c}|5qmD{K0ul6ZU^+`vI~}Ci$9Rg3M}n;QQHHy9T6l66W;y`04LREV&e>sj7{w
zpVbt#diHXmv(pvVkA4iv{1r!h+?nHBPDAE(+1UoJuJwlgucG2!US1v^P2Js}P{WgC
zsEg?fFA2W}oO%rD7Ut#a@H7%mEG&3F-<_gEC&&~E`Q9xqEu|Yr{{Xg(3`CHVr>Byf
zT%T!epQe$xMf0!OFZ1rZsapE~!ZY=))NYlVh1mRV&R6yFw2KR;{$}71+OoxeWdH^p
zVIz(1XQ!85_VjT@0;}BTX3Q?!gAW&(71hA@C12(hP+D7A<3w=~zrn!301oKu^F7W2
z7Z+Eb=076cf#83lIN>jAGSJrbBld`#G(QQSi#b5hs_BCFY|RyMRu^Y`2zg^5&x};8
ze$IR5djpkxRct@u?zXn&wYARHg66`uV+Z<uQAsca(fQWv3NR&m{VQ(Q7*_tJEY);@
zK!|)OD>!J5Rd?=1{m|12PURIW9+}_2|1xI>47k^m??tMm43x#i#ZpRXuT%FQpmt%t
zF3mxJ03*?|hpSo@8xW{?9M^r6m59lcZKRuJRImX_7LZjb=n7+F6?AoVUw+Gi&o5wQ
zB=whm0}2w<9|<tV{P?}9oUuvWNXVcuE{py{-}?mNu-h%_=~=Qz$F+rU$cfLt)u@PA
z>@WbbF+M@-h;2Jgsvixx=Gk@x_pP;fAHTlOlFJ;dO))Rus2%1-yZ<e!UR+!pw|?i!
zIXv2z5lgrM1havMiGOHLjxF_C7tVz;K4qEZnIf@pHoQ(Zrib3=V|`PZtg+M+V6#2X
z#EO+%2Eu(VWN!tc(dZ)~6<a4jmkC_tX0{foQ-WkYWo1dUQZ>w~J%EChmOw=?>BgA(
z&>G+RpC%xnR^?nT+7XwQ8i27X7g5dAGUsLz>dd2T#X=Pa=K%^t7GNu9b#b@lz;OYV
zej7P|i&JdHE3>{-Qn&tREr3|UtiJl)&-_t9KgTO?Rj*W{>aVRD1!usQab_{&=ewy2
z;6lCnDmdwXgaRqw{$NcQasQOkSWJ*!a`<0ez9CIIaa2I~y5p4rgaUxh+|a32iI1W9
z9$mgWH>)e`{9CzVbhOren+pLPGkI9Tm2YkPlgA0<l(bseBMn3J*u%Q5olw=v@+cXo
zArgPw;P@$`nFiY){|kkw85xrsTPfhbBriRU$o?|HPl3K*fYkZliB<v=0>ZN%?sc@8
zk~#vDjI%2v=6m|wYyM}@(neZG#gf7v^VL5DC>+|L`0MbU<J=W?&!0C0MjI>%+MRs+
zg`$PYz;{XvG1qw~Ul|apK7Fy8l5Lv*t+qh8CBTEYGB=@HM=}HmRh_krS~O6S$p&!q
z-#ThJJpUlec=B2O=3QJu_k7Vr0ev6ZjRv=z3Tsv+N-^Qss4nHjS7BUA?O&@sd*E0I
z4H^O}5bC8P5&U2q&ENahl>#@~Z#?+-WJZCXte?|<NibU*v$<t)YmA~2k;v>L6zz!T
zB#YA;$x|95^ZJvC{uhI7mK}blmb%fWRVT~){nGvPLnQnFG!4L62JL{)K2FjJh_m=}
zZWylA61?+9DF26En$&l*wF-z-DA2E!^rx3=iv9t1c+i12-QNE~pE8&FT__H?aJ-nn
ze6XC;(3nZw`2T_{jlsznd6J6#jqylajMZHGZ2Pa+lMl)(Fqeup(ru-<y!9T`|7cVp
ze<WzWVgtiB;)WGk&4>b;QwAq9K5e#B5)#098am>5t}>h{zufMSea2bfi*E(~%&Ys+
z8VS|5cA~I%2vr>wYc>u6LP`PC^>BIYzuYc`-+vwM{)%eb*v+6^*Unc<PIL0a1GgxS
zQB58K;*9WKnLaG&2_}_x9?+pUu+lGC$ncIpdp|}Yw61&oIJTKM$_`;`nE`?Nw~WA!
z|IwGgw$?BG^BJ2x`S?nEGN%)WcMY7dw{1(3=mXqqi{xPeaoA)75-Edv!GJ|&sorVA
z_NS>RKx)L#4DgwB65tmBj*Fq7&X&>^n4F0r9`-8E#2z%QAHs*4jq$P&r1-%nG|<!W
zRC?#OJvgM(0r`)e)vm)kGAVVhG1lfgHQ!G)7MHn%UrYo@{D9nr|Ir4(igP0Yc6!uW
zcUUs;6W}?S#W99$_oe-VC`Fk7v#8}t66`=n0|sY+Hd&N<3=hQ*Zzr=f>zszX=)<bd
zSAYaLlLGgv*mmgR11?mjz>3+Uya$n%x~l!=I$Rmd#CVR~8S)BnO83X&{NCG7^EsAr
z!M=!#Oo-8<oHj6%mI?8Jgw{_ksABTMw#c?VF;LsZ(L@pQfPp2HjSwaT2ycuG!a=bk
z*4$^IEr<F0%XuQf{Qu5g|JwNvPfKO<59J>c@zDUVA8a*^EKrM7Ml+irPbh9%;4cJs
zOTgJ69=x3#t=XVuhYp>`l^ZFM%#cS?hL=>Q*C5HRLdTIrm5JZ~=O!Txs=K50e&oVS
zP*;Cs^U|w$PMRI#$`gmhmZ~<xmV%E=Y`|r$bHeSXYT|$ymzHcr5TXGYA#hfgWuYE)
z=;1Bp3J*83j+xxk?a(}8E2nB<ud~n@p4h=>@Fprn1##wo59}-bTWB?uS3XHDfNOu$
zov*W<fIVt3Q`^A~(rXAPr(-w@c!6UESozU-1X?j;=dPM?QSHmhu|MO?1JGxaHA!Cn
zsGG_;$PHIfw_=lUN|Ov%kf(nBESzhm8)u+*#d+LaVxuh0PJ6?d64_cNL7vhzRdwC;
zhYCz|-ihrIaojQ9#71xPt{ZsAsLoYFm9C?|aBt};$r5KOc^$=RS+9l+s<|R7p#PP0
zhp2o?4k?~OEmY>KtndDAVlQf<arcq63sBjqvcC3q?lfHgO!jZCIK4afKNlGHB9ks!
zAb$Prg}EGSByk<(bu~lS7>Kf=C1P$Pa9h<>l;1#e?(9D`zAdl50KCD=TCe`;Y*uUO
z2L6}omVJ={#Rhqt79e!h^Iq?FcliNJ*U$@drh#7tWgnrD(o4{uSz+aBOd^+$jmjRK
zphFrB9DSjbXM@mT#cRIbUQsX^vf$h&fhWF?6uv+EZakjsl%yoPqkz%~x!z+#xft38
z4)-L>yR!TC8Vf9z@jkuZ?(IBLGM_jTwzHk}peCh!_R`OAbX(}lJiVXjy(a;`VS#>C
zGkc`9NGR(qd)#w^ehJ+gg|w;4Dm4#g8qMdd4p*~KFSmJhbt=(dxL&b*q_k>S<6Coi
zg`cAu6Ao}*C|JR#T;g}?2%Tj66)pU7NX9jeQ)2nN?RR&Xn6cW>9H@5pL5TPexboTP
zh6QbBQMKL<(_0Q5(ft{@ZV+hsESsS9Sjy3sxte{0+k<7PzkB;R5E8^6T@B-(L2p;O
zt?_rK^7Nru@uD1w0^TLN`Ed*H+GpkHC_w4@{lRT7aKTfki}84Gr;<dfg73k2M;jp?
z2;+Lom;{PX0izqO-35k)Fu+L#?!O6taLHN!iqgo8^F?BT)q~_!WJB6eK4xUEW5jRZ
zfEomH9TPOv2&P2oE*|3MCjcpT1;Qk3jeYhDKf?H2Qx|xbCC_kXgV(rl&nb9jUk(oM
zo?p>~^Thy(F#LHIoMM642kxmv<B}ut^oPxZHOjdr!Kqggb4m+UZA_2|ODd21>=Q5C
zZIti(giz(nc9*e04E`zJ#ve`p&V!JVT)RqQA4#D#p|-@AgbQ4RE!MyG)T+)L&9yB@
zT{8K98w}w^Z2!L4o*ycV@aVWLz~}OUM4a6f=Vk_3Z}rJfehgtV!7fa#xs>lZCa>zY
zDuiLx$bE2{gzMcTFRFp)HKt%f70-XF5}I{6byE9_meR#GFM7!;T+^;V&nCq}gx=_C
z=37ZxpaCKzddcXFb#4AxpR{pie$0{(6<zK+N)&}M(+Cr6Dp~aDw1gw93Ig<=*1*rP
zua4BL;v61CMI%)R$Zh;bE+bXth4FW}=C;;opoi<Ttu7b~*`}##$r0x6Qr}?*#aHg`
zZI7pHagw!4+a%BIcF!0alf$#k8OH+JWLc;+s;9TZHG)i|_&_bsQ^bBDokuG(C|VAt
z)5$dfk3G>aW$K5USg3;6?PnTPbLp77T>OK#>5pz|(x>CwW3{5yom>`QKR=;YAIO@>
zLJJ<Q7nJ+EmfM!kj@mdk>{lqGo8x?JIz1x*iL7LASR1mA1Rs$sdEBFw2l0mB)Su1+
znjsL%$Rkd+9S=YK{cxlP;(biVWl}=EToQ)th!qOD$=odoXx;u3#{;hDGr6R#dUZOV
zN}iKlDbGo1bjl17k|L-$E>*eyWLPsO)7C_;jM=*{51*drr$&7^KJwnT*0aBst~c~M
z_0C9JjbKK)lk|7oi@<rzU5Ho!>2Uu97RcMc_5<i=M_$XfmD%BQUN2`ylfBLWftK+s
z*v3F$L3I7jayBD}EXrCPJ$Z<6Y2i|;Oifog+@1y$F37&;e^>X0@Xz{so?@v0uab?w
zcd>jX)FuXI(+CqNr6#NfmsehwN|c3-D3Z@Q$`PE)=7{&|ac@<7cQrQuU9@^Bg9M3;
z&WXP)n=ZiU*_){fuD{e#_#1gHt)11`MgAOhKKf)|khG^-D?e+-6OfR$u*GWu6MVLt
z_<|880Ry^enoA|~TG$z9JxrJ(p<(_Y5LbgrIEl}t=b1pdfOnX~%i=V0+lvGWiaSe*
zJDZOROVs;-7ubDS-_xep?Rt>(UimSxO8R*)x7*Xs@4^jUgmwRy<CpRY)@~t|hYuX^
zrqZ}t`%d1cVhJL?q7*gIX(rYS!^hu;sH4JEsn7fkhs9LWn_vv^;l;shEA6_wn?;9A
zlsQ1rD}%$8PunGLM}9vQCQ~RS1cWrUoCE`!O?jIHBr@#RzHN;%y5O4Uk*_hgt}_9Y
zep6IFkuKUasju0dmM#xjxh#H{b{}K6<`)gWsu#^%>@Q5Ne1N6BFCqt1_>%>Z3yy2n
zeiEBrGkjUH#mCfO!5IEP+5cd;)a+PpzM|t+RDXcDKCQVrH}7*y9ojGVpf3TfImHDE
z3ZjiF0@8jcweFk(c*z~9CF;h;ojE1SM^en}fDj}CW}i7H&LHTdiy&F@#O36;pB*{O
zWL^DK^--}^pC#!Sn-*;-K}@g!YPA=!yskh>{->^XPTLz+^cE%_XMgaSpqU#3C`hWQ
zE|1+ZM$vy-TUR%;`s(3E1u1<V6^)@jX=lCc3)Aq(CnOO0rfjw3#Lm_7Agog7cmIwW
zhL!Pk-~JngSbMH_;b(-Is7AY^aHh874R5fo4@}jap-^XsnHK8Jb0ex-YGhu0m%y_Q
zZMS$)RNM8(^61Geq;@k%%(|<0W$z0Y(WiBsK~xKWqy^b^#oz_RNs9Ftxwbi!@R7ak
zD$QhH_G~>tFMRRknb?R>fA`oy3`{E38hks|6U-K88VtG_DdsPB)vbW{GqTfjQi4Fi
z;gD+RV%}1FzyinaQ8EXFraG4aOS1i6I7X8q`U^~Lw&K2xKT}g*AC?~yW}f*~b^H_5
zX)ZlczS(mvnyH)Y3DN3#8jK8H{Fw+otK&5p-x}S((OBztdgqNaW^<nn8Ms>}k_>eA
zJ`G$Kdf8v=0|JqE@7bqw(z9bp6{-R0XQda%zd0ROnf-Hoyki`xcOHlj0{1uB;W|t4
z)xFt68Go<3&vL!ylh5t=-CwN)<DA~sYrC%%-~G0bw2Jo|RH|(;r`Oi4^e-9;7yE%w
zxK2NQw9?p^x9sA5%HDe7v-Q|DMFRSnw4tKy{t0+$t_L)Ek)=y!k94UIU$4(hO67fo
zpYbak@*>_n&knl`f7dklI^J3Rll5N*7p_Je8N{}`+RiX8-|z2*sSf#jX<5K0u%OMO
zuQ7BRmkeupR<WuR=SfsK%nA+pf7+qjns&pQWeWUvh?dK6J~0x?C7I&pwtK3pJuL<e
zt4CXLI_h`_#vo+r>n6dycWAh%Sci5IqoP5Jkzy$MXreHph!5|w=gItpr>G+{l8cc!
z6v)4-5LncQ<ssj%j}N?GUjk?O^vgkj;*hoQis4qhwrjk|z~qeyEB#q&a^Q-0fpfPs
zPrHEr(|~2%$ilWZ-9|a`<n-e33z!fZuIbMSbMB#@`nFbB)nF`;@`7$@MAF=`t^>A#
zIEZiy^QGeUm$8M5j@_@_E^&CCD3%UklW#cR6e+b2gk9iar8vX-jX$!!&o+sKMbIX|
zoA<-xya?GKt4u1eqeBn7N0|Ed_gX6Yx7_3J^~Msjy|v%iM<GtnOa;CSSxd8=Yk4;{
zaFc!Q0Wvu7;=GnzNgmVMhdX->E2Qr-3Ns11D*`TLrTmG|iImm0?o{DlOOF$B3f^1X
zYw|Cbj!ZCnOkai${Yhl$&RPw8=!MN`!WTSlMF)&|r#q#!XhO$+iQu;&Ub+)_ms4ZC
z>GpqNf+p|37}^VQ+Z%LD(U*_(MS?_M{Tg$Z>9ucy3oRNo)BU3w0{Vxv4jpyIb`N}R
zt-o0$9v_|10G@|$-ODEa&$d?N_z&n#5_<{Fw%_*u#`;QQd=m%Nd96RaXZt=?(i3&b
zz01eiF+f$|;v(Udp(Z5zdwz3FjN3ii7Nlg7+WWg;MS4_Xi-W^>Q<#6edX?!q*a?JG
zfi^t|kV@B%E2+^!$#H6OYy?085NBKA^<DmX$8h_q+v_fM?ph5EQ50R#t=)kQ#^+lV
zf!z-xB#MQ70Z)fu{<heWEW6a8=N~g4&2GflBl(wE4H=SN0(S897{mF@QhaLsd?iEX
zC&1T(toO!`7?;17QxKC8>cX0AAmh@|g#p%xxxqJ>(jKX4$P~88<49Wjun7qok1^b}
z*&jaNg8*5^DY`dVG&>Px@*f>=wlDsIMh?wOsKn|`1U#9XR)1wlUCiJt>~`wkt_>)$
zvcEC5hDCE~Y*iOewvV!gFm^uTkBt!Mb!0=9WsWYlWk7&D=Hsx$t=a`1c$eGUFZrMa
zt%rBfDE+o0ps)ItUk2`uFj%NKToqMhQ|is|;Cv;Pl<l`RCAbi;qR-p;TVCICL)yY=
z4rJogG&kuq!wz*(cS|qdZ(I<6CRTp7y|R=7$KBxDOEy`b+i2>1ra9Qt;ugyJ^hUx2
z%hPL7_%l_STDxS<o#_5B1YuN2CqOKrCchRM!+^(?NK<|T|CFaz?pJAmE-X(xhuPbf
z!}(6nRq_b6lE4w}z`H9j>zXw0lnBY1oQOKJApwY@Oo;+ZG6Ur4Xmu)4TjRF~&ONu;
zFG{PNb|>lwET=EJl;)Coo=E%_F5>xk@FJXbjdwU`3>PlQ+FZv;T<_^t3MOs{y!>&B
zy*1Q&?H`45Ifc{(n1kTgm6nBRKGrHl7X}dZgG~0w7Ib{f$oWdvq)ey{pg>H++-s0k
zX9bHiRmZ^kD#~I6UVg~qN&uNfhwuCG9#AR^83&p~b9y}WNoLZoPnq2%tTu-us90Dh
z+2ui%SO5jFKuCMCva=dwE}`la<1kKp`gx1jrkE(ZLWK5<dr|u36{0f*Z&eTm>izjq
zl^YwjG*F11&f?GBkB&bv!Wgz`MNvU&MJT|ws%&tqr`(3h9pi+`{%vlfGFt(T^^D8n
z9)F9tLQNg_V@l_4blmg4L>A5b51jZe-NhO0c~3OVxX<@K`x~6OCC`i<wYZ#<&3Ds%
z;C26*gd+^gL!wwP8!`AzjbWF^=ncn%S*{yj@yv*2n5=0{cuvP}e5X=qEbYhDdqu-l
ze4p;0O@D&d-oB|xYAI|5!-Wx;qH?NMffx{-8)$7>5sY8D>vJ5t(=_Eu>PJjl!gy$3
zKGOhK3h-I5-XsL5P-so#{W*eE7zgyLd<+U*;+kO2>rH@Hye_)?xP#f-wl7G2`SxLA
zD0|^r7{2nwo0-FdG_-#qTajw1*8K$<PY+E!{t5R-RbF<p+Rw~RJYA3F%lRL-J}2C%
zqYrc6KtWZ~p62<a3claXKdW@<7uN=gp#>^195j$*n-e9OVr9^sIrO0qgFx`T7dgA1
zy@(Aha;n&pxysMI5Td%MP)43D+}x>|VB6iNJilC82WNI_iFwr-#?z5`mU*D5=8}FV
zWc{$*YR1g8r)upeZ3gHF7v1@l2zC3k(2?!8>kA#GPg1rFP;;d;03<UOJ90YJ^d#C{
z&s|Y{c`TFsC}4A(e|{tQvRmXYVRKIoj7X;tJ~2d>_0|bePW8v{h~C;EFrc6#3`EeE
zNkwp-bZ;d(C<ue=dWoC*U~kxNP2{)UiQhFIC+w8E*%`1yxxQVLA~lGxWMyEE=)YXD
zP>aum`_=w0_fHE%o=e$Uxi>cJ#?QkH1?Qm^MUKd}S9-XphUmDX<zsLlU}-9gO3Iy=
zbeSR_E-Sa<kh_hm4-FcU5W<b4ia@4?TuT^oGL?|;o5El1Ws>!5cXtw8pT>2xSkUp)
z=-r1uB5k7%jG)lgevHL%Z$iu!b~dO<J|c(%Py4Re%&~mb5}-i(DweN+oPBD395;w@
zvrP6na{M&@`!*^$l$D`BSlPbGB{R&m788NGU(xLJOWQ+16<nY=dXD9a=RI_oOmK~M
z>Cb56zQhaOy8Y-s&e*oeol;(Mdd{8#e-tVc-s$`Fmh;<4TdLq1$c8Wa5W)yb&U)Ht
zG@9WMK;*~%P1;Joy@!83pDoTrTZ&1uin*_`{-KJ7gEDU>$NCYK+(N8@awPK?nknJr
zdXh~3Gj&H}_VEkzzeF02#eqU?m^cm+Z@-OC5&puz!%a8w{}@9d`cHX0|2QK~9*E*F
zfO?rHoe%I323$3Hl4-->pT!Yc28#y5CHkaZ8O-C1#Cv8^K&u^SvPrLiLHrpg>i+NL
zSRZlG58Ot~**R^<<ZpyLQNLXV^&d5$FJ0u6%TRA$KLV|9Uvvt*8iNOS5TGGrrPPF1
z(IBJ~NB8tu@Z2e|`%t*Kxp8rEBffdq+uNJg;xlUk|3-`o2l_dmdP8crdHq;|mxn*m
zMXe?sQPNE1?%|PN6luy^0JthY^Rf}*@a7g4l7`LsvmIueS3A7x%gV~mU)jtCYL-q4
zq~jSDSg{d5kdu=$G0A_p{Dy?bz`)?dOZ-hNozq&?$_gw`g9Hz+tVoqKyzYO$_WrW}
zsV$4AvUB@oFV)DLeJKou$at-}smaIdiy-O2?r>5(snBOx*~@F+L#C?kw<ktgT3VpJ
zkDXn0BlUC9bT}FrEG(>zx5SLE0s&>hm!Hgu2(a$Q^Oesp*OTs-p}`#=1O<she#Q$w
z(Xp_w@bP)iSLmpysNA2ejgE{!g2r-$+N-M0?l%IB>dc2%9J_@aR$G7~cqZ){5jN79
z_4TeVQ~6+}+$w#bWpO0Ay8F#XN@@V;?aRo>cp6DhNMIAOh5*ga&lgC=@&~*SJg!cD
zdhhVMG-W;n%cr}%KHY}F&h0=A#F0E-Z1qJ&Awn5uWM*cjr7f<m+Wi8~*``zElQetJ
zV?JQ#1zru)zBcLNvFPIu=`k@e2?_~8fS7Qj<!SVEb>r~aOo6j@5;r$7**Z9o$y1i0
z#zYNwb$2&tb}#_?AA_ObHU}3fbg+ntU;bt~ZaeXUK&0`3``MneO=zvHtutUHt;PJF
zAaZzI{DCMOM#XIYq7Oo`QuXy}u9Rfy@|LyQVV#`p>{oz&dUtmR+p%zBe8)&sQBi@d
z@?D;0=hDk*XJuuj{7VGTOu2F2larGJ3=D&GrJt~oT3=;ITU#5gQjWH^_QTVY&1dmE
z<xSBXIiPf}%g=kM&Vm5}mr45%&qGsPUB;)tH(*}O!fuaDeN(1WPiQJA_&TaF`B9o0
zbJ)Dj1*D^;1-JjR$-E)_9t8!(Mqt(dHLAy$t_=z}5=-jno=dO@_`2oMFTl)<3M7*E
zc{U;DTVi5jI)_!j!&Z0=0W(mC_jfG4`~B_t@p`}6e&ueQX9EOc=in$WD~pPbMkD40
zNt*(GHQ?jIp%M>GP8zkkSOCR?1_lN;Hb*1LjA$hM4PKYqW`l9of#*Q0sWFLCCbxop
zNLE%BaIb;J2S`v|eZ3A-N^I=W6amoaI5jmj;>pg=UaXMG?Qx>b$XI@J<EG%)?YuKk
zD4TS5w)VX7{^r>mim<o0M?ykUu2nrTHrC;^)tkX(t8N;9c6PQmnyQrc5#sG}kSx&e
z?Qvg2L&J8VO2hhTBKK3^_V36~?=P;W%Z)`vMgB(qz}3ZndVARF3qM|{?hbfuyHj0m
zcFb*V_B1dc0g*E^j}H#gQc>+ZT<!q(#BV2-=c#QQ&&UshfZ!2$&VW+n$G@5NR+HSE
zoG-^l??+lB#h~l;R7`aA3t;N>5*E!I0u8t%JPxZv2~?dO9b)3*V1>*kptO^K!*X_h
z9*7`S+LtkIZ-MX6M&ja7#GYGDEkKjUIZ*hx*5;0c%e2^NtEH~~!)32Bge>r-wz-*y
zipp%I$(|w5#>R%<=cYuTW#5UHmzaf-5y=DS(7=TiA2$ioW=awBjvQzxDuO``1v-1<
z$v)*|WRz)!wY0PV7CTTA0Nm7-$;rv3rN^&m!Vz(C0k5|U5FlVwfw`FTDX`1s;bxdN
zcPxWTjso1?{=8Ug1_cEr9ZQ&;n3%5(gN2RF>9{Tk60x(hQ&pV?#=KO$j4~I9B4^8u
z$3PPaFR{aHi3$P&g6H}ATz)o}?L3{|W*^!oLH8r<_`$E=UhY;Lfxz)&<$8WzlQcdx
zCkItJaWI~2a%iZgw)XPwuFL%x_ee%6{7RLM30ICXp`)Y2_;?CiEbCKXPLFYxus<PC
zv*fa64qR!EF)tq<eVIKk;K~*j7h}Rt#(mLmvLi^4K}JQ@Br@;SoL*RYf4NplmoL^f
zcDdRe-rn9`UDao0tp+-9dmw@F6%2U6_i(J$&dSWxQ&E9<mp#>jgM}4C=k&Sx5-E|G
zo_@Hq0~LrN*px@;MOSIB0|c>u;>_El6?-}pI{brQSJ7U*+A{XnTt!7id}N><NC3F@
zC^!xdj^xbGuwN<A;TzQX8J2sDKLQ;ph#Q*oilU-DV4xE7^?rl#$qbnJ7#J8pp>7hb
z63|%EtRhW#bKYb9oZs8qySZu1gi8brxYY#l0}XLdPfFgZlJb|fbM(;`=N-3P1Q~gG
zc{w>odU}<#snu53q@<)JyF1_WPOww**w`4*WrG<h@y$M)*V%MuAU2~;Lk39{9XA^I
zc^jCN5TK~@>Cj2xXP^|=Zn+^+g0j-6+fYun47q2=i5EAzXsCMw?-qu%I>%sOdfH_$
zj-*_(a%5;INbeAbQIpA_?FI;Qz_YRI#H%|3aY{u2)$(38zm4;W9vgABmW7Ea6bcMH
zfAopjLcW?G$Uwi2f!IH}9~4XU#`y9nSVUQeHvg)HUEjdq)ALC^ka6feGhiTr#zqbG
zMX@|(I&6jxc+f}3^$s<#qP2_G-U86(ty6Dl(CQ*NC9UNKlPm|@b#`*{vK3Ay{ICgy
z83~QnHM(~Nj4Xm;@t4r@@^W8aU+H+#%9oSbQZ*zbB%E70DJdzVOZ5@aN@V|PaWS!h
zSfbRlG<bm~cGxm<U@1}Z&*uWDGrV5(p=D=htEsB~^K5Kyn{q_N-0;_F1KE-AhyW8C
z2gm>Y^&STY2grv&FasW1U?)X}IygAk+S(e3FM_13+}!i)>u-PZ#3>UZ#PWc~`^V>J
zASw4!_-bm-A50a%K=v#c1sdq;j_hB5{rVLrI13Vw)lksu!dP6q4@fla?d?F}ck=ra
zFls>CdsA0e&g4FDLFG4oAP_KJs;a7h7>yArp;~&=^A!%r5*fVCl+pbz?(Y9|{R5TY
zh=_=gkdT{;b;ZShfdhW{@BujErlzK-AgqCG0Z&v?L2Vr!pfenZOb{Tmff!jOB~}&|
z2RpkCAV^i{)O!i812>pV&<iB`JwARC*ayaN*xbd<%?+R)pu$I)j)|GsqE=h0N<W>~
zIX@{$O;a<ayxi8?+gnFx$+7#j$#q|mmKKbJ$C3s7e_mb>ko<tiv#@YqUfp$ccnG8g
z)TCi`(vz@P<;!WHe4B*VX%i?X*Jt5jVL1b$IB+Hm;e7}wC_(`*5B<@20{r|TAt68(
z;$UY7AmD3Ah!gKB&xRjxr!WZ#eQpj^O4XOArhrl)HsF!L!^2B*AtH<|=9eg4nEkHK
zFelt?IdX7tplk>NK97K(pWn{j9!@O`sI)07V*q~JOdEE!LPt;UK2wzm_x|SQmjTsY
z7{(h`?&Ly_tezfGw&Fq^*m(2147E#ZM`h*s`uchaD?7+bFlX2tbtnO7RjFY`#O}kz
zM(iIMvH9s=*yW4_{ERrF1{o%hn4Fwk%GF4xj7$gs!K{NYk9l~7JdLfRqqOITJp3@j
zLV8M$moZr3Tvl{=Wlu3SVq+5%(IQo@@vjgPin@|$jiQ%`P$A?7Pwek##3ercXMy{v
Rzz1A_WF-|Ps>F;#{vV?@oK*k-

diff --git a/doc/figures/fpga.tex b/doc/figures/fpga.tex
index 12f0bcf5..9eafda95 100644
--- a/doc/figures/fpga.tex
+++ b/doc/figures/fpga.tex
@@ -359,8 +359,8 @@
   \draw[arrows=-,color=black] (links.north) to (boardrouter.south);
 
   % Is the board router connected to off-chip RAM?
-  \draw[arrows=-,color=black] (ram0.east) to (boardrouter.west);
-  \draw[arrows=-,color=black] (ram1.west) to (boardrouter.east);
+  \draw[arrows=-,color=mygreen] (ram0.east) to (boardrouter.west);
+  \draw[arrows=-,color=mygreen] (ram1.west) to (boardrouter.east);
 
 
 \end{tikzpicture}
diff --git a/hostlink/HostLink.cpp b/hostlink/HostLink.cpp
index aa4d3af6..4708457e 100644
--- a/hostlink/HostLink.cpp
+++ b/hostlink/HostLink.cpp
@@ -218,8 +218,9 @@ void HostLink::fromAddr(uint32_t addr, uint32_t* meshX, uint32_t* meshY,
   *meshY = addr;
 }
 
-// Inject a message via PCIe (blocking by default)
-bool HostLink::send(uint32_t dest, uint32_t numFlits, void* payload, bool block)
+// Internal helper for sending messages
+bool HostLink::sendHelper(uint32_t dest, uint32_t numFlits, void* payload,
+       bool block, uint32_t key)
 {
   assert(useSendBuffer ? block : true);
 
@@ -242,7 +243,7 @@ bool HostLink::send(uint32_t dest, uint32_t numFlits, void* payload, bool block)
     buffer[0] = dest;
     buffer[1] = 0;
     buffer[2] = (numFlits-1) << 24;
-    buffer[3] = 0;
+    buffer[3] = key;
 
     // Fill in message payload
     memcpy(&buffer[4], payload, numFlits*16);
@@ -285,6 +286,13 @@ bool HostLink::send(uint32_t dest, uint32_t numFlits, void* payload, bool block)
   }
 }
 
+
+// Inject a message via PCIe (blocking by default)
+bool HostLink::send(uint32_t dest, uint32_t numFlits, void* msg, bool block)
+{
+  return sendHelper(dest, numFlits, msg, block, 0);
+}
+
 // Flush the send buffer
 void HostLink::flush()
 {
@@ -298,7 +306,28 @@ void HostLink::flush()
 // Try to send a message (non-blocking, returns true on success)
 bool HostLink::trySend(uint32_t dest, uint32_t numFlits, void* msg)
 {
-  return send(dest, numFlits, msg, false);
+  return sendHelper(dest, numFlits, msg, false, 0);
+}
+
+// Send a message using routing key (blocking by default)
+bool HostLink::keySend(uint32_t key, uint32_t numFlits,
+       void* msg, bool block)
+{
+  uint32_t useRoutingKey = 1 << (
+    TinselLogThreadsPerCore + TinselLogCoresPerMailbox +
+    TinselMailboxMeshXBits + TinselMailboxMeshYBits +
+    TinselMeshXBits + TinselMeshYBits + 2);
+  return sendHelper(useRoutingKey, numFlits, msg, block, key);
+}
+
+// Try to send using routing key (non-blocking, returns true on success)
+bool HostLink::keyTrySend(uint32_t key, uint32_t numFlits, void* msg)
+{
+  uint32_t useRoutingKey = 1 << (
+    TinselLogThreadsPerCore + TinselLogCoresPerMailbox +
+    TinselMailboxMeshXBits + TinselMailboxMeshYBits +
+    TinselMeshXBits + TinselMeshYBits + 2);
+  return sendHelper(useRoutingKey, numFlits, msg, false, key);
 }
 
 // Receive a message via PCIe (blocking)
diff --git a/hostlink/HostLink.h b/hostlink/HostLink.h
index 81c9b32f..f6a7a71c 100644
--- a/hostlink/HostLink.h
+++ b/hostlink/HostLink.h
@@ -35,6 +35,10 @@ class HostLink {
 
   // Internal constructor
   void constructor(uint32_t numBoxesX, uint32_t numBoxesY);
+
+  // Internal helper for sending messages
+  bool sendHelper(uint32_t dest, uint32_t numFlits, void* payload,
+         bool block, uint32_t key);
  public:
   // Dimensions of board mesh
   int meshXLen;
@@ -65,6 +69,12 @@ class HostLink {
   // Try to send a message (non-blocking, returns true on success)
   bool trySend(uint32_t dest, uint32_t numFlits, void* msg);
 
+  // Send a message using routing key (blocking by default)
+  bool keySend(uint32_t key, uint32_t numFlits, void* msg, bool block = true);
+
+  // Try to send using routing key (non-blocking, returns true on success)
+  bool keyTrySend(uint32_t key, uint32_t numFlits, void* msg);
+
   // Receive a max-sized message (blocking)
   void recv(void* msg);
 
diff --git a/include/tinsel-interface.h b/include/tinsel-interface.h
index 93b5ec96..352b4461 100644
--- a/include/tinsel-interface.h
+++ b/include/tinsel-interface.h
@@ -166,7 +166,7 @@ INLINE uint32_t tinselAccId(
            uint32_t tileX, uint32_t tileY)
 {
   uint32_t addr;
-  addr = 0x4;
+  addr = 0x8;
   addr = (addr << TinselMeshYBits) | boardY;
   addr = (addr << TinselMeshXBits) | boardX;
   addr = (addr << TinselMailboxMeshYBits) | tileY;
diff --git a/include/tinsel.h b/include/tinsel.h
index 9ebd8451..ec26b849 100644
--- a/include/tinsel.h
+++ b/include/tinsel.h
@@ -176,6 +176,16 @@ INLINE void tinselSend(int dest, volatile void* addr)
   tinselMulticast(dest >> 6, high, low, addr);
 }
 
+// Send message at addr using given routing key
+INLINE void tinselKeySend(int key, volatile void* addr)
+{
+  // Special address to signify use of routing key
+  uint32_t useRoutingKey = 1 <<
+    (TinselMailboxMeshYBits + TinselMailboxMeshXBits +
+     TinselMeshXBits + TinselMeshYBits + 2);
+  tinselMulticast(useRoutingKey, 0, key, addr);
+}
+
 // Receive message
 INLINE volatile void* tinselRecv()
 {
diff --git a/rtl/DE5BridgeTop.bsv b/rtl/DE5BridgeTop.bsv
index 5dce9e25..15e2ba8f 100644
--- a/rtl/DE5BridgeTop.bsv
+++ b/rtl/DE5BridgeTop.bsv
@@ -12,9 +12,10 @@
 //   1. DA: Destination address (4 bytes)
 //   2. NM: Number of messages that follow minus one (4 bytes)
 //   3. FM: Number of flit payloads per message minus one (1 byte)
-//   4. Padding (7 bytes)
-//   5. (NM+1)*(FM+1) flit payloads ((NM+1)*(FM+1)*BytesPerFlit bytes)
-//   6. Goto step 1
+//   4. Padding (3 bytes)
+//   5. Routing key (optional, 4 bytes)
+//   6. (NM+1)*(FM+1) flit payloads ((NM+1)*(FM+1)*BytesPerFlit bytes)
+//   7. Goto step 1
 //
 // The format of the data stream in the FPGA->PC direction is simply
 // raw flit payloads.
@@ -161,6 +162,7 @@ module de5BridgeTop (DE5BridgeTop);
   Reg#(Bit#(32)) fromPCIeDA    <- mkConfigRegU;
   Reg#(Bit#(32)) fromPCIeNM    <- mkConfigRegU;
   Reg#(Bit#(8))  fromPCIeFM    <- mkConfigRegU;
+  Reg#(Bit#(32))  fromPCIeKey   <- mkConfigRegU;
   Reg#(Bit#(1))  toLinkState   <- mkConfigReg(0);
 
   Reg#(Bit#(32)) messageCount  <- mkConfigReg(0);
@@ -182,6 +184,7 @@ module de5BridgeTop (DE5BridgeTop);
         fromPCIeDA <= data[31:0];
         fromPCIeNM <= data[63:32];
         fromPCIeFM <= data[95:88];
+        fromPCIeKey <= data[127:96];
         toLinkState <= 1;
         fromPCIe.get;
       end
@@ -203,6 +206,10 @@ module de5BridgeTop (DE5BridgeTop);
         Flit flit;
         flit.dest.addr = unpack(truncate(fromPCIeDA[31:`LogThreadsPerMailbox]));
         flit.dest.threads = pack(destThreads);
+        // If address says to use routing key, then use it
+        if (flit.dest.addr.isKey) begin
+          flit.dest.threads = zeroExtend(fromPCIeKey);
+        end
         flit.payload = fromPCIe.value;
         flit.notFinalFlit = True;
         flit.isIdleToken = False;
diff --git a/rtl/Globals.bsv b/rtl/Globals.bsv
index a2648a23..42ea3c70 100644
--- a/rtl/Globals.bsv
+++ b/rtl/Globals.bsv
@@ -20,10 +20,13 @@ typedef struct {
 // destination board, it is routed either left or right depending
 // the contents of the host bit.  This is to support bridge boards
 // connected at the east/west rims of the FPGA mesh.
+// The 'isKey' bit means that the destination is a routing key, held
+// in the botom 32 bits of the 'NetAddr'.
 // The 'acc' bit means message is routed to a custom accelerator rather
 // than a mailbox.
 typedef struct {
   Bool acc;
+  Bool isKey;
   Option#(Bit#(1)) host;
   BoardId board;
   MailboxId mbox;
@@ -42,6 +45,9 @@ typedef struct {
 
 function MailboxId getMailboxId(NetAddr addr) = addr.addr.mbox;
 
+// Extract routing key from network address
+function Bit#(32) getRoutingKey(NetAddr addr) = truncate(pack(addr));
+
 // ============================================================================
 // Messages
 // ============================================================================
diff --git a/rtl/IdleDetector.bsv b/rtl/IdleDetector.bsv
index 0307f198..4cb3ccc5 100644
--- a/rtl/IdleDetector.bsv
+++ b/rtl/IdleDetector.bsv
@@ -221,6 +221,7 @@ module mkIdleDetector (IdleDetector);
       NetAddr {
         addr: MailboxNetAddr {
           acc: False,
+          isKey: False,
           host: option(True, 0),
           board: BoardId { y: 0, x: 0 },
           mbox: MailboxId { y: 0, x: 0 }
@@ -538,6 +539,7 @@ module mkIdleDetectMaster (IdleDetectMaster);
       NetAddr {
         addr: MailboxNetAddr {
           acc: False,
+          isKey: False,
           host: option(False, 0),
           board: BoardId { y: truncate(boardY), x: truncate(boardX) },
           mbox: MailboxId { y: 0, x: 0 }

From ec7dd777c7cac19f993a10d981976a72d111e0b4 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Tue, 4 Feb 2020 16:07:23 +0000
Subject: [PATCH 05/78] New types for routing keys, records, etc.

---
 rtl/Globals.bsv     |   2 +-
 rtl/ProgRouting.bsv | 111 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 112 insertions(+), 1 deletion(-)
 create mode 100644 rtl/ProgRouting.bsv

diff --git a/rtl/Globals.bsv b/rtl/Globals.bsv
index 42ea3c70..914852e9 100644
--- a/rtl/Globals.bsv
+++ b/rtl/Globals.bsv
@@ -46,7 +46,7 @@ typedef struct {
 function MailboxId getMailboxId(NetAddr addr) = addr.addr.mbox;
 
 // Extract routing key from network address
-function Bit#(32) getRoutingKey(NetAddr addr) = truncate(pack(addr));
+function Bit#(32) getRoutingKeyRaw(NetAddr addr) = truncate(pack(addr));
 
 // ============================================================================
 // Messages
diff --git a/rtl/ProgRouting.bsv b/rtl/ProgRouting.bsv
new file mode 100644
index 00000000..b3819180
--- /dev/null
+++ b/rtl/ProgRouting.bsv
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: BSD-2-Clause
+package ProgRouting;
+
+// Functions and data types for programmable routers
+
+// =============================================================================
+// Routing keys and beats
+// =============================================================================
+
+// A routing record is either 40 bits or 80 bits in size (aligned on a
+// 40-bit or 80-bit boundary respectively). Multiple records are
+// packed into a 256-bit DRAM beat (aligned on a 256-bit boundary).
+// The most significant 16 bits of the beat contain a count of the
+// number of records in the beat (in the range 1 to 6 inclusive). The
+// remaining 240 bits contain records. The first record lies in the
+// least-significant bits of the beat. The size portion of the routing
+// key contains the number of contiguous DRAM beats holding all
+// records for the key.
+
+// 256-bit routing beat
+typedef struct {
+  // Number of 40-bit record chunks present
+  Bit#(16) size;
+  // The 40-bit record chunks
+  Vector#(6, Bit#(40)) chunks;
+} RoutingBeat deriving (Bits);
+
+// 32-bit routing key
+typedef struct {
+  // Pointer to array of routing beats containing routing records
+  Bit#(26) ptr;
+  // Number of beats in the array
+  Bit#(6) numBeats;
+} RoutingKey deriving (Bits);
+
+// =============================================================================
+// Types of routing record
+// =============================================================================
+
+typedef enum {
+  URM1 = 3'd0, // 40-bit Unicast Router-to-Mailbox
+  URM2 = 3'd1, // 80-bit Unicast Router-to-Mailbox
+  RR   = 3'd2, // 40-bit Router-to-Router
+  MRM  = 3'd3, // 80-bit Multicast Router-to-Mailbox
+  IND  = 3'd4  // 40-bit Indirection
+} RoutingRecordTag;
+
+// 40-bit Unicast Router-to-Mailbox (URM1) record
+typedef struct {
+  // Record type
+  RoutingRecordTag tag;
+  // Mailbox destination
+  Bit#(4) mbox;
+  // Mailbox-local thread identifier
+  Bit#(6) thread;
+  // Local key. The first word of the message
+  // payload is overwritten with this.
+  Bit#(27) localKey;
+} URM1Record deriving (Bits);
+
+// 80-bit Unicast Router-to-Mailbox (URM2) record
+typedef struct {
+  // Record type
+  RoutingRecordTag tag;
+  // Mailbox destination
+  Bit#(4) mbox;
+  // Mailbox-local thread identifier
+  Bit#(6) thread;
+  // Currently unused
+  Bit#(3) unused;
+  // Local key. The first two words of the message
+  // payload is overwritten with this.
+  Bit#(64) localKey;
+} URM2Record deriving (Bits);
+
+// 40-bit Router-to-Router (RR) record
+typedef struct {
+  // Record type
+  RoutingRecordTag tag;
+  // Direction (N, S, E, or W)
+  Bit#(2) dir;
+  // Currently unused
+  Bit#(3) unused;
+  // New 32-bit routing key that will replace the one in the
+  // current message for the next hop of the message's journey
+  Bit#(32) newKey;
+} RRRecord deriving (Bits);
+
+// 80-bit Multicast Router-to-Mailbox (MRM) record
+typedef struct {
+  // Record type
+  RoutingRecordTag tag;
+  // Mailbox destination
+  Bit#(4) mbox;
+  // Currently unused
+  Bit#(9) unused;
+  // Mailbox-local destination mask
+  Bit#(64) destMask;
+} MRMRecord deriving (Bits);
+
+// 40-bit Indirection (IND) record:
+typedef struct {
+  // Record type
+  RoutingRecordTag tag;
+  // Currently unused
+  Bit#(5) unused;
+  // New 32-bit routing key for new set of records on current router
+  Bit#(32) newKey;
+} MRMRecord deriving (Bits);
+
+endpackage

From 57346c9f760da882943419ea1fda3d27451d38f1 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Thu, 19 Mar 2020 11:29:22 +0000
Subject: [PATCH 06/78] Give prog-router multiple ports to each DRAM

---
 config.py           |  4 ++++
 rtl/Connections.bsv |  9 ++++-----
 rtl/Network.bsv     | 25 +++++++++++++++++--------
 3 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/config.py b/config.py
index 74c7f63e..9021804c 100755
--- a/config.py
+++ b/config.py
@@ -360,6 +360,10 @@ def quoted(s): return "'\"" + s + "\"'"
 # Number of FPGA boards per box (including bridge board)
 p["BoardsPerBox"] = p["MeshXLenWithinBox"] * p["MeshYLenWithinBox"] + 1
 
+# Number of fetchers in the per-board programmable router
+# (Currently assumed to be 4)
+p["FetchersPerProgRouter"] = 4
+
 #==============================================================================
 # Main 
 #==============================================================================
diff --git a/rtl/Connections.bsv b/rtl/Connections.bsv
index 4de1c4d8..013224c9 100644
--- a/rtl/Connections.bsv
+++ b/rtl/Connections.bsv
@@ -55,7 +55,8 @@ module connectClientsToOffChipRAM#(
   // Data caches
   Vector#(`DCachesPerDRAM, DCache) caches,
   // Programmable per-board router, reqs and resps
-  BOut#(DRAMReq) routerReqs, In#(DRAMResp) routerResps,
+  Vector#(`FetchersPerProgRouter, BOut#(DRAMReq)) routerReqs,
+  Vector#(`FetchersPerProgRouter, In#(DRAMResp)) routerResps,
   // Off-chip memory
   OffChipRAM ram) ();
 
@@ -63,8 +64,7 @@ module connectClientsToOffChipRAM#(
   function getReqOut(cache) = cache.reqOut;
   let reqs <- mkMergeTreeB(Fair,
                 mkUGShiftQueue1(QueueOptFmax),
-                append(map(getReqOut, caches),
-                  cons(routerReqs, nil)));
+                append(map(getReqOut, caches), routerReqs));
   connectUsing(mkUGQueue, reqs, ram.reqIn);
 
   // Connect load responses
@@ -73,8 +73,7 @@ module connectClientsToOffChipRAM#(
   let ramResps <- mkResponseDistributor(
                     getRespKey,
                     mkUGShiftQueue2(QueueOptFmax),
-                    append(map(getRespIn, caches), 
-                      cons(routerResps, nil)));
+                    append(map(getRespIn, caches), routerResps));
   connectDirect(ram.respOut, ramResps);
 
 endmodule
diff --git a/rtl/Network.bsv b/rtl/Network.bsv
index 642acfcb..99229c96 100644
--- a/rtl/Network.bsv
+++ b/rtl/Network.bsv
@@ -381,8 +381,10 @@ interface NoC;
   interface Vector#(`NumEastWestLinks, AvalonMac) west;
   `endif
   // Connections to off-chip memory (for the programmable router)
-  interface Vector#(`DRAMsPerBoard, BOut#(DRAMReq)) dramReqs;
-  interface Vector#(`DRAMsPerBoard, In#(DRAMResp)) dramResps;
+  interface Vector#(`DRAMsPerBoard,
+    Vector#(`FetchersPerProgRouter, BOut#(DRAMReq))) dramReqs;
+  interface Vector#(`DRAMsPerBoard,
+    Vector#(`FetchersPerProgRouter, In#(DRAMResp))) dramResps;
 endinterface
 
 module mkNoC#(
@@ -404,12 +406,17 @@ module mkNoC#(
     mapM(mkBoardLink(linkEnable[3]), westSocket);
 
   // Responses from off-chip memory
-  Vector#(`DRAMsPerBoard, InPort#(DRAMResp)) dramRespPort <-
-    replicateM(mkInPort);
+  Vector#(`DRAMsPerBoard,
+    Vector#(`FetchersPerProgRouter, InPort#(DRAMResp))) dramRespPort <-
+      replicateM(replicateM(mkInPort));
 
   // Requests to off-chip memory
-  Vector#(`DRAMsPerBoard, Queue1#(DRAMReq)) dramReqQueues <-
-    replicateM(mkUGShiftQueue1(QueueOptFmax));
+  Vector#(`DRAMsPerBoard,
+    Vector#(`FetchersPerProgRouter, Queue1#(DRAMReq))) dramReqQueues <-
+      replicateM(replicateM(mkUGShiftQueue1(QueueOptFmax)));
+
+  // Dimension-ordered routers
+  // -------------------------
 
   // Create mailbox routers
   Vector#(`MailboxMeshYLen,
@@ -567,10 +574,12 @@ module mkNoC#(
   `endif
 
   // Requests to off-chip memory
-  interface dramReqs = Vector::map(queueToBOut, dramReqQueues);
+  interface dramReqs =
+    Vector::map(Vector::map(queueToBOut), dramReqQueues);
 
   // Responses from off-chip memory
-  interface dramResps = Vector::map(getIn, dramRespPort);
+  interface dramResps =
+    Vector::map(Vector::map(getIn), dramRespPort);
 
 endmodule
 

From 77c62f77a4d7353b9865aac7deb8c334d893f840 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Tue, 24 Mar 2020 17:33:19 +0000
Subject: [PATCH 07/78] Generalise DRAMResp

Need to support inflight request info for programmable routers now, in
addition to caches.  Therefore, generalise the info field in a
DRAMResp.  Need to check that the optimiser does what I expect in this
case, i.e. throws away unused bits in the info field.
---
 config.py                               |  6 ++-
 rtl/DCache.bsv                          |  9 ++--
 rtl/DRAM.bsv                            | 19 ++++---
 rtl/NarrowSRAM.bsv                      |  9 ++--
 rtl/{ProgRouting.bsv => ProgRouter.bsv} | 68 ++++++++++++++++++++++---
 rtl/WideSRAM.bsv                        |  1 +
 6 files changed, 87 insertions(+), 25 deletions(-)
 rename rtl/{ProgRouting.bsv => ProgRouter.bsv} (58%)

diff --git a/config.py b/config.py
index 9021804c..7f931637 100755
--- a/config.py
+++ b/config.py
@@ -361,8 +361,10 @@ def quoted(s): return "'\"" + s + "\"'"
 p["BoardsPerBox"] = p["MeshXLenWithinBox"] * p["MeshYLenWithinBox"] + 1
 
 # Number of fetchers in the per-board programmable router
-# (Currently assumed to be 4)
-p["FetchersPerProgRouter"] = 4
+# Parameters for programmable routers
+# (and the routing-record fetchers they contain)
+p["FetchersPerProgRouter"] = 5
+p["LogFetcherFlitBufferSize"] = 5
 
 #==============================================================================
 # Main 
diff --git a/rtl/DCache.bsv b/rtl/DCache.bsv
index b99d4667..bbed742d 100644
--- a/rtl/DCache.bsv
+++ b/rtl/DCache.bsv
@@ -437,9 +437,10 @@ module mkDCache#(DCacheId myId) (DCache);
     // This rule either consumes a flush request or a memory response
     let flush = flushQueue.dataOut;
     let resp = respPort.value;
+    InflightDCacheReqInfo info = unpack(truncate(resp.info));
     lineWriteDataWire <= resp.data;
-    lineWriteIndexWire <= beatIndex(resp.info.beat, resp.info.req.id,
-                            resp.info.req.addr, resp.info.way);
+    lineWriteIndexWire <= beatIndex(truncate(resp.beat), info.req.id,
+                            info.req.addr, info.way);
     // Ready to consume flush queue?
     if (flushQueue.canDeq && flushQueue.canPeek) begin
       flush.req.cmd.isFlush = False;
@@ -453,14 +454,14 @@ module mkDCache#(DCacheId myId) (DCache);
       // Remove item from fill queue and feed associated request (which
       // will definitely hit if it starts again from the beginning of
       // the pipeline) back to beginning of the pipeline
-      if (allHigh(resp.info.beat))
+      if (allHigh(resp.beat))
         feedbackTrigger <= True;
       // Write new line data to dataMem
       // (The write parameters are set outside condition for better timing)
       lineWriteReqWire <= True;
       respPort.get;
       // Set feedback request
-      feedbackReq <= resp.info.req;
+      feedbackReq <= info.req;
     end
   endrule
 
diff --git a/rtl/DRAM.bsv b/rtl/DRAM.bsv
index e5d4a33e..d188afbd 100644
--- a/rtl/DRAM.bsv
+++ b/rtl/DRAM.bsv
@@ -25,8 +25,13 @@ typedef struct {
 typedef struct {
   DRAMReqId id;
   Bit#(`BeatWidth) data;
-  InflightDCacheReqInfo info;
+  // Which beat is it?
   Bool finalBeat;
+  Bit#(`BeatBurstWidth) beat;
+  // Data from original load request
+  // (Can be largely ignored and optimised away, but
+  // can also hold useful info about the original request)
+  Bit#(`BeatWidth) info;
 } DRAMResp deriving (Bits);
 
 // DRAM identifier
@@ -83,7 +88,6 @@ import Util        :: *;
 import Interface   :: *;
 import Queue       :: *;
 import Assert      :: *;
-import DCacheTypes :: *;
 
 // Types
 // -----
@@ -154,8 +158,8 @@ module mkDRAM#(RAMId id) (DRAM);
           DRAMResp resp;
           resp.id = req.id;
           resp.data = pack(elems);
-          resp.info = unpack(truncate(req.data));
-          resp.info.beat = truncate(burstCount);
+          resp.info = req.data;
+          resp.beat = burstCount;
           resp.finalBeat = finalBeat;
           resps.enq(resp);
           decOutstanding.send;
@@ -222,7 +226,6 @@ import Interface   :: *;
 import Assert      :: *;
 import Util        :: *;
 import Assert      :: *;
-import DCacheTypes :: *;
 
 // Types
 // -----
@@ -247,7 +250,7 @@ endinterface
 typedef struct {
   DRAMReqId id;
   Bit#(`BeatBurstWidth) burst;
-  InflightDCacheReqInfo info;
+  Bit#(`BeatWidth) info;
 } DRAMInFlightReq deriving (Bits);
 
 // Implementation
@@ -312,7 +315,7 @@ module mkDRAM#(t id) (DRAM);
           DRAMInFlightReq inflightReq;
           inflightReq.id = req.id;
           inflightReq.burst = req.burst;
-          inflightReq.info = unpack(truncate(req.data));
+          inflightReq.info = req.data;
           inFlight.enq(inflightReq);
           inFlightCount.incBy(zeroExtend(req.burst));
         end
@@ -339,7 +342,7 @@ module mkDRAM#(t id) (DRAM);
       DRAMResp resp;
       resp.id = inFlight.dataOut.id;
       resp.info = inFlight.dataOut.info;
-      resp.info.beat = truncate(burstCount-1);
+      resp.beat = truncate(burstCount-1);
       resp.data = respBuffer.dataOut;
       resp.finalBeat = burstCount == inFlight.dataOut.burst;
       return resp;
diff --git a/rtl/NarrowSRAM.bsv b/rtl/NarrowSRAM.bsv
index 0fbd34fa..dde0e08a 100644
--- a/rtl/NarrowSRAM.bsv
+++ b/rtl/NarrowSRAM.bsv
@@ -1,8 +1,7 @@
 // SPDX-License-Identifier: BSD-2-Clause
 package NarrowSRAM;
 
-import DCacheTypes :: *;
-import Util        :: *;
+import Util :: *;
 
 // ============================================================================
 // Types
@@ -16,7 +15,7 @@ typedef struct {
   SRAMReqId id;
   Bit#(`SRAMAddrWidth) addr;
   Bit#(`SRAMBurstWidth) burst;
-  InflightDCacheReqInfo info;
+  Bit#(`BeatWidth) info;
 } SRAMLoadReq deriving (Bits);
 
 // SRAM store request
@@ -31,7 +30,7 @@ typedef struct {
 typedef struct {
   SRAMReqId id;
   Bit#(`SRAMDataWidth) data;
-  InflightDCacheReqInfo info;
+  Bit#(`BeatWidth) info;
 } SRAMResp deriving (Bits);
 
 // ============================================================================
@@ -243,7 +242,7 @@ endinterface
 typedef struct {
   SRAMReqId id;
   Bit#(`SRAMBurstWidth) burst;
-  InflightDCacheReqInfo info;
+  Bit#(`BeatWidth) info;
 } SRAMInFlightReq deriving (Bits);
 
 // SRAM Implementation
diff --git a/rtl/ProgRouting.bsv b/rtl/ProgRouter.bsv
similarity index 58%
rename from rtl/ProgRouting.bsv
rename to rtl/ProgRouter.bsv
index b3819180..8796182a 100644
--- a/rtl/ProgRouting.bsv
+++ b/rtl/ProgRouter.bsv
@@ -1,7 +1,6 @@
 // SPDX-License-Identifier: BSD-2-Clause
-package ProgRouting;
-
-// Functions and data types for programmable routers
+// Functions, data types, and modules for programmable routers
+package ProgRouter;
 
 // =============================================================================
 // Routing keys and beats
@@ -27,12 +26,18 @@ typedef struct {
 
 // 32-bit routing key
 typedef struct {
+  // Which off-chip RAM?
+  Bit#(`LogDRAMsPerBoard) ram
   // Pointer to array of routing beats containing routing records
-  Bit#(26) ptr;
+  Bit#(`LogBeatsPerDRAM) ptr;
   // Number of beats in the array
-  Bit#(6) numBeats;
+  Bit#(`LogRoutingEntryLen) numBeats;
 } RoutingKey deriving (Bits);
 
+// Extract routing key from an address
+function RoutingKey getRoutingKey(NetAddr addr) =
+  unpack(getRoutingKeyRaw(addr));
+
 // =============================================================================
 // Types of routing record
 // =============================================================================
@@ -106,6 +111,57 @@ typedef struct {
   Bit#(5) unused;
   // New 32-bit routing key for new set of records on current router
   Bit#(32) newKey;
-} MRMRecord deriving (Bits);
+} INDRecord deriving (Bits);
+
+// =============================================================================
+// Design
+// =============================================================================
+
+// =============================================================================
+// Fetcher
+// =============================================================================
+
+// Address in a fetcher's flit buffer
+typedef Bit#(TSub#(`LogFetcherFlitBufferSize, `LogMaxFlitsPerMsg))
+  FetcherFlitBufferMsgAddr;
+
+// This structure contains information about an in-flight memory
+// request from a fetcher.  When a fetcher issues a memory load
+// request, this info is packed into the unused data field of the
+// request.  When the memory subsystem responds, it passes back the
+// same info in an extra field inside the memory response structure.
+// Maintaining info about an inflight request inside the request
+// itself provides an easy way to handle out-of-order responses from
+// memory.
+typedef struct {
+  // Message address in the fetcher's flit buffer
+  FetcherFlitBufferMsgAddr msgAddr;
+  // Is this the final routing beat for the key being fetched?
+  Bool finalBeat;
+} InflightFetcherReqInfo deriving (Bits);
+
+// =============================================================================
+// Programmable router
+// =============================================================================
+
+interface ProgRouter;
+  // Incoming and outgoing flits
+  interface Vector#(`FetchersPerProgRouter, In#(Flit) flitIn);
+  interface Vector#(`FetchersPerProgRouter, Out#(Flit) flitOut);
+
+  // Interface to off-chip memory
+  interface Vector#(`DRAMsPerBoard,
+    Vector#(`FetchersPerProgRouter, BOut#(DRAMReq))) ramReqs;
+  interface Vector#(`DRAMsPerBoard,
+    Vector#(`FetchersPerProgRouter, In#(DRAMResp))) ramResps;
+endinterface
+
+module mkProgRouter (ProgRouter);
+
+  // Flit input ports
+  Vector#(`FetchersPerProgRouter, InPort#(Flit)) flitInPort <-
+    replicateM(mkInPort);
+
+endmodule
 
 endpackage
diff --git a/rtl/WideSRAM.bsv b/rtl/WideSRAM.bsv
index a3816a38..04af1dc7 100644
--- a/rtl/WideSRAM.bsv
+++ b/rtl/WideSRAM.bsv
@@ -108,6 +108,7 @@ module mkWideSRAM#(RAMId id) (WideSRAM);
         respOut.data = pack(data);
         respOut.info = respIn.info;
         respOut.finalBeat = True;
+        respOut.beat = 0;
         respQueue.enq(respOut);
         respCount <= 0;
       end

From 52c4317e1df6efdca37a7f2e6a2bcc86512db57d Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Tue, 24 Mar 2020 21:31:19 +0000
Subject: [PATCH 08/78] Fixes to previous commit

---
 rtl/DCache.bsv      | 6 +++---
 rtl/DCacheTypes.bsv | 1 -
 rtl/DRAM.bsv        | 2 +-
 rtl/NarrowSRAM.bsv  | 3 +--
 4 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/rtl/DCache.bsv b/rtl/DCache.bsv
index bbed742d..e972a858 100644
--- a/rtl/DCache.bsv
+++ b/rtl/DCache.bsv
@@ -438,8 +438,9 @@ module mkDCache#(DCacheId myId) (DCache);
     let flush = flushQueue.dataOut;
     let resp = respPort.value;
     InflightDCacheReqInfo info = unpack(truncate(resp.info));
+    Bit#(`LogBeatsPerLine) beat = truncate(resp.beat);
     lineWriteDataWire <= resp.data;
-    lineWriteIndexWire <= beatIndex(truncate(resp.beat), info.req.id,
+    lineWriteIndexWire <= beatIndex(beat, info.req.id,
                             info.req.addr, info.way);
     // Ready to consume flush queue?
     if (flushQueue.canDeq && flushQueue.canPeek) begin
@@ -454,7 +455,7 @@ module mkDCache#(DCacheId myId) (DCache);
       // Remove item from fill queue and feed associated request (which
       // will definitely hit if it starts again from the beginning of
       // the pipeline) back to beginning of the pipeline
-      if (allHigh(resp.beat))
+      if (allHigh(beat))
         feedbackTrigger <= True;
       // Write new line data to dataMem
       // (The write parameters are set outside condition for better timing)
@@ -493,7 +494,6 @@ module mkDCache#(DCacheId myId) (DCache);
     InflightDCacheReqInfo info;
     info.req = miss.req;
     info.way = miss.evictWay;
-    info.beat = ?;
     // Create memory request
     DRAMReq memReq;
     memReq.isStore = !isLoad;
diff --git a/rtl/DCacheTypes.bsv b/rtl/DCacheTypes.bsv
index fa6ba407..4ddd809f 100644
--- a/rtl/DCacheTypes.bsv
+++ b/rtl/DCacheTypes.bsv
@@ -43,7 +43,6 @@ typedef struct {
 typedef struct {
   DCacheReq req;
   Way way;
-  Bit#(`LogBeatsPerLine) beat;
 } InflightDCacheReqInfo deriving (Bits);
 
 endpackage
diff --git a/rtl/DRAM.bsv b/rtl/DRAM.bsv
index d188afbd..406cfe89 100644
--- a/rtl/DRAM.bsv
+++ b/rtl/DRAM.bsv
@@ -6,7 +6,7 @@ package DRAM;
 // ============================================================================
 
 // DRAM client id
-typedef Bit#(TAdd#(`LogDCachesPerDRAM, 1)) DRAMClientId;
+typedef Bit#(TLog#(TAdd#(`DCachesPerDRAM,`FetchersPerProgRouter))) DRAMClientId;
 
 // DRAM request id
 typedef DRAMClientId DRAMReqId;
diff --git a/rtl/NarrowSRAM.bsv b/rtl/NarrowSRAM.bsv
index dde0e08a..4e51be85 100644
--- a/rtl/NarrowSRAM.bsv
+++ b/rtl/NarrowSRAM.bsv
@@ -8,7 +8,7 @@ import Util :: *;
 // ============================================================================
 
 // SRAM request id
-typedef Bit#(TAdd#(`LogDCachesPerDRAM, 1)) SRAMReqId;
+typedef Bit#(TLog#(TAdd#(`DCachesPerDRAM,`FetchersPerProgRouter))) SRAMReqId;
 
 // SRAM load request
 typedef struct {
@@ -139,7 +139,6 @@ module mkSRAM#(RAMId id) (SRAM);
         resp.id = req.id;
         resp.data = pack(elems);
         resp.info = req.info;
-        resp.info.beat = truncate(loadBurstCount);
         resps.enq(resp);
         inFlightCount.dec;
       end

From b512bf969313032ecb5d0c6b18596bbcaa279c1c Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Wed, 25 Mar 2020 13:49:51 +0000
Subject: [PATCH 09/78] Add some design notes

---
 rtl/ProgRouter.bsv | 50 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv
index 8796182a..4cbc75f1 100644
--- a/rtl/ProgRouter.bsv
+++ b/rtl/ProgRouter.bsv
@@ -117,6 +117,56 @@ typedef struct {
 // Design
 // =============================================================================
 
+// In the following diagram N/S/E/W are the inter-FPGA links and
+// L0..L3 are links at one edge of the NoC.  Depending on the NoC
+// dimensions, there may be more or less than four links on a single
+// NoC edge, but the diagram assumes four.
+
+//
+//               N     S     E     W     L0..L3/Loop   Input flits
+//               |     |     |     |     |       |
+//             +---+ +---+ +---+ +---+ +---+     |
+//             | F | | F | | F | | F | | F |     |     Fetchers
+//             +---+ +---+ +---+ +---+ +---+     |
+//               |     |     |     |     |       |
+//             +---------------------------+     |
+//             |          Crossbar         |     |     Preliminary routing
+//             +---------------------------+     |
+//               |     |     |     |     |       |
+//              N/L0  S/L1  E/L2  W/L3   Ind-----+     Output queues
+//               |     |     |     |
+//             +---------------------------+
+//             |          Expander         |           Final expansion
+//             +---------------------------+
+//               |  |  |  |  |  |  |  |
+//               N  S  E  W  L0 L1 L2 L3               Output flits
+//
+
+// The core functionality is implemented in the fetchers, which:
+//   (1) extract routing keys from incoming flits;
+//   (2) lookup the keys in RAM;
+//   (3) interpret the resulting routing records; and
+//   (4) emit the interpreted flits.
+
+// The key property of these fetchers is that they act entirely
+// indepdedently of each other: each one can make progress even if
+// another is blocked.  Unfortunately, this leads to a duplicated
+// logic resources, but is necessary to avoid deadlock.
+
+// Note that, as the routers are fully programmable, it is possible
+// for the programmer to introduce deadlock using an ill-defined
+// routing scheme, e.g. where a flit arrives in on (say) link N and
+// requires a flit to be sent back along the same direction N.
+// However, the hardware does guarantee deadlock-freedom if the
+// routing scheme is based on dimension-ordered routing.
+
+// After the fetchers have interpreted the flits, they are fed to a
+// fair crossbar which organises them by destination into output
+// queues.  To reduce logic, we allow each inter-board link to share
+// an output queue with a local link, as this does not compromise
+// forward progress.  Finally the queues are expanded to provide an
+// output stream for each possible destination.
+
 // =============================================================================
 // Fetcher
 // =============================================================================

From 1c7ed6f1254acd576492625b60025a3e8eb3331b Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Wed, 25 Mar 2020 18:11:06 +0000
Subject: [PATCH 10/78] First stage of fetcher

---
 rtl/ProgRouter.bsv | 159 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 154 insertions(+), 5 deletions(-)

diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv
index 4cbc75f1..8f203277 100644
--- a/rtl/ProgRouter.bsv
+++ b/rtl/ProgRouter.bsv
@@ -171,9 +171,11 @@ typedef struct {
 // Fetcher
 // =============================================================================
 
-// Address in a fetcher's flit buffer
-typedef Bit#(TSub#(`LogFetcherFlitBufferSize, `LogMaxFlitsPerMsg))
-  FetcherFlitBufferMsgAddr;
+// Flit address in a fetcher's flit buffer
+typedef Bit#(`FetcherLogFlitBufferSize) FetcherFlitBufferAddr;
+
+// Message address in a fetcher's flit buffer
+typedef Bit#(`FetcherLogMsgsPerFlitBuffer) FetcherFlitBufferMsgAddr;
 
 // This structure contains information about an in-flight memory
 // request from a fetcher.  When a fetcher issues a memory load
@@ -186,10 +188,157 @@ typedef Bit#(TSub#(`LogFetcherFlitBufferSize, `LogMaxFlitsPerMsg))
 typedef struct {
   // Message address in the fetcher's flit buffer
   FetcherFlitBufferMsgAddr msgAddr;
-  // Is this the final routing beat for the key being fetched?
-  Bool finalBeat;
+  // How many beats in the burst?
+  Bit#(`BeatBurstWidth) burst;
+  // Is this the final burst of routing records for the current key?
+  Bool finalBurst;
 } InflightFetcherReqInfo deriving (Bits);
 
+// Fetcher interface
+interface Fetcher;
+  // Incoming and outgoing flits
+  interface In#(Flit) flitIn;
+  interface Out#(Flit) flitOut;
+  // Off-chip RAM connections
+  Vector#(`DRAMsPerBoard, BOut#(DRAMReq)) ramReqs;
+  Vector#(`DRAMsPerBoard, In#(DRAMResp)) ramResps;
+endinterface
+
+// Fetcher module
+module mkFetcher;
+
+  // Flit input port
+  InPort#(Flit)) flitInPort <- mkInPort;
+
+  // RAM response ports
+  Vector#(`DRAMsPerBoard, InPort#(DRAMResp)) ramRespPort <-
+    replicateM(mkInPort);
+
+  // RAM request queues
+  Vector#(`DRAMsPerBoard, Queue1#(DRAMReq)) ramReqQueue <-
+    replicateM(mkUGShiftQueue(QueueOptFmax));
+
+  // Flit buffer
+  BlockRamOpts flitBufferOpts =
+    BlockRamOpts {
+      readDuringWrite: DontCare,
+      style: "AUTO",
+      registerDataOut: False,
+      initFile: Invalid
+    };
+  BlockRam#(FetcherFlitBufferAddr, Flit) flitBuffer <- mkBlockRam;
+
+  // Beat buffer
+  SizedQueue#(`LogProgRouterBeatBufferSize, RoutingBeat)) beatBuffer <-
+    replicateM(mkUGSizedQueue);
+
+  // Stage 1: consume input message
+  // ------------------------------
+
+  // Consumer state
+  // State 0: pass through flits that don't contain routing keys
+  // State 1: buffer flits that do contain routing keys
+  // State 2: fetch routing beats
+  Reg#(Bit#(2)) consumeState <- mkReg(0);
+
+  // Count number of flits of message consumed so far
+  Reg#(Bit#(`LogFlitsPerMsg)) consumeFlitCount <- mkReg(0);
+
+  // Flit slot allocator
+  Vector#(`FetcherMsgsPerFlitBuffer, SetReset) flitBufferUsedSlots <-
+    mkSetReset(False);
+
+  // Chosen message slot
+  Reg#(FetcherFlitBufferMsgAddr) chosenReg <- mkRegU;
+
+  // Routing key of message consumed
+  Reg#(RoutingKey) consumeKey <- mkRegU;
+
+  // Maintain count of routing beats fetched so far
+  Reg#(Bit#(`LogRoutingEntryLen)) fetchBeatCount <- mkReg(0);
+
+  // State 0: pass through flits that don't contain routing keys
+  rule consumeMessage0 (consumeState == 0);
+    Flit flit = flitInPort.value;
+    // Find unused message slot
+    Bool found = False;
+    FetcherFlitBufferMsgAddr chosen = ?;
+    for (Integer i = 0; i < `FetcherMsgsPerFlitBuffer; i=i+1) 
+      if (flitBufferUsedSlots[i].value == 0) begin
+        found = True;
+        chosen = fromInteger(i);
+      end
+    chosenReg <= chosen;
+    // Initialise counters for subsequent states
+    flitCount <= 0;
+    fetchBeatCount<= 0;
+    // Consume flit
+    if (flitInPort.canGet) begin
+      if (flit.dest.addr.isKey) begin
+        if (found) begin
+          consumeState <= 1;
+        end
+      end else if (flitQueue.notFull) begin
+        // TODO: avoid conflict with interpreter stage
+        flitOutQueue.enq(flit);
+        flitInPort.get;
+      end
+    end
+  endrule
+
+  // State 1: buffer flits that do contain routing keys
+  rule consumeMessage1 (consumeState == 1);
+    Flit flit = flitInPort.value;
+    if (flitInPort.canGet) begin
+      flitInPort.get;
+      consumeKey <= getRoutingKey(flit.dest.addr);
+      // Write to flit buffer
+      flitBuffer.write({chosenReg, consumeFlitCount}, flit);
+      consumeFlitCount <= consumeFlitCount + 1;
+      // On final flit, move to fetch state
+      if (! flit.notFinalFlit) begin
+        consumeState <= 2;
+        // Claim chosen slot
+        flitBufferUsedSlots[chosenReg].set;
+      end
+    end
+  endrule
+
+  // State 2: fetch routing beats
+  rule consumeMessage2 (consumeState == 2);
+    // Have we finished fetching beats?
+    Bool finished = fetchBeatCount + `ProgRouterMaxBurst >= consumeKey.len;
+    // Prepare inflight RAM request info
+    // (to handle out of order resps from the RAMs)
+    InflightFetcherReqInfo info;
+    info.msgAddr = chosenReg;
+    info.burst = min(consumeKey.len - fetchBeatCount, `ProgRouterMaxBurst);
+    info.finalBurst = finished;
+    // Prepare RAM request
+    DRAMReq req;
+    req.isStore = False;
+    req.id = fromInteger(`DCachesPerDRAM + myId);
+    req.addr = {1'b0, consumeKey.ptr + fetchBeatCount};
+    req.data = zeroExtend(pack(info));
+    req.burst = info.burst;
+    // Don't overfetch (beat buffer has finite size)
+    if (ramReqQueue[consumeKey.ram].notFull &&
+          beatBufferLen.available >= zeroExtend(req.burst)) begin
+        ramReqQueue[consumeKey.ram].enq(req);
+        fetchBeatCount <= fetchBeatCount + req.burst;
+        beatBufferLen.incBy(zeroExtend(req.burst));
+        if (finished) consumeState <= 0;
+      end
+    end
+  endrule
+
+  // Stage 2: consume RAM responses
+  // ------------------------------
+
+
+
+endmodule
+
 // =============================================================================
 // Programmable router
 // =============================================================================

From 210449a886b3d7cebb227553753732fb769a9b91 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Fri, 27 Mar 2020 11:43:32 +0000
Subject: [PATCH 11/78] Fetcher complete

(But doesn't compile yet)
---
 rtl/ProgRouter.bsv | 290 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 274 insertions(+), 16 deletions(-)

diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv
index 8f203277..c5245254 100644
--- a/rtl/ProgRouter.bsv
+++ b/rtl/ProgRouter.bsv
@@ -24,6 +24,14 @@ typedef struct {
   Vector#(6, Bit#(40)) chunks;
 } RoutingBeat deriving (Bits);
 
+// Routing beat, tagged with the beat number in the DRAM burst
+typedef struct {
+  // Beat
+  RoutingBeat beat;
+  // Beat number
+  Bit#(`BeatBurstWidth) beatNum;
+} NumberedRoutingBeat deriving (Bits);
+
 // 32-bit routing key
 typedef struct {
   // Which off-chip RAM?
@@ -50,6 +58,13 @@ typedef enum {
   IND  = 3'd4  // 40-bit Indirection
 } RoutingRecordTag;
 
+typedef enum {
+  NORTH = 2'd0,
+  SOUTH = 2'd1,
+  EAST  = 2'd2,
+  WEST  = 2'd3,
+} RoutingDir;
+
 // 40-bit Unicast Router-to-Mailbox (URM1) record
 typedef struct {
   // Record type
@@ -83,7 +98,7 @@ typedef struct {
   // Record type
   RoutingRecordTag tag;
   // Direction (N, S, E, or W)
-  Bit#(2) dir;
+  RoutingDir dir;
   // Currently unused
   Bit#(3) unused;
   // New 32-bit routing key that will replace the one in the
@@ -113,6 +128,25 @@ typedef struct {
   Bit#(32) newKey;
 } INDRecord deriving (Bits);
 
+// It is sometimes convenient (though redundant) to record a routing
+// decision for a flit internally within the programmable router
+typedef struct {
+  // Normal flit
+  Flit flit;
+  // Routing decision for flit
+  RoutingDecision decision;
+} RoutedFlit deriving (Bits);
+
+// Routing decision
+typedef enum {
+  RouteNorth,
+  RouteSouth,
+  RouteEast,
+  RouteWest,
+  RouteNoC,
+  RouteLoop
+} RoutingDecision deriving (Bits, Eq);
+
 // =============================================================================
 // Design
 // =============================================================================
@@ -130,7 +164,7 @@ typedef struct {
 //             +---+ +---+ +---+ +---+ +---+     |
 //               |     |     |     |     |       |
 //             +---------------------------+     |
-//             |          Crossbar         |     |     Preliminary routing
+//             |          Crossbar         |     |     Routing
 //             +---------------------------+     |
 //               |     |     |     |     |       |
 //              N/L0  S/L1  E/L2  W/L3   Ind-----+     Output queues
@@ -198,22 +232,18 @@ typedef struct {
 interface Fetcher;
   // Incoming and outgoing flits
   interface In#(Flit) flitIn;
-  interface Out#(Flit) flitOut;
+  interface BOut#(RoutedFlit) flitOut;
   // Off-chip RAM connections
   Vector#(`DRAMsPerBoard, BOut#(DRAMReq)) ramReqs;
   Vector#(`DRAMsPerBoard, In#(DRAMResp)) ramResps;
 endinterface
 
 // Fetcher module
-module mkFetcher;
+module mkFetcher#(BoardId boardId) (Fetcher);
 
   // Flit input port
   InPort#(Flit)) flitInPort <- mkInPort;
 
-  // RAM response ports
-  Vector#(`DRAMsPerBoard, InPort#(DRAMResp)) ramRespPort <-
-    replicateM(mkInPort);
-
   // RAM request queues
   Vector#(`DRAMsPerBoard, Queue1#(DRAMReq)) ramReqQueue <-
     replicateM(mkUGShiftQueue(QueueOptFmax));
@@ -229,8 +259,21 @@ module mkFetcher;
   BlockRam#(FetcherFlitBufferAddr, Flit) flitBuffer <- mkBlockRam;
 
   // Beat buffer
-  SizedQueue#(`LogProgRouterBeatBufferSize, RoutingBeat)) beatBuffer <-
-    replicateM(mkUGSizedQueue);
+  SizedQueue#(`LogProgRouterBeatBufferSize, NumberedRoutingBeat))
+    beatBuffer <- replicateM(mkUGSizedQueue);
+
+  // Track length of beat buffer, so that we don't overfetch
+  Count#(TAdd#(`LogProgRouterBeatBufferSize, 1)) beatBufferLen <-
+      mkCount(2 ** `LogProgRouterBeatBufferSize);
+
+  // For flits whose destinations are *not* routing keys
+  Queue1#(RoutedFlit) flitBypassQueue <- mkUGShiftQueue(QueueOptFmax);
+
+  // For flits whose destinations are routing keys
+  Queue1#(RoutedFlit) flitProcessedQueue <- mkUGShiftQueue(QueueOptFmax);
+
+  // Final output queue for flits
+  Queue1#(RoutedFlit) flitOutQueue <- mkUGShiftQueue(QueueOptFmax);
 
   // Stage 1: consume input message
   // ------------------------------
@@ -278,10 +321,19 @@ module mkFetcher;
         if (found) begin
           consumeState <= 1;
         end
-      end else if (flitQueue.notFull) begin
-        // TODO: avoid conflict with interpreter stage
-        flitOutQueue.enq(flit);
+      end else if (flitBypassQueue.notFull) begin
         flitInPort.get;
+        // Make routing decision
+        RoutingDecision decision = RouteLocal;
+        MailboxNetAddr = flit.dest.addr;
+        if (a.addr.host.valid)
+          decision = addr.host.value == 0 ? RouteWest : RouteEast;
+        else if (addr.board.x < boardId.x) decision = RouteWest;
+        else if (addr.board.x > boardId.x) decision = RouteEast;
+        else if (addr.board.y < boardId.y) decision = RouteSouth;
+        else if (addr.board.y > boardId.y) decision = RouteNorth;
+        // Insert into bypass queue
+        flitBypassQueue.enq(RoutedFlit { decision: decision, flit: flit});
       end
     end
   endrule
@@ -319,7 +371,7 @@ module mkFetcher;
     req.isStore = False;
     req.id = fromInteger(`DCachesPerDRAM + myId);
     req.addr = {1'b0, consumeKey.ptr + fetchBeatCount};
-    req.data = zeroExtend(pack(info));
+    req.data = {?, pack(info)};
     req.burst = info.burst;
     // Don't overfetch (beat buffer has finite size)
     if (ramReqQueue[consumeKey.ram].notFull &&
@@ -332,10 +384,214 @@ module mkFetcher;
     end
   endrule
 
-  // Stage 2: consume RAM responses
-  // ------------------------------
+  // Stage 2: interpret routing beats
+  // --------------------------------
+
+  // Merge responses from each RAM
+  staticAssert(`DRAMsPerBoard == 2,
+    "Fetcher: need to generalise number of RAMs used")
+  MergeUnit#(NumberedRoutingBeat) ramRespMerger <- mkMergeUnitFair;
+
+  // Convert a RAM response to a numbered routing beat
+  function NumberedRoutingBeat fromDRAMResp(DRAMResp resp) =
+    NumberedRoutingBeat {
+      beat: unpack(resp.data)
+    , beatNum: resp.beat
+    };
+
+  // Create RAM response input interfaces for this module
+  In#(DRAMResp) respA <- onIn(fromDRAMResp, ramRespMerger.inA);
+  In#(DRAMResp) respB <- onIn(fromDRAMResp, ramRespMerger.inB);
+  Vector#(`DRAMsPerBoard, In#(DRAMResp)) ramResps = vector(respA, respB);
+
+  // Connect the merger to the beat buffer
+  connectToQueue(ramRespMerger.out, beatBuffer);
+
+  // Count number of flits of message emitted so far
+  Reg#(Bit#(`LogFlitsPerMsg)) emitFlitCount <- mkReg(0);
+
+  // Count number of records processed so far in current beat
+  Reg#(Bit#(3)) recordCount <- mkReg(0);
+
+  // (Shift) register holding current routing beat
+  Reg#(NumberedRoutingBeat) beatReg <- mkRegU;
+
+  // Interpreter state
+  // 0: register the routing beat and fetch first flit
+  // 1: interpret flits
+  Reg#(Bit#(1)) interpreterState <- mkReg(0);
+
+  // State 0: register the routing beat and fetch first flit
+  rule interpreter0 (interpreterState == 0);
+    let beat = beatBuffer.dataOut.beat;
+    InflightFetcherReqInfo info = unpack(truncate(beat.info));
+    // Consume beat
+    if (beatBuffer.canDeq && beatBuffer.canPeek) begin
+      beatReg <= beatBuffer.dataOut;
+      beatBuffer.deq;
+      interpreterState <= 1;
+    end
+    // Load first flit
+    flitBuffer.load({info.msgAddr, 0});
+    emitFlitCount <= 0;
+    recordCount <= 0;
+  endrule
+
+  // State 1: interpret flits
+  rule interpreter1 (interpreterState == 1);
+    // Extract details of registered routing beat
+    let beat = beatReg.beat;
+    let beatNum = beatReg.beatNum;
+    InflightFetcherReqInfo info = unpack(truncate(beat.info));
+    // Extract tag from next record
+    RoutingRecordTag tag = beat.chunks[5].tag;
+    // Is this the first flit of a message?
+    Bool firstFlit = emitFlitCount == 0;
+    // Modify flit by interpreting routing key
+    RoutingDecision decision = ?;
+    Flit flit = flitBuffer.dataOut;
+    case (tag)
+      // 40-bit Unicast Router-to-Mailbox
+      URM1: begin
+        URM1Record rec = unpack(beat.chunks[5]);
+        flit.dest.addr.isKey = False;
+        flit.dest.addr.mbox = rec.mbox;
+        Vector#(`ThreadsPerMailbox, Bool) threadMask = newVector;
+        for (Integer j = 0; j < `ThreadsPerMailbox; j=j+1)
+          threadMask = rec.thread == fromInteger(j);
+        flit.dest.threads = pack(threadMask);
+        // Replace first word of message with local key
+        if (firstFlit)
+          flit.payload = {truncateLSB(flit.payload), 5'b0, rec.localKey};
+        decision = RouteLocal;
+      end
+      // 80-bit Unicast Router-to-Mailbox
+      URM2: begin
+        URM2Record rec = unpack({beat.chunks[5], beat.chunks[4]});
+        flit.dest.addr.isKey = False;
+        flit.dest.addr.mbox = rec.mbox;
+        Vector#(`ThreadsPerMailbox, Bool) threadMask = newVector;
+        for (Integer j = 0; j < `ThreadsPerMailbox; j=j+1)
+          threadMask = rec.thread == fromInteger(j);
+        flit.dest.threads = pack(threadMask);
+        // Replace first two words of message with local key
+        if (firstFlit)
+          flit.payload = {truncateLSB(flit.payload), rec.localKey};
+        decision = RouteLocal;
+      end
+      // 40-bit Router-to-Router
+      RR: begin
+        RRRecord rec = unpack(beat.chunks[5]);
+        case (rec.dir)
+          NORTH: begin
+            decision = RouteNorth;
+            flit.dest.board = BoardId {x: boardId.x, y: boardId.y+1};
+          end
+          SOUTH: begin
+            decision = RouteSouth;
+            flit.dest.board = BoardId {x: boardId.x, y: boardId.y-1};
+          end
+          EAST: begin
+            decision = RouteEast;
+            flit.dest.board = BoardId {x: boardId.x+1, y: boardId.y};
+          end
+          WEST: begin
+            decision = RouteWest;
+            flit.dest.board = BoardId {x: boardId.x-1, y: boardId.y};
+          end
+        endcase
+        flit.dest.threads = {?, rec.newKey};
+      end
+      // 80-bit Multicast Router-to-Mailbox
+      MRM: begin
+        MRMRecord rec = unpack({beat.chunks[5], beat.chunks[4]});
+        flit.dest.addr.isKey = False;
+        flit.dest.addr.mbox = rec.mbox;
+        flit.dest.threads = rec.destMask;
+        decision = RouteLocal;
+      end
+      // 40-bit Indirection
+      IND: begin
+        INDRecord rec = unpack(beat.chunks[5]);
+        flit.dest.threads = {?, rec.newKey};
+        decision = RouteLoop;
+      end
+    end
+    // Is output queue ready for new flit?
+    Bool emit = flitProcessedQueue.notFull;
+    Bool newFlitCount = emitFlitCount;
+    // Consume routing record
+    if (emit) begin
+      flitProcessedQueue.enq(RoutedFlit { decision: decision, flit: flit });
+      // Move to next record
+      recordCount <= recordCount + 1;
+      // Shift beat to point to next record
+      RoutingBeat newBeat = beat;
+      Bool doubleChunk = unpack(pack(tag)[0]);
+      if (doubleChunk) begin
+        for (Integer i = 5; i > 2; i=i-2) begin
+          newBeat.chunks[i] = beat.chunks[i-2];
+          newBeat.chunks[i-1] = beat.chunks[i-3];
+        end
+      end else begin
+        for (Integer i = 5; i > 0; i=i-1)
+          newBeat.chunks[i] = beat.chunks[i-1];
+      end
+      beatReg <= NumberedRoutingBeat { beatNum: beatNum, beat: newBeat };
+      // Is this the final record in the beat?
+      if ((recordCount+1) == beat.size) begin
+        interpreterState <= 0;
+        // Have we finished with this message yet?
+        if (info.finalBurst && info.burst == (beatNum+1)) begin
+          // Reclaim message slot in flit buffer
+          flitBufferUsedSlots[info.msgAddr].clear;
+        end
+      end
+      // Is this the final flit in the message?
+      if (flit.notFinalFlit)
+        newFlitCount = emitFlitCount + 1;
+      else
+        newFlitCount = 0;
+    end
+    // Issue flit load request
+    flitBuffer.load({info.msgAddr, newFlitCount});
+    emitFlitCount <= newFlitCount;
+  endrule
+
+  // Stage 3: merge output queues
+  // ----------------------------
+
+  // We want to merge messages, not flits
+  // Are we in the middle of consuming a message?
+  Reg#(Bool) mergeInProgress <- mkReg(False);
+  Reg#(Bool) prevFromBypass <- mkReg(False);
+
+  rule merge (flitOutQueue.notFull);
+    // Favour the bypass queue
+    Bool chooseBypass = mergeInProgress ? prevFromBypass :
+      flitBypassQueue.canDeq;
+    if (chooseBypass) begin
+      if (flitBypassQueue.canDeq) begin
+        flitBypassQueue.deq;
+        flitOutQueue.enq(flitBypassQueue.dataOut);
+        mergeInProgress <= flitBypassQueue.dataOut.flit.notFinalFlit;
+        prevFromBypass = True;
+      end
+    end else if (flitProcessedQueue.canDeq) begin
+      flitProcessedQueue.deq;
+      flitOutQueue.enq(flitProcessedQueue.dataOut);
+      mergeInProgress <= flitProcessedQueue.dataOut.flit.notFinalFlit;
+      prevFromBypass = False;
+    end
+  endrule;
 
+  // Interfaces
+  // -----------
 
+  interface flitIn = flitInPort.in;
+  interface flitOut = queueToBOut(flitOutQueue);
+  interface ramReqs = map(queueToBOut, ramReqQueue);
+  interface ramResps = ramResps;
 
 endmodule
 
@@ -355,6 +611,7 @@ interface ProgRouter;
     Vector#(`FetchersPerProgRouter, In#(DRAMResp))) ramResps;
 endinterface
 
+/*
 module mkProgRouter (ProgRouter);
 
   // Flit input ports
@@ -362,5 +619,6 @@ module mkProgRouter (ProgRouter);
     replicateM(mkInPort);
 
 endmodule
+*/
 
 endpackage

From c294bebfa647e6e7a8858e00badb9998cbcda7b3 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Fri, 27 Mar 2020 14:43:42 +0000
Subject: [PATCH 12/78] Fetcher compiles

---
 config.py          |   9 +++
 rtl/Network.bsv    |   1 +
 rtl/ProgRouter.bsv | 153 ++++++++++++++++++++++++---------------------
 3 files changed, 93 insertions(+), 70 deletions(-)

diff --git a/config.py b/config.py
index 7f931637..919ab4a8 100755
--- a/config.py
+++ b/config.py
@@ -161,6 +161,15 @@ def quoted(s): return "'\"" + s + "\"'"
 p["SRAMLogMaxInFlight"] = 5
 p["SRAMStoreLatency"] = 2
 
+# Programmable router parameters:
+p["LogRoutingEntryLen"] = 5 # Number of beats in a routing table entry
+p["ProgRouterMaxBurst"] = 4
+p["FetcherLogBeatBufferSize"] = 5
+p["FetcherLogFlitBufferSize"] = 5
+p["FetcherLogMsgsPerFlitBuffer"] = (
+  p["FetcherLogFlitBufferSize"] - p["LogMaxFlitsPerMsg"])
+p["FetcherMsgsPerFlitBuffer"] = 2 ** p["FetcherLogMsgsPerFlitBuffer"]
+
 # Enable performance counters
 p["EnablePerfCount"] = True
 
diff --git a/rtl/Network.bsv b/rtl/Network.bsv
index 99229c96..6706d00f 100644
--- a/rtl/Network.bsv
+++ b/rtl/Network.bsv
@@ -25,6 +25,7 @@ import IdleDetector :: *;
 import FlitMerger   :: *;
 import OffChipRAM   :: *;
 import DRAM         :: *;
+import ProgRouter   :: *;
 
 // =============================================================================
 // Mesh Router
diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv
index c5245254..b5de42ef 100644
--- a/rtl/ProgRouter.bsv
+++ b/rtl/ProgRouter.bsv
@@ -2,6 +2,15 @@
 // Functions, data types, and modules for programmable routers
 package ProgRouter;
 
+import Globals   :: *;
+import Util      :: *;
+import DRAM      :: *;
+import Vector    :: *;
+import Queue     :: *;
+import Interface :: *;
+import BlockRam  :: *;
+import Assert    :: *;
+
 // =============================================================================
 // Routing keys and beats
 // =============================================================================
@@ -24,18 +33,10 @@ typedef struct {
   Vector#(6, Bit#(40)) chunks;
 } RoutingBeat deriving (Bits);
 
-// Routing beat, tagged with the beat number in the DRAM burst
-typedef struct {
-  // Beat
-  RoutingBeat beat;
-  // Beat number
-  Bit#(`BeatBurstWidth) beatNum;
-} NumberedRoutingBeat deriving (Bits);
-
 // 32-bit routing key
 typedef struct {
   // Which off-chip RAM?
-  Bit#(`LogDRAMsPerBoard) ram
+  Bit#(`LogDRAMsPerBoard) ram;
   // Pointer to array of routing beats containing routing records
   Bit#(`LogBeatsPerDRAM) ptr;
   // Number of beats in the array
@@ -56,14 +57,14 @@ typedef enum {
   RR   = 3'd2, // 40-bit Router-to-Router
   MRM  = 3'd3, // 80-bit Multicast Router-to-Mailbox
   IND  = 3'd4  // 40-bit Indirection
-} RoutingRecordTag;
+} RoutingRecordTag deriving (Bits, Eq);
 
 typedef enum {
   NORTH = 2'd0,
   SOUTH = 2'd1,
   EAST  = 2'd2,
-  WEST  = 2'd3,
-} RoutingDir;
+  WEST  = 2'd3
+} RoutingDir deriving (Bits, Eq);
 
 // 40-bit Unicast Router-to-Mailbox (URM1) record
 typedef struct {
@@ -228,21 +229,31 @@ typedef struct {
   Bool finalBurst;
 } InflightFetcherReqInfo deriving (Bits);
 
+// Routing beat, tagged with the beat number in the DRAM burst
+typedef struct {
+  // Beat
+  RoutingBeat beat;
+  // Beat number
+  Bit#(`BeatBurstWidth) beatNum;
+  // Inflight request info
+  InflightFetcherReqInfo info;
+} NumberedRoutingBeat deriving (Bits);
+
 // Fetcher interface
 interface Fetcher;
   // Incoming and outgoing flits
   interface In#(Flit) flitIn;
   interface BOut#(RoutedFlit) flitOut;
   // Off-chip RAM connections
-  Vector#(`DRAMsPerBoard, BOut#(DRAMReq)) ramReqs;
-  Vector#(`DRAMsPerBoard, In#(DRAMResp)) ramResps;
+  interface Vector#(`DRAMsPerBoard, BOut#(DRAMReq)) ramReqs;
+  interface Vector#(`DRAMsPerBoard, In#(DRAMResp)) ramResps;
 endinterface
 
 // Fetcher module
-module mkFetcher#(BoardId boardId) (Fetcher);
+module mkFetcher#(Integer fetcherId, BoardId boardId) (Fetcher);
 
   // Flit input port
-  InPort#(Flit)) flitInPort <- mkInPort;
+  InPort#(Flit) flitInPort <- mkInPort;
 
   // RAM request queues
   Vector#(`DRAMsPerBoard, Queue1#(DRAMReq)) ramReqQueue <-
@@ -259,12 +270,12 @@ module mkFetcher#(BoardId boardId) (Fetcher);
   BlockRam#(FetcherFlitBufferAddr, Flit) flitBuffer <- mkBlockRam;
 
   // Beat buffer
-  SizedQueue#(`LogProgRouterBeatBufferSize, NumberedRoutingBeat))
-    beatBuffer <- replicateM(mkUGSizedQueue);
+  SizedQueue#(`FetcherLogBeatBufferSize, NumberedRoutingBeat)
+    beatBuffer <- mkUGSizedQueue;
 
   // Track length of beat buffer, so that we don't overfetch
-  Count#(TAdd#(`LogProgRouterBeatBufferSize, 1)) beatBufferLen <-
-      mkCount(2 ** `LogProgRouterBeatBufferSize);
+  Count#(TAdd#(`FetcherLogBeatBufferSize, 1)) beatBufferLen <-
+      mkCount(2 ** `FetcherLogBeatBufferSize);
 
   // For flits whose destinations are *not* routing keys
   Queue1#(RoutedFlit) flitBypassQueue <- mkUGShiftQueue(QueueOptFmax);
@@ -285,11 +296,11 @@ module mkFetcher#(BoardId boardId) (Fetcher);
   Reg#(Bit#(2)) consumeState <- mkReg(0);
 
   // Count number of flits of message consumed so far
-  Reg#(Bit#(`LogFlitsPerMsg)) consumeFlitCount <- mkReg(0);
+  Reg#(Bit#(`LogMaxFlitsPerMsg)) consumeFlitCount <- mkReg(0);
 
   // Flit slot allocator
   Vector#(`FetcherMsgsPerFlitBuffer, SetReset) flitBufferUsedSlots <-
-    mkSetReset(False);
+    replicateM(mkSetReset(False));
 
   // Chosen message slot
   Reg#(FetcherFlitBufferMsgAddr) chosenReg <- mkRegU;
@@ -307,14 +318,14 @@ module mkFetcher#(BoardId boardId) (Fetcher);
     Bool found = False;
     FetcherFlitBufferMsgAddr chosen = ?;
     for (Integer i = 0; i < `FetcherMsgsPerFlitBuffer; i=i+1) 
-      if (flitBufferUsedSlots[i].value == 0) begin
+      if (! flitBufferUsedSlots[i].value) begin
         found = True;
         chosen = fromInteger(i);
       end
     chosenReg <= chosen;
     // Initialise counters for subsequent states
-    flitCount <= 0;
-    fetchBeatCount<= 0;
+    consumeFlitCount <= 0;
+    fetchBeatCount <= 0;
     // Consume flit
     if (flitInPort.canGet) begin
       if (flit.dest.addr.isKey) begin
@@ -324,9 +335,9 @@ module mkFetcher#(BoardId boardId) (Fetcher);
       end else if (flitBypassQueue.notFull) begin
         flitInPort.get;
         // Make routing decision
-        RoutingDecision decision = RouteLocal;
-        MailboxNetAddr = flit.dest.addr;
-        if (a.addr.host.valid)
+        RoutingDecision decision = RouteNoC;
+        MailboxNetAddr addr = flit.dest.addr;
+        if (addr.host.valid)
           decision = addr.host.value == 0 ? RouteWest : RouteEast;
         else if (addr.board.x < boardId.x) decision = RouteWest;
         else if (addr.board.x > boardId.x) decision = RouteEast;
@@ -343,7 +354,7 @@ module mkFetcher#(BoardId boardId) (Fetcher);
     Flit flit = flitInPort.value;
     if (flitInPort.canGet) begin
       flitInPort.get;
-      consumeKey <= getRoutingKey(flit.dest.addr);
+      consumeKey <= getRoutingKey(flit.dest);
       // Write to flit buffer
       flitBuffer.write({chosenReg, consumeFlitCount}, flit);
       consumeFlitCount <= consumeFlitCount + 1;
@@ -359,28 +370,28 @@ module mkFetcher#(BoardId boardId) (Fetcher);
   // State 2: fetch routing beats
   rule consumeMessage2 (consumeState == 2);
     // Have we finished fetching beats?
-    Bool finished = fetchBeatCount + `ProgRouterMaxBurst >= consumeKey.len;
+    Bool finished = fetchBeatCount+`ProgRouterMaxBurst >= consumeKey.numBeats;
     // Prepare inflight RAM request info
     // (to handle out of order resps from the RAMs)
     InflightFetcherReqInfo info;
     info.msgAddr = chosenReg;
-    info.burst = min(consumeKey.len - fetchBeatCount, `ProgRouterMaxBurst);
+    info.burst = truncate(
+      min(consumeKey.numBeats - fetchBeatCount, `ProgRouterMaxBurst));
     info.finalBurst = finished;
     // Prepare RAM request
     DRAMReq req;
     req.isStore = False;
-    req.id = fromInteger(`DCachesPerDRAM + myId);
-    req.addr = {1'b0, consumeKey.ptr + fetchBeatCount};
+    req.id = fromInteger(`DCachesPerDRAM + fetcherId);
+    req.addr = {1'b0, consumeKey.ptr + zeroExtend(fetchBeatCount)};
     req.data = {?, pack(info)};
     req.burst = info.burst;
     // Don't overfetch (beat buffer has finite size)
     if (ramReqQueue[consumeKey.ram].notFull &&
           beatBufferLen.available >= zeroExtend(req.burst)) begin
-        ramReqQueue[consumeKey.ram].enq(req);
-        fetchBeatCount <= fetchBeatCount + req.burst;
-        beatBufferLen.incBy(zeroExtend(req.burst));
-        if (finished) consumeState <= 0;
-      end
+      ramReqQueue[consumeKey.ram].enq(req);
+      fetchBeatCount <= fetchBeatCount + zeroExtend(req.burst);
+      beatBufferLen.incBy(zeroExtend(req.burst));
+      if (finished) consumeState <= 0;
     end
   endrule
 
@@ -389,7 +400,7 @@ module mkFetcher#(BoardId boardId) (Fetcher);
 
   // Merge responses from each RAM
   staticAssert(`DRAMsPerBoard == 2,
-    "Fetcher: need to generalise number of RAMs used")
+    "Fetcher: need to generalise number of RAMs used");
   MergeUnit#(NumberedRoutingBeat) ramRespMerger <- mkMergeUnitFair;
 
   // Convert a RAM response to a numbered routing beat
@@ -397,18 +408,19 @@ module mkFetcher#(BoardId boardId) (Fetcher);
     NumberedRoutingBeat {
       beat: unpack(resp.data)
     , beatNum: resp.beat
+    , info: unpack(truncate(resp.info))
     };
 
   // Create RAM response input interfaces for this module
   In#(DRAMResp) respA <- onIn(fromDRAMResp, ramRespMerger.inA);
   In#(DRAMResp) respB <- onIn(fromDRAMResp, ramRespMerger.inB);
-  Vector#(`DRAMsPerBoard, In#(DRAMResp)) ramResps = vector(respA, respB);
+  Vector#(`DRAMsPerBoard, In#(DRAMResp)) ramRespsOut = vector(respA, respB);
 
   // Connect the merger to the beat buffer
   connectToQueue(ramRespMerger.out, beatBuffer);
 
   // Count number of flits of message emitted so far
-  Reg#(Bit#(`LogFlitsPerMsg)) emitFlitCount <- mkReg(0);
+  Reg#(Bit#(`LogMaxFlitsPerMsg)) emitFlitCount <- mkReg(0);
 
   // Count number of records processed so far in current beat
   Reg#(Bit#(3)) recordCount <- mkReg(0);
@@ -423,16 +435,16 @@ module mkFetcher#(BoardId boardId) (Fetcher);
 
   // State 0: register the routing beat and fetch first flit
   rule interpreter0 (interpreterState == 0);
-    let beat = beatBuffer.dataOut.beat;
-    InflightFetcherReqInfo info = unpack(truncate(beat.info));
+    let beat = beatBuffer.dataOut;
+    InflightFetcherReqInfo info = beat.info;
     // Consume beat
     if (beatBuffer.canDeq && beatBuffer.canPeek) begin
-      beatReg <= beatBuffer.dataOut;
+      beatReg <= beat;
       beatBuffer.deq;
       interpreterState <= 1;
     end
     // Load first flit
-    flitBuffer.load({info.msgAddr, 0});
+    flitBuffer.read({info.msgAddr, 0});
     emitFlitCount <= 0;
     recordCount <= 0;
   endrule
@@ -442,9 +454,9 @@ module mkFetcher#(BoardId boardId) (Fetcher);
     // Extract details of registered routing beat
     let beat = beatReg.beat;
     let beatNum = beatReg.beatNum;
-    InflightFetcherReqInfo info = unpack(truncate(beat.info));
+    let info = beatReg.info;
     // Extract tag from next record
-    RoutingRecordTag tag = beat.chunks[5].tag;
+    RoutingRecordTag tag = unpack(truncateLSB(beat.chunks[5]));
     // Is this the first flit of a message?
     Bool firstFlit = emitFlitCount == 0;
     // Modify flit by interpreting routing key
@@ -455,29 +467,29 @@ module mkFetcher#(BoardId boardId) (Fetcher);
       URM1: begin
         URM1Record rec = unpack(beat.chunks[5]);
         flit.dest.addr.isKey = False;
-        flit.dest.addr.mbox = rec.mbox;
+        flit.dest.addr.mbox = unpack(rec.mbox);
         Vector#(`ThreadsPerMailbox, Bool) threadMask = newVector;
         for (Integer j = 0; j < `ThreadsPerMailbox; j=j+1)
-          threadMask = rec.thread == fromInteger(j);
+          threadMask[j] = rec.thread == fromInteger(j);
         flit.dest.threads = pack(threadMask);
         // Replace first word of message with local key
         if (firstFlit)
           flit.payload = {truncateLSB(flit.payload), 5'b0, rec.localKey};
-        decision = RouteLocal;
+        decision = RouteNoC;
       end
       // 80-bit Unicast Router-to-Mailbox
       URM2: begin
         URM2Record rec = unpack({beat.chunks[5], beat.chunks[4]});
         flit.dest.addr.isKey = False;
-        flit.dest.addr.mbox = rec.mbox;
+        flit.dest.addr.mbox = unpack(rec.mbox);
         Vector#(`ThreadsPerMailbox, Bool) threadMask = newVector;
         for (Integer j = 0; j < `ThreadsPerMailbox; j=j+1)
-          threadMask = rec.thread == fromInteger(j);
+          threadMask[j] = rec.thread == fromInteger(j);
         flit.dest.threads = pack(threadMask);
         // Replace first two words of message with local key
         if (firstFlit)
           flit.payload = {truncateLSB(flit.payload), rec.localKey};
-        decision = RouteLocal;
+        decision = RouteNoC;
       end
       // 40-bit Router-to-Router
       RR: begin
@@ -485,19 +497,19 @@ module mkFetcher#(BoardId boardId) (Fetcher);
         case (rec.dir)
           NORTH: begin
             decision = RouteNorth;
-            flit.dest.board = BoardId {x: boardId.x, y: boardId.y+1};
+            flit.dest.addr.board = BoardId {x: boardId.x, y: boardId.y+1};
           end
           SOUTH: begin
             decision = RouteSouth;
-            flit.dest.board = BoardId {x: boardId.x, y: boardId.y-1};
+            flit.dest.addr.board = BoardId {x: boardId.x, y: boardId.y-1};
           end
           EAST: begin
             decision = RouteEast;
-            flit.dest.board = BoardId {x: boardId.x+1, y: boardId.y};
+            flit.dest.addr.board = BoardId {x: boardId.x+1, y: boardId.y};
           end
           WEST: begin
             decision = RouteWest;
-            flit.dest.board = BoardId {x: boardId.x-1, y: boardId.y};
+            flit.dest.addr.board = BoardId {x: boardId.x-1, y: boardId.y};
           end
         endcase
         flit.dest.threads = {?, rec.newKey};
@@ -506,9 +518,9 @@ module mkFetcher#(BoardId boardId) (Fetcher);
       MRM: begin
         MRMRecord rec = unpack({beat.chunks[5], beat.chunks[4]});
         flit.dest.addr.isKey = False;
-        flit.dest.addr.mbox = rec.mbox;
+        flit.dest.addr.mbox = unpack(rec.mbox);
         flit.dest.threads = rec.destMask;
-        decision = RouteLocal;
+        decision = RouteNoC;
       end
       // 40-bit Indirection
       IND: begin
@@ -516,10 +528,10 @@ module mkFetcher#(BoardId boardId) (Fetcher);
         flit.dest.threads = {?, rec.newKey};
         decision = RouteLoop;
       end
-    end
+    endcase
     // Is output queue ready for new flit?
     Bool emit = flitProcessedQueue.notFull;
-    Bool newFlitCount = emitFlitCount;
+    let newFlitCount = emitFlitCount;
     // Consume routing record
     if (emit) begin
       flitProcessedQueue.enq(RoutedFlit { decision: decision, flit: flit });
@@ -537,9 +549,10 @@ module mkFetcher#(BoardId boardId) (Fetcher);
         for (Integer i = 5; i > 0; i=i-1)
           newBeat.chunks[i] = beat.chunks[i-1];
       end
-      beatReg <= NumberedRoutingBeat { beatNum: beatNum, beat: newBeat };
+      beatReg <= NumberedRoutingBeat {
+        beat: newBeat, beatNum: beatNum, info: info };
       // Is this the final record in the beat?
-      if ((recordCount+1) == beat.size) begin
+      if ((recordCount+1) == truncate(beat.size)) begin
         interpreterState <= 0;
         // Have we finished with this message yet?
         if (info.finalBurst && info.burst == (beatNum+1)) begin
@@ -554,7 +567,7 @@ module mkFetcher#(BoardId boardId) (Fetcher);
         newFlitCount = 0;
     end
     // Issue flit load request
-    flitBuffer.load({info.msgAddr, newFlitCount});
+    flitBuffer.read({info.msgAddr, newFlitCount});
     emitFlitCount <= newFlitCount;
   endrule
 
@@ -575,15 +588,15 @@ module mkFetcher#(BoardId boardId) (Fetcher);
         flitBypassQueue.deq;
         flitOutQueue.enq(flitBypassQueue.dataOut);
         mergeInProgress <= flitBypassQueue.dataOut.flit.notFinalFlit;
-        prevFromBypass = True;
+        prevFromBypass <= True;
       end
     end else if (flitProcessedQueue.canDeq) begin
       flitProcessedQueue.deq;
       flitOutQueue.enq(flitProcessedQueue.dataOut);
       mergeInProgress <= flitProcessedQueue.dataOut.flit.notFinalFlit;
-      prevFromBypass = False;
+      prevFromBypass <= False;
     end
-  endrule;
+  endrule
 
   // Interfaces
   // -----------
@@ -591,7 +604,7 @@ module mkFetcher#(BoardId boardId) (Fetcher);
   interface flitIn = flitInPort.in;
   interface flitOut = queueToBOut(flitOutQueue);
   interface ramReqs = map(queueToBOut, ramReqQueue);
-  interface ramResps = ramResps;
+  interface ramResps = ramRespsOut;
 
 endmodule
 
@@ -601,8 +614,8 @@ endmodule
 
 interface ProgRouter;
   // Incoming and outgoing flits
-  interface Vector#(`FetchersPerProgRouter, In#(Flit) flitIn);
-  interface Vector#(`FetchersPerProgRouter, Out#(Flit) flitOut);
+  interface Vector#(`FetchersPerProgRouter, In#(Flit)) flitIn;
+  interface Vector#(`FetchersPerProgRouter, Out#(Flit)) flitOut;
 
   // Interface to off-chip memory
   interface Vector#(`DRAMsPerBoard,

From 129c22cedab4693ee2f4e9053142af1a11d59f80 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Sat, 28 Mar 2020 09:49:32 +0000
Subject: [PATCH 13/78] Fix to previous commit

---
 rtl/ProgRouter.bsv | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv
index b5de42ef..3bbfa613 100644
--- a/rtl/ProgRouter.bsv
+++ b/rtl/ProgRouter.bsv
@@ -441,6 +441,7 @@ module mkFetcher#(Integer fetcherId, BoardId boardId) (Fetcher);
     if (beatBuffer.canDeq && beatBuffer.canPeek) begin
       beatReg <= beat;
       beatBuffer.deq;
+      beatBufferLen.dec;
       interpreterState <= 1;
     end
     // Load first flit

From 6551259d9bd161eb2c0f10c2206edd7810fdab7c Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Tue, 31 Mar 2020 11:29:31 +0100
Subject: [PATCH 14/78] ProgRouter module complete

(Compiles but untested)
---
 config.py          |   1 -
 rtl/ProgRouter.bsv | 198 ++++++++++++++++++++++++++++++++++++++++++---
 rtl/Util.bsv       |  20 +++++
 3 files changed, 208 insertions(+), 11 deletions(-)

diff --git a/config.py b/config.py
index 919ab4a8..50bc3480 100755
--- a/config.py
+++ b/config.py
@@ -369,7 +369,6 @@ def quoted(s): return "'\"" + s + "\"'"
 # Number of FPGA boards per box (including bridge board)
 p["BoardsPerBox"] = p["MeshXLenWithinBox"] * p["MeshYLenWithinBox"] + 1
 
-# Number of fetchers in the per-board programmable router
 # Parameters for programmable routers
 # (and the routing-record fetchers they contain)
 p["FetchersPerProgRouter"] = 5
diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv
index 3bbfa613..6a038e9d 100644
--- a/rtl/ProgRouter.bsv
+++ b/rtl/ProgRouter.bsv
@@ -10,6 +10,7 @@ import Queue     :: *;
 import Interface :: *;
 import BlockRam  :: *;
 import Assert    :: *;
+import Util      :: *;
 
 // =============================================================================
 // Routing keys and beats
@@ -171,7 +172,7 @@ typedef enum {
 //              N/L0  S/L1  E/L2  W/L3   Ind-----+     Output queues
 //               |     |     |     |
 //             +---------------------------+
-//             |          Expander         |           Final expansion
+//             |          Splitter         |           Final splitting
 //             +---------------------------+
 //               |  |  |  |  |  |  |  |
 //               N  S  E  W  L0 L1 L2 L3               Output flits
@@ -199,7 +200,7 @@ typedef enum {
 // fair crossbar which organises them by destination into output
 // queues.  To reduce logic, we allow each inter-board link to share
 // an output queue with a local link, as this does not compromise
-// forward progress.  Finally the queues are expanded to provide an
+// forward progress.  Finally the queues are split to provide an
 // output stream for each possible destination.
 
 // =============================================================================
@@ -250,7 +251,7 @@ interface Fetcher;
 endinterface
 
 // Fetcher module
-module mkFetcher#(Integer fetcherId, BoardId boardId) (Fetcher);
+module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
 
   // Flit input port
   InPort#(Flit) flitInPort <- mkInPort;
@@ -609,6 +610,125 @@ module mkFetcher#(Integer fetcherId, BoardId boardId) (Fetcher);
 
 endmodule
 
+// =============================================================================
+// Crossbar
+// =============================================================================
+
+// Selector function for a mux in the programmable router crossbar
+typedef function Bool selector(RoutedFlit flit) SelectorFunc;
+
+module mkProgRouterCrossbar#(
+         Vector#(n, SelectorFunc) f,
+         Vector#(n, BOut#(RoutedFlit)) out)
+           (Vector#(n, BOut#(RoutedFlit)))
+  provisos (Add#(a_, 1, n));
+
+  // Input ports
+  Vector#(n, InPort#(RoutedFlit)) inPort <- replicateM(mkInPort);
+
+  // Connect up input ports
+  for (Integer i = 0; i < valueOf(n); i=i+1)
+    connectDirect(out[i], inPort[i].in);
+
+  // Cosume wires, for each input port
+  Vector#(n, PulseWire) consumeWire<- replicateM(mkPulseWireOR);
+
+  // Keep track of service history for flit sources (for fair selection)
+  Vector#(n, Reg#(Bit#(n))) hist <- replicateM(mkReg(0));
+
+  // Current choice of flit source
+  Vector#(n, Reg#(Bit#(n))) choiceReg <- replicateM(mkReg(0));
+
+  // Output queue
+  Vector#(n, Queue1#(RoutedFlit)) outQueue <-
+    replicateM(mkUGShiftQueue(QueueOptFmax));
+
+  // Selector mux for each out queue
+  for (Integer i = 0; i < valueOf(n); i=i+1) begin
+
+    rule select;
+      // Vector of input flits and available flits
+      Vector#(n, RoutedFlit) flits = newVector;
+      Vector#(n, Bool) avails = newVector;
+      for (Integer i = 0; i < valueOf(n); i=i+1) begin
+        flits[i] = inPort[i].value;
+        avails[i] = f[i](inPort[i].value) && inPort[i].canGet;
+      end
+      Bit#(n) avail = pack(avails);
+      // Choose a new source using fair scheduler
+      match {.newHist, .choice} = sched(hist[i], avail);
+      // Select a flit
+      RoutedFlit flit =
+        oneHotSelect(unpack(choiceReg[i]), flits);
+      // Consume a flit
+      if (choiceReg[i] != 0) begin
+        if (outQueue[i].notFull) begin
+          // Pass chosen flit to out queue
+          outQueue[i].enq(flit);
+          // On final flit of message
+          if (!flit.flit.notFinalFlit) begin
+            if (choice != choiceReg[i]) begin
+              choiceReg[i] <= choice;
+              hist[i] <= newHist;
+            end else
+              choiceReg[i] <= 0;
+          end
+        end
+      end else begin
+        choiceReg[i] <= choice;
+        hist[i] <= newHist;
+      end
+      // Consume from chosen source
+      for (Integer j = 0; j < valueOf(n); j=j+1)
+        if (outQueue[i].notFull && choiceReg[i][j] == 1)
+          consumeWire[j].send;
+    endrule
+
+  end
+
+  // Consume from flit sources
+  rule consumeFlitSources;
+    for (Integer j = 0; j < valueOf(n); j=j+1)
+      if (consumeWire[j]) inPort[j].get;
+  endrule
+
+  return map(queueToBOut, outQueue);
+endmodule
+
+
+// =============================================================================
+// Splitter
+// =============================================================================
+
+// Split a single stream in two based on a predicate
+module splitFlits#(SelectorFunc f, BOut#(RoutedFlit) out)
+                  (Tuple2#(BOut#(Flit), BOut#(Flit)));
+
+  // Consume wire
+  PulseWire consumeWire <- mkPulseWireOR;
+
+  // Output streams
+  BOut#(Flit) outYes =
+    interface BOut
+      method Action get = consumeWire.send;
+      method Bool valid = out.valid && f(out.value);
+      method Flit value = out.value.flit;
+    endinterface;
+  BOut#(Flit) outNo =
+    interface BOut
+      method Action get = consumeWire.send;
+      method Bool valid = out.valid && !f(out.value);
+      method Flit value = out.value.flit;
+    endinterface;
+
+  // Consume
+  rule consume;
+    if (consumeWire) out.get;
+  endrule
+
+  return tuple2(outYes, outNo);
+endmodule
+
 // =============================================================================
 // Programmable router
 // =============================================================================
@@ -616,7 +736,8 @@ endmodule
 interface ProgRouter;
   // Incoming and outgoing flits
   interface Vector#(`FetchersPerProgRouter, In#(Flit)) flitIn;
-  interface Vector#(`FetchersPerProgRouter, Out#(Flit)) flitOut;
+  interface Vector#(`FetchersPerProgRouter, BOut#(Flit)) flitOut;
+  interface Vector#(`MailboxMeshXLen, BOut#(Flit)) nocFlitOut;
 
   // Interface to off-chip memory
   interface Vector#(`DRAMsPerBoard,
@@ -625,14 +746,71 @@ interface ProgRouter;
     Vector#(`FetchersPerProgRouter, In#(DRAMResp))) ramResps;
 endinterface
 
-/*
-module mkProgRouter (ProgRouter);
+module mkProgRouter#(BoardId boardId) (ProgRouter);
+
+  // Fetchers
+  Vector#(`FetchersPerProgRouter, Fetcher) fetchers = newVector;
+  for (Integer i = 0; i < `FetchersPerProgRouter; i=i+1)
+    fetchers[i] <- mkFetcher(boardId, i);
+
+  // Crossbar routing functions
+  function Bit#(2) xcoord(RoutedFlit rf) =
+    zeroExtend(rf.flit.dest.addr.mbox.x);
+  function Bool routeN(RoutedFlit rf) =
+    rf.decision == RouteNorth || (rf.decision == RouteNoC && xcoord(rf) == 0);
+  function Bool routeS(RoutedFlit rf) =
+    rf.decision == RouteSouth || (rf.decision == RouteNoC && xcoord(rf) == 1);
+  function Bool routeE(RoutedFlit rf) =
+    rf.decision == RouteEast || (rf.decision == RouteNoC && xcoord(rf) == 2);
+  function Bool routeW(RoutedFlit rf) =
+    rf.decision == RouteWest || (rf.decision == RouteNoC && xcoord(rf) == 3);
+  function Bool routeLoop(RoutedFlit rf) = rf.decision == RouteLoop;
+  Vector#(`FetchersPerProgRouter, SelectorFunc) funcs =
+    vector(routeN, routeS, routeE, routeW, routeLoop);
+
+  // Crossbar
+  function BOut#(RoutedFlit) getFetcherFlitOut(Fetcher f) = f.flitOut;
+  Vector#(`FetchersPerProgRouter, BOut#(RoutedFlit)) fetcherOuts =
+    map(getFetcherFlitOut, fetchers);
+  Vector#(`FetchersPerProgRouter, BOut#(RoutedFlit))
+    crossbarOuts <- mkProgRouterCrossbar(funcs, fetcherOuts);
+
+  // Flit input interfaces
+  Vector#(`FetchersPerProgRouter, In#(Flit)) flitInIfc = newVector;
+  for (Integer i = 0; i < `FetchersPerProgRouter; i=i+1)
+    flitInIfc[i] = fetchers[i].flitIn;
+
+  // Flit output interfaces
+  Vector#(`FetchersPerProgRouter, BOut#(Flit)) flitOutIfc = newVector;
+  Vector#(`MailboxMeshXLen, BOut#(Flit)) nocFlitOutIfc = newVector;
+
+  // Strands
+  function Bool forNoC(RoutedFlit rf) = rf.decision == RouteNoC;
+  for (Integer i = 0; i < 4; i=i+1) begin
+    match {.noc, .other} <- splitFlits(forNoC, crossbarOuts[i]);
+    flitOutIfc[i] = other;
+    if (i < `MailboxMeshXLen) nocFlitOutIfc[i] = noc;
+  end
+  function Flit toFlit (RoutedFlit rf) = rf.flit;
+  flitOutIfc[4] <- onBOut(toFlit, crossbarOuts[4]);
+
+  // RAM interfaces
+  Vector#(`DRAMsPerBoard, Vector#(`FetchersPerProgRouter, In#(DRAMResp)))
+    ramRespIfc = replicate(newVector);
+  Vector#(`DRAMsPerBoard, Vector#(`FetchersPerProgRouter, BOut#(DRAMReq)))
+    ramReqIfc = replicate(newVector);
+  for (Integer i = 0; i < `DRAMsPerBoard; i=i+1)
+    for (Integer j = 0; j < `FetchersPerProgRouter; j=j+1) begin
+      ramReqIfc[i][j] = fetchers[j].ramReqs[i];
+      ramRespIfc[i][j] = fetchers[j].ramResps[i];
+    end
 
-  // Flit input ports
-  Vector#(`FetchersPerProgRouter, InPort#(Flit)) flitInPort <-
-    replicateM(mkInPort);
+  interface flitIn = flitInIfc;
+  interface flitOut = flitOutIfc;
+  interface nocFlitOut = nocFlitOutIfc;
+  interface ramReqs = ramReqIfc;
+  interface ramResps = ramRespIfc;
 
 endmodule
-*/
 
 endpackage
diff --git a/rtl/Util.bsv b/rtl/Util.bsv
index 7ac885c3..507d1ef2 100644
--- a/rtl/Util.bsv
+++ b/rtl/Util.bsv
@@ -254,4 +254,24 @@ module mkBuffer#(Integer n, dataT init, dataT inp) (dataT)
   return regs[n-1];
 endmodule
 
+// Isolate first hot bit
+function Bit#(n) firstHot(Bit#(n) x) = x & (~x + 1);
+
+// Function for fair scheduling of n tasks
+function Tuple2#(Bit#(n), Bit#(n)) sched(Bit#(n) hist, Bit#(n) avail);
+  // First choice: an available bit that's not in the history
+  Bit#(n) first = firstHot(avail & ~hist);
+  // Second choice: any available bit
+  Bit#(n) second = firstHot(avail);
+
+  // Return new history, and chosen bit
+  if (first != 0) begin
+    // Return first choice, and update history
+    return tuple2(hist | first, first);
+  end else begin
+    // Return second choice, and reset history
+    return tuple2(second, second);
+  end
+endfunction
+
 endpackage

From 2ed587effd9dfe8d7d0af7923e432c055cf7670c Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Tue, 31 Mar 2020 15:34:06 +0100
Subject: [PATCH 15/78] Use ProgRouter instead of BoardRouter

---
 rtl/Interface.bsv |   8 +++
 rtl/Network.bsv   | 143 ++++++----------------------------------------
 2 files changed, 25 insertions(+), 126 deletions(-)

diff --git a/rtl/Interface.bsv b/rtl/Interface.bsv
index dffd8ac2..0484cb41 100644
--- a/rtl/Interface.bsv
+++ b/rtl/Interface.bsv
@@ -212,6 +212,14 @@ module onBOut#(function u f(t x), BOut#(t) out) (BOut#(u));
   method u value = f(out.value);
 endmodule
 
+// Convert BOut to Out
+function Out#(t) fromBOut(BOut#(t) out) =
+  interface Out
+    method Action tryGet = out.get;
+    method Bool valid = out.valid;
+    method t value = out.value;
+  endinterface;
+
 // A null In port accepts and discards all inputs
 module mkNullIn (In#(t));
   method Action tryPut(u val); endmethod
diff --git a/rtl/Network.bsv b/rtl/Network.bsv
index 6706d00f..00066cce 100644
--- a/rtl/Network.bsv
+++ b/rtl/Network.bsv
@@ -221,104 +221,6 @@ module mkMeshRouter#(MailboxId m) (MeshRouter);
 
 endmodule
 
-// =============================================================================
-// Board router
-// =============================================================================
-
-// Similar to a mesh router, but: (1) different routing function,
-// which routes between boards rather than mailboxes; (2) no loopback
-// in the sense that packets coming from mailbox mesh never get routed back
-// onto mailbox mesh.  This is a first step towards supporting
-// programmable board routers.
-module mkBoardRouter(MeshRouter);
-
-  // Board id
-  Wire#(BoardId) b <- mkDWire(?);
-
-  // Ports
-  InPort#(Flit)  leftInPort      <- mkInPort;
-  OutPort#(Flit) leftOutPort     <- mkOutPort;
-  InPort#(Flit)  rightInPort     <- mkInPort;
-  OutPort#(Flit) rightOutPort    <- mkOutPort;
-  InPort#(Flit)  topInPort       <- mkInPort;
-  OutPort#(Flit) topOutPort      <- mkOutPort;
-  InPort#(Flit)  bottomInPort    <- mkInPort;
-  OutPort#(Flit) bottomOutPort   <- mkOutPort;
-  InPort#(Flit)  fromMailboxPort <- mkInPort;
-  OutPort#(Flit) toMailboxPort   <- mkOutPort;
-
-  // Routing function
-  function Route route(NetAddr a);
-         if (a.addr.host.valid)    return a.addr.host.value == 0 ? Left : Right;
-    else if (a.addr.board.x < b.x) return Left;
-    else if (a.addr.board.x > b.x) return Right;
-    else if (a.addr.board.y < b.y) return Down;
-    else if (a.addr.board.y > b.y) return Up;
-    else return Mailbox;
-  endfunction
-
-  // Route to the mailbox
-  mkRouterMux(
-    route,
-    Mailbox,
-    toMailboxPort,
-    vector(FromLeft, FromRight, FromTop, FromBottom),
-    vector(leftInPort, rightInPort, topInPort, bottomInPort)
-  );
-
-  // Route left
-  mkRouterMux(
-    route,
-    Left,
-    leftOutPort,
-    vector(FromRight,   FromTop,   FromBottom,   FromMailbox),
-    vector(rightInPort, topInPort, bottomInPort, fromMailboxPort)
-  );
-
-  // Route right
-  mkRouterMux(
-    route,
-    Right,
-    rightOutPort,
-    vector(FromLeft,   FromTop,   FromBottom,   FromMailbox),
-    vector(leftInPort, topInPort, bottomInPort, fromMailboxPort)
-  );
-
-  // Route up
-  mkRouterMux(
-    route,
-    Up,
-    topOutPort,
-    vector(FromLeft,   FromRight,   FromBottom,   FromMailbox),
-    vector(leftInPort, rightInPort, bottomInPort, fromMailboxPort)
-  );
-
-  // Route down
-  mkRouterMux(
-    route,
-    Down,
-    bottomOutPort,
-    vector(FromLeft,   FromRight,   FromTop,   FromMailbox),
-    vector(leftInPort, rightInPort, topInPort, fromMailboxPort)
-  );
-
-  method Action setBoardId(BoardId id);
-    b <= id;
-  endmethod
-
-  // Interface
-  interface In  leftIn      = leftInPort.in;
-  interface Out leftOut     = leftOutPort.out;
-  interface In  rightIn     = rightInPort.in;
-  interface Out rightOut    = rightOutPort.out;
-  interface In  topIn       = topInPort.in;
-  interface Out topOut      = topOutPort.out;
-  interface In  bottomIn    = bottomInPort.in;
-  interface Out bottomOut   = bottomOutPort.out;
-  interface In  fromMailbox = fromMailboxPort.in;
-  interface Out toMailbox   = toMailboxPort.out;
-endmodule
-
 // =============================================================================
 // Flit-sized reliable links
 // =============================================================================
@@ -478,59 +380,48 @@ module mkNoC#(
                      routers[y+1][x].bottomOut, routers[y][x].topIn);
   end
 
-  // Board router
-  // ------------
+  // Programmable board router
+  // -------------------------
 
-  // For routing messages between boards
-  MeshRouter boardRouter <- mkBoardRouter;
-
-  // Set board id for board router
-  rule setBoardRouterId;
-    boardRouter.setBoardId(boardId);
-  endrule
+  // Programmable router
+  ProgRouter boardRouter <- mkProgRouter(boardId);
 
   // Connect board router to north link
+  connectDirect(boardRouter.flitOut[0], northLink[0].flitIn);
   connectUsing(mkUGShiftQueue1(QueueOptFmax),
-    boardRouter.topOut, northLink[0].flitIn);
-  connectUsing(mkUGShiftQueue1(QueueOptFmax),
-    northLink[0].flitOut, boardRouter.topIn);
+    northLink[0].flitOut, boardRouter.flitIn[0]);
 
   // Connect board router to south link
+  connectDirect(boardRouter.flitOut[1], southLink[0].flitIn);
   connectUsing(mkUGShiftQueue1(QueueOptFmax),
-    boardRouter.bottomOut, southLink[0].flitIn);
-  connectUsing(mkUGShiftQueue1(QueueOptFmax),
-    southLink[0].flitOut, boardRouter.bottomIn);
+    southLink[0].flitOut, boardRouter.flitIn[1]);
 
   // Connect board router to east link
+  connectDirect(boardRouter.flitOut[2], eastLink[0].flitIn);
   connectUsing(mkUGShiftQueue1(QueueOptFmax),
-    boardRouter.rightOut, eastLink[0].flitIn);
-  connectUsing(mkUGShiftQueue1(QueueOptFmax),
-    eastLink[0].flitOut, boardRouter.rightIn);
+    eastLink[0].flitOut, boardRouter.flitIn[2]);
 
   // Connect board router to west link
+  connectDirect(boardRouter.flitOut[3], westLink[0].flitIn);
   connectUsing(mkUGShiftQueue1(QueueOptFmax),
-    boardRouter.leftOut, westLink[0].flitIn);
-  connectUsing(mkUGShiftQueue1(QueueOptFmax),
-    westLink[0].flitOut, boardRouter.leftIn);
+    westLink[0].flitOut, boardRouter.flitIn[3]);
 
   // Connect mailbox mesh south rim to board router
   function List#(t) single(t elem) = List::cons(elem, Nil);
   List#(Out#(Flit)) botOutList = Nil;
   for (Integer x = `MailboxMeshXLen-1; x >= 0; x=x-1)
     botOutList = Cons(routers[0][x].bottomOut, botOutList);
+  // Also include loopback connection to board router to implement IND records
+  botOutList = Cons(fromBOut(boardRouter.flitOut[4]), botOutList);
   function In#(Flit) getFlitIn(BoardLink link) = link.flitIn;
-  reduceConnect(mkFlitMerger, botOutList, single(boardRouter.fromMailbox));
+  reduceConnect(mkFlitMerger, botOutList, single(boardRouter.flitIn[4]));
 
   // Connect board router to mailbox mesh south rim
   function In#(Flit) getBottomIn(MeshRouter r) = r.bottomIn;
   Vector#(`MailboxMeshXLen, In#(Flit)) southRimInPorts =
     map(getBottomIn, routers[0]);
-  function Bit#(`MailboxMeshXBits) flitGetX(Flit flit) =
-    flit.dest.addr.mbox.x;
-  let southRimDistributor <- mkResponseDistributor(flitGetX,
-    mkUGShiftQueue1(QueueOptFmax), southRimInPorts);
-  connectUsing(mkUGShiftQueue1(QueueOptFmax), boardRouter.toMailbox,
-    southRimDistributor);
+  for (Integer i = 0; i < `MailboxMeshXLen; i=i+1)
+    connectDirect(boardRouter.nocFlitOut[i], southRimInPorts[i]);
 
   // Detect inter-board activity
   // ---------------------------

From 04846495dfae6a76aa7887ed619a5574693318d6 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Tue, 31 Mar 2020 15:38:08 +0100
Subject: [PATCH 16/78] Wire up ProgRouter's off-chip RAM interfaces

---
 rtl/Network.bsv | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/rtl/Network.bsv b/rtl/Network.bsv
index 00066cce..5820209a 100644
--- a/rtl/Network.bsv
+++ b/rtl/Network.bsv
@@ -308,16 +308,6 @@ module mkNoC#(
   Vector#(`NumEastWestLinks, BoardLink) westLink <-
     mapM(mkBoardLink(linkEnable[3]), westSocket);
 
-  // Responses from off-chip memory
-  Vector#(`DRAMsPerBoard,
-    Vector#(`FetchersPerProgRouter, InPort#(DRAMResp))) dramRespPort <-
-      replicateM(replicateM(mkInPort));
-
-  // Requests to off-chip memory
-  Vector#(`DRAMsPerBoard,
-    Vector#(`FetchersPerProgRouter, Queue1#(DRAMReq))) dramReqQueues <-
-      replicateM(replicateM(mkUGShiftQueue1(QueueOptFmax)));
-
   // Dimension-ordered routers
   // -------------------------
 
@@ -466,12 +456,10 @@ module mkNoC#(
   `endif
 
   // Requests to off-chip memory
-  interface dramReqs =
-    Vector::map(Vector::map(queueToBOut), dramReqQueues);
+  interface dramReqs = boardRouter.ramReqs;
 
   // Responses from off-chip memory
-  interface dramResps =
-    Vector::map(Vector::map(getIn), dramRespPort);
+  interface dramResps = boardRouter.ramResps;
 
 endmodule
 

From a8e3c8f3226d96a064aefaedbffe8492f284bfc1 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Wed, 1 Apr 2020 11:39:49 +0100
Subject: [PATCH 17/78] Fix ProgRouter's crossbar

---
 rtl/ProgRouter.bsv | 44 ++++++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv
index 6a038e9d..17fd574f 100644
--- a/rtl/ProgRouter.bsv
+++ b/rtl/ProgRouter.bsv
@@ -334,6 +334,7 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
           consumeState <= 1;
         end
       end else if (flitBypassQueue.notFull) begin
+$display("ProgRouter: bypass");
         flitInPort.get;
         // Make routing decision
         RoutingDecision decision = RouteNoC;
@@ -469,7 +470,8 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
       URM1: begin
         URM1Record rec = unpack(beat.chunks[5]);
         flit.dest.addr.isKey = False;
-        flit.dest.addr.mbox = unpack(rec.mbox);
+        flit.dest.addr.mbox.x = unpack(truncate(rec.mbox[1:0]));
+        flit.dest.addr.mbox.y = unpack(truncate(rec.mbox[3:2]));
         Vector#(`ThreadsPerMailbox, Bool) threadMask = newVector;
         for (Integer j = 0; j < `ThreadsPerMailbox; j=j+1)
           threadMask[j] = rec.thread == fromInteger(j);
@@ -483,7 +485,8 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
       URM2: begin
         URM2Record rec = unpack({beat.chunks[5], beat.chunks[4]});
         flit.dest.addr.isKey = False;
-        flit.dest.addr.mbox = unpack(rec.mbox);
+        flit.dest.addr.mbox.x = unpack(truncate(rec.mbox[1:0]));
+        flit.dest.addr.mbox.y = unpack(truncate(rec.mbox[3:2]));
         Vector#(`ThreadsPerMailbox, Bool) threadMask = newVector;
         for (Integer j = 0; j < `ThreadsPerMailbox; j=j+1)
           threadMask[j] = rec.thread == fromInteger(j);
@@ -520,7 +523,8 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
       MRM: begin
         MRMRecord rec = unpack({beat.chunks[5], beat.chunks[4]});
         flit.dest.addr.isKey = False;
-        flit.dest.addr.mbox = unpack(rec.mbox);
+        flit.dest.addr.mbox.x = unpack(truncate(rec.mbox[1:0]));
+        flit.dest.addr.mbox.y = unpack(truncate(rec.mbox[3:2]));
         flit.dest.threads = rec.destMask;
         decision = RouteNoC;
       end
@@ -587,6 +591,7 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
       flitBypassQueue.canDeq;
     if (chooseBypass) begin
       if (flitBypassQueue.canDeq) begin
+$display("ProgRouter: consuming from bypass queue");
         flitBypassQueue.deq;
         flitOutQueue.enq(flitBypassQueue.dataOut);
         mergeInProgress <= flitBypassQueue.dataOut.flit.notFinalFlit;
@@ -649,38 +654,37 @@ module mkProgRouterCrossbar#(
     rule select;
       // Vector of input flits and available flits
       Vector#(n, RoutedFlit) flits = newVector;
-      Vector#(n, Bool) avails = newVector;
-      for (Integer i = 0; i < valueOf(n); i=i+1) begin
-        flits[i] = inPort[i].value;
-        avails[i] = f[i](inPort[i].value) && inPort[i].canGet;
+      Vector#(n, Bool) nextAvails = newVector;
+      Bool avail = False;
+      for (Integer j = 0; j < valueOf(n); j=j+1) begin
+        flits[j] = inPort[j].value;
+        nextAvails[j] = inPort[j].canGet && f[i](inPort[j].value)
+                          && choiceReg[i][j] == 0;
+        avail = avail || (choiceReg[i][j] == 1 && inPort[j].canGet);
       end
-      Bit#(n) avail = pack(avails);
+      Bit#(n) nextAvail = pack(nextAvails);
       // Choose a new source using fair scheduler
-      match {.newHist, .choice} = sched(hist[i], avail);
+      match {.newHist, .nextChoice} = sched(hist[i], nextAvail);
       // Select a flit
-      RoutedFlit flit =
-        oneHotSelect(unpack(choiceReg[i]), flits);
+      RoutedFlit flit = oneHotSelect(unpack(choiceReg[i]), flits);
       // Consume a flit
-      if (choiceReg[i] != 0) begin
+      if (avail) begin
         if (outQueue[i].notFull) begin
           // Pass chosen flit to out queue
           outQueue[i].enq(flit);
           // On final flit of message
           if (!flit.flit.notFinalFlit) begin
-            if (choice != choiceReg[i]) begin
-              choiceReg[i] <= choice;
-              hist[i] <= newHist;
-            end else
-              choiceReg[i] <= 0;
+            choiceReg[i] <= nextChoice;
+            hist[i] <= newHist;
           end
         end
-      end else begin
-        choiceReg[i] <= choice;
+      end else if (choiceReg[i] == 0) begin
+        choiceReg[i] <= nextChoice;
         hist[i] <= newHist;
       end
       // Consume from chosen source
       for (Integer j = 0; j < valueOf(n); j=j+1)
-        if (outQueue[i].notFull && choiceReg[i][j] == 1)
+        if (inPort[j].canGet && choiceReg[i][j] == 1 && outQueue[i].notFull)
           consumeWire[j].send;
     endrule
 

From 53f1acf60c5a8ac640ca01f7e3b700c1f07908f9 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Wed, 1 Apr 2020 11:42:29 +0100
Subject: [PATCH 18/78] Drop debug statements

---
 rtl/ProgRouter.bsv | 2 --
 1 file changed, 2 deletions(-)

diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv
index 17fd574f..230c1170 100644
--- a/rtl/ProgRouter.bsv
+++ b/rtl/ProgRouter.bsv
@@ -334,7 +334,6 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
           consumeState <= 1;
         end
       end else if (flitBypassQueue.notFull) begin
-$display("ProgRouter: bypass");
         flitInPort.get;
         // Make routing decision
         RoutingDecision decision = RouteNoC;
@@ -591,7 +590,6 @@ $display("ProgRouter: bypass");
       flitBypassQueue.canDeq;
     if (chooseBypass) begin
       if (flitBypassQueue.canDeq) begin
-$display("ProgRouter: consuming from bypass queue");
         flitBypassQueue.deq;
         flitOutQueue.enq(flitBypassQueue.dataOut);
         mergeInProgress <= flitBypassQueue.dataOut.flit.notFinalFlit;

From 7cecf664286d4985fa6fbb59530bc5890cca5316 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Thu, 2 Apr 2020 14:22:38 +0100
Subject: [PATCH 19/78] Simple bare-metal test for the ProgRouter

(Works, but only tests one type of routing record so far)
---
 Makefile                       |   1 +
 apps/progrouter/Makefile       |  53 ++++++++++++
 apps/progrouter/entry.S        |   3 +
 apps/progrouter/genld.sh       |  32 ++++++++
 apps/progrouter/progrouter.cpp | 142 +++++++++++++++++++++++++++++++++
 apps/progrouter/run.cpp        |  12 +++
 rtl/Globals.bsv                |   2 +-
 rtl/Network.bsv                |   1 +
 rtl/ProgRouter.bsv             |  19 ++---
 9 files changed, 255 insertions(+), 10 deletions(-)
 create mode 100644 apps/progrouter/Makefile
 create mode 100644 apps/progrouter/entry.S
 create mode 100755 apps/progrouter/genld.sh
 create mode 100644 apps/progrouter/progrouter.cpp
 create mode 100644 apps/progrouter/run.cpp

diff --git a/Makefile b/Makefile
index 5b2608a3..d52882f7 100644
--- a/Makefile
+++ b/Makefile
@@ -24,6 +24,7 @@ clean:
 	make -C apps/multiprog clean
 	make -C apps/sync clean
 	make -C apps/temps clean
+	make -C apps/progrouter clean
 	make -C apps/POLite/heat-gals clean
 	make -C apps/POLite/heat-sync clean
 	make -C apps/POLite/asp-gals clean
diff --git a/apps/progrouter/Makefile b/apps/progrouter/Makefile
new file mode 100644
index 00000000..b58478f2
--- /dev/null
+++ b/apps/progrouter/Makefile
@@ -0,0 +1,53 @@
+# Tinsel root
+TINSEL_ROOT=../..
+
+ifndef QUARTUS_ROOTDIR
+  $(error Please set QUARTUS_ROOTDIR)
+endif
+
+include $(TINSEL_ROOT)/globals.mk
+
+# RISC-V compiler flags
+CFLAGS = $(RV_CFLAGS) -O2 -I $(INC)
+LDFLAGS = -melf32lriscv -G 0 
+
+.PHONY: all
+all: code.v data.v run
+
+code.v: progrouter.elf
+	checkelf.sh progrouter.elf
+	$(RV_OBJCOPY) -O verilog --only-section=.text progrouter.elf code.v
+
+data.v: progrouter.elf
+	$(RV_OBJCOPY) -O verilog --remove-section=.text \
+                --set-section-flags .bss=alloc,load,contents \
+                progrouter.elf data.v
+
+progrouter.elf: progrouter.cpp link.ld $(INC)/config.h $(INC)/tinsel.h entry.o
+	$(RV_CPPC) $(CFLAGS) -Wall -c -o progrouter.o progrouter.cpp
+	$(RV_LD) $(LDFLAGS) -T link.ld -o progrouter.elf entry.o progrouter.o $(LIB)/lib.o
+
+entry.o:
+	$(RV_CPPC) $(CFLAGS) -Wall -c -o entry.o entry.S
+
+link.ld: genld.sh
+	./genld.sh > link.ld
+
+$(LIB)/lib.o:
+	make -C $(LIB)
+
+$(INC)/config.h: $(TINSEL_ROOT)/config.py
+	make -C $(INC)
+
+$(HL)/%.o:
+	make -C $(HL)
+
+run: run.cpp $(HL)/*.o
+	g++ -O2 -I $(INC) -I $(HL) -o run run.cpp $(HL)/*.o
+
+sim: run.cpp $(HL)/sim/*.o
+	g++ -O2 -I $(INC) -I $(HL) -o sim run.cpp $(HL)/sim/*.o
+
+.PHONY: clean
+clean:
+	rm -f *.o *.elf link.ld *.v run sim
diff --git a/apps/progrouter/entry.S b/apps/progrouter/entry.S
new file mode 100644
index 00000000..18cd8d27
--- /dev/null
+++ b/apps/progrouter/entry.S
@@ -0,0 +1,3 @@
+# We assume the boot loader has already setup the stack.
+# All we need to do is jump to main.
+j main
diff --git a/apps/progrouter/genld.sh b/apps/progrouter/genld.sh
new file mode 100755
index 00000000..cfe144c4
--- /dev/null
+++ b/apps/progrouter/genld.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# Load config parameters
+while read -r EXPORT; do
+  eval $EXPORT
+done <<< `python ../../config.py envs`
+
+# Compute space available for instructions
+MaxInstrBytes=$((4 * 2**$LogInstrsPerCore - $MaxBootImageBytes))
+
+cat - << EOF
+/* THIS FILE HAS BEEN GENERATED AUTOMATICALLY. */
+/* DO NOT MODIFY. INSTEAD, MODIFY THE genld.sh SCRIPT. */
+
+OUTPUT_ARCH( "riscv" )
+
+MEMORY
+{
+  instrs  : ORIGIN = $MaxBootImageBytes, LENGTH = $MaxInstrBytes
+  globals : ORIGIN = $DRAMBase, LENGTH = $DRAMGlobalsLength
+}
+
+SECTIONS
+{
+  .text   : { *.o(.text*) }             > instrs
+  .bss    : { *.o(.bss*) }              > globals = 0
+  .rodata : { *.o(.rodata*) }           > globals
+  .sdata  : { *.o(.sdata*) }            > globals
+  .data   : { *.o(.data*) }             > globals
+  __heapBase = ALIGN(.);
+}
+EOF
diff --git a/apps/progrouter/progrouter.cpp b/apps/progrouter/progrouter.cpp
new file mode 100644
index 00000000..b1740082
--- /dev/null
+++ b/apps/progrouter/progrouter.cpp
@@ -0,0 +1,142 @@
+#include <tinsel.h>
+
+// Simplest possible example involving programmable routers
+
+/*
+Byte ordering in a routing beat:
+
+  31: Upper byte of length (i.e. number of records in beat)
+  30: Lower byte of length
+  29: Upper byte of first chunk 
+  28:
+  27:
+  26:
+  25: Lower byte of first chunk 
+  24: Upper byte of second chunk 
+  23:
+  22:
+  21:
+  20: Lower byte of second chunk
+  19: Upper byte of third chunk
+  18:
+  17:
+  16:
+  15: Lower byte of third chunk
+  14: Upper byte of fourth chunk
+  13:
+  12:
+  11:
+  10: Lower byte of fourth chunk
+   9: Upper byte of fifth chunk
+   8:
+   7:
+   6:
+   5: Lower byte of fifth chunk
+   4: Upper byte of sixth chunk
+   3:
+   2:
+   1:
+   0: Lower byte of sixth chunk
+
+Need to fold this into the docs eventually.
+*/
+
+// Use this to align on beat boundary
+#define ALIGNED __attribute__((aligned(32)))
+
+// A single RAM beat
+struct ALIGNED Beat {
+  uint8_t bytes[32];
+};
+
+// Routing table, with methods to aid construction
+template <int NumBeats> struct RoutingTable {
+  // Raw beats comprising the table
+  Beat beats[NumBeats];
+
+  // Number of chunks used so far in current beat
+  uint32_t numChunks;
+
+  // Index of beat currently being filled
+  uint32_t currentBeat;
+
+  // Constructor
+  RoutingTable() {
+    currentBeat = 0;
+  }
+
+  // Pointer to current beat being filled
+  uint8_t* currentPointer() {
+    return beats[currentBeat].bytes;
+  }
+
+  // Move on to next the beat
+  void next() {
+    beats[currentBeat].bytes[31] = 0;
+    beats[currentBeat].bytes[30] = numChunks;
+    numChunks = 0;
+    currentBeat++;
+  }
+
+  // Add a URM1 record to the table
+  void addURM1(uint32_t mboxX, uint32_t mboxY,
+                 uint32_t mboxThread, uint32_t localKey) {
+    uint8_t* ptr = beats[currentBeat].bytes + 5*(5-numChunks);
+    ptr[0] = localKey;
+    ptr[1] = localKey >> 8;
+    ptr[2] = localKey >> 16;
+    ptr[3] = ((mboxThread&0x1f) << 3) | ((localKey >> 24) & 0x7);
+    ptr[4] = (mboxY << 3) | (mboxX << 1) | (mboxThread >> 5);
+    numChunks++;
+    if (numChunks == 6) next();
+  }
+};
+
+// Create global routing table of 16 beats
+RoutingTable<16> table;
+
+int main()
+{
+  // Get thread id
+  int me = tinselId();
+
+  // Sample outgoing message
+  volatile uint32_t* msgOut = (uint32_t*) tinselSendSlot();
+  msgOut[0] = 0x10;
+  msgOut[1] = 0x20;
+  msgOut[2] = 0x30;
+  msgOut[3] = 0x40;
+
+  // On thread 0
+  if (me == 0) {
+    // Add an URM1 record
+    uint8_t* entry = table.currentPointer();
+    table.addURM1(0, 0, 10, 0xff);
+    table.next();
+
+    // Cache flush, to write table into RAM
+    tinselCacheFlush();
+    // Wait until flush done, by issuing a load
+    volatile uint32_t* dummyPtr = (uint32_t*) entry; dummyPtr[0];
+
+    // Construct key
+    uint32_t key = (uint32_t) entry;
+    key = key | 1; // Entry is 1 beat long
+
+    // Send message to key
+    tinselWaitUntil(TINSEL_CAN_SEND);
+    tinselKeySend(key, msgOut);
+
+    while (1);
+  }
+
+  // On other threads, print anything received
+  while (me != 0) {
+    tinselWaitUntil(TINSEL_CAN_RECV);
+    volatile uint32_t* msgIn = (uint32_t*) tinselRecv();
+    printf("%x %x %x %x\n", msgIn[0], msgIn[1], msgIn[2], msgIn[3]);
+    tinselFree(msgIn);
+  }
+
+  return 0;
+}
diff --git a/apps/progrouter/run.cpp b/apps/progrouter/run.cpp
new file mode 100644
index 00000000..a198a064
--- /dev/null
+++ b/apps/progrouter/run.cpp
@@ -0,0 +1,12 @@
+#include <HostLink.h>
+
+int main()
+{
+  HostLink hostLink;
+
+  hostLink.boot("code.v", "data.v");
+  hostLink.go();
+  hostLink.dumpStdOut();
+
+  return 0;
+}
diff --git a/rtl/Globals.bsv b/rtl/Globals.bsv
index 914852e9..d240aa2c 100644
--- a/rtl/Globals.bsv
+++ b/rtl/Globals.bsv
@@ -69,7 +69,7 @@ typedef struct {
   Bool notFinalFlit;
   // Is this a special packet for idle-detection?
   Bool isIdleToken;
-} Flit deriving (Bits);
+} Flit deriving (Bits, FShow);
 
 // A padded flit is a multiple of 64 bits
 // (i.e. the data width of the 10G MAC interface)
diff --git a/rtl/Network.bsv b/rtl/Network.bsv
index 5820209a..4ee2e69b 100644
--- a/rtl/Network.bsv
+++ b/rtl/Network.bsv
@@ -150,6 +150,7 @@ module mkMeshRouter#(MailboxId m) (MeshRouter);
   // Routing function
   function Route route(NetAddr a);
          if (a.addr.board != b)   return Down;
+    else if (a.addr.isKey)        return Down;
     else if (a.addr.host.valid)   return Down;
     else if (a.addr.mbox.y < m.y) return Down;
     else if (a.addr.mbox.y > m.y) return Up;
diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv
index 230c1170..48c29f15 100644
--- a/rtl/ProgRouter.bsv
+++ b/rtl/ProgRouter.bsv
@@ -32,7 +32,7 @@ typedef struct {
   Bit#(16) size;
   // The 40-bit record chunks
   Vector#(6, Bit#(40)) chunks;
-} RoutingBeat deriving (Bits);
+} RoutingBeat deriving (Bits, FShow);
 
 // 32-bit routing key
 typedef struct {
@@ -42,7 +42,7 @@ typedef struct {
   Bit#(`LogBeatsPerDRAM) ptr;
   // Number of beats in the array
   Bit#(`LogRoutingEntryLen) numBeats;
-} RoutingKey deriving (Bits);
+} RoutingKey deriving (Bits, FShow);
 
 // Extract routing key from an address
 function RoutingKey getRoutingKey(NetAddr addr) =
@@ -58,7 +58,7 @@ typedef enum {
   RR   = 3'd2, // 40-bit Router-to-Router
   MRM  = 3'd3, // 80-bit Multicast Router-to-Mailbox
   IND  = 3'd4  // 40-bit Indirection
-} RoutingRecordTag deriving (Bits, Eq);
+} RoutingRecordTag deriving (Bits, Eq, FShow);
 
 typedef enum {
   NORTH = 2'd0,
@@ -78,7 +78,7 @@ typedef struct {
   // Local key. The first word of the message
   // payload is overwritten with this.
   Bit#(27) localKey;
-} URM1Record deriving (Bits);
+} URM1Record deriving (Bits, FShow);
 
 // 80-bit Unicast Router-to-Mailbox (URM2) record
 typedef struct {
@@ -137,7 +137,7 @@ typedef struct {
   Flit flit;
   // Routing decision for flit
   RoutingDecision decision;
-} RoutedFlit deriving (Bits);
+} RoutedFlit deriving (Bits, FShow);
 
 // Routing decision
 typedef enum {
@@ -147,7 +147,7 @@ typedef enum {
   RouteWest,
   RouteNoC,
   RouteLoop
-} RoutingDecision deriving (Bits, Eq);
+} RoutingDecision deriving (Bits, Eq, FShow);
 
 // =============================================================================
 // Design
@@ -228,7 +228,7 @@ typedef struct {
   Bit#(`BeatBurstWidth) burst;
   // Is this the final burst of routing records for the current key?
   Bool finalBurst;
-} InflightFetcherReqInfo deriving (Bits);
+} InflightFetcherReqInfo deriving (Bits, FShow);
 
 // Routing beat, tagged with the beat number in the DRAM burst
 typedef struct {
@@ -238,7 +238,7 @@ typedef struct {
   Bit#(`BeatBurstWidth) beatNum;
   // Inflight request info
   InflightFetcherReqInfo info;
-} NumberedRoutingBeat deriving (Bits);
+} NumberedRoutingBeat deriving (Bits, FShow);
 
 // Fetcher interface
 interface Fetcher;
@@ -268,7 +268,8 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
       registerDataOut: False,
       initFile: Invalid
     };
-  BlockRam#(FetcherFlitBufferAddr, Flit) flitBuffer <- mkBlockRam;
+  BlockRam#(FetcherFlitBufferAddr, Flit) flitBuffer <-
+    mkBlockRamOpts(flitBufferOpts);
 
   // Beat buffer
   SizedQueue#(`FetcherLogBeatBufferSize, NumberedRoutingBeat)

From e13246bf76f1e3bf13c81b5d58b6a949738a9312 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Fri, 3 Apr 2020 16:45:54 +0100
Subject: [PATCH 20/78] Testing URM2, MRM, and IND records

These work.  Still need to test RR records, and multi-beat records.
---
 apps/progrouter/Makefile       |  2 +-
 apps/progrouter/progrouter.cpp | 72 ++++++++++++++++++++++++++++++++--
 2 files changed, 69 insertions(+), 5 deletions(-)

diff --git a/apps/progrouter/Makefile b/apps/progrouter/Makefile
index b58478f2..76c728f5 100644
--- a/apps/progrouter/Makefile
+++ b/apps/progrouter/Makefile
@@ -23,7 +23,7 @@ data.v: progrouter.elf
                 --set-section-flags .bss=alloc,load,contents \
                 progrouter.elf data.v
 
-progrouter.elf: progrouter.cpp link.ld $(INC)/config.h $(INC)/tinsel.h entry.o
+progrouter.elf: progrouter.cpp link.ld $(INC)/config.h $(INC)/tinsel.h entry.o $(LIB)/lib.o
 	$(RV_CPPC) $(CFLAGS) -Wall -c -o progrouter.o progrouter.cpp
 	$(RV_LD) $(LDFLAGS) -T link.ld -o progrouter.elf entry.o progrouter.o $(LIB)/lib.o
 
diff --git a/apps/progrouter/progrouter.cpp b/apps/progrouter/progrouter.cpp
index b1740082..9c78f2f8 100644
--- a/apps/progrouter/progrouter.cpp
+++ b/apps/progrouter/progrouter.cpp
@@ -81,6 +81,7 @@ template <int NumBeats> struct RoutingTable {
   // Add a URM1 record to the table
   void addURM1(uint32_t mboxX, uint32_t mboxY,
                  uint32_t mboxThread, uint32_t localKey) {
+    if (numChunks == 6) next();
     uint8_t* ptr = beats[currentBeat].bytes + 5*(5-numChunks);
     ptr[0] = localKey;
     ptr[1] = localKey >> 8;
@@ -88,7 +89,63 @@ template <int NumBeats> struct RoutingTable {
     ptr[3] = ((mboxThread&0x1f) << 3) | ((localKey >> 24) & 0x7);
     ptr[4] = (mboxY << 3) | (mboxX << 1) | (mboxThread >> 5);
     numChunks++;
+  }
+
+  // Add a URM2 record to the table
+  void addURM2(uint32_t mboxX, uint32_t mboxY, uint32_t mboxThread,
+                 uint32_t localKeyHigh, uint32_t localKeyLow) {
+    if (numChunks >= 5) next();
+    uint8_t* ptr = beats[currentBeat].bytes + 5*(4-numChunks);
+    ptr[0] = localKeyLow;
+    ptr[1] = localKeyLow >> 8;
+    ptr[2] = localKeyLow >> 16;
+    ptr[3] = localKeyLow >> 24;
+    ptr[4] = localKeyHigh;
+    ptr[5] = localKeyHigh >> 8;
+    ptr[6] = localKeyHigh >> 16;
+    ptr[7] = localKeyHigh >> 24;
+    ptr[8] = (mboxThread&0x1f) << 3;
+    ptr[9] = (1 << 5) | (mboxY << 3) | (mboxX << 1) | (mboxThread >> 5);
+    numChunks += 2;
+  }
+
+  // Add an MRM record to the table
+  void addMRM(uint32_t mboxX, uint32_t mboxY,
+                uint32_t threadsHigh, uint32_t threadsLow) {
+    if (numChunks >= 5) next();
+    uint8_t* ptr = beats[currentBeat].bytes + 5*(4-numChunks);
+    ptr[0] = threadsLow;
+    ptr[1] = threadsLow >> 8;
+    ptr[2] = threadsLow >> 16;
+    ptr[3] = threadsLow >> 24;
+    ptr[4] = threadsHigh;
+    ptr[5] = threadsHigh >> 8;
+    ptr[6] = threadsHigh >> 16;
+    ptr[7] = threadsHigh >> 24;
+    ptr[9] = (3 << 5) | (mboxY << 3) | (mboxX << 1);
+    numChunks += 2;
+  }
+
+  // Add an IND record to the table
+  // Return a pointer to the indirection key,
+  // so it can be set later by the caller
+  uint8_t* addIND() {
     if (numChunks == 6) next();
+    uint8_t* ptr = beats[currentBeat].bytes + 5*(5-numChunks);
+    ptr[4] = 4 << 5;
+    numChunks++;
+    return ptr;
+  }
+
+  // Set indirection key
+  void setIND(uint8_t* ind, bool upperRam,
+                uint8_t* beatPtr, uint32_t numBeats) {
+    uint32_t key = (uint32_t) beatPtr | numBeats;
+    if (upperRam) key |= 0x80000000;
+    ind[0] = key;
+    ind[1] = key >> 8;
+    ind[2] = key >> 16;
+    ind[3] = key >> 24;
   }
 };
 
@@ -110,17 +167,24 @@ int main()
   // On thread 0
   if (me == 0) {
     // Add an URM1 record
-    uint8_t* entry = table.currentPointer();
-    table.addURM1(0, 0, 10, 0xff);
+    uint8_t* entry1 = table.currentPointer();
+    table.addURM1(0, 0, 10, 0xfff);
+    table.addURM2(0, 0, 60, 0xff1, 0xff0);
+    //table.addMRM(1, 0, 0x22222222, 0x11111111);
+    uint8_t* ind = table.addIND();
+    table.next();
+    uint8_t* entry2 = table.currentPointer();
+    table.addURM1(0, 0, 20, 0x111);
     table.next();
+    table.setIND(ind, 0, entry2, 1);
 
     // Cache flush, to write table into RAM
     tinselCacheFlush();
     // Wait until flush done, by issuing a load
-    volatile uint32_t* dummyPtr = (uint32_t*) entry; dummyPtr[0];
+    volatile uint32_t* dummyPtr = (uint32_t*) entry1; dummyPtr[0];
 
     // Construct key
-    uint32_t key = (uint32_t) entry;
+    uint32_t key = (uint32_t) entry1;
     key = key | 1; // Entry is 1 beat long
 
     // Send message to key

From 82a2e7f02d4de4214c73788325927b54ef193701 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Mon, 6 Apr 2020 09:56:05 +0100
Subject: [PATCH 21/78] Testing multi-beat records

Working fine but needed to clarify the docs: the size field in a beat
is the number of records, not the number of chunks.
---
 apps/progrouter/progrouter.cpp | 28 ++++++++++++++++++++++++++--
 rtl/ProgRouter.bsv             |  2 +-
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/apps/progrouter/progrouter.cpp b/apps/progrouter/progrouter.cpp
index 9c78f2f8..2cf55711 100644
--- a/apps/progrouter/progrouter.cpp
+++ b/apps/progrouter/progrouter.cpp
@@ -57,12 +57,16 @@ template <int NumBeats> struct RoutingTable {
   // Number of chunks used so far in current beat
   uint32_t numChunks;
 
+  // Number of records used so far in current beat
+  uint32_t numRecords;
+
   // Index of beat currently being filled
   uint32_t currentBeat;
 
   // Constructor
   RoutingTable() {
     currentBeat = 0;
+    numChunks = numRecords = 0;
   }
 
   // Pointer to current beat being filled
@@ -73,8 +77,9 @@ template <int NumBeats> struct RoutingTable {
   // Move on to next the beat
   void next() {
     beats[currentBeat].bytes[31] = 0;
-    beats[currentBeat].bytes[30] = numChunks;
+    beats[currentBeat].bytes[30] = numRecords;
     numChunks = 0;
+    numRecords = 0;
     currentBeat++;
   }
 
@@ -89,6 +94,7 @@ template <int NumBeats> struct RoutingTable {
     ptr[3] = ((mboxThread&0x1f) << 3) | ((localKey >> 24) & 0x7);
     ptr[4] = (mboxY << 3) | (mboxX << 1) | (mboxThread >> 5);
     numChunks++;
+    numRecords++;
   }
 
   // Add a URM2 record to the table
@@ -107,6 +113,7 @@ template <int NumBeats> struct RoutingTable {
     ptr[8] = (mboxThread&0x1f) << 3;
     ptr[9] = (1 << 5) | (mboxY << 3) | (mboxX << 1) | (mboxThread >> 5);
     numChunks += 2;
+    numRecords++;
   }
 
   // Add an MRM record to the table
@@ -124,6 +131,7 @@ template <int NumBeats> struct RoutingTable {
     ptr[7] = threadsHigh >> 24;
     ptr[9] = (3 << 5) | (mboxY << 3) | (mboxX << 1);
     numChunks += 2;
+    numRecords++;
   }
 
   // Add an IND record to the table
@@ -134,6 +142,7 @@ template <int NumBeats> struct RoutingTable {
     uint8_t* ptr = beats[currentBeat].bytes + 5*(5-numChunks);
     ptr[4] = 4 << 5;
     numChunks++;
+    numRecords++;
     return ptr;
   }
 
@@ -147,6 +156,19 @@ template <int NumBeats> struct RoutingTable {
     ind[2] = key >> 16;
     ind[3] = key >> 24;
   }
+
+  // Add an RR record to the table
+  void addRR(uint32_t dir, uint32_t key) {
+    if (numChunks == 6) next();
+    uint8_t* ptr = beats[currentBeat].bytes + 5*(5-numChunks);
+    ptr[0] = key;
+    ptr[1] = key >> 8;
+    ptr[2] = key >> 16;
+    ptr[3] = key >> 24;
+    ptr[4] = (2 << 5) | (dir << 3);
+    numChunks++;
+    numRecords++;
+  }
 };
 
 // Create global routing table of 16 beats
@@ -170,6 +192,8 @@ int main()
     uint8_t* entry1 = table.currentPointer();
     table.addURM1(0, 0, 10, 0xfff);
     table.addURM2(0, 0, 60, 0xff1, 0xff0);
+    table.addURM2(0, 0, 60, 0xff3, 0xff2);
+    table.addURM2(0, 0, 60, 0xff5, 0xff4);
     //table.addMRM(1, 0, 0x22222222, 0x11111111);
     uint8_t* ind = table.addIND();
     table.next();
@@ -185,7 +209,7 @@ int main()
 
     // Construct key
     uint32_t key = (uint32_t) entry1;
-    key = key | 1; // Entry is 1 beat long
+    key = key | 2; // Entry is 2 beats long
 
     // Send message to key
     tinselWaitUntil(TINSEL_CAN_SEND);
diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv
index 48c29f15..f63b997c 100644
--- a/rtl/ProgRouter.bsv
+++ b/rtl/ProgRouter.bsv
@@ -28,7 +28,7 @@ import Util      :: *;
 
 // 256-bit routing beat
 typedef struct {
-  // Number of 40-bit record chunks present
+  // Number of records present
   Bit#(16) size;
   // The 40-bit record chunks
   Vector#(6, Bit#(40)) chunks;

From 3177e016dcb2a28259f8171dcbf77387e5cff6d1 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Tue, 7 Apr 2020 08:06:39 +0100
Subject: [PATCH 22/78] Move to 48-bit record chunks

---
 apps/progrouter/progrouter.cpp | 78 ++++++++++++++++++----------------
 rtl/ProgRouter.bsv             | 74 ++++++++++++++++++--------------
 2 files changed, 82 insertions(+), 70 deletions(-)

diff --git a/apps/progrouter/progrouter.cpp b/apps/progrouter/progrouter.cpp
index 2cf55711..6b764ac5 100644
--- a/apps/progrouter/progrouter.cpp
+++ b/apps/progrouter/progrouter.cpp
@@ -7,36 +7,36 @@ Byte ordering in a routing beat:
 
   31: Upper byte of length (i.e. number of records in beat)
   30: Lower byte of length
-  29: Upper byte of first chunk 
+  29: Upper byte of first chunk
   28:
   27:
   26:
-  25: Lower byte of first chunk 
-  24: Upper byte of second chunk 
-  23:
+  25: 
+  24: Lower byte of first chunk
+  23: Upper byte of second chunk
   22:
   21:
-  20: Lower byte of second chunk
-  19: Upper byte of third chunk
-  18:
-  17:
+  20: 
+  19: 
+  18: Lower byte of second chunk
+  17: Upper byte of third chunk
   16:
-  15: Lower byte of third chunk
-  14: Upper byte of fourth chunk
+  15: 
+  14: 
   13:
-  12:
-  11:
-  10: Lower byte of fourth chunk
-   9: Upper byte of fifth chunk
+  12: Lower byte of third chunk
+  11: Upper byte of fourth chunk
+  10: 
+   9: 
    8:
    7:
-   6:
-   5: Lower byte of fifth chunk
-   4: Upper byte of sixth chunk
+   6: Lower byte of fourth chunk
+   5: Upper byte of fifth chunk
+   4: 
    3:
    2:
    1:
-   0: Lower byte of sixth chunk
+   0: Lower byte of fifth chunk
 
 Need to fold this into the docs eventually.
 */
@@ -86,13 +86,14 @@ template <int NumBeats> struct RoutingTable {
   // Add a URM1 record to the table
   void addURM1(uint32_t mboxX, uint32_t mboxY,
                  uint32_t mboxThread, uint32_t localKey) {
-    if (numChunks == 6) next();
-    uint8_t* ptr = beats[currentBeat].bytes + 5*(5-numChunks);
+    if (numChunks == 5) next();
+    uint8_t* ptr = beats[currentBeat].bytes + 6*(4-numChunks);
     ptr[0] = localKey;
     ptr[1] = localKey >> 8;
     ptr[2] = localKey >> 16;
-    ptr[3] = ((mboxThread&0x1f) << 3) | ((localKey >> 24) & 0x7);
-    ptr[4] = (mboxY << 3) | (mboxX << 1) | (mboxThread >> 5);
+    ptr[3] = localKey >> 24;
+    ptr[4] = (mboxThread&0x1f) << 3;
+    ptr[5] = (mboxY << 3) | (mboxX << 1) | (mboxThread >> 5);
     numChunks++;
     numRecords++;
   }
@@ -100,8 +101,8 @@ template <int NumBeats> struct RoutingTable {
   // Add a URM2 record to the table
   void addURM2(uint32_t mboxX, uint32_t mboxY, uint32_t mboxThread,
                  uint32_t localKeyHigh, uint32_t localKeyLow) {
-    if (numChunks >= 5) next();
-    uint8_t* ptr = beats[currentBeat].bytes + 5*(4-numChunks);
+    if (numChunks >= 4) next();
+    uint8_t* ptr = beats[currentBeat].bytes + 6*(3-numChunks);
     ptr[0] = localKeyLow;
     ptr[1] = localKeyLow >> 8;
     ptr[2] = localKeyLow >> 16;
@@ -110,17 +111,18 @@ template <int NumBeats> struct RoutingTable {
     ptr[5] = localKeyHigh >> 8;
     ptr[6] = localKeyHigh >> 16;
     ptr[7] = localKeyHigh >> 24;
-    ptr[8] = (mboxThread&0x1f) << 3;
-    ptr[9] = (1 << 5) | (mboxY << 3) | (mboxX << 1) | (mboxThread >> 5);
+    ptr[10] = (mboxThread&0x1f) << 3;
+    ptr[11] = (1 << 5) | (mboxY << 3) | (mboxX << 1) | (mboxThread >> 5);
     numChunks += 2;
     numRecords++;
   }
 
   // Add an MRM record to the table
   void addMRM(uint32_t mboxX, uint32_t mboxY,
-                uint32_t threadsHigh, uint32_t threadsLow) {
-    if (numChunks >= 5) next();
-    uint8_t* ptr = beats[currentBeat].bytes + 5*(4-numChunks);
+                uint32_t threadsHigh, uint32_t threadsLow,
+                  uint16_t localKey) {
+    if (numChunks >= 4) next();
+    uint8_t* ptr = beats[currentBeat].bytes + 6*(3-numChunks);
     ptr[0] = threadsLow;
     ptr[1] = threadsLow >> 8;
     ptr[2] = threadsLow >> 16;
@@ -129,7 +131,9 @@ template <int NumBeats> struct RoutingTable {
     ptr[5] = threadsHigh >> 8;
     ptr[6] = threadsHigh >> 16;
     ptr[7] = threadsHigh >> 24;
-    ptr[9] = (3 << 5) | (mboxY << 3) | (mboxX << 1);
+    ptr[8] = localKey;
+    ptr[9] = localKey >> 8;
+    ptr[11] = (3 << 5) | (mboxY << 3) | (mboxX << 1);
     numChunks += 2;
     numRecords++;
   }
@@ -138,9 +142,9 @@ template <int NumBeats> struct RoutingTable {
   // Return a pointer to the indirection key,
   // so it can be set later by the caller
   uint8_t* addIND() {
-    if (numChunks == 6) next();
-    uint8_t* ptr = beats[currentBeat].bytes + 5*(5-numChunks);
-    ptr[4] = 4 << 5;
+    if (numChunks == 5) next();
+    uint8_t* ptr = beats[currentBeat].bytes + 6*(4-numChunks);
+    ptr[5] = 4 << 5;
     numChunks++;
     numRecords++;
     return ptr;
@@ -159,13 +163,13 @@ template <int NumBeats> struct RoutingTable {
 
   // Add an RR record to the table
   void addRR(uint32_t dir, uint32_t key) {
-    if (numChunks == 6) next();
-    uint8_t* ptr = beats[currentBeat].bytes + 5*(5-numChunks);
+    if (numChunks == 5) next();
+    uint8_t* ptr = beats[currentBeat].bytes + 6*(4-numChunks);
     ptr[0] = key;
     ptr[1] = key >> 8;
     ptr[2] = key >> 16;
     ptr[3] = key >> 24;
-    ptr[4] = (2 << 5) | (dir << 3);
+    ptr[5] = (2 << 5) | (dir << 3);
     numChunks++;
     numRecords++;
   }
@@ -194,7 +198,7 @@ int main()
     table.addURM2(0, 0, 60, 0xff1, 0xff0);
     table.addURM2(0, 0, 60, 0xff3, 0xff2);
     table.addURM2(0, 0, 60, 0xff5, 0xff4);
-    //table.addMRM(1, 0, 0x22222222, 0x11111111);
+    //table.addMRM(1, 0, 0x22222222, 0x11111111, 0x2222);
     uint8_t* ind = table.addIND();
     table.next();
     uint8_t* entry2 = table.currentPointer();
diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv
index f63b997c..f6712ba1 100644
--- a/rtl/ProgRouter.bsv
+++ b/rtl/ProgRouter.bsv
@@ -16,11 +16,11 @@ import Util      :: *;
 // Routing keys and beats
 // =============================================================================
 
-// A routing record is either 40 bits or 80 bits in size (aligned on a
-// 40-bit or 80-bit boundary respectively). Multiple records are
+// A routing record is either 48 bits or 96 bits in size (aligned on a
+// 48-bit or 96-bit boundary respectively). Multiple records are
 // packed into a 256-bit DRAM beat (aligned on a 256-bit boundary).
 // The most significant 16 bits of the beat contain a count of the
-// number of records in the beat (in the range 1 to 6 inclusive). The
+// number of records in the beat (in the range 1 to 5 inclusive). The
 // remaining 240 bits contain records. The first record lies in the
 // least-significant bits of the beat. The size portion of the routing
 // key contains the number of contiguous DRAM beats holding all
@@ -30,8 +30,8 @@ import Util      :: *;
 typedef struct {
   // Number of records present
   Bit#(16) size;
-  // The 40-bit record chunks
-  Vector#(6, Bit#(40)) chunks;
+  // The 48-bit record chunks
+  Vector#(5, Bit#(48)) chunks;
 } RoutingBeat deriving (Bits, FShow);
 
 // 32-bit routing key
@@ -53,11 +53,11 @@ function RoutingKey getRoutingKey(NetAddr addr) =
 // =============================================================================
 
 typedef enum {
-  URM1 = 3'd0, // 40-bit Unicast Router-to-Mailbox
-  URM2 = 3'd1, // 80-bit Unicast Router-to-Mailbox
-  RR   = 3'd2, // 40-bit Router-to-Router
-  MRM  = 3'd3, // 80-bit Multicast Router-to-Mailbox
-  IND  = 3'd4  // 40-bit Indirection
+  URM1 = 3'd0, // 48-bit Unicast Router-to-Mailbox
+  URM2 = 3'd1, // 96-bit Unicast Router-to-Mailbox
+  RR   = 3'd2, // 48-bit Router-to-Router
+  MRM  = 3'd3, // 96-bit Multicast Router-to-Mailbox
+  IND  = 3'd4  // 48-bit Indirection
 } RoutingRecordTag deriving (Bits, Eq, FShow);
 
 typedef enum {
@@ -67,7 +67,7 @@ typedef enum {
   WEST  = 2'd3
 } RoutingDir deriving (Bits, Eq);
 
-// 40-bit Unicast Router-to-Mailbox (URM1) record
+// 48-bit Unicast Router-to-Mailbox (URM1) record
 typedef struct {
   // Record type
   RoutingRecordTag tag;
@@ -75,12 +75,14 @@ typedef struct {
   Bit#(4) mbox;
   // Mailbox-local thread identifier
   Bit#(6) thread;
+  // Unused
+  Bit#(3) unused;
   // Local key. The first word of the message
   // payload is overwritten with this.
-  Bit#(27) localKey;
+  Bit#(32) localKey;
 } URM1Record deriving (Bits, FShow);
 
-// 80-bit Unicast Router-to-Mailbox (URM2) record
+// 96-bit Unicast Router-to-Mailbox (URM2) record
 typedef struct {
   // Record type
   RoutingRecordTag tag;
@@ -89,26 +91,26 @@ typedef struct {
   // Mailbox-local thread identifier
   Bit#(6) thread;
   // Currently unused
-  Bit#(3) unused;
+  Bit#(19) unused;
   // Local key. The first two words of the message
   // payload is overwritten with this.
   Bit#(64) localKey;
 } URM2Record deriving (Bits);
 
-// 40-bit Router-to-Router (RR) record
+// 48-bit Router-to-Router (RR) record
 typedef struct {
   // Record type
   RoutingRecordTag tag;
   // Direction (N, S, E, or W)
   RoutingDir dir;
   // Currently unused
-  Bit#(3) unused;
+  Bit#(11) unused;
   // New 32-bit routing key that will replace the one in the
   // current message for the next hop of the message's journey
   Bit#(32) newKey;
 } RRRecord deriving (Bits);
 
-// 80-bit Multicast Router-to-Mailbox (MRM) record
+// 96-bit Multicast Router-to-Mailbox (MRM) record
 typedef struct {
   // Record type
   RoutingRecordTag tag;
@@ -116,16 +118,19 @@ typedef struct {
   Bit#(4) mbox;
   // Currently unused
   Bit#(9) unused;
+  // Local key. The least-significant half-word
+  // of the message is replaced with this
+  Bit#(16) localKey;
   // Mailbox-local destination mask
   Bit#(64) destMask;
 } MRMRecord deriving (Bits);
 
-// 40-bit Indirection (IND) record:
+// 48-bit Indirection (IND) record:
 typedef struct {
   // Record type
   RoutingRecordTag tag;
   // Currently unused
-  Bit#(5) unused;
+  Bit#(13) unused;
   // New 32-bit routing key for new set of records on current router
   Bit#(32) newKey;
 } INDRecord deriving (Bits);
@@ -459,16 +464,16 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
     let beatNum = beatReg.beatNum;
     let info = beatReg.info;
     // Extract tag from next record
-    RoutingRecordTag tag = unpack(truncateLSB(beat.chunks[5]));
+    RoutingRecordTag tag = unpack(truncateLSB(beat.chunks[4]));
     // Is this the first flit of a message?
     Bool firstFlit = emitFlitCount == 0;
     // Modify flit by interpreting routing key
     RoutingDecision decision = ?;
     Flit flit = flitBuffer.dataOut;
     case (tag)
-      // 40-bit Unicast Router-to-Mailbox
+      // 48-bit Unicast Router-to-Mailbox
       URM1: begin
-        URM1Record rec = unpack(beat.chunks[5]);
+        URM1Record rec = unpack(beat.chunks[4]);
         flit.dest.addr.isKey = False;
         flit.dest.addr.mbox.x = unpack(truncate(rec.mbox[1:0]));
         flit.dest.addr.mbox.y = unpack(truncate(rec.mbox[3:2]));
@@ -478,12 +483,12 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
         flit.dest.threads = pack(threadMask);
         // Replace first word of message with local key
         if (firstFlit)
-          flit.payload = {truncateLSB(flit.payload), 5'b0, rec.localKey};
+          flit.payload = {truncateLSB(flit.payload), rec.localKey};
         decision = RouteNoC;
       end
-      // 80-bit Unicast Router-to-Mailbox
+      // 96-bit Unicast Router-to-Mailbox
       URM2: begin
-        URM2Record rec = unpack({beat.chunks[5], beat.chunks[4]});
+        URM2Record rec = unpack({beat.chunks[4], beat.chunks[3]});
         flit.dest.addr.isKey = False;
         flit.dest.addr.mbox.x = unpack(truncate(rec.mbox[1:0]));
         flit.dest.addr.mbox.y = unpack(truncate(rec.mbox[3:2]));
@@ -496,9 +501,9 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
           flit.payload = {truncateLSB(flit.payload), rec.localKey};
         decision = RouteNoC;
       end
-      // 40-bit Router-to-Router
+      // 48-bit Router-to-Router
       RR: begin
-        RRRecord rec = unpack(beat.chunks[5]);
+        RRRecord rec = unpack(beat.chunks[4]);
         case (rec.dir)
           NORTH: begin
             decision = RouteNorth;
@@ -519,18 +524,21 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
         endcase
         flit.dest.threads = {?, rec.newKey};
       end
-      // 80-bit Multicast Router-to-Mailbox
+      // 96-bit Multicast Router-to-Mailbox
       MRM: begin
-        MRMRecord rec = unpack({beat.chunks[5], beat.chunks[4]});
+        MRMRecord rec = unpack({beat.chunks[4], beat.chunks[3]});
         flit.dest.addr.isKey = False;
         flit.dest.addr.mbox.x = unpack(truncate(rec.mbox[1:0]));
         flit.dest.addr.mbox.y = unpack(truncate(rec.mbox[3:2]));
         flit.dest.threads = rec.destMask;
+        // Replace first half-word of message with local key
+        if (firstFlit)
+          flit.payload = {truncateLSB(flit.payload), rec.localKey};
         decision = RouteNoC;
       end
-      // 40-bit Indirection
+      // 48-bit Indirection
       IND: begin
-        INDRecord rec = unpack(beat.chunks[5]);
+        INDRecord rec = unpack(beat.chunks[4]);
         flit.dest.threads = {?, rec.newKey};
         decision = RouteLoop;
       end
@@ -547,12 +555,12 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
       RoutingBeat newBeat = beat;
       Bool doubleChunk = unpack(pack(tag)[0]);
       if (doubleChunk) begin
-        for (Integer i = 5; i > 2; i=i-2) begin
+        for (Integer i = 4; i > 2; i=i-2) begin
           newBeat.chunks[i] = beat.chunks[i-2];
           newBeat.chunks[i-1] = beat.chunks[i-3];
         end
       end else begin
-        for (Integer i = 5; i > 0; i=i-1)
+        for (Integer i = 4; i > 0; i=i-1)
           newBeat.chunks[i] = beat.chunks[i-1];
       end
       beatReg <= NumberedRoutingBeat {

From 4928d5ab05865b9f89a8cd9b1cff51235c612f0c Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Tue, 7 Apr 2020 08:37:40 +0100
Subject: [PATCH 23/78] Forward port recent changes to POLite app suite

(From 0.6.3)
---
 apps/POLite/asp-pc/Makefile                   |   9 +-
 apps/POLite/asp-pc/asp-push.cpp               | 180 ++++++++++++++++++
 apps/POLite/asp-sync/Run.cpp                  |   3 +-
 apps/POLite/heat-gals/Heat.h                  |  10 +-
 apps/POLite/heat-gals/Makefile                |   4 +-
 apps/POLite/heat-gals/Run.cpp                 | 109 +++++------
 .../{heat-gals => heat-grid-sync}/Colours.cpp |   0
 .../{heat-gals => heat-grid-sync}/Colours.h   |   0
 apps/POLite/heat-grid-sync/Heat.cpp           |  23 +++
 apps/POLite/heat-grid-sync/Heat.h             |  71 +++++++
 apps/POLite/heat-grid-sync/Makefile           |   7 +
 apps/POLite/heat-grid-sync/Run.cpp            | 119 ++++++++++++
 apps/POLite/heat-pc/.asp.cpp.swp              | Bin 0 -> 16384 bytes
 apps/POLite/heat-pc/Makefile                  |  11 ++
 apps/POLite/heat-pc/heat.cpp                  |  63 ++++++
 apps/POLite/heat-sync/Colours.cpp             |  71 -------
 apps/POLite/heat-sync/Colours.h               |  10 -
 apps/POLite/heat-sync/Heat.h                  |  21 +-
 apps/POLite/heat-sync/Makefile                |   4 +-
 apps/POLite/heat-sync/Run.cpp                 | 111 +++++------
 apps/POLite/izhikevich-gals/Izhikevich.cpp    |  23 +++
 apps/POLite/izhikevich-gals/Izhikevich.h      | 115 +++++++++++
 apps/POLite/izhikevich-gals/Makefile          |   6 +
 apps/POLite/izhikevich-gals/RNG.h             |  23 +++
 apps/POLite/izhikevich-gals/Run.cpp           | 130 +++++++++++++
 apps/POLite/izhikevich-pc/Izhikevich.cpp      | 139 ++++++++++++++
 apps/POLite/izhikevich-pc/Makefile            |   6 +
 apps/POLite/izhikevich-pc/RNG.h               |  27 +++
 apps/POLite/izhikevich-sync/Izhikevich.cpp    |  23 +++
 apps/POLite/izhikevich-sync/Izhikevich.h      |  72 +++++++
 apps/POLite/izhikevich-sync/Makefile          |   6 +
 apps/POLite/izhikevich-sync/RNG.h             |  23 +++
 apps/POLite/izhikevich-sync/Run.cpp           | 117 ++++++++++++
 apps/POLite/pagerank-sync/Run.cpp             |   1 +
 apps/POLite/sssp-async/Run.cpp                |   1 +
 apps/POLite/sssp-pc/.asp.cpp.swp              | Bin 0 -> 16384 bytes
 apps/POLite/sssp-pc/Makefile                  |  11 ++
 apps/POLite/sssp-pc/sssp.cpp                  |  92 +++++++++
 apps/POLite/sssp-sync/Run.cpp                 |   1 +
 apps/POLite/util/sumstats.awk                 |   2 +-
 40 files changed, 1412 insertions(+), 232 deletions(-)
 create mode 100644 apps/POLite/asp-pc/asp-push.cpp
 rename apps/POLite/{heat-gals => heat-grid-sync}/Colours.cpp (100%)
 rename apps/POLite/{heat-gals => heat-grid-sync}/Colours.h (100%)
 create mode 100644 apps/POLite/heat-grid-sync/Heat.cpp
 create mode 100644 apps/POLite/heat-grid-sync/Heat.h
 create mode 100644 apps/POLite/heat-grid-sync/Makefile
 create mode 100644 apps/POLite/heat-grid-sync/Run.cpp
 create mode 100644 apps/POLite/heat-pc/.asp.cpp.swp
 create mode 100644 apps/POLite/heat-pc/Makefile
 create mode 100644 apps/POLite/heat-pc/heat.cpp
 delete mode 100644 apps/POLite/heat-sync/Colours.cpp
 delete mode 100644 apps/POLite/heat-sync/Colours.h
 create mode 100644 apps/POLite/izhikevich-gals/Izhikevich.cpp
 create mode 100644 apps/POLite/izhikevich-gals/Izhikevich.h
 create mode 100644 apps/POLite/izhikevich-gals/Makefile
 create mode 100644 apps/POLite/izhikevich-gals/RNG.h
 create mode 100644 apps/POLite/izhikevich-gals/Run.cpp
 create mode 100644 apps/POLite/izhikevich-pc/Izhikevich.cpp
 create mode 100644 apps/POLite/izhikevich-pc/Makefile
 create mode 100644 apps/POLite/izhikevich-pc/RNG.h
 create mode 100644 apps/POLite/izhikevich-sync/Izhikevich.cpp
 create mode 100644 apps/POLite/izhikevich-sync/Izhikevich.h
 create mode 100644 apps/POLite/izhikevich-sync/Makefile
 create mode 100644 apps/POLite/izhikevich-sync/RNG.h
 create mode 100644 apps/POLite/izhikevich-sync/Run.cpp
 create mode 100644 apps/POLite/sssp-pc/.asp.cpp.swp
 create mode 100644 apps/POLite/sssp-pc/Makefile
 create mode 100644 apps/POLite/sssp-pc/sssp.cpp

diff --git a/apps/POLite/asp-pc/Makefile b/apps/POLite/asp-pc/Makefile
index 0cf7448f..bf9439f3 100644
--- a/apps/POLite/asp-pc/Makefile
+++ b/apps/POLite/asp-pc/Makefile
@@ -1,10 +1,10 @@
 # SPDX-License-Identifier: BSD-2-Clause
-all: asp GenHypercube GenTree GenGeoGraph
+all: asp GenHypercube GenTree
 
 INC=../../../../include
 
 asp: asp.cpp
-	g++ -fopenmp -D_DEFAULT_SOURCE -I$(INC) -O3 asp.cpp -o asp
+	g++ -I$(INC) -O3 asp.cpp -o asp
 
 GenHypercube: GenHypercube.hs
 	ghc -O2 --make GenHypercube.hs
@@ -12,8 +12,5 @@ GenHypercube: GenHypercube.hs
 GenTree: GenTree.hs
 	ghc -O2 --make GenTree.hs
 
-GenGeoGraph: GenGeoGraph.cpp
-	g++ -O2 -lstdc++ GenGeoGraph.cpp -o GenGeoGraph
-
 clean:
-	rm -f asp GenHypercube GenTree GenGeoGraph *.hi *.o
+	rm -f asp GenHypercube GenTree *.hi *.o
diff --git a/apps/POLite/asp-pc/asp-push.cpp b/apps/POLite/asp-pc/asp-push.cpp
new file mode 100644
index 00000000..a75f6628
--- /dev/null
+++ b/apps/POLite/asp-pc/asp-push.cpp
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: BSD-2-Clause
+#include "RandomSet.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <assert.h>
+#include <sys/time.h>
+
+// Number of nodes and edges
+uint32_t numNodes;
+uint32_t numEdges;
+
+// Mapping from node id to array of neighbouring node ids
+// First element of each array holds the number of neighbours
+uint32_t** neighbours;
+
+// Mapping from node id to bit vector of reaching nodes
+uint64_t** reaching;
+uint64_t** reachingNext;
+
+// Number of 64-bit words in reaching vector
+const uint64_t vectorSize = 1;
+
+void readGraph(const char* filename, bool undirected)
+{
+  // Read edges
+  FILE* fp = fopen(filename, "rt");
+  if (fp == NULL) {
+    fprintf(stderr, "Can't open '%s'\n", filename);
+    exit(EXIT_FAILURE);
+  }
+
+  // Note: we use a "pull" algorithm (rather than "push") to
+  // avoid parallel writes to the same address, hence we reverse
+  // the direction of the edges here.
+
+  // Count number of nodes and edges
+  numEdges = 0;
+  numNodes = 0;
+  int ret;
+  while (1) {
+    uint32_t src, dst;
+    ret = fscanf(fp, "%d %d", &dst, &src);
+    if (ret == EOF) break;
+    numEdges++;
+    numNodes = src >= numNodes ? src+1 : numNodes;
+    numNodes = dst >= numNodes ? dst+1 : numNodes;
+  }
+  rewind(fp);
+
+  // Create mapping from node id to number of neighbours
+  uint32_t* count = (uint32_t*) calloc(numNodes, sizeof(uint32_t));
+  for (int i = 0; i < numEdges; i++) {
+    uint32_t src, dst;
+    ret = fscanf(fp, "%d %d", &dst, &src);
+    count[src]++;
+    if (undirected) count[dst]++;
+  }
+
+  // Create mapping from node id to neighbours
+  neighbours = (uint32_t**) calloc(numNodes, sizeof(uint32_t*));
+  rewind(fp);
+  for (int i = 0; i < numNodes; i++) {
+    neighbours[i] = (uint32_t*) calloc(count[i]+1, sizeof(uint32_t));
+    neighbours[i][0] = count[i];
+  }
+  for (int i = 0; i < numEdges; i++) {
+    uint32_t src, dst;
+    ret = fscanf(fp, "%d %d", &dst, &src);
+    neighbours[src][count[src]--] = dst;
+    if (undirected) neighbours[dst][count[dst]--] = src;
+  }
+
+  // Create mapping from node id to bit vector of reaching nodes
+  reaching = (uint64_t**) calloc(numNodes, sizeof(uint64_t*));
+  reachingNext = (uint64_t**) calloc(numNodes, sizeof(uint64_t*));
+  for (int i = 0; i < numNodes; i++) {
+    reaching[i] = (uint64_t*) calloc(vectorSize, sizeof(uint64_t));
+    reachingNext[i] = (uint64_t*) calloc(vectorSize, sizeof(uint64_t));
+  }
+
+  // Release
+  free(count);
+  fclose(fp);
+}
+
+// Compute sum of all shortest paths from given sources
+uint64_t ssp(uint32_t numSources, uint32_t* sources)
+{
+  // Sum of distances
+  uint64_t sum = 0;
+
+  // Initialise reaching vector for each node
+  for (int i = 0; i < numNodes; i++) {
+    for (int j = 0; j < vectorSize; j++) {
+      reaching[i][j] = 0;
+      reachingNext[i][j] = 0;
+    }
+  }
+  for (int i = 0; i < numSources; i++) {
+    uint32_t src = sources[i];
+    reaching[src][i/64] |= 1ul << (i%64);
+  }
+
+  int* queue = new int [numNodes];
+  int queueSize = 0;
+  for (int i = 0; i < numNodes; i++) queue[queueSize++] = i;
+
+  // Distance increases on each iteration
+  uint32_t dist = 1;
+
+  while (queueSize > 0) {
+    // For each node
+    for (int i = 0; i < queueSize; i++) {
+      int me = queue[i];
+      // For each neighbour
+      uint32_t numNeighbours = neighbours[me][0];
+      for (int j = 1; j <= numNeighbours; j++) {
+        uint32_t n = neighbours[me][j];
+        // For each chunk
+        for (int k = 0; k < vectorSize; k++) {
+          if (reaching[me][k] & ~reachingNext[n][k])
+            reachingNext[n][k] |= reaching[me][k];
+        }
+      }
+    }
+
+    // For each node, update reaching vector
+    queueSize = 0;
+    for (int i = 0; i < numNodes; i++) {
+      for (int k = 0; k < vectorSize; k++) {
+        uint64_t diff = reachingNext[i][k] & ~reaching[i][k];
+        if (diff) {
+          queue[queueSize++] = i;
+          uint32_t n = __builtin_popcountll(diff);
+          sum += n * dist;
+          reaching[i][k] |= reachingNext[i][k];
+        }
+      }
+    }
+    dist++;
+  }
+
+  return sum;
+}
+
+int main(int argc, char**argv)
+{
+  if (argc != 2) {
+    printf("Specify edges file\n");
+    exit(EXIT_FAILURE);
+  }
+  bool undirected = false;
+  readGraph(argv[1], undirected);
+  printf("Nodes: %u.  Edges: %u\n", numNodes, numEdges);
+
+  uint32_t numSources = 64*vectorSize;
+  assert(numSources < numNodes);
+  uint32_t sources[numSources];
+  for (int i = 0; i < numSources; i++) sources[i] = i;
+  //randomSet(numSources, sources, numNodes);
+
+  struct timeval start, finish, diff;
+
+  uint64_t sum = 0;
+  const int nodesPerVector = 64 * vectorSize;
+  gettimeofday(&start, NULL);
+  sum = ssp(numSources, sources);
+  gettimeofday(&finish, NULL);
+
+  printf("Sum of subset of shortest paths = %lu\n", sum);
+ 
+  timersub(&finish, &start, &diff);
+  double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
+  printf("Time = %lf\n", duration);
+
+  return 0;
+}
diff --git a/apps/POLite/asp-sync/Run.cpp b/apps/POLite/asp-sync/Run.cpp
index 25082646..3264d114 100644
--- a/apps/POLite/asp-sync/Run.cpp
+++ b/apps/POLite/asp-sync/Run.cpp
@@ -19,9 +19,10 @@ int main(int argc, char**argv)
   // Read network
   EdgeList net;
   net.read(argv[1]);
-
+  
   // Print max fan-out
   printf("Max fan-out = %d\n", net.maxFanOut());
+  assert(net.minFanOut() > 0);
 
   // Check that parameters make sense
   assert(32*N <= net.numNodes);
diff --git a/apps/POLite/heat-gals/Heat.h b/apps/POLite/heat-gals/Heat.h
index 12ca9574..600b4d00 100644
--- a/apps/POLite/heat-gals/Heat.h
+++ b/apps/POLite/heat-gals/Heat.h
@@ -2,6 +2,8 @@
 #ifndef _HEAT_H_
 #define _HEAT_H_
 
+#define POLITE_DUMP_STATS
+#define POLITE_COUNT_MSGS
 #include <POLite.h>
 
 struct HeatMessage {
@@ -10,7 +12,7 @@ struct HeatMessage {
   // Time step
   uint32_t time;
   // Temperature at sender
-  uint32_t val;
+  float val;
 };
 
 struct HeatState {
@@ -21,9 +23,9 @@ struct HeatState {
   // Current time step of device
   uint32_t time;
   // Current temperature of device
-  uint32_t val;
+  float val;
   // Accumulator for temperatures received at times t and t+1
-  uint32_t acc, accNext;
+  float acc, accNext;
   // Count messages sent and received
   uint8_t sent, received, receivedNext;
   // Is the temperature of this device constant?
@@ -45,7 +47,7 @@ struct HeatDevice : PDevice<HeatState, None, HeatMessage> {
     // Proceed to next time step?
     if (s->sent && s->received == s->fanIn) {
       s->time--;
-      if (!s->isConstant) s->val = s->acc >> 2;
+      if (!s->isConstant) s->val = s->acc / (float) s->fanIn;
       s->acc = s->accNext;
       s->received = s->receivedNext;
       s->accNext = s->receivedNext = 0;
diff --git a/apps/POLite/heat-gals/Makefile b/apps/POLite/heat-gals/Makefile
index 0c343edd..86430b66 100644
--- a/apps/POLite/heat-gals/Makefile
+++ b/apps/POLite/heat-gals/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: BSD-2-Clause
 APP_CPP = Heat.cpp
 APP_HDR = Heat.h
-RUN_CPP = Run.cpp Colours.cpp
-RUN_H = Colours.h
+RUN_CPP = Run.cpp
+RUN_H =
 
 include ../util/polite.mk
diff --git a/apps/POLite/heat-gals/Run.cpp b/apps/POLite/heat-gals/Run.cpp
index 0a08505b..eacf449f 100644
--- a/apps/POLite/heat-gals/Run.cpp
+++ b/apps/POLite/heat-gals/Run.cpp
@@ -1,17 +1,31 @@
 // SPDX-License-Identifier: BSD-2-Clause
 #include "Heat.h"
-#include "Colours.h"
 
 #include <HostLink.h>
 #include <POLite.h>
+#include <EdgeList.h>
 #include <sys/time.h>
 
-int main()
+int main(int argc, char **argv)
 {
   // Parameters
-  const uint32_t width  = 256;
-  const uint32_t height = 256;
-  const uint32_t time   = 1000;
+  const uint32_t time = 1000;
+
+  // Read in the example edge list and create data structure
+  if (argc != 2) {
+    printf("Specify edge file\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Load in the edge list file
+  printf("Loading in the graph..."); fflush(stdout);
+  EdgeList net;
+  net.read(argv[1]);
+  printf(" done\n");
+
+  // Print max fan-out
+  printf("Min fan-out = %d\n", net.minFanOut());
+  printf("Max fan-out = %d\n", net.maxFanOut());
 
   // Connection to tinsel machine
   HostLink hostLink;
@@ -19,58 +33,32 @@ int main()
   // Create POETS graph
   PGraph<HeatDevice, HeatState, None, HeatMessage> graph;
 
-  // Create 2D mesh of devices
-  PDeviceId **mesh = new PDeviceId* [height];
-  for (uint32_t y = 0; y < height; y++) {
-    mesh[y] = new PDeviceId [width];
-    for (uint32_t x = 0; x < width; x++)
-      mesh[y][x] = graph.newDevice();
+  // Create nodes in POETS graph
+  for (uint32_t i = 0; i < net.numNodes; i++) {
+    PDeviceId id = graph.newDevice();
+    assert(i == id);
   }
 
-  // Add edges
-  for (uint32_t y = 0; y < height; y++)
-    for (uint32_t x = 0; x < width; x++) {
-      if (x < width-1) {
-        graph.addEdge(mesh[y][x],   0, mesh[y][x+1]);
-        graph.addEdge(mesh[y][x+1], 0, mesh[y][x]);
-      }
-      if (y < height-1) {
-        graph.addEdge(mesh[y][x],   0, mesh[y+1][x]);
-        graph.addEdge(mesh[y+1][x], 0, mesh[y][x]);
-      }
-    }
+  // Create connections in POETS graph
+  for (uint32_t i = 0; i < net.numNodes; i++) {
+    uint32_t numNeighbours = net.neighbours[i][0];
+    for (uint32_t j = 0; j < numNeighbours; j++)
+      graph.addEdge(i, 0, net.neighbours[i][j+1]);
+  }
 
   // Prepare mapping from graph to hardware
   graph.map();
 
-  // Set device ids
-  for (uint32_t y = 0; y < height; y++)
-    for (uint32_t x = 0; x < width; x++)
-      graph.devices[mesh[y][x]]->state.id = mesh[y][x];
-
-  // Initialise time and fanIn fields
+  // Specify number of time steps to run on each device
+  srand(1);
   for (PDeviceId i = 0; i < graph.numDevices; i++) {
+    int r = rand() % 255;
+    graph.devices[i]->state.id = i;
     graph.devices[i]->state.time = time;
+    graph.devices[i]->state.val = (float) r;
+    graph.devices[i]->state.isConstant = false;
     graph.devices[i]->state.fanIn = graph.fanIn(i);
   }
- 
-  // Apply constant heat at north edge
-  // Apply constant cool at south edge
-  for (uint32_t x = 0; x < width; x++) {
-    graph.devices[mesh[0][x]]->state.val = 255 << 16;
-    graph.devices[mesh[0][x]]->state.isConstant = true;
-    graph.devices[mesh[height-1][x]]->state.val = 40 << 16;
-    graph.devices[mesh[height-1][x]]->state.isConstant = true;
-  }
-
-  // Apply constant heat at west edge
-  // Apply constant cool at east edge
-  for (uint32_t y = 0; y < height; y++) {
-    graph.devices[mesh[y][0]]->state.val = 255 << 16;
-    graph.devices[mesh[y][0]]->state.isConstant = true;
-    graph.devices[mesh[y][width-1]]->state.val = 40 << 16;
-    graph.devices[mesh[y][width-1]]->state.isConstant = true;
-  }
 
   // Write graph down to tinsel machine via HostLink
   graph.write(&hostLink);
@@ -84,8 +72,11 @@ int main()
   struct timeval start, finish, diff;
   gettimeofday(&start, NULL);
 
+  // Consume performance stats
+  politeSaveStats(&hostLink, "stats.txt");
+
   // Allocate array to contain final value of each device
-  uint32_t* pixels = new uint32_t [graph.numDevices];
+  float* pixels = new float [graph.numDevices];
 
   // Receive final value of each device
   for (uint32_t i = 0; i < graph.numDevices; i++) {
@@ -97,25 +88,17 @@ int main()
     pixels[msg.payload.from] = msg.payload.val;
   }
 
+  // Display final values of first ten devices
+  for (uint32_t i = 0; i < 10; i++) {
+    if (i < graph.numDevices) {
+      printf("%d: %f\n", i, pixels[i]);
+    }
+  }
+
   // Display time
   timersub(&finish, &start, &diff);
   double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
   printf("Time = %lf\n", duration);
 
-  // Emit image
-  FILE* fp = fopen("out.ppm", "wt");
-  if (fp == NULL) {
-    printf("Can't open output file for writing\n");
-    return -1;
-  }
-  fprintf(fp, "P3\n%d %d\n255\n", width, height);
-  for (uint32_t y = 0; y < height; y++)
-    for (uint32_t x = 0; x < width; x++) {
-      uint32_t val = (pixels[mesh[y][x]] >> 16) & 0xff;
-      fprintf(fp, "%d %d %d\n",
-        colours[val*3], colours[val*3+1], colours[val*3+2]);
-    }
-  fclose(fp);
-
   return 0;
 }
diff --git a/apps/POLite/heat-gals/Colours.cpp b/apps/POLite/heat-grid-sync/Colours.cpp
similarity index 100%
rename from apps/POLite/heat-gals/Colours.cpp
rename to apps/POLite/heat-grid-sync/Colours.cpp
diff --git a/apps/POLite/heat-gals/Colours.h b/apps/POLite/heat-grid-sync/Colours.h
similarity index 100%
rename from apps/POLite/heat-gals/Colours.h
rename to apps/POLite/heat-grid-sync/Colours.h
diff --git a/apps/POLite/heat-grid-sync/Heat.cpp b/apps/POLite/heat-grid-sync/Heat.cpp
new file mode 100644
index 00000000..b2b4fc3e
--- /dev/null
+++ b/apps/POLite/heat-grid-sync/Heat.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: BSD-2-Clause
+#include "Heat.h"
+
+#include <tinsel.h>
+#include <POLite.h>
+
+typedef PThread<
+          HeatDevice,
+          HeatState,    // State
+          None,         // Edge label
+          HeatMessage   // Message
+        > HeatThread;
+
+int main()
+{
+  // Point thread structure at base of thread's heap
+  HeatThread* thread = (HeatThread*) tinselHeapBaseSRAM();
+  
+  // Invoke interpreter
+  thread->run();
+
+  return 0;
+}
diff --git a/apps/POLite/heat-grid-sync/Heat.h b/apps/POLite/heat-grid-sync/Heat.h
new file mode 100644
index 00000000..b3a63a93
--- /dev/null
+++ b/apps/POLite/heat-grid-sync/Heat.h
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef _HEAT_H_
+#define _HEAT_H_
+
+#include <POLite.h>
+
+struct HeatMessage {
+  // Sender id
+  uint32_t from;
+  // Time step
+  uint32_t time;
+  // Temperature at sender
+  uint32_t val;
+};
+
+struct HeatState {
+  // Device id
+  uint32_t id;
+  // Current time step of device
+  uint32_t time;
+  // Current temperature of device
+  uint32_t val, acc;
+  // Is the temperature of this device constant?
+  bool isConstant;
+};
+
+struct HeatDevice : PDevice<HeatState, None, HeatMessage> {
+
+  // Called once by POLite at start of execution
+  inline void init() {
+    *readyToSend = Pin(0);
+  }
+
+  // Send handler
+  inline void send(volatile HeatMessage* msg) {
+    msg->from = s->id;
+    msg->time = s->time;
+    msg->val = s->val;
+    *readyToSend = No;
+  }
+
+  // Receive handler
+  inline void recv(HeatMessage* msg, None* edge) {
+    s->acc += msg->val;
+  }
+
+  // Called by POLite when system becomes idle
+  inline bool step() {
+    // Execution complete?
+    if (s->time == 0) {
+      *readyToSend = No;
+      return false;
+    }
+    else {
+      s->time--;
+      if (!s->isConstant) s->val = s->acc >> 2;
+      s->acc = 0;
+      *readyToSend = Pin(0);
+      return true;
+    }
+  }
+
+  // Optionally send message to host on termination
+  inline bool finish(volatile HeatMessage* msg) {
+    msg->from = s->id;
+    msg->val = s->val;
+    return true;
+  }
+};
+
+#endif
diff --git a/apps/POLite/heat-grid-sync/Makefile b/apps/POLite/heat-grid-sync/Makefile
new file mode 100644
index 00000000..0c343edd
--- /dev/null
+++ b/apps/POLite/heat-grid-sync/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: BSD-2-Clause
+APP_CPP = Heat.cpp
+APP_HDR = Heat.h
+RUN_CPP = Run.cpp Colours.cpp
+RUN_H = Colours.h
+
+include ../util/polite.mk
diff --git a/apps/POLite/heat-grid-sync/Run.cpp b/apps/POLite/heat-grid-sync/Run.cpp
new file mode 100644
index 00000000..a938a446
--- /dev/null
+++ b/apps/POLite/heat-grid-sync/Run.cpp
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: BSD-2-Clause
+#include "Heat.h"
+#include "Colours.h"
+
+#include <HostLink.h>
+#include <POLite.h>
+#include <sys/time.h>
+
+int main()
+{
+  // Parameters
+  const uint32_t width  = 256;
+  const uint32_t height = 256;
+  const uint32_t time   = 1000;
+
+  // Connection to tinsel machine
+  HostLink hostLink;
+
+  // Create POETS graph
+  PGraph<HeatDevice, HeatState, None, HeatMessage> graph;
+
+  // Create 2D mesh of devices
+  PDeviceId **mesh = new PDeviceId* [height];
+  for (uint32_t y = 0; y < height; y++) {
+    mesh[y] = new PDeviceId [width];
+    for (uint32_t x = 0; x < width; x++)
+      mesh[y][x] = graph.newDevice();
+  }
+
+  // Add edges
+  for (uint32_t y = 0; y < height; y++)
+    for (uint32_t x = 0; x < width; x++) {
+      if (x < width-1) {
+        graph.addEdge(mesh[y][x],   0, mesh[y][x+1]);
+        graph.addEdge(mesh[y][x+1], 0, mesh[y][x]);
+      }
+      if (y < height-1) {
+        graph.addEdge(mesh[y][x],   0, mesh[y+1][x]);
+        graph.addEdge(mesh[y+1][x], 0, mesh[y][x]);
+      }
+    }
+
+  // Prepare mapping from graph to hardware
+  graph.map();
+
+  // Set device ids
+  for (uint32_t y = 0; y < height; y++)
+    for (uint32_t x = 0; x < width; x++)
+      graph.devices[mesh[y][x]]->state.id = mesh[y][x];
+
+  // Specify number of time steps to run on each device
+  for (PDeviceId i = 0; i < graph.numDevices; i++)
+    graph.devices[i]->state.time = time;
+ 
+  // Apply constant heat at north edge
+  // Apply constant cool at south edge
+  for (uint32_t x = 0; x < width; x++) {
+    graph.devices[mesh[0][x]]->state.val = 255 << 16;
+    graph.devices[mesh[0][x]]->state.isConstant = true;
+    graph.devices[mesh[height-1][x]]->state.val = 40 << 16;
+    graph.devices[mesh[height-1][x]]->state.isConstant = true;
+  }
+
+  // Apply constant heat at west edge
+  // Apply constant cool at east edge
+  for (uint32_t y = 0; y < height; y++) {
+    graph.devices[mesh[y][0]]->state.val = 255 << 16;
+    graph.devices[mesh[y][0]]->state.isConstant = true;
+    graph.devices[mesh[y][width-1]]->state.val = 40 << 16;
+    graph.devices[mesh[y][width-1]]->state.isConstant = true;
+  }
+
+  // Write graph down to tinsel machine via HostLink
+  graph.write(&hostLink);
+
+  // Load code and trigger execution
+  hostLink.boot("code.v", "data.v");
+  hostLink.go();
+  printf("Starting\n");
+
+  // Start timer
+  struct timeval start, finish, diff;
+  gettimeofday(&start, NULL);
+
+  // Allocate array to contain final value of each device
+  uint32_t* pixels = new uint32_t [graph.numDevices];
+
+  // Receive final value of each device
+  for (uint32_t i = 0; i < graph.numDevices; i++) {
+    // Receive message
+    PMessage<HeatMessage> msg;
+    hostLink.recvMsg(&msg, sizeof(msg));
+    if (i == 0) gettimeofday(&finish, NULL);
+    // Save final value
+    pixels[msg.payload.from] = msg.payload.val;
+  }
+
+  // Display time
+  timersub(&finish, &start, &diff);
+  double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
+  printf("Time = %lf\n", duration);
+
+  // Emit image
+  FILE* fp = fopen("out.ppm", "wt");
+  if (fp == NULL) {
+    printf("Can't open output file for writing\n");
+    return -1;
+  }
+  fprintf(fp, "P3\n%d %d\n255\n", width, height);
+  for (uint32_t y = 0; y < height; y++)
+    for (uint32_t x = 0; x < width; x++) {
+      uint32_t val = (pixels[mesh[y][x]] >> 16) & 0xff;
+      fprintf(fp, "%d %d %d\n",
+        colours[val*3], colours[val*3+1], colours[val*3+2]);
+    }
+  fclose(fp);
+
+  return 0;
+}
diff --git a/apps/POLite/heat-pc/.asp.cpp.swp b/apps/POLite/heat-pc/.asp.cpp.swp
new file mode 100644
index 0000000000000000000000000000000000000000..2d2936e2dd2badda5887be7d43054d6c78666e79
GIT binary patch
literal 16384
zcmeI2U2Ggz700KE3#1_}N`Z$;NH?3>*<E{WuVbv%9}Q_@w??iT?4&Ak7Hf9q&hEss
zGt+x#?3$$E0VotIo+=a`5ap>4E#jdnPkgmO3PM6c2oWmLH!4vHsCYoY8~=0feC*oZ
zxTz2VF)RJNo;&yB+;h(TIQMRI{E5ZK*aM^E20kA#jL%+K{#@stsex;zVRZb-`V^(_
z^J#92z~3hiy}qg{9E)6Ea&OEGLpgTt@g+CnW2Ovip+#S#Rv5mAs){Uy1_}+_jt1h$
zsXcJSI5|;QL;ZN^7`ylAh1;p2$XjTj&_JPqLIZ^c3Jnw*C^S%LpwPhoxCWx|fbj+N
z;Xu-ltI78rJH8Jl&ojw4m7!0XPd^oZ3k?(+C^S%LpwK{}fkFd?1_})n8YnbSXrRzQ
zp@IKK4VboJlpxp?9RT3|KaKxy-eVX)1wR3=fNS7OAOsh|0+<J7@G)=z+yQRfZ5Yo1
z0iFU6g8RW?a2I&%h++H~{0Mvnd>O>xGhhxJ0)PFmVf-2V349;;;8E}hD1oEk0Qmc1
z!*~b$8oUIa2Uox%_&7KU2Eak^i@OZtS+EWs1)l(i!8?Zx<8AN<@EVAK15SZ|e#kI>
z2Yv&-2fhh5z{B7WxDyP3>mP(}z-e$F`0WP_;~U^>;5l#wbig`jfQ#UM@a~=HC%6f&
zgEznnU>mG}6W|DVcK~vNx4{p=w}Au!@WFZTAQ%7#!J9a&`5CwlUIj0LuLB><g9`Wu
z?0o~g2%Z5JARqW7*#E<lj}r)1cKwKLusJq9gRfb($*m|5tM2DHo`;7kMw)qSj4dEP
zH?20a+OfZpy>T5@K4!H|zs2n}zfoaU;Nw>!F`vn@#AxN@#CpVhwB&QQ)ouo{kZW#Z
z?NTGnmWgu-nd&p_5}QTlxU-z)M;b(8my>$)TDvptYGFKUp}P;66soeUZC9(pg(|4&
zz~*v>xp}L{Lt(Z$CJQ<t3r%5q9`_hZt6F;MM^PkhwOx-xKg<q%u0<z})01k+K%T50
zwkspmcO;^&vv@Rpi!;}^1UDs@Ebz4j@Ut6XMlStuhf(v|j{SvC&Eq!G`ft$@WEQY-
zV$ehjP$HUo1~3L;thVd7)}*i+Ywp<8WP@FS&0>$u&O(&IsmY4AYSuO6M9UIZw>a6I
zUiXoERcGo_yKYtg&0AmQPe<B=v?fWi{O!&+S!3i~aCUfCkF#f<m1XR&O>)3n=ntk_
z^xeocJy-HfSxIZE*J>2`<RPeubTl=I!6D;L+7nc<il@L~<P+os(ztBoIb~SRR0}^%
zR5p^iQZ-d|#XwbO;H`1QRRgFJPRnFF5D|yFg=W;2%n?C{wcJharv)*wKsLwn0?Erx
zsO;$o&dXL1`;mG?%`4m^$K352iE+>$B`Ec;ikSgT2Ua<sWJZ|8R1-K^rb^;CnTsip
z&0gO|9q-YOX+tD%kF<GH=|r$g*s7K`t>S$aCj+4h*r$VE)nQm^Ymj>|*Je?`nr@VO
z56qV}M>%A!dYXr0-*yG2aBf$+=UH1ujWuPsHTt(!qiFy`QV-H>DwKLa>x?wEbw(7?
z83iX5InpwH2R<>vN`p2Vv`Zsw2zl@qW%d~OBP^+*k4fh3R?jS}2YQ!OSSH3vVl9}S
zJXm-CawnlxCbwK#V7T66JIbU@*GHnA^S<C)u5YKqAeSt)xN0?Tc29@VlBEM@AB*Qr
z^tW$r&*5QhpWJY1%74gW>0DIRChAd3HtPFoMVn&(VSoynT+pQ0tDau#B;J!OTMz}#
z#~#u<2^ui$6r~K;*>t|bW2S>Y<wa<<>Meqly^AR4P1Q|9P^o;5%|E_SVNHzHjepPB
zNj4Jod=G(!sOm1U$CGCz6MC+m)%=esn=+>AQ+hCcn{gU?<YLGbwK-NBnmw?uAT3jZ
z%D~!O@KK%1+zc=&Wm}&ufvt$7Ns4kyv%{Hb+k#6uf>N}3*`g(IlMAd8s+r{=;?tBG
zV{tTDDU3a@gf*!ZpsIEU3dD*8Ge_8f&|BFqRZxFE4*1h<RGxow@%;M2>BXfBEAz>e
z<Aee$sZ*Ab&4s|^bjI{gL@WrgsGS&;CocM!s2ps^d`PLvkhz?Y&CRjp3rkB{vkQw$
z^HoNvSgBF8oR&a^A}XchcZHj&_AVPrB`~%9h%m!8R!AQ%N4qFgl(aX(nnB>PuK5|e
zc9kz%)bmvlZP#aL;a<*6HZ@tJRbVR+w#0rgUj%e1gki}pqFT+~(ULZ^*9%qmWAAIR
z$wqp$YY?jX7F>Z|V?WU0a5lJ)+T@r*nA@x!co3UBq{|7jZ9Vn0pESCB_mHOrGFcnp
zxv4x<v2uIeVVZ2Sk+DOSYT0C6-hUkXYcIxXJ1e&(e&F8WqwR-`Jk^wvi>NnEMsT*)
z&NLfF*p=qF&0T43KuLKUr4`5%bXGZvmq;J0=gvM^TXHRqU)$FfZK%|BTrQ^B!>ea&
z6SXs*iQ!{V{9i?^JAqi3;{Wu1|5u3Re-7RNuY(uBv)~N44-A06AfA66d=Z%76u2Aw
z4Ke%;@DlhA_!hVZo(EqAKInjT&;S<!#r`v(3<dzj{(l6&1TO;to&ui+6>tc=g}DCr
z;0NF}@LeFm1#l946x<8mMBi?LR{;m7!Kc6^Q2pLN;4N|#8YnbSXrRzQp@BjJg$DkQ
zH9*lZ0>@bR6dmFr37SghvF%0NG3YrePO@WtzO3RMJBXV&#ZJ?Zt*nzO2pAn_G}>I3
z++xGKld(y)*chvitIwzkNNIyDWZWzdIj--@Hk}(qrl9j9s*PIgk}Ej|MSh9XCj<gJ
zPA*C0Sc#0|Q<O!~7LTZnz_HEkyb%@RQGl$EStX885i?{br^(T@>ThoaiEvqY9kyYo
zufh;{j*BPMr3utBHOZ>or^;w8jmz@WT@oP@u@y0ra?`|NtnPlQ4$^|^h)9Eiy(AsU
z;u|G%76>IBS<5@84%>AQ(^XN|4M%SUc|p~qY95#NIE*?Yn`VRYC}S$}qW_g4P=R`>
zBvh|d19^aKiq_^@y@BKE{J4@Vvm-=C^N#6Bb&sS^b^3+=sS~e;+;W}m<kW&<ZwRCs
zr%)fwTI|>yo1jSlvOyN?n69s`!06qGj@hduUPYgOCqo4;)6@4sTMrVP&DZLxek~sz
z*3(%K`jXz2<lp(-M0#_vL5G@ZOi=#(J>6a49j;WMGj-FFmmodq5Bc#h&b8oj)NcMF
za|OC>TVHR+u7~@$^)Lw40ioxW({Yy6M9NV~*{x50Po%RX_f_g^Xh@BEIyUK&J@a#_
z2<`5coDH$7X{zpFDspxTdy9+6?A9;6m{KBU0`&AbNzJOg^-WIyyBu`F#yG`sSb1n@
zI=RBj(}t&MQ0ptwL_s*VA%!7;Uq~-Y>9_%&9@=!mpY}hQB>Fi@?h3i#yWIiTtZ<MS
F<6oGQi6sC4

literal 0
HcmV?d00001

diff --git a/apps/POLite/heat-pc/Makefile b/apps/POLite/heat-pc/Makefile
new file mode 100644
index 00000000..235863ef
--- /dev/null
+++ b/apps/POLite/heat-pc/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: BSD-2-Clause
+all: heat
+
+INC=../../../include
+
+heat: heat.cpp
+	g++ -I$(INC) -O3 heat.cpp -o heat
+
+.PHONY: clean
+clean:
+	rm heat
diff --git a/apps/POLite/heat-pc/heat.cpp b/apps/POLite/heat-pc/heat.cpp
new file mode 100644
index 00000000..194766ac
--- /dev/null
+++ b/apps/POLite/heat-pc/heat.cpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: BSD-2-Clause
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <assert.h>
+#include <sys/time.h>
+#include <EdgeList.h>
+
+int main(int argc, char**argv)
+{
+  if (argc != 2) {
+    printf("Specify edges file\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Read network
+  EdgeList net;
+  net.read(argv[1]);
+
+  // Create states
+  float* heat = new float [net.numNodes];
+  float* heatNext = new float [net.numNodes];
+  srand(1);
+  for (int i = 0; i < net.numNodes; i++) {
+    int r = rand() % 255;
+    heat[i] = (float) r;
+  }
+
+  // Start timer
+  printf("Started\n");
+  struct timeval start, finish, diff;
+  gettimeofday(&start, NULL);
+
+  for (int t = 0; t < 100; t++) {
+    for (int i = 0; i < net.numNodes; i++) {
+      uint32_t numNeighbours = net.neighbours[i][0];
+      float acc = 0.0;
+      for (uint32_t j = 0; j < numNeighbours; j++) {
+        uint32_t neighbour = net.neighbours[i][j+1];
+        acc += heat[neighbour];
+      }
+      heatNext[i] = acc / (float) numNeighbours;
+    }
+    float* tmp = heat; heat = heatNext; heatNext = tmp;
+  }
+
+  // Stop timer
+  gettimeofday(&finish, NULL);
+
+  // Display final values of first ten devices
+  for (uint32_t i = 0; i < 10; i++) {
+    if (i < net.numNodes)
+      printf("%d: %f\n", i, heat[i]);
+  }
+
+  // Display time
+  timersub(&finish, &start, &diff);
+  double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
+  printf("Time = %lf\n", duration);
+
+  return 0;
+}
diff --git a/apps/POLite/heat-sync/Colours.cpp b/apps/POLite/heat-sync/Colours.cpp
deleted file mode 100644
index 93b49740..00000000
--- a/apps/POLite/heat-sync/Colours.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-// SPDX-License-Identifier: BSD-2-Clause
-#include <stdint.h>
-
-// 256 x RGB colours representing heat intensities
-uint8_t colours[] = {
-  0x00, 0x00, 0x76, 0x00, 0x00, 0x7a, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x83,
-  0x00, 0x00, 0x88, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x91, 0x00, 0x00, 0x95,
-  0x00, 0x00, 0x9a, 0x00, 0x00, 0x9e, 0x00, 0x00, 0xa3, 0x00, 0x00, 0xa3,
-  0x00, 0x00, 0xa7, 0x00, 0x00, 0xac, 0x00, 0x00, 0xb0, 0x00, 0x00, 0xb5,
-  0x00, 0x00, 0xb9, 0x00, 0x00, 0xbe, 0x00, 0x00, 0xc2, 0x00, 0x00, 0xc7,
-  0x00, 0x00, 0xcb, 0x00, 0x00, 0xd0, 0x00, 0x00, 0xd4, 0x00, 0x00, 0xd9,
-  0x00, 0x00, 0xde, 0x00, 0x00, 0xe2, 0x00, 0x00, 0xe7, 0x00, 0x00, 0xeb,
-  0x00, 0x00, 0xf0, 0x00, 0x00, 0xf4, 0x00, 0x00, 0xf9, 0x00, 0x00, 0xfd,
-  0x00, 0x03, 0xff, 0x00, 0x07, 0xff, 0x00, 0x0c, 0xff, 0x00, 0x10, 0xff,
-  0x00, 0x15, 0xff, 0x00, 0x19, 0xff, 0x00, 0x1e, 0xff, 0x00, 0x22, 0xff,
-  0x00, 0x27, 0xff, 0x00, 0x2b, 0xff, 0x00, 0x30, 0xff, 0x00, 0x34, 0xff,
-  0x00, 0x39, 0xff, 0x00, 0x3d, 0xff, 0x00, 0x42, 0xff, 0x00, 0x47, 0xff,
-  0x00, 0x4b, 0xff, 0x00, 0x50, 0xff, 0x00, 0x54, 0xff, 0x00, 0x59, 0xff,
-  0x00, 0x5d, 0xff, 0x00, 0x62, 0xff, 0x00, 0x66, 0xff, 0x00, 0x6b, 0xff,
-  0x00, 0x6f, 0xff, 0x00, 0x74, 0xff, 0x00, 0x78, 0xff, 0x00, 0x7d, 0xff,
-  0x00, 0x81, 0xff, 0x00, 0x86, 0xff, 0x00, 0x8a, 0xff, 0x00, 0x8f, 0xff,
-  0x00, 0x93, 0xff, 0x00, 0x98, 0xff, 0x00, 0x9c, 0xff, 0x00, 0xa1, 0xff,
-  0x00, 0xa5, 0xff, 0x00, 0xaa, 0xff, 0x00, 0xaf, 0xff, 0x00, 0xb3, 0xff,
-  0x00, 0xb8, 0xff, 0x00, 0xbc, 0xff, 0x00, 0xc1, 0xff, 0x00, 0xc5, 0xff,
-  0x00, 0xca, 0xff, 0x00, 0xce, 0xff, 0x00, 0xd3, 0xff, 0x00, 0xd7, 0xff,
-  0x00, 0xdc, 0xff, 0x00, 0xe0, 0xff, 0x00, 0xe5, 0xff, 0x00, 0xe9, 0xff,
-  0x00, 0xee, 0xff, 0x00, 0xf2, 0xff, 0x00, 0xf7, 0xff, 0x00, 0xfb, 0xff,
-  0x00, 0xff, 0xff, 0x00, 0xff, 0xfa, 0x00, 0xff, 0xf5, 0x00, 0xff, 0xf1,
-  0x00, 0xff, 0xec, 0x00, 0xff, 0xe7, 0x00, 0xff, 0xe3, 0x00, 0xff, 0xde,
-  0x00, 0xff, 0xda, 0x00, 0xff, 0xd5, 0x00, 0xff, 0xd1, 0x00, 0xff, 0xcc,
-  0x00, 0xff, 0xc8, 0x00, 0xff, 0xc3, 0x00, 0xff, 0xbf, 0x00, 0xff, 0xba,
-  0x00, 0xff, 0xb6, 0x00, 0xff, 0xb1, 0x00, 0xff, 0xad, 0x00, 0xff, 0xa8,
-  0x00, 0xff, 0xa4, 0x00, 0xff, 0x9f, 0x00, 0xff, 0x9b, 0x00, 0xff, 0x96,
-  0x00, 0xff, 0x92, 0x00, 0xff, 0x8d, 0x00, 0xff, 0x89, 0x00, 0xff, 0x84,
-  0x00, 0xff, 0x80, 0x00, 0xff, 0x7b, 0x00, 0xff, 0x76, 0x00, 0xff, 0x72,
-  0x00, 0xff, 0x6d, 0x00, 0xff, 0x69, 0x00, 0xff, 0x64, 0x00, 0xff, 0x60,
-  0x00, 0xff, 0x5b, 0x00, 0xff, 0x57, 0x00, 0xff, 0x52, 0x00, 0xff, 0x4e,
-  0x00, 0xff, 0x49, 0x00, 0xff, 0x45, 0x00, 0xff, 0x40, 0x00, 0xff, 0x3c,
-  0x00, 0xff, 0x37, 0x00, 0xff, 0x33, 0x00, 0xff, 0x2e, 0x00, 0xff, 0x2a,
-  0x00, 0xff, 0x25, 0x00, 0xff, 0x21, 0x00, 0xff, 0x1c, 0x00, 0xff, 0x18,
-  0x00, 0xff, 0x13, 0x00, 0xff, 0x0e, 0x00, 0xff, 0x0a, 0x00, 0xff, 0x05,
-  0x00, 0xff, 0x01, 0x04, 0xff, 0x00, 0x08, 0xff, 0x00, 0x0d, 0xff, 0x00,
-  0x11, 0xff, 0x00, 0x16, 0xff, 0x00, 0x1a, 0xff, 0x00, 0x1f, 0xff, 0x00,
-  0x23, 0xff, 0x00, 0x28, 0xff, 0x00, 0x2c, 0xff, 0x00, 0x31, 0xff, 0x00,
-  0x35, 0xff, 0x00, 0x3a, 0xff, 0x00, 0x3e, 0xff, 0x00, 0x43, 0xff, 0x00,
-  0x47, 0xff, 0x00, 0x4c, 0xff, 0x00, 0x50, 0xff, 0x00, 0x55, 0xff, 0x00,
-  0x5a, 0xff, 0x00, 0x5e, 0xff, 0x00, 0x63, 0xff, 0x00, 0x67, 0xff, 0x00,
-  0x6c, 0xff, 0x00, 0x70, 0xff, 0x00, 0x75, 0xff, 0x00, 0x79, 0xff, 0x00,
-  0x7e, 0xff, 0x00, 0x82, 0xff, 0x00, 0x87, 0xff, 0x00, 0x8b, 0xff, 0x00,
-  0x90, 0xff, 0x00, 0x94, 0xff, 0x00, 0x99, 0xff, 0x00, 0x9d, 0xff, 0x00,
-  0xa2, 0xff, 0x00, 0xa6, 0xff, 0x00, 0xab, 0xff, 0x00, 0xaf, 0xff, 0x00,
-  0xb4, 0xff, 0x00, 0xb8, 0xff, 0x00, 0xbd, 0xff, 0x00, 0xc2, 0xff, 0x00,
-  0xc6, 0xff, 0x00, 0xcb, 0xff, 0x00, 0xcf, 0xff, 0x00, 0xd4, 0xff, 0x00,
-  0xd8, 0xff, 0x00, 0xdd, 0xff, 0x00, 0xe1, 0xff, 0x00, 0xe6, 0xff, 0x00,
-  0xea, 0xff, 0x00, 0xef, 0xff, 0x00, 0xf3, 0xff, 0x00, 0xf8, 0xff, 0x00,
-  0xfc, 0xff, 0x00, 0xff, 0xfd, 0x00, 0xff, 0xf9, 0x00, 0xff, 0xf4, 0x00,
-  0xff, 0xf0, 0x00, 0xff, 0xeb, 0x00, 0xff, 0xe7, 0x00, 0xff, 0xe2, 0x00,
-  0xff, 0xde, 0x00, 0xff, 0xd9, 0x00, 0xff, 0xd5, 0x00, 0xff, 0xd0, 0x00,
-  0xff, 0xcb, 0x00, 0xff, 0xc7, 0x00, 0xff, 0xc2, 0x00, 0xff, 0xbe, 0x00,
-  0xff, 0xb9, 0x00, 0xff, 0xb5, 0x00, 0xff, 0xb0, 0x00, 0xff, 0xac, 0x00,
-  0xff, 0xa7, 0x00, 0xff, 0xa3, 0x00, 0xff, 0x9e, 0x00, 0xff, 0x9a, 0x00,
-  0xff, 0x95, 0x00, 0xff, 0x91, 0x00, 0xff, 0x8c, 0x00, 0xff, 0x88, 0x00,
-  0xff, 0x83, 0x00, 0xff, 0x7f, 0x00, 0xff, 0x7a, 0x00, 0xff, 0x76, 0x00,
-  0xff, 0x71, 0x00, 0xff, 0x6d, 0x00, 0xff, 0x68, 0x00, 0xff, 0x63, 0x00,
-  0xff, 0x5f, 0x00, 0xff, 0x5a, 0x00, 0xff, 0x56, 0x00, 0xff, 0x51, 0x00,
-  0xff, 0x4d, 0x00, 0xff, 0x48, 0x00, 0xff, 0x44, 0x00, 0xff, 0x3f, 0x00,
-  0xff, 0x3b, 0x00, 0xff, 0x36, 0x00, 0xff, 0x32, 0x00, 0xff, 0x2d, 0x00,
-  0xff, 0x29, 0x00, 0xff, 0x24, 0x00, 0xff, 0x20, 0x00, 0xff, 0x1b, 0x00,
-  0xff, 0x17, 0x00, 0xff, 0x12, 0x00, 0xff, 0x0e, 0x00, 0xff, 0x09, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
-};
diff --git a/apps/POLite/heat-sync/Colours.h b/apps/POLite/heat-sync/Colours.h
deleted file mode 100644
index fc34e04c..00000000
--- a/apps/POLite/heat-sync/Colours.h
+++ /dev/null
@@ -1,10 +0,0 @@
-// SPDX-License-Identifier: BSD-2-Clause
-#ifndef _COLOURS_H_
-#define _COLOURS_H_
-
-#include <stdint.h>
-
-// 256 x RGB colours representing heat intensities
-extern uint8_t colours[];
-
-#endif
diff --git a/apps/POLite/heat-sync/Heat.h b/apps/POLite/heat-sync/Heat.h
index b3a63a93..8dc926b3 100644
--- a/apps/POLite/heat-sync/Heat.h
+++ b/apps/POLite/heat-sync/Heat.h
@@ -2,24 +2,26 @@
 #ifndef _HEAT_H_
 #define _HEAT_H_
 
+#define POLITE_DUMP_STATS
+#define POLITE_COUNT_MSGS
 #include <POLite.h>
 
 struct HeatMessage {
   // Sender id
   uint32_t from;
-  // Time step
-  uint32_t time;
   // Temperature at sender
-  uint32_t val;
+  float val;
 };
 
 struct HeatState {
   // Device id
   uint32_t id;
-  // Current time step of device
-  uint32_t time;
   // Current temperature of device
-  uint32_t val, acc;
+  float val, acc;
+  // Time step
+  uint16_t time;
+  // Number of neighbours
+  uint16_t numNeighbours;
   // Is the temperature of this device constant?
   bool isConstant;
 };
@@ -34,7 +36,6 @@ struct HeatDevice : PDevice<HeatState, None, HeatMessage> {
   // Send handler
   inline void send(volatile HeatMessage* msg) {
     msg->from = s->id;
-    msg->time = s->time;
     msg->val = s->val;
     *readyToSend = No;
   }
@@ -42,6 +43,7 @@ struct HeatDevice : PDevice<HeatState, None, HeatMessage> {
   // Receive handler
   inline void recv(HeatMessage* msg, None* edge) {
     s->acc += msg->val;
+    s->numNeighbours++;
   }
 
   // Called by POLite when system becomes idle
@@ -53,8 +55,9 @@ struct HeatDevice : PDevice<HeatState, None, HeatMessage> {
     }
     else {
       s->time--;
-      if (!s->isConstant) s->val = s->acc >> 2;
-      s->acc = 0;
+      if (!s->isConstant) s->val = s->acc / (float) s->numNeighbours;
+      s->acc = 0.0;
+      s->numNeighbours = 0;
       *readyToSend = Pin(0);
       return true;
     }
diff --git a/apps/POLite/heat-sync/Makefile b/apps/POLite/heat-sync/Makefile
index 0c343edd..f44d5b09 100644
--- a/apps/POLite/heat-sync/Makefile
+++ b/apps/POLite/heat-sync/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: BSD-2-Clause
 APP_CPP = Heat.cpp
 APP_HDR = Heat.h
-RUN_CPP = Run.cpp Colours.cpp
-RUN_H = Colours.h
+RUN_CPP = Run.cpp
+RUN_H = 
 
 include ../util/polite.mk
diff --git a/apps/POLite/heat-sync/Run.cpp b/apps/POLite/heat-sync/Run.cpp
index a938a446..c3db2fbf 100644
--- a/apps/POLite/heat-sync/Run.cpp
+++ b/apps/POLite/heat-sync/Run.cpp
@@ -1,17 +1,31 @@
 // SPDX-License-Identifier: BSD-2-Clause
 #include "Heat.h"
-#include "Colours.h"
 
 #include <HostLink.h>
 #include <POLite.h>
+#include <EdgeList.h>
 #include <sys/time.h>
 
-int main()
+int main(int argc, char **argv)
 {
-  // Parameters
-  const uint32_t width  = 256;
-  const uint32_t height = 256;
-  const uint32_t time   = 1000;
+  const uint32_t time = 1000;
+
+  // Read in the example edge list and create data structure
+  if (argc != 2) {
+    printf("Specify edge file\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Load in the edge list file
+  printf("Loading in the graph..."); fflush(stdout);
+  EdgeList net;
+  net.read(argv[1]);
+  printf(" done\n");
+
+  // Print max fan-out
+  printf("Min fan-out = %d\n", net.minFanOut());
+  printf("Max fan-out = %d\n", net.maxFanOut());
+  assert(net.minFanOut() > 0);
 
   // Connection to tinsel machine
   HostLink hostLink;
@@ -19,55 +33,31 @@ int main()
   // Create POETS graph
   PGraph<HeatDevice, HeatState, None, HeatMessage> graph;
 
-  // Create 2D mesh of devices
-  PDeviceId **mesh = new PDeviceId* [height];
-  for (uint32_t y = 0; y < height; y++) {
-    mesh[y] = new PDeviceId [width];
-    for (uint32_t x = 0; x < width; x++)
-      mesh[y][x] = graph.newDevice();
+  // Create nodes in POETS graph
+  for (uint32_t i = 0; i < net.numNodes; i++) {
+    PDeviceId id = graph.newDevice();
+    assert(i == id);
   }
 
-  // Add edges
-  for (uint32_t y = 0; y < height; y++)
-    for (uint32_t x = 0; x < width; x++) {
-      if (x < width-1) {
-        graph.addEdge(mesh[y][x],   0, mesh[y][x+1]);
-        graph.addEdge(mesh[y][x+1], 0, mesh[y][x]);
-      }
-      if (y < height-1) {
-        graph.addEdge(mesh[y][x],   0, mesh[y+1][x]);
-        graph.addEdge(mesh[y+1][x], 0, mesh[y][x]);
-      }
-    }
+  // Create connections in POETS graph
+  for (uint32_t i = 0; i < net.numNodes; i++) {
+    uint32_t numNeighbours = net.neighbours[i][0];
+    for (uint32_t j = 0; j < numNeighbours; j++)
+      graph.addEdge(i, 0, net.neighbours[i][j+1]);
+  }
 
   // Prepare mapping from graph to hardware
   graph.map();
 
-  // Set device ids
-  for (uint32_t y = 0; y < height; y++)
-    for (uint32_t x = 0; x < width; x++)
-      graph.devices[mesh[y][x]]->state.id = mesh[y][x];
-
   // Specify number of time steps to run on each device
-  for (PDeviceId i = 0; i < graph.numDevices; i++)
+  srand(1);
+  for (PDeviceId i = 0; i < graph.numDevices; i++) {
+    int r = rand() % 255;
+    graph.devices[i]->state.id = i;
     graph.devices[i]->state.time = time;
- 
-  // Apply constant heat at north edge
-  // Apply constant cool at south edge
-  for (uint32_t x = 0; x < width; x++) {
-    graph.devices[mesh[0][x]]->state.val = 255 << 16;
-    graph.devices[mesh[0][x]]->state.isConstant = true;
-    graph.devices[mesh[height-1][x]]->state.val = 40 << 16;
-    graph.devices[mesh[height-1][x]]->state.isConstant = true;
-  }
-
-  // Apply constant heat at west edge
-  // Apply constant cool at east edge
-  for (uint32_t y = 0; y < height; y++) {
-    graph.devices[mesh[y][0]]->state.val = 255 << 16;
-    graph.devices[mesh[y][0]]->state.isConstant = true;
-    graph.devices[mesh[y][width-1]]->state.val = 40 << 16;
-    graph.devices[mesh[y][width-1]]->state.isConstant = true;
+    graph.devices[i]->state.val = (float) r;
+    graph.devices[i]->state.isConstant = false;
+    //graph.devices[i]->state.fanOut = graph.fanOut(i);
   }
 
   // Write graph down to tinsel machine via HostLink
@@ -82,8 +72,11 @@ int main()
   struct timeval start, finish, diff;
   gettimeofday(&start, NULL);
 
+  // Consume performance stats
+  politeSaveStats(&hostLink, "stats.txt");
+
   // Allocate array to contain final value of each device
-  uint32_t* pixels = new uint32_t [graph.numDevices];
+  float* pixels = new float [graph.numDevices];
 
   // Receive final value of each device
   for (uint32_t i = 0; i < graph.numDevices; i++) {
@@ -95,25 +88,17 @@ int main()
     pixels[msg.payload.from] = msg.payload.val;
   }
 
+  // Display final values of first ten devices
+  for (uint32_t i = 0; i < 10; i++) {
+    if (i < graph.numDevices) {
+      printf("%d: %f\n", i, pixels[i]);
+    }
+  }
+
   // Display time
   timersub(&finish, &start, &diff);
   double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
   printf("Time = %lf\n", duration);
 
-  // Emit image
-  FILE* fp = fopen("out.ppm", "wt");
-  if (fp == NULL) {
-    printf("Can't open output file for writing\n");
-    return -1;
-  }
-  fprintf(fp, "P3\n%d %d\n255\n", width, height);
-  for (uint32_t y = 0; y < height; y++)
-    for (uint32_t x = 0; x < width; x++) {
-      uint32_t val = (pixels[mesh[y][x]] >> 16) & 0xff;
-      fprintf(fp, "%d %d %d\n",
-        colours[val*3], colours[val*3+1], colours[val*3+2]);
-    }
-  fclose(fp);
-
   return 0;
 }
diff --git a/apps/POLite/izhikevich-gals/Izhikevich.cpp b/apps/POLite/izhikevich-gals/Izhikevich.cpp
new file mode 100644
index 00000000..8533062a
--- /dev/null
+++ b/apps/POLite/izhikevich-gals/Izhikevich.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: BSD-2-Clause
+#include "Izhikevich.h"
+
+#include <tinsel.h>
+#include <POLite.h>
+
+typedef PThread<
+          IzhikevichDevice,
+          IzhikevichState,    // State
+          Weight,             // Edge label
+          IzhikevichMsg       // Message
+        > IzhikevichThread;
+
+int main()
+{
+  // Point thread structure at base of thread's heap
+  IzhikevichThread* thread = (IzhikevichThread*) tinselHeapBaseSRAM();
+  
+  // Invoke interpreter
+  thread->run();
+
+  return 0;
+}
diff --git a/apps/POLite/izhikevich-gals/Izhikevich.h b/apps/POLite/izhikevich-gals/Izhikevich.h
new file mode 100644
index 00000000..701af341
--- /dev/null
+++ b/apps/POLite/izhikevich-gals/Izhikevich.h
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: BSD-2-Clause
+// (Based on code by David Thomas)
+#ifndef _Izhikevich_H_
+#define _Izhikevich_H_
+
+#define POLITE_DUMP_STATS
+#define POLITE_COUNT_MSGS
+#include <POLite.h>
+#include "RNG.h"
+
+// Number of time steps to run for
+#define NUM_STEPS 100
+
+// Vertex state
+struct IzhikevichState {
+  // Random-number-generator state
+  uint32_t rng;
+  // Neuron state
+  float u, v, I, acc, accNext;
+  uint32_t spikeCount;
+  // Protocol
+  bool sent;
+  uint16_t received, receivedNext, fanIn, time;
+  // Neuron properties
+  float a, b, c, d, Ir;
+};
+
+// Edge weight type
+typedef float Weight;
+
+// Message type
+struct IzhikevichMsg {
+  // Did the sender spike or not?
+  bool spike;
+  // Time step of sender
+  uint16_t time;
+  // Number of times sender has spiked
+  uint32_t spikeCount;
+};
+
+// Vertex behaviour
+struct IzhikevichDevice : PDevice<IzhikevichState,Weight,IzhikevichMsg> {
+  inline void init() {
+    s->v = -65.0f;
+    s->u = s->b * s->v;
+    s->I = s->Ir * grng(s->rng);
+    *readyToSend = Pin(0);
+  }
+
+  // We call this on every state change
+  inline void change() {
+    // Execution complete?
+    if (s->time == NUM_STEPS) return;
+
+    // Proceed to next time step?
+    if (s->sent && s->received == s->fanIn) {
+      s->time++;
+      s->I += s->acc;
+      s->acc = s->accNext;
+      s->accNext = 0;
+      s->received = s->receivedNext;
+      s->receivedNext = 0;
+      s->sent = false;
+      *readyToSend = s->time == (NUM_STEPS+1) ? No : Pin(0);
+    }
+  }
+
+  // Send handler
+  inline void send(volatile IzhikevichMsg* msg) {
+    bool spike = false;
+    float &v = s->v;
+    float &u = s->u;
+    float &I = s->I;
+    v = v+0.5*(0.04*v*v+5*v+140-u+I); // Step 0.5 ms
+    v = v+0.5*(0.04*v*v+5*v+140-u+I); // for numerical
+    u = u + s->a*(s->b*v-u);          // stability
+    if (v >= 30.0) {
+      v = s->c;
+      u += s->d;
+      s->spikeCount++;
+      spike = true;
+    }
+    s->I = s->Ir * grng(s->rng);
+    msg->time = s->time;
+    msg->spike = spike;
+    msg->spikeCount = s->spikeCount;
+    s->sent = true;
+    *readyToSend = No;
+    change();
+  }
+
+  // Receive handler
+  inline void recv(IzhikevichMsg* msg, Weight* weight) {
+    if (msg->time == s->time) {
+      if (msg->spike) s->acc += *weight;
+      s->received++;
+      change();
+    }
+    else {
+      if (msg->spike) s->accNext += *weight;
+      s->receivedNext++;
+    }
+  }
+
+  inline bool step() {
+    return false;
+  }
+
+  inline bool finish(IzhikevichMsg* msg) {
+    msg->spikeCount = s->spikeCount;
+    return true;
+  }
+};
+
+#endif
diff --git a/apps/POLite/izhikevich-gals/Makefile b/apps/POLite/izhikevich-gals/Makefile
new file mode 100644
index 00000000..5ba3d9e3
--- /dev/null
+++ b/apps/POLite/izhikevich-gals/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: BSD-2-Clause
+APP_CPP = Izhikevich.cpp 
+APP_HDR = Izhikevich.h
+RUN_CPP = Run.cpp
+
+include ../util/polite.mk
diff --git a/apps/POLite/izhikevich-gals/RNG.h b/apps/POLite/izhikevich-gals/RNG.h
new file mode 100644
index 00000000..61b719b3
--- /dev/null
+++ b/apps/POLite/izhikevich-gals/RNG.h
@@ -0,0 +1,23 @@
+#ifndef _RNG_H_
+#define _RNG_H_
+
+inline uint32_t urng(uint32_t &state) {
+  state = state*1664525+1013904223;
+  return state;
+}
+
+// World's crappiest gaussian (courtesy of dt10!)
+inline float grng(uint32_t &state) {
+  uint32_t u=urng(state);
+  int32_t acc=0;
+  for(unsigned i=0;i<8;i++){
+    acc += u&0xf;
+    u=u>>4;
+  }
+  // a four-bit uniform has mean 7.5 and variance ((15-0+1)^2-1)/12 = 85/4
+  // sum of four uniforms has mean 8*7.5=60 and variance of 8*85/4=170
+  const float scale=0.07669649888473704; // == 1/sqrt(170)
+  return (acc-60.0f) * scale;
+}
+
+#endif
diff --git a/apps/POLite/izhikevich-gals/Run.cpp b/apps/POLite/izhikevich-gals/Run.cpp
new file mode 100644
index 00000000..43fb3d4d
--- /dev/null
+++ b/apps/POLite/izhikevich-gals/Run.cpp
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: BSD-2-Clause
+#include "Izhikevich.h"
+
+#include <HostLink.h>
+#include <POLite.h>
+
+#include <EdgeList.h>
+#include <assert.h>
+#include <sys/time.h>
+#include <config.h>
+
+inline double urand() { return (double) rand() / RAND_MAX; }
+
+int main(int argc, char**argv)
+{
+  if (argc != 2) {
+    printf("Specify edges file\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Read network
+  EdgeList net;
+  net.read(argv[1]);
+
+  // Connection to tinsel machine
+  HostLink hostLink;
+
+  // Create POETS graph
+  PGraph<IzhikevichDevice, IzhikevichState, Weight, IzhikevichMsg> graph;
+
+  // Create nodes in POETS graph
+  for (uint32_t i = 0; i < net.numNodes; i++) {
+    PDeviceId id = graph.newDevice();
+    assert(i == id);
+  }
+
+  // Ratio of excitatory to inhibitory neurons
+  double excitatory = 0.8;
+
+  // Mark each neuron as excitatory (or inhibiatory)
+  srand(1);
+  bool* excite = new bool [net.numNodes];
+  for (int i = 0; i < net.numNodes; i++)
+    excite[i] = urand() < excitatory;
+
+  // Create connections in POETS graph
+  for (uint32_t i = 0; i < net.numNodes; i++) {
+    uint32_t numNeighbours = net.neighbours[i][0];
+    for (uint32_t j = 0; j < numNeighbours; j++) {
+      float weight = excite[i] ? 0.5 * urand() : -urand();
+      graph.addLabelledEdge(weight, i, 0, net.neighbours[i][j+1]);
+    }
+  }
+
+  // Add zero-weight back-edges for any directed edges
+  // (For GALS synchronisation)
+  for (uint32_t i = 0; i < net.numNodes; i++) {
+    for (uint32_t j = 0; j < net.neighbours[i][0]; j++) {
+      uint32_t n = net.neighbours[i][j+1];
+      // TODO: can be more efficient here
+      bool needBackEdge = true;
+      for (uint32_t k = 0; k < net.neighbours[n][0]; k++)
+        if (net.neighbours[n][k+1] == i) needBackEdge = false;
+      if (needBackEdge) graph.addLabelledEdge(0.0, n, 0, i);
+    }
+  }
+
+  // Prepare mapping from graph to hardware
+  graph.map();
+
+  srand(2);
+  // Initialise devices
+  for (PDeviceId i = 0; i < graph.numDevices; i++) {
+    IzhikevichState* n = &graph.devices[i]->state;
+    n->rng = (int32_t) (urand()*((double) (1<<31)));
+    n->fanIn = graph.fanIn(i);
+    if (excite[i]) {
+      float re = (float) urand();
+      n->a = 0.02;
+      n->b = 0.2;
+      n->c = -65+15*re*re;
+      n->d = 8-6*re*re;
+      n->Ir = 5;
+    }
+    else {
+      float ri = (float) urand();
+      n->a = 0.02+0.08*ri;
+      n->b = 0.25-0.05*ri;
+      n->c = -65;
+      n->d = 2;
+      n->Ir = 2;
+    }
+  }
+
+  // Write graph down to tinsel machine via HostLink
+  graph.write(&hostLink);
+
+  // Load code and trigger execution
+  hostLink.boot("code.v", "data.v");
+  hostLink.go();
+
+  // Timer
+  printf("Started\n");
+  struct timeval start, finish, diff;
+  gettimeofday(&start, NULL);
+
+  // Consume performance stats
+  politeSaveStats(&hostLink, "stats.txt");
+
+  int64_t sum = 0;
+  // Receive final distance to each vertex
+  for (uint32_t i = 0; i < graph.numDevices; i++) {
+    // Receive message
+    PMessage<IzhikevichMsg> msg;
+    hostLink.recvMsg(&msg, sizeof(msg));
+    if (i == 0) gettimeofday(&finish, NULL);
+    // Accumulate
+    sum += msg.payload.spikeCount;
+  }
+
+  // Emit result
+  printf("Total spikes = %ld\n", sum);
+
+  // Display time
+  timersub(&finish, &start, &diff);
+  double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
+  printf("Time = %lf\n", duration);
+
+  return 0;
+}
diff --git a/apps/POLite/izhikevich-pc/Izhikevich.cpp b/apps/POLite/izhikevich-pc/Izhikevich.cpp
new file mode 100644
index 00000000..b4f03ed5
--- /dev/null
+++ b/apps/POLite/izhikevich-pc/Izhikevich.cpp
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: BSD-2-Clause
+// (Based on code by David Thomas)
+
+#include <EdgeList.h>
+#include <assert.h>
+#include <sys/time.h>
+#include "RNG.h"
+
+#define NUM_STEPS 100
+
+// Neuron 
+struct Neuron {
+  // Random-number-generator state
+  uint32_t rng;
+  // Neuron state
+  float u, v, I, spikeCount;
+  // Neuron properties
+  float a, b, c, d, Ir;
+};
+
+int main(int argc, char**argv)
+{
+  if (argc != 2) {
+    printf("Specify edges file\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Read network
+  EdgeList net;
+  net.read(argv[1]);
+
+  // Ratio of excitatory to inhibitory neurons
+  double excitatory = 0.8;
+
+  // Mark each neuron as excitatory (or inhibiatory)
+  srand(1);
+  bool* excite = new bool [net.numNodes];
+  for (int i = 0; i < net.numNodes; i++) {
+    excite[i] = urand() < excitatory;
+  }
+
+  // Edge weights
+  float** weight = new float* [net.numNodes];
+  for (int i = 0; i < net.numNodes; i++) {
+    uint32_t numEdges = net.neighbours[i][0];
+    weight[i] = new float [numEdges];
+    for (int j = 0; j < numEdges; j++) {
+      weight[i][j] = excite[i] ? 0.5 * urand() : -urand();
+    }
+  }
+
+  // State for each neuron
+  srand(2);
+  Neuron* neuron = new Neuron [net.numNodes];
+  for (int i = 0; i < net.numNodes; i++) {
+    Neuron* n = &neuron[i];
+    n->rng = (int32_t) (urand()*((double) (1<<31)));
+    if (excite[i]) {
+      float re = (float) urand();
+      n->a = 0.02;
+      n->b = 0.2;
+      n->c = -65+15*re*re;
+      n->d = 8-6*re*re;
+      n->Ir = 5;
+    }
+    else {
+      float ri = (float) urand();
+      n->a = 0.02+0.08*ri;
+      n->b = 0.25-0.05*ri;
+      n->c = -65;
+      n->d = 2;
+      n->Ir = 2;
+    }
+  }
+
+  // Spike array
+  bool* spike = new bool [net.numNodes];
+
+  // Initialisation
+  for (int i = 0; i < net.numNodes; i++) {
+    Neuron* n = &neuron[i];
+    n->v = -65.0;
+    n->u = n->b * n->v;
+    n->I = n->Ir * grng(n->rng);
+  }
+
+  // Timer
+  printf("Started\n");
+  struct timeval start, finish, diff;
+  gettimeofday(&start, NULL);
+
+  // Simulation
+  int64_t totalSpikes = 0;
+  for (int t = 0; t <= NUM_STEPS; t++) {
+    // Update state
+    for (int i = 0; i < net.numNodes; i++) {
+      spike[i] = false;
+      Neuron* n = &neuron[i];
+      float &v = n->v;
+      float &u = n->u;
+      float &I = n->I;
+      v = v+0.5*(0.04*v*v+5*v+140-u+I); // Step 0.5 ms
+      v = v+0.5*(0.04*v*v+5*v+140-u+I); // for numerical
+      u = u + n->a*(n->b*v-u);          // stability
+      if (v >= 30.0) {
+        n->v = n->c;
+        n->u += n->d;
+        spike[i] = true;
+      }
+      n->I = n->Ir * grng(n->rng);
+    }
+    // Update I-values
+    uint32_t spikes = 0;
+    for (int i = 0; i < net.numNodes; i++) {
+      Neuron* n = &neuron[i];
+      if (spike[i]) {
+        spikes++;
+        n->spikeCount++;
+        uint32_t numEdges = net.neighbours[i][0];
+        uint32_t* dst = &net.neighbours[i][1];
+        for (int j = 0; j < numEdges; j++) {
+          neuron[dst[j]].I += weight[i][j];
+        }
+      }
+    }
+    //printf("%d: %d\n", t, spikes);
+    totalSpikes += spikes;
+  }
+  gettimeofday(&finish, NULL);
+
+  printf("Total spikes: %ld\n", totalSpikes);
+
+  // Display time
+  timersub(&finish, &start, &diff);
+  double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
+  printf("Time = %lf\n", duration);
+
+  return 0;
+}
diff --git a/apps/POLite/izhikevich-pc/Makefile b/apps/POLite/izhikevich-pc/Makefile
new file mode 100644
index 00000000..52c92c74
--- /dev/null
+++ b/apps/POLite/izhikevich-pc/Makefile
@@ -0,0 +1,6 @@
+Izhikevich: Izhikevich.cpp RNG.h
+	g++ -I../../../include -O2 Izhikevich.cpp -o Izhikevich
+
+.PHONY: clean
+clean:
+	rm Izhikevich
diff --git a/apps/POLite/izhikevich-pc/RNG.h b/apps/POLite/izhikevich-pc/RNG.h
new file mode 100644
index 00000000..decc32f1
--- /dev/null
+++ b/apps/POLite/izhikevich-pc/RNG.h
@@ -0,0 +1,27 @@
+#ifndef _RNG_H_
+#define _RNG_H_
+
+inline uint32_t urng(uint32_t &state) {
+  state = state*1664525+1013904223;
+  return state;
+}
+
+// World's crappiest gaussian (courtesy of dt10!)
+inline float grng(uint32_t &state) {
+  uint32_t u=urng(state);
+  int32_t acc=0;
+  for(unsigned i=0;i<8;i++){
+    acc += u&0xf;
+    u=u>>4;
+  }
+  // a four-bit uniform has mean 7.5 and variance ((15-0+1)^2-1)/12 = 85/4
+  // sum of four uniforms has mean 8*7.5=60 and variance of 8*85/4=170
+  const float scale=0.07669649888473704; // == 1/sqrt(170)
+  return (acc-60.0f) * scale;
+}
+
+inline double urand() {
+  return (double) rand() / RAND_MAX;
+}
+
+#endif
diff --git a/apps/POLite/izhikevich-sync/Izhikevich.cpp b/apps/POLite/izhikevich-sync/Izhikevich.cpp
new file mode 100644
index 00000000..8533062a
--- /dev/null
+++ b/apps/POLite/izhikevich-sync/Izhikevich.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: BSD-2-Clause
+#include "Izhikevich.h"
+
+#include <tinsel.h>
+#include <POLite.h>
+
+typedef PThread<
+          IzhikevichDevice,
+          IzhikevichState,    // State
+          Weight,             // Edge label
+          IzhikevichMsg       // Message
+        > IzhikevichThread;
+
+int main()
+{
+  // Point thread structure at base of thread's heap
+  IzhikevichThread* thread = (IzhikevichThread*) tinselHeapBaseSRAM();
+  
+  // Invoke interpreter
+  thread->run();
+
+  return 0;
+}
diff --git a/apps/POLite/izhikevich-sync/Izhikevich.h b/apps/POLite/izhikevich-sync/Izhikevich.h
new file mode 100644
index 00000000..150a4afa
--- /dev/null
+++ b/apps/POLite/izhikevich-sync/Izhikevich.h
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: BSD-2-Clause
+// (Based on code by David Thomas)
+#ifndef _Izhikevich_H_
+#define _Izhikevich_H_
+
+#define POLITE_DUMP_STATS
+#define POLITE_COUNT_MSGS
+
+#include <POLite.h>
+#include "RNG.h"
+
+// Number of time steps to run for
+#define NUM_STEPS 100
+
+// Vertex state
+struct IzhikevichState {
+  // Random-number-generator state
+  uint32_t rng;
+  // Neuron state
+  float u, v, I;
+  uint32_t spikeCount;
+  // Neuron properties
+  float a, b, c, d, Ir;
+};
+
+// Edge weight type
+typedef float Weight;
+
+// Message type
+struct IzhikevichMsg {
+  // Number of times sender has spiked
+  uint32_t spikeCount;
+};
+
+// Vertex behaviour
+struct IzhikevichDevice : PDevice<IzhikevichState,Weight,IzhikevichMsg> {
+  inline void init() {
+    s->v = -65.0f;
+    s->u = s->b * s->v;
+    s->I = s->Ir * grng(s->rng);
+    *readyToSend = No;
+  }
+  inline void send(IzhikevichMsg* msg) {
+    s->spikeCount++;
+    msg->spikeCount = s->spikeCount;
+    *readyToSend = No;
+  }
+  inline void recv(IzhikevichMsg* msg, Weight* weight) {
+    s->I += *weight;
+  }
+  inline bool step() {
+    float &v = s->v;
+    float &u = s->u;
+    float &I = s->I;
+    v = v+0.5*(0.04*v*v+5*v+140-u+I); // Step 0.5 ms
+    v = v+0.5*(0.04*v*v+5*v+140-u+I); // for numerical
+    u = u + s->a*(s->b*v-u);          // stability
+    if (v >= 30.0) {
+      v = s->c;
+      u += s->d;
+      *readyToSend = Pin(0);
+    }
+    s->I = s->Ir * grng(s->rng);
+    return (time < NUM_STEPS);
+  }
+  inline bool finish(IzhikevichMsg* msg) {
+    msg->spikeCount = s->spikeCount;
+    return true;
+  }
+};
+
+#endif
diff --git a/apps/POLite/izhikevich-sync/Makefile b/apps/POLite/izhikevich-sync/Makefile
new file mode 100644
index 00000000..5ba3d9e3
--- /dev/null
+++ b/apps/POLite/izhikevich-sync/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: BSD-2-Clause
+APP_CPP = Izhikevich.cpp 
+APP_HDR = Izhikevich.h
+RUN_CPP = Run.cpp
+
+include ../util/polite.mk
diff --git a/apps/POLite/izhikevich-sync/RNG.h b/apps/POLite/izhikevich-sync/RNG.h
new file mode 100644
index 00000000..61b719b3
--- /dev/null
+++ b/apps/POLite/izhikevich-sync/RNG.h
@@ -0,0 +1,23 @@
+#ifndef _RNG_H_
+#define _RNG_H_
+
+inline uint32_t urng(uint32_t &state) {
+  state = state*1664525+1013904223;
+  return state;
+}
+
+// World's crappiest gaussian (courtesy of dt10!)
+inline float grng(uint32_t &state) {
+  uint32_t u=urng(state);
+  int32_t acc=0;
+  for(unsigned i=0;i<8;i++){
+    acc += u&0xf;
+    u=u>>4;
+  }
+  // a four-bit uniform has mean 7.5 and variance ((15-0+1)^2-1)/12 = 85/4
+  // sum of four uniforms has mean 8*7.5=60 and variance of 8*85/4=170
+  const float scale=0.07669649888473704; // == 1/sqrt(170)
+  return (acc-60.0f) * scale;
+}
+
+#endif
diff --git a/apps/POLite/izhikevich-sync/Run.cpp b/apps/POLite/izhikevich-sync/Run.cpp
new file mode 100644
index 00000000..dd1ac79e
--- /dev/null
+++ b/apps/POLite/izhikevich-sync/Run.cpp
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: BSD-2-Clause
+#include "Izhikevich.h"
+
+#include <HostLink.h>
+#include <POLite.h>
+
+#include <EdgeList.h>
+#include <assert.h>
+#include <sys/time.h>
+#include <config.h>
+
+inline double urand() { return (double) rand() / RAND_MAX; }
+
+int main(int argc, char**argv)
+{
+  if (argc != 2) {
+    printf("Specify edges file\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Read network
+  EdgeList net;
+  net.read(argv[1]);
+  assert(net.minFanOut() > 0);
+
+  // Connection to tinsel machine
+  HostLink hostLink;
+
+  // Create POETS graph
+  PGraph<IzhikevichDevice, IzhikevichState, Weight, IzhikevichMsg> graph;
+
+  // Create nodes in POETS graph
+  for (uint32_t i = 0; i < net.numNodes; i++) {
+    PDeviceId id = graph.newDevice();
+    assert(i == id);
+  }
+
+  // Ratio of excitatory to inhibitory neurons
+  double excitatory = 0.8;
+
+  // Mark each neuron as excitatory (or inhibiatory)
+  srand(1);
+  bool* excite = new bool [net.numNodes];
+  for (int i = 0; i < net.numNodes; i++)
+    excite[i] = urand() < excitatory;
+
+  // Create connections in POETS graph
+  for (uint32_t i = 0; i < net.numNodes; i++) {
+    uint32_t numNeighbours = net.neighbours[i][0];
+    for (uint32_t j = 0; j < numNeighbours; j++) {
+      float weight = excite[i] ? 0.5 * urand() : -urand();
+      graph.addLabelledEdge(weight, i, 0, net.neighbours[i][j+1]);
+    }
+  }
+
+  // Prepare mapping from graph to hardware
+  graph.map();
+
+  srand(2);
+  // Initialise devices
+  for (PDeviceId i = 0; i < graph.numDevices; i++) {
+    IzhikevichState* n = &graph.devices[i]->state;
+    n->rng = (int32_t) (urand()*((double) (1<<31)));
+    if (excite[i]) {
+      float re = (float) urand();
+      n->a = 0.02;
+      n->b = 0.2;
+      n->c = -65+15*re*re;
+      n->d = 8-6*re*re;
+      n->Ir = 5;
+    }
+    else {
+      float ri = (float) urand();
+      n->a = 0.02+0.08*ri;
+      n->b = 0.25-0.05*ri;
+      n->c = -65;
+      n->d = 2;
+      n->Ir = 2;
+    }
+  }
+
+  // Write graph down to tinsel machine via HostLink
+  graph.write(&hostLink);
+
+  // Load code and trigger execution
+  hostLink.boot("code.v", "data.v");
+  hostLink.go();
+
+  // Timer
+  printf("Started\n");
+  struct timeval start, finish, diff;
+  gettimeofday(&start, NULL);
+
+  // Consume performance stats
+  politeSaveStats(&hostLink, "stats.txt");
+
+  int64_t sum = 0;
+  // Receive final distance to each vertex
+  for (uint32_t i = 0; i < graph.numDevices; i++) {
+    // Receive message
+    PMessage<IzhikevichMsg> msg;
+    hostLink.recvMsg(&msg, sizeof(msg));
+    if (i == 0) gettimeofday(&finish, NULL);
+    // Accumulate
+    sum += msg.payload.spikeCount;
+  }
+
+  // Emit result
+  printf("Total spikes = %ld\n", sum);
+
+  // Display time
+  timersub(&finish, &start, &diff);
+  double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
+  printf("Time = %lf\n", duration);
+
+  return 0;
+}
diff --git a/apps/POLite/pagerank-sync/Run.cpp b/apps/POLite/pagerank-sync/Run.cpp
index 435a0750..3ce786b5 100644
--- a/apps/POLite/pagerank-sync/Run.cpp
+++ b/apps/POLite/pagerank-sync/Run.cpp
@@ -27,6 +27,7 @@ int main(int argc, char **argv)
   EdgeList net;
   net.read(argv[1]);
   printf(" done\n");
+  assert(net.minFanOut() > 0);
 
   // Print max fan-out
   printf("Max fan-out = %d\n", net.maxFanOut());
diff --git a/apps/POLite/sssp-async/Run.cpp b/apps/POLite/sssp-async/Run.cpp
index c7953795..b9c174a3 100644
--- a/apps/POLite/sssp-async/Run.cpp
+++ b/apps/POLite/sssp-async/Run.cpp
@@ -22,6 +22,7 @@ int main(int argc, char**argv)
 
   // Print max fan-out
   printf("Max fan-out = %d\n", net.maxFanOut());
+  assert(net.minFanOut() > 0);
 
   // Connection to tinsel machine
   HostLink hostLink;
diff --git a/apps/POLite/sssp-pc/.asp.cpp.swp b/apps/POLite/sssp-pc/.asp.cpp.swp
new file mode 100644
index 0000000000000000000000000000000000000000..2d2936e2dd2badda5887be7d43054d6c78666e79
GIT binary patch
literal 16384
zcmeI2U2Ggz700KE3#1_}N`Z$;NH?3>*<E{WuVbv%9}Q_@w??iT?4&Ak7Hf9q&hEss
zGt+x#?3$$E0VotIo+=a`5ap>4E#jdnPkgmO3PM6c2oWmLH!4vHsCYoY8~=0feC*oZ
zxTz2VF)RJNo;&yB+;h(TIQMRI{E5ZK*aM^E20kA#jL%+K{#@stsex;zVRZb-`V^(_
z^J#92z~3hiy}qg{9E)6Ea&OEGLpgTt@g+CnW2Ovip+#S#Rv5mAs){Uy1_}+_jt1h$
zsXcJSI5|;QL;ZN^7`ylAh1;p2$XjTj&_JPqLIZ^c3Jnw*C^S%LpwPhoxCWx|fbj+N
z;Xu-ltI78rJH8Jl&ojw4m7!0XPd^oZ3k?(+C^S%LpwK{}fkFd?1_})n8YnbSXrRzQ
zp@IKK4VboJlpxp?9RT3|KaKxy-eVX)1wR3=fNS7OAOsh|0+<J7@G)=z+yQRfZ5Yo1
z0iFU6g8RW?a2I&%h++H~{0Mvnd>O>xGhhxJ0)PFmVf-2V349;;;8E}hD1oEk0Qmc1
z!*~b$8oUIa2Uox%_&7KU2Eak^i@OZtS+EWs1)l(i!8?Zx<8AN<@EVAK15SZ|e#kI>
z2Yv&-2fhh5z{B7WxDyP3>mP(}z-e$F`0WP_;~U^>;5l#wbig`jfQ#UM@a~=HC%6f&
zgEznnU>mG}6W|DVcK~vNx4{p=w}Au!@WFZTAQ%7#!J9a&`5CwlUIj0LuLB><g9`Wu
z?0o~g2%Z5JARqW7*#E<lj}r)1cKwKLusJq9gRfb($*m|5tM2DHo`;7kMw)qSj4dEP
zH?20a+OfZpy>T5@K4!H|zs2n}zfoaU;Nw>!F`vn@#AxN@#CpVhwB&QQ)ouo{kZW#Z
z?NTGnmWgu-nd&p_5}QTlxU-z)M;b(8my>$)TDvptYGFKUp}P;66soeUZC9(pg(|4&
zz~*v>xp}L{Lt(Z$CJQ<t3r%5q9`_hZt6F;MM^PkhwOx-xKg<q%u0<z})01k+K%T50
zwkspmcO;^&vv@Rpi!;}^1UDs@Ebz4j@Ut6XMlStuhf(v|j{SvC&Eq!G`ft$@WEQY-
zV$ehjP$HUo1~3L;thVd7)}*i+Ywp<8WP@FS&0>$u&O(&IsmY4AYSuO6M9UIZw>a6I
zUiXoERcGo_yKYtg&0AmQPe<B=v?fWi{O!&+S!3i~aCUfCkF#f<m1XR&O>)3n=ntk_
z^xeocJy-HfSxIZE*J>2`<RPeubTl=I!6D;L+7nc<il@L~<P+os(ztBoIb~SRR0}^%
zR5p^iQZ-d|#XwbO;H`1QRRgFJPRnFF5D|yFg=W;2%n?C{wcJharv)*wKsLwn0?Erx
zsO;$o&dXL1`;mG?%`4m^$K352iE+>$B`Ec;ikSgT2Ua<sWJZ|8R1-K^rb^;CnTsip
z&0gO|9q-YOX+tD%kF<GH=|r$g*s7K`t>S$aCj+4h*r$VE)nQm^Ymj>|*Je?`nr@VO
z56qV}M>%A!dYXr0-*yG2aBf$+=UH1ujWuPsHTt(!qiFy`QV-H>DwKLa>x?wEbw(7?
z83iX5InpwH2R<>vN`p2Vv`Zsw2zl@qW%d~OBP^+*k4fh3R?jS}2YQ!OSSH3vVl9}S
zJXm-CawnlxCbwK#V7T66JIbU@*GHnA^S<C)u5YKqAeSt)xN0?Tc29@VlBEM@AB*Qr
z^tW$r&*5QhpWJY1%74gW>0DIRChAd3HtPFoMVn&(VSoynT+pQ0tDau#B;J!OTMz}#
z#~#u<2^ui$6r~K;*>t|bW2S>Y<wa<<>Meqly^AR4P1Q|9P^o;5%|E_SVNHzHjepPB
zNj4Jod=G(!sOm1U$CGCz6MC+m)%=esn=+>AQ+hCcn{gU?<YLGbwK-NBnmw?uAT3jZ
z%D~!O@KK%1+zc=&Wm}&ufvt$7Ns4kyv%{Hb+k#6uf>N}3*`g(IlMAd8s+r{=;?tBG
zV{tTDDU3a@gf*!ZpsIEU3dD*8Ge_8f&|BFqRZxFE4*1h<RGxow@%;M2>BXfBEAz>e
z<Aee$sZ*Ab&4s|^bjI{gL@WrgsGS&;CocM!s2ps^d`PLvkhz?Y&CRjp3rkB{vkQw$
z^HoNvSgBF8oR&a^A}XchcZHj&_AVPrB`~%9h%m!8R!AQ%N4qFgl(aX(nnB>PuK5|e
zc9kz%)bmvlZP#aL;a<*6HZ@tJRbVR+w#0rgUj%e1gki}pqFT+~(ULZ^*9%qmWAAIR
z$wqp$YY?jX7F>Z|V?WU0a5lJ)+T@r*nA@x!co3UBq{|7jZ9Vn0pESCB_mHOrGFcnp
zxv4x<v2uIeVVZ2Sk+DOSYT0C6-hUkXYcIxXJ1e&(e&F8WqwR-`Jk^wvi>NnEMsT*)
z&NLfF*p=qF&0T43KuLKUr4`5%bXGZvmq;J0=gvM^TXHRqU)$FfZK%|BTrQ^B!>ea&
z6SXs*iQ!{V{9i?^JAqi3;{Wu1|5u3Re-7RNuY(uBv)~N44-A06AfA66d=Z%76u2Aw
z4Ke%;@DlhA_!hVZo(EqAKInjT&;S<!#r`v(3<dzj{(l6&1TO;to&ui+6>tc=g}DCr
z;0NF}@LeFm1#l946x<8mMBi?LR{;m7!Kc6^Q2pLN;4N|#8YnbSXrRzQp@BjJg$DkQ
zH9*lZ0>@bR6dmFr37SghvF%0NG3YrePO@WtzO3RMJBXV&#ZJ?Zt*nzO2pAn_G}>I3
z++xGKld(y)*chvitIwzkNNIyDWZWzdIj--@Hk}(qrl9j9s*PIgk}Ej|MSh9XCj<gJ
zPA*C0Sc#0|Q<O!~7LTZnz_HEkyb%@RQGl$EStX885i?{br^(T@>ThoaiEvqY9kyYo
zufh;{j*BPMr3utBHOZ>or^;w8jmz@WT@oP@u@y0ra?`|NtnPlQ4$^|^h)9Eiy(AsU
z;u|G%76>IBS<5@84%>AQ(^XN|4M%SUc|p~qY95#NIE*?Yn`VRYC}S$}qW_g4P=R`>
zBvh|d19^aKiq_^@y@BKE{J4@Vvm-=C^N#6Bb&sS^b^3+=sS~e;+;W}m<kW&<ZwRCs
zr%)fwTI|>yo1jSlvOyN?n69s`!06qGj@hduUPYgOCqo4;)6@4sTMrVP&DZLxek~sz
z*3(%K`jXz2<lp(-M0#_vL5G@ZOi=#(J>6a49j;WMGj-FFmmodq5Bc#h&b8oj)NcMF
za|OC>TVHR+u7~@$^)Lw40ioxW({Yy6M9NV~*{x50Po%RX_f_g^Xh@BEIyUK&J@a#_
z2<`5coDH$7X{zpFDspxTdy9+6?A9;6m{KBU0`&AbNzJOg^-WIyyBu`F#yG`sSb1n@
zI=RBj(}t&MQ0ptwL_s*VA%!7;Uq~-Y>9_%&9@=!mpY}hQB>Fi@?h3i#yWIiTtZ<MS
F<6oGQi6sC4

literal 0
HcmV?d00001

diff --git a/apps/POLite/sssp-pc/Makefile b/apps/POLite/sssp-pc/Makefile
new file mode 100644
index 00000000..2ddbeca3
--- /dev/null
+++ b/apps/POLite/sssp-pc/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: BSD-2-Clause
+all: sssp
+
+INC=../../../include
+
+sssp: sssp.cpp
+	g++ -I$(INC) -O3 sssp.cpp -o sssp
+
+.PHONY: clean
+clean:
+	rm sssp
diff --git a/apps/POLite/sssp-pc/sssp.cpp b/apps/POLite/sssp-pc/sssp.cpp
new file mode 100644
index 00000000..9012f49e
--- /dev/null
+++ b/apps/POLite/sssp-pc/sssp.cpp
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: BSD-2-Clause
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <assert.h>
+#include <sys/time.h>
+#include <EdgeList.h>
+
+int main(int argc, char**argv)
+{
+  if (argc != 2) {
+    printf("Specify edges file\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Read network
+  EdgeList net;
+  net.read(argv[1]);
+
+  // Create weights
+  srand(1);
+  uint32_t** weights = new uint32_t* [net.numNodes];
+  for (uint32_t i = 0; i < net.numNodes; i++) {
+    uint32_t numNeighbours = net.neighbours[i][0];
+    weights[i] = new uint32_t [numNeighbours];
+    for (uint32_t j = 0; j < numNeighbours; j++) {
+      weights[i][j] = rand() % 100;
+    }
+  }
+
+  // Create states
+  uint32_t* dist = new uint32_t [net.numNodes];
+  int* queue = new int [net.numNodes];
+  int queueSize = 0;
+  int* queueNext = new int [net.numNodes];
+  int queueSizeNext = 0;
+  bool* inQueue = new bool [net.numNodes];
+  for (int i = 0; i < net.numNodes; i++) {
+    inQueue[i] = false;
+    dist[i] = 0x7fffffff;
+  }
+
+  // Set source vertex
+  dist[2] = 0;
+  queue[queueSize++] = 2;
+ 
+  // Start timer
+  printf("Started\n");
+  struct timeval start, finish, diff;
+  gettimeofday(&start, NULL);
+
+  int iters = 0;
+  while (queueSize > 0) {
+    for (int i = 0; i < queueSize; i++) {
+      uint32_t me = queue[i];
+      uint32_t numNeighbours = net.neighbours[me][0];
+      for (uint32_t j = 0; j < numNeighbours; j++) {
+        uint32_t neighbour = net.neighbours[me][j+1];
+        uint32_t newDist = dist[me] + weights[me][j];
+        if (newDist < dist[neighbour]) {
+          dist[neighbour] = newDist;
+          if (!inQueue[neighbour]) {
+            queueNext[queueSizeNext++] = neighbour;
+            inQueue[neighbour] = true;
+          }
+        }
+      }
+    }
+    queueSize = queueSizeNext;
+    queueSizeNext = 0;
+    int32_t* tmp = queue; queue = queueNext; queueNext = tmp;
+    for (int i = 0; i < queueSize; i++) inQueue[queue[i]] = false;
+    iters++;
+  }
+
+  // Stop timer
+  gettimeofday(&finish, NULL);
+
+  uint64_t sum = 0;
+  for (int i = 0; i < net.numNodes; i++)
+    sum += dist[i];
+  printf("Sum of distances = %ld\n", sum);
+  printf("Iterations = %d\n", iters);
+
+  // Display time
+  timersub(&finish, &start, &diff);
+  double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
+  printf("Time = %lf\n", duration);
+
+  return 0;
+}
diff --git a/apps/POLite/sssp-sync/Run.cpp b/apps/POLite/sssp-sync/Run.cpp
index c7953795..b9c174a3 100644
--- a/apps/POLite/sssp-sync/Run.cpp
+++ b/apps/POLite/sssp-sync/Run.cpp
@@ -22,6 +22,7 @@ int main(int argc, char**argv)
 
   // Print max fan-out
   printf("Max fan-out = %d\n", net.maxFanOut());
+  assert(net.minFanOut() > 0);
 
   // Connection to tinsel machine
   HostLink hostLink;
diff --git a/apps/POLite/util/sumstats.awk b/apps/POLite/util/sumstats.awk
index 4d037cca..f1f70329 100755
--- a/apps/POLite/util/sumstats.awk
+++ b/apps/POLite/util/sumstats.awk
@@ -13,7 +13,7 @@ BEGIN {
   intraThreadSendCount = 0;
   interThreadSendCount = 0;
   interBoardSendCount = 0;
-  fmax = 225000000;
+  fmax = 250000000;
   if (boardsX == "" || boardsY == "") {
     boardsX = 3;
     boardsY = 2;

From 8d76d9bbd26e7e7df78b8c51ba37381bee2119cc Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Tue, 7 Apr 2020 08:40:18 +0100
Subject: [PATCH 24/78] Drop some accidently committed files

---
 apps/POLite/heat-pc/.asp.cpp.swp | Bin 16384 -> 0 bytes
 apps/POLite/sssp-pc/.asp.cpp.swp | Bin 16384 -> 0 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 apps/POLite/heat-pc/.asp.cpp.swp
 delete mode 100644 apps/POLite/sssp-pc/.asp.cpp.swp

diff --git a/apps/POLite/heat-pc/.asp.cpp.swp b/apps/POLite/heat-pc/.asp.cpp.swp
deleted file mode 100644
index 2d2936e2dd2badda5887be7d43054d6c78666e79..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 16384
zcmeI2U2Ggz700KE3#1_}N`Z$;NH?3>*<E{WuVbv%9}Q_@w??iT?4&Ak7Hf9q&hEss
zGt+x#?3$$E0VotIo+=a`5ap>4E#jdnPkgmO3PM6c2oWmLH!4vHsCYoY8~=0feC*oZ
zxTz2VF)RJNo;&yB+;h(TIQMRI{E5ZK*aM^E20kA#jL%+K{#@stsex;zVRZb-`V^(_
z^J#92z~3hiy}qg{9E)6Ea&OEGLpgTt@g+CnW2Ovip+#S#Rv5mAs){Uy1_}+_jt1h$
zsXcJSI5|;QL;ZN^7`ylAh1;p2$XjTj&_JPqLIZ^c3Jnw*C^S%LpwPhoxCWx|fbj+N
z;Xu-ltI78rJH8Jl&ojw4m7!0XPd^oZ3k?(+C^S%LpwK{}fkFd?1_})n8YnbSXrRzQ
zp@IKK4VboJlpxp?9RT3|KaKxy-eVX)1wR3=fNS7OAOsh|0+<J7@G)=z+yQRfZ5Yo1
z0iFU6g8RW?a2I&%h++H~{0Mvnd>O>xGhhxJ0)PFmVf-2V349;;;8E}hD1oEk0Qmc1
z!*~b$8oUIa2Uox%_&7KU2Eak^i@OZtS+EWs1)l(i!8?Zx<8AN<@EVAK15SZ|e#kI>
z2Yv&-2fhh5z{B7WxDyP3>mP(}z-e$F`0WP_;~U^>;5l#wbig`jfQ#UM@a~=HC%6f&
zgEznnU>mG}6W|DVcK~vNx4{p=w}Au!@WFZTAQ%7#!J9a&`5CwlUIj0LuLB><g9`Wu
z?0o~g2%Z5JARqW7*#E<lj}r)1cKwKLusJq9gRfb($*m|5tM2DHo`;7kMw)qSj4dEP
zH?20a+OfZpy>T5@K4!H|zs2n}zfoaU;Nw>!F`vn@#AxN@#CpVhwB&QQ)ouo{kZW#Z
z?NTGnmWgu-nd&p_5}QTlxU-z)M;b(8my>$)TDvptYGFKUp}P;66soeUZC9(pg(|4&
zz~*v>xp}L{Lt(Z$CJQ<t3r%5q9`_hZt6F;MM^PkhwOx-xKg<q%u0<z})01k+K%T50
zwkspmcO;^&vv@Rpi!;}^1UDs@Ebz4j@Ut6XMlStuhf(v|j{SvC&Eq!G`ft$@WEQY-
zV$ehjP$HUo1~3L;thVd7)}*i+Ywp<8WP@FS&0>$u&O(&IsmY4AYSuO6M9UIZw>a6I
zUiXoERcGo_yKYtg&0AmQPe<B=v?fWi{O!&+S!3i~aCUfCkF#f<m1XR&O>)3n=ntk_
z^xeocJy-HfSxIZE*J>2`<RPeubTl=I!6D;L+7nc<il@L~<P+os(ztBoIb~SRR0}^%
zR5p^iQZ-d|#XwbO;H`1QRRgFJPRnFF5D|yFg=W;2%n?C{wcJharv)*wKsLwn0?Erx
zsO;$o&dXL1`;mG?%`4m^$K352iE+>$B`Ec;ikSgT2Ua<sWJZ|8R1-K^rb^;CnTsip
z&0gO|9q-YOX+tD%kF<GH=|r$g*s7K`t>S$aCj+4h*r$VE)nQm^Ymj>|*Je?`nr@VO
z56qV}M>%A!dYXr0-*yG2aBf$+=UH1ujWuPsHTt(!qiFy`QV-H>DwKLa>x?wEbw(7?
z83iX5InpwH2R<>vN`p2Vv`Zsw2zl@qW%d~OBP^+*k4fh3R?jS}2YQ!OSSH3vVl9}S
zJXm-CawnlxCbwK#V7T66JIbU@*GHnA^S<C)u5YKqAeSt)xN0?Tc29@VlBEM@AB*Qr
z^tW$r&*5QhpWJY1%74gW>0DIRChAd3HtPFoMVn&(VSoynT+pQ0tDau#B;J!OTMz}#
z#~#u<2^ui$6r~K;*>t|bW2S>Y<wa<<>Meqly^AR4P1Q|9P^o;5%|E_SVNHzHjepPB
zNj4Jod=G(!sOm1U$CGCz6MC+m)%=esn=+>AQ+hCcn{gU?<YLGbwK-NBnmw?uAT3jZ
z%D~!O@KK%1+zc=&Wm}&ufvt$7Ns4kyv%{Hb+k#6uf>N}3*`g(IlMAd8s+r{=;?tBG
zV{tTDDU3a@gf*!ZpsIEU3dD*8Ge_8f&|BFqRZxFE4*1h<RGxow@%;M2>BXfBEAz>e
z<Aee$sZ*Ab&4s|^bjI{gL@WrgsGS&;CocM!s2ps^d`PLvkhz?Y&CRjp3rkB{vkQw$
z^HoNvSgBF8oR&a^A}XchcZHj&_AVPrB`~%9h%m!8R!AQ%N4qFgl(aX(nnB>PuK5|e
zc9kz%)bmvlZP#aL;a<*6HZ@tJRbVR+w#0rgUj%e1gki}pqFT+~(ULZ^*9%qmWAAIR
z$wqp$YY?jX7F>Z|V?WU0a5lJ)+T@r*nA@x!co3UBq{|7jZ9Vn0pESCB_mHOrGFcnp
zxv4x<v2uIeVVZ2Sk+DOSYT0C6-hUkXYcIxXJ1e&(e&F8WqwR-`Jk^wvi>NnEMsT*)
z&NLfF*p=qF&0T43KuLKUr4`5%bXGZvmq;J0=gvM^TXHRqU)$FfZK%|BTrQ^B!>ea&
z6SXs*iQ!{V{9i?^JAqi3;{Wu1|5u3Re-7RNuY(uBv)~N44-A06AfA66d=Z%76u2Aw
z4Ke%;@DlhA_!hVZo(EqAKInjT&;S<!#r`v(3<dzj{(l6&1TO;to&ui+6>tc=g}DCr
z;0NF}@LeFm1#l946x<8mMBi?LR{;m7!Kc6^Q2pLN;4N|#8YnbSXrRzQp@BjJg$DkQ
zH9*lZ0>@bR6dmFr37SghvF%0NG3YrePO@WtzO3RMJBXV&#ZJ?Zt*nzO2pAn_G}>I3
z++xGKld(y)*chvitIwzkNNIyDWZWzdIj--@Hk}(qrl9j9s*PIgk}Ej|MSh9XCj<gJ
zPA*C0Sc#0|Q<O!~7LTZnz_HEkyb%@RQGl$EStX885i?{br^(T@>ThoaiEvqY9kyYo
zufh;{j*BPMr3utBHOZ>or^;w8jmz@WT@oP@u@y0ra?`|NtnPlQ4$^|^h)9Eiy(AsU
z;u|G%76>IBS<5@84%>AQ(^XN|4M%SUc|p~qY95#NIE*?Yn`VRYC}S$}qW_g4P=R`>
zBvh|d19^aKiq_^@y@BKE{J4@Vvm-=C^N#6Bb&sS^b^3+=sS~e;+;W}m<kW&<ZwRCs
zr%)fwTI|>yo1jSlvOyN?n69s`!06qGj@hduUPYgOCqo4;)6@4sTMrVP&DZLxek~sz
z*3(%K`jXz2<lp(-M0#_vL5G@ZOi=#(J>6a49j;WMGj-FFmmodq5Bc#h&b8oj)NcMF
za|OC>TVHR+u7~@$^)Lw40ioxW({Yy6M9NV~*{x50Po%RX_f_g^Xh@BEIyUK&J@a#_
z2<`5coDH$7X{zpFDspxTdy9+6?A9;6m{KBU0`&AbNzJOg^-WIyyBu`F#yG`sSb1n@
zI=RBj(}t&MQ0ptwL_s*VA%!7;Uq~-Y>9_%&9@=!mpY}hQB>Fi@?h3i#yWIiTtZ<MS
F<6oGQi6sC4

diff --git a/apps/POLite/sssp-pc/.asp.cpp.swp b/apps/POLite/sssp-pc/.asp.cpp.swp
deleted file mode 100644
index 2d2936e2dd2badda5887be7d43054d6c78666e79..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 16384
zcmeI2U2Ggz700KE3#1_}N`Z$;NH?3>*<E{WuVbv%9}Q_@w??iT?4&Ak7Hf9q&hEss
zGt+x#?3$$E0VotIo+=a`5ap>4E#jdnPkgmO3PM6c2oWmLH!4vHsCYoY8~=0feC*oZ
zxTz2VF)RJNo;&yB+;h(TIQMRI{E5ZK*aM^E20kA#jL%+K{#@stsex;zVRZb-`V^(_
z^J#92z~3hiy}qg{9E)6Ea&OEGLpgTt@g+CnW2Ovip+#S#Rv5mAs){Uy1_}+_jt1h$
zsXcJSI5|;QL;ZN^7`ylAh1;p2$XjTj&_JPqLIZ^c3Jnw*C^S%LpwPhoxCWx|fbj+N
z;Xu-ltI78rJH8Jl&ojw4m7!0XPd^oZ3k?(+C^S%LpwK{}fkFd?1_})n8YnbSXrRzQ
zp@IKK4VboJlpxp?9RT3|KaKxy-eVX)1wR3=fNS7OAOsh|0+<J7@G)=z+yQRfZ5Yo1
z0iFU6g8RW?a2I&%h++H~{0Mvnd>O>xGhhxJ0)PFmVf-2V349;;;8E}hD1oEk0Qmc1
z!*~b$8oUIa2Uox%_&7KU2Eak^i@OZtS+EWs1)l(i!8?Zx<8AN<@EVAK15SZ|e#kI>
z2Yv&-2fhh5z{B7WxDyP3>mP(}z-e$F`0WP_;~U^>;5l#wbig`jfQ#UM@a~=HC%6f&
zgEznnU>mG}6W|DVcK~vNx4{p=w}Au!@WFZTAQ%7#!J9a&`5CwlUIj0LuLB><g9`Wu
z?0o~g2%Z5JARqW7*#E<lj}r)1cKwKLusJq9gRfb($*m|5tM2DHo`;7kMw)qSj4dEP
zH?20a+OfZpy>T5@K4!H|zs2n}zfoaU;Nw>!F`vn@#AxN@#CpVhwB&QQ)ouo{kZW#Z
z?NTGnmWgu-nd&p_5}QTlxU-z)M;b(8my>$)TDvptYGFKUp}P;66soeUZC9(pg(|4&
zz~*v>xp}L{Lt(Z$CJQ<t3r%5q9`_hZt6F;MM^PkhwOx-xKg<q%u0<z})01k+K%T50
zwkspmcO;^&vv@Rpi!;}^1UDs@Ebz4j@Ut6XMlStuhf(v|j{SvC&Eq!G`ft$@WEQY-
zV$ehjP$HUo1~3L;thVd7)}*i+Ywp<8WP@FS&0>$u&O(&IsmY4AYSuO6M9UIZw>a6I
zUiXoERcGo_yKYtg&0AmQPe<B=v?fWi{O!&+S!3i~aCUfCkF#f<m1XR&O>)3n=ntk_
z^xeocJy-HfSxIZE*J>2`<RPeubTl=I!6D;L+7nc<il@L~<P+os(ztBoIb~SRR0}^%
zR5p^iQZ-d|#XwbO;H`1QRRgFJPRnFF5D|yFg=W;2%n?C{wcJharv)*wKsLwn0?Erx
zsO;$o&dXL1`;mG?%`4m^$K352iE+>$B`Ec;ikSgT2Ua<sWJZ|8R1-K^rb^;CnTsip
z&0gO|9q-YOX+tD%kF<GH=|r$g*s7K`t>S$aCj+4h*r$VE)nQm^Ymj>|*Je?`nr@VO
z56qV}M>%A!dYXr0-*yG2aBf$+=UH1ujWuPsHTt(!qiFy`QV-H>DwKLa>x?wEbw(7?
z83iX5InpwH2R<>vN`p2Vv`Zsw2zl@qW%d~OBP^+*k4fh3R?jS}2YQ!OSSH3vVl9}S
zJXm-CawnlxCbwK#V7T66JIbU@*GHnA^S<C)u5YKqAeSt)xN0?Tc29@VlBEM@AB*Qr
z^tW$r&*5QhpWJY1%74gW>0DIRChAd3HtPFoMVn&(VSoynT+pQ0tDau#B;J!OTMz}#
z#~#u<2^ui$6r~K;*>t|bW2S>Y<wa<<>Meqly^AR4P1Q|9P^o;5%|E_SVNHzHjepPB
zNj4Jod=G(!sOm1U$CGCz6MC+m)%=esn=+>AQ+hCcn{gU?<YLGbwK-NBnmw?uAT3jZ
z%D~!O@KK%1+zc=&Wm}&ufvt$7Ns4kyv%{Hb+k#6uf>N}3*`g(IlMAd8s+r{=;?tBG
zV{tTDDU3a@gf*!ZpsIEU3dD*8Ge_8f&|BFqRZxFE4*1h<RGxow@%;M2>BXfBEAz>e
z<Aee$sZ*Ab&4s|^bjI{gL@WrgsGS&;CocM!s2ps^d`PLvkhz?Y&CRjp3rkB{vkQw$
z^HoNvSgBF8oR&a^A}XchcZHj&_AVPrB`~%9h%m!8R!AQ%N4qFgl(aX(nnB>PuK5|e
zc9kz%)bmvlZP#aL;a<*6HZ@tJRbVR+w#0rgUj%e1gki}pqFT+~(ULZ^*9%qmWAAIR
z$wqp$YY?jX7F>Z|V?WU0a5lJ)+T@r*nA@x!co3UBq{|7jZ9Vn0pESCB_mHOrGFcnp
zxv4x<v2uIeVVZ2Sk+DOSYT0C6-hUkXYcIxXJ1e&(e&F8WqwR-`Jk^wvi>NnEMsT*)
z&NLfF*p=qF&0T43KuLKUr4`5%bXGZvmq;J0=gvM^TXHRqU)$FfZK%|BTrQ^B!>ea&
z6SXs*iQ!{V{9i?^JAqi3;{Wu1|5u3Re-7RNuY(uBv)~N44-A06AfA66d=Z%76u2Aw
z4Ke%;@DlhA_!hVZo(EqAKInjT&;S<!#r`v(3<dzj{(l6&1TO;to&ui+6>tc=g}DCr
z;0NF}@LeFm1#l946x<8mMBi?LR{;m7!Kc6^Q2pLN;4N|#8YnbSXrRzQp@BjJg$DkQ
zH9*lZ0>@bR6dmFr37SghvF%0NG3YrePO@WtzO3RMJBXV&#ZJ?Zt*nzO2pAn_G}>I3
z++xGKld(y)*chvitIwzkNNIyDWZWzdIj--@Hk}(qrl9j9s*PIgk}Ej|MSh9XCj<gJ
zPA*C0Sc#0|Q<O!~7LTZnz_HEkyb%@RQGl$EStX885i?{br^(T@>ThoaiEvqY9kyYo
zufh;{j*BPMr3utBHOZ>or^;w8jmz@WT@oP@u@y0ra?`|NtnPlQ4$^|^h)9Eiy(AsU
z;u|G%76>IBS<5@84%>AQ(^XN|4M%SUc|p~qY95#NIE*?Yn`VRYC}S$}qW_g4P=R`>
zBvh|d19^aKiq_^@y@BKE{J4@Vvm-=C^N#6Bb&sS^b^3+=sS~e;+;W}m<kW&<ZwRCs
zr%)fwTI|>yo1jSlvOyN?n69s`!06qGj@hduUPYgOCqo4;)6@4sTMrVP&DZLxek~sz
z*3(%K`jXz2<lp(-M0#_vL5G@ZOi=#(J>6a49j;WMGj-FFmmodq5Bc#h&b8oj)NcMF
za|OC>TVHR+u7~@$^)Lw40ioxW({Yy6M9NV~*{x50Po%RX_f_g^Xh@BEIyUK&J@a#_
z2<`5coDH$7X{zpFDspxTdy9+6?A9;6m{KBU0`&AbNzJOg^-WIyyBu`F#yG`sSb1n@
zI=RBj(}t&MQ0ptwL_s*VA%!7;Uq~-Y>9_%&9@=!mpY}hQB>Fi@?h3i#yWIiTtZ<MS
F<6oGQi6sC4


From b4a7078d2bd787e1da3ef6f51913956d8a05d1d6 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Tue, 7 Apr 2020 09:38:02 +0100
Subject: [PATCH 25/78] Forward port EdgeList.h

(From 0.6.3)
---
 include/EdgeList.h | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/include/EdgeList.h b/include/EdgeList.h
index 7d03bb8f..efb65c73 100644
--- a/include/EdgeList.h
+++ b/include/EdgeList.h
@@ -16,7 +16,7 @@ struct EdgeList {
   uint32_t** neighbours;
 
   // Read network from file
-  void read(const char* filename)
+  void read(const char* filename, bool warn = true)
   {
     // Read edges
     FILE* fp = fopen(filename, "rt");
@@ -62,6 +62,11 @@ struct EdgeList {
     // Release
     free(count);
     fclose(fp);
+
+    if (warn && minFanOut() == 0) {
+      printf("Warning: some vertices have no outgoing edges and\n");
+      printf("         some POLite apps do not handle this case.\n");
+    }
   }
 
   // Determine max fan-out
@@ -73,6 +78,17 @@ struct EdgeList {
     }
     return max;
   }
+
+  // Determine min fan-out
+  uint32_t minFanOut() {
+    uint32_t min = ~0;
+    for (uint32_t i = 0; i < numNodes; i++) {
+      uint32_t numNeighbours = neighbours[i][0];
+      if (numNeighbours < min) min = numNeighbours;
+    }
+    return min;
+  }
+
 };
 
 #endif

From 8f3a15a46a0afad0c85659ee391c5b65b7046d00 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Tue, 7 Apr 2020 09:38:38 +0100
Subject: [PATCH 26/78] Renaming routing tables to thread routing tables

This is to distinguish (existing) thread-level routing tables from
board-level routing tables (to come).
---
 include/POLite/PGraph.h | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/include/POLite/PGraph.h b/include/POLite/PGraph.h
index 4181c3da..3ac93eae 100644
--- a/include/POLite/PGraph.h
+++ b/include/POLite/PGraph.h
@@ -56,7 +56,7 @@ template <typename DeviceType,
   uint32_t numBoardsX;
   uint32_t numBoardsY;
 
-  // Multicast routing tables:
+  // Thread routing tables:
   // Sequence of outgoing edges for every (device, pin) pair
   Seq<POutEdge>*** outTable;
   // Sequence of incoming edges for every thread
@@ -365,9 +365,9 @@ template <typename DeviceType,
     numDevicesOnThread = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
   }
 
-  // Allocate routing tables
+  // Allocate thread routing tables
   // (Only valid after mapper is called)
-  void allocateRoutingTables() {
+  void allocateThreadRoutingTables() {
     // Receiver-side tables
     inTable = (Seq<PInEdge<E>>**)
       calloc(TinselMaxThreads,sizeof(Seq<PInEdge<E>>*));
@@ -407,9 +407,9 @@ template <typename DeviceType,
     }
   }
 
-  // Determine routing key for given set of receivers
+  // Determine thread routing key for given set of receivers
   // (The key must be the same for all receivers)
-  uint32_t findKey(Seq<PReceiverGroup<E>>* receivers) { 
+  uint32_t findThreadKey(Seq<PReceiverGroup<E>>* receivers) { 
     uint32_t key = 0;
 
     bool found = false;
@@ -445,9 +445,9 @@ template <typename DeviceType,
   // Add entries to the input tables for the given receivers
   // (Only valid after mapper is called)
   uint32_t addInTableEntries(Seq<PReceiverGroup<E>>* receivers) {
-    uint32_t key = findKey(receivers);
+    uint32_t key = findThreadKey(receivers);
     if (key >= 0xfffe) {
-      printf("Routing key exceeds 16 bits\n");
+      printf("Thread routing key exceeds 16 bits\n");
       exit(EXIT_FAILURE);
     }
     PInEdge<E> null, unused;
@@ -475,9 +475,9 @@ template <typename DeviceType,
     return key;
   }
 
-  // Compute routing tables
+  // Compute thread routing tables
   // (Only valid after mapper is called)
-  void computeRoutingTables() {
+  void computeThreadRoutingTables() {
     // Routing table stats
     uint64_t totalOutEdges = 0;
 
@@ -607,7 +607,7 @@ template <typename DeviceType,
   void map() {
     // Let's measure some times
     struct timeval placementStart, placementFinish;
-    struct timeval routingStart, routingFinish;
+    struct timeval threadRoutingStart, threadRoutingFinish;
     struct timeval initStart, initFinish;
 
     // Release all mapping and heap structures
@@ -679,14 +679,14 @@ template <typename DeviceType,
 
     // Stop placement timer and start routing timer
     gettimeofday(&placementFinish, NULL);
-    gettimeofday(&routingStart, NULL);
+    gettimeofday(&threadRoutingStart, NULL);
 
-    // Compute send and receive side routing tables
-    allocateRoutingTables();
-    computeRoutingTables();
+    // Compute send and receive side thread routing tables
+    allocateThreadRoutingTables();
+    computeThreadRoutingTables();
 
     // Stop routing timer and start init timer
-    gettimeofday(&routingFinish, NULL);
+    gettimeofday(&threadRoutingFinish, NULL);
     gettimeofday(&initStart, NULL);
 
     // Reallocate and initialise heap structures
@@ -704,9 +704,9 @@ template <typename DeviceType,
       printf("POLite mapper profile:\n");
       printf("  Partitioning and placement: %lfs\n", duration);
 
-      timersub(&routingFinish, &routingStart, &diff);
+      timersub(&threadRoutingFinish, &threadRoutingStart, &diff);
       duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
-      printf("  Routing table construction: %lfs\n", duration);
+      printf("  Thread routing table construction: %lfs\n", duration);
 
       timersub(&initFinish, &initStart, &diff);
       duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;

From 6cdcbe0c3ec74d6865cc149e5903582e9886a6b6 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Fri, 10 Apr 2020 19:04:00 +0100
Subject: [PATCH 27/78] Update POLite to use ProgRouters

(Builds but untested)
---
 Makefile                       |   9 +-
 apps/POLite/ping-test/Makefile |   6 -
 apps/POLite/ping-test/Run.cpp  |  57 -----
 apps/POLite/ping-test/ping.cpp |  23 ---
 apps/POLite/ping-test/ping.h   |  54 -----
 apps/POLite/util/genld.sh      |   2 +-
 config.py                      |   6 +
 include/POLite/PDevice.h       |  82 ++------
 include/POLite/PGraph.h        | 149 ++++++-------
 include/POLite/ProgRouters.h   | 367 +++++++++++++++++++++++++++++++++
 include/POLite/Seq.h           |  20 +-
 rtl/ProgRouter.bsv             |  14 +-
 12 files changed, 487 insertions(+), 302 deletions(-)
 delete mode 100644 apps/POLite/ping-test/Makefile
 delete mode 100644 apps/POLite/ping-test/Run.cpp
 delete mode 100644 apps/POLite/ping-test/ping.cpp
 delete mode 100644 apps/POLite/ping-test/ping.h
 create mode 100644 include/POLite/ProgRouters.h

diff --git a/Makefile b/Makefile
index d52882f7..d95602f9 100644
--- a/Makefile
+++ b/Makefile
@@ -27,13 +27,18 @@ clean:
 	make -C apps/progrouter clean
 	make -C apps/POLite/heat-gals clean
 	make -C apps/POLite/heat-sync clean
+	make -C apps/POLite/heat-cube-sync clean
+	make -C apps/POLite/heat-grid-sync clean
 	make -C apps/POLite/asp-gals clean
 	make -C apps/POLite/asp-sync clean
-	make -C apps/POLite/asp-pc clean
 	make -C apps/POLite/pagerank-sync clean
 	make -C apps/POLite/pagerank-gals clean
+	make -C apps/POLite/sssp-sync clean
 	make -C apps/POLite/sssp-async clean
-	make -C apps/POLite/ping-test clean
 	make -C apps/POLite/clocktree-async clean
+	make -C apps/POLite/izhikevich-gals clean
+	make -C apps/POLite/izhikevich-sync clean
+	make -C apps/POLite/pressure-sync clean
+	make -C apps/POLite/hashmin-sync clean
 	make -C bin clean
 	make -C tests clean
diff --git a/apps/POLite/ping-test/Makefile b/apps/POLite/ping-test/Makefile
deleted file mode 100644
index 7e85d2c6..00000000
--- a/apps/POLite/ping-test/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-# SPDX-License-Identifier: BSD-2-Clause
-APP_CPP = ping.cpp
-APP_HDR = ping.h
-RUN_CPP = Run.cpp
-
-include ../util/polite.mk
diff --git a/apps/POLite/ping-test/Run.cpp b/apps/POLite/ping-test/Run.cpp
deleted file mode 100644
index 57ac5441..00000000
--- a/apps/POLite/ping-test/Run.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-// SPDX-License-Identifier: BSD-2-Clause
-#include "ping.h"
-
-#include <HostLink.h>
-#include <POLite.h>
-#include <EdgeList.h>
-#include <assert.h>
-#include <sys/time.h>
-#include <config.h>
-
-int main(int argc, char**argv)
-{
-  // Connection to tinsel machine
-  HostLink hostLink;
-
-  // Create POETS graph
-  PGraph<PingDevice, PingState, None, PingMessage> graph;
-
-  // Create single ping device
-  PDeviceId id = graph.newDevice();
-
-  // Prepare mapping from graph to hardware
-  graph.map();
-
-  // Write graph down to tinsel machine via HostLink
-  graph.write(&hostLink);
-
-  // Load code and trigger execution
-  hostLink.boot("code.v", "data.v");
-  hostLink.go();
-
-  printf("Ping started\n");
-
-  // Consume performance stats
-  //politeSaveStats(&hostLink, "stats.txt");
-
-  int test = 0;
-  int deviceAddr = graph.toDeviceAddr[id];
-  printf("deviceAddr = %d\n", deviceAddr);
-  while (test < 100) {
-    // Send ping
-    PMessage<PingMessage> sendMsg;
-    sendMsg.devId = getLocalDeviceId(deviceAddr);
-    sendMsg.payload.test = test;
-    hostLink.send(getThreadId(deviceAddr), 1, &sendMsg);
-    printf("Sent %d to device\n", sendMsg.payload.test);
-
-    // Receive pong
-    PMessage<PingMessage> recvMsg;
-    hostLink.recvMsg(&recvMsg, sizeof(recvMsg));
-    printf("Received %d from device\n", recvMsg.payload.test);
-
-    test++;
-  }
-
-  return 0;
-}
diff --git a/apps/POLite/ping-test/ping.cpp b/apps/POLite/ping-test/ping.cpp
deleted file mode 100644
index 74960d36..00000000
--- a/apps/POLite/ping-test/ping.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// SPDX-License-Identifier: BSD-2-Clause
-#include "ping.h"
-
-#include <tinsel.h>
-#include <POLite.h>
-
-typedef PThread<
-          PingDevice,
-          PingState,     // State
-          None,         // Edge label
-          PingMessage    // Message
-        > PingThread;
-
-int main()
-{
-  // Point thread structure at base of thread's heap
-  PingThread* thread = (PingThread*) tinselHeapBaseSRAM();
-
-  // Invoke interpreter
-  thread->run();
-
-  return 0;
-}
diff --git a/apps/POLite/ping-test/ping.h b/apps/POLite/ping-test/ping.h
deleted file mode 100644
index 3d4c17de..00000000
--- a/apps/POLite/ping-test/ping.h
+++ /dev/null
@@ -1,54 +0,0 @@
-// SPDX-License-Identifier: BSD-2-Clause
-// Test messaging between host and threads.
-
-#ifndef _ping_H_
-#define _ping_H_
-
-//#define POLITE_DUMP_STATS
-//#define POLITE_COUNT_MSGS
-
-// Lightweight POETS frontend
-#include <POLite.h>
-
-struct PingMessage {
-  uint32_t test;
-};
-
-struct PingState {
-  // Number received to be sent back to host
-  uint32_t test;
-};
-
-struct PingDevice : PDevice<PingState, None, PingMessage> {
-  // Called once by POLite at start of execution
-  void init() {
-    // Do nothing until a message is received from the host
-    *readyToSend = No;
-  }
-
-  // Receive handler
-  inline void recv(PingMessage* msg, None* edge) {
-    // Store number from host to send back to host
-    s->test = msg->test;
-    *readyToSend = HostPin;
-  }
-
-  // Send handler
-  inline void send(volatile PingMessage* msg) {
-    // Put received value back in message for host to check
-    msg->test = s->test;
-    *readyToSend = No;
-  }
-
-  // Called by POLite when system becomes idle
-  inline bool step() {
-    return true; // Never terminate
-  }
-
-  // Optionally send message to host on termination
-  inline bool finish(volatile PingMessage* msg) {
-    return false;
-  }
-};
-
-#endif
diff --git a/apps/POLite/util/genld.sh b/apps/POLite/util/genld.sh
index 0350108e..474e5694 100755
--- a/apps/POLite/util/genld.sh
+++ b/apps/POLite/util/genld.sh
@@ -18,7 +18,7 @@ OUTPUT_ARCH( "riscv" )
 MEMORY
 {
   instrs  : ORIGIN = $MaxBootImageBytes, LENGTH = $MaxInstrBytes
-  globals : ORIGIN = $DRAMBase, LENGTH = $DRAMGlobalsLength
+  globals : ORIGIN = $DRAMBase, LENGTH = $POLiteDRAMGlobalsLength
 }
 
 SECTIONS
diff --git a/config.py b/config.py
index 50bc3480..a4099e7e 100755
--- a/config.py
+++ b/config.py
@@ -365,6 +365,12 @@ def quoted(s): return "'\"" + s + "\"'"
 # DRAM base and length
 p["DRAMBase"] = 3 * (2 ** p["LogBytesPerSRAM"])
 p["DRAMGlobalsLength"] = 2 ** (p["LogBytesPerDRAM"] - 1) - p["DRAMBase"]
+p["POLiteDRAMGlobalsLength"] = 2 ** 14
+p["POLiteProgRouterBase"] = p["DRAMBase"] + p["POLiteDRAMGlobalsLength"]
+p["POLiteProgRouterLength"] = (p["DRAMGlobalsLength"] -
+                                 p["POLiteDRAMGlobalsLength"])
+
+# POLite globals
 
 # Number of FPGA boards per box (including bridge board)
 p["BoardsPerBox"] = p["MeshXLenWithinBox"] * p["MeshYLenWithinBox"] + 1
diff --git a/include/POLite/PDevice.h b/include/POLite/PDevice.h
index 9eefda3a..b5f99340 100644
--- a/include/POLite/PDevice.h
+++ b/include/POLite/PDevice.h
@@ -54,9 +54,8 @@ inline PLocalDeviceId getLocalDeviceId(PDeviceAddr addr) { return addr >> 19; }
 // What's the max allowed local device address?
 inline uint32_t maxLocalDeviceId() { return 8192; }
 
-// Routing key
-typedef uint16_t Key;
-#define InvalidKey 0xffff
+// Index into the per-thread in-edge table
+typedef uint16_t InTableKey;
 
 // Pins
 //   No      - means 'not ready to send'
@@ -92,8 +91,8 @@ template <typename S, typename E, typename M> struct PDevice {
 
 // Generic device state structure
 template <typename S> struct ALIGNED PState {
-  // Pointer to base of neighbours arrays
-  uint16_t pinBase[POLITE_NUM_PINS];
+  // Board-level routing key for each outgoing pin
+  uint32_t pin[POLITE_NUM_PINS];
   // Ready-to-send status
   PPin readyToSend;
   // Custom state
@@ -103,22 +102,11 @@ template <typename S> struct ALIGNED PState {
 // Message structure
 template <typename M> struct PMessage {
   // Source-based routing key
-  Key key;
+  InTableKey key;
   // Application message
   M payload;
 };
 
-// An outgoing edge from a device
-struct POutEdge {
-  // Destination mailbox
-  uint16_t mbox;
-  // Routing key
-  uint16_t key;
-  // Destination threads
-  uint32_t threadMaskLow;
-  uint32_t threadMaskHigh;
-};
-
 // An incoming edge to a device (labelleled)
 template <typename E> struct PInEdge {
   // Destination device
@@ -137,16 +125,6 @@ template <> struct PInEdge<None> {
   };
 };
 
-// Helper function: Count board hops between two threads
-inline uint32_t hopsBetween(uint32_t t0, uint32_t t1) {
-  uint32_t xmask = ((1<<TinselMeshXBits)-1);
-  int32_t y0 = t0 >> (TinselLogThreadsPerBoard + TinselMeshXBits);
-  int32_t x0 = (t0 >> TinselLogThreadsPerBoard) & xmask;
-  int32_t y1 = t1 >> (TinselLogThreadsPerBoard + TinselMeshXBits);
-  int32_t x1 = (t1 >> TinselLogThreadsPerBoard) & xmask;
-  return (abs(x0-x1) + abs(y0-y1));
-}
-
 // Generic thread structure
 template <typename DeviceType,
           typename S, typename E, typename M> struct PThread {
@@ -159,8 +137,7 @@ template <typename DeviceType,
   uint32_t numVertices;
   // Pointer to array of device states
   PTR(PState<S>) devices;
-  // Pointer to base of routing tables
-  PTR(POutEdge) outTableBase;
+  // Pointer to base of in table
   PTR(PInEdge<E>) inTableBase;
   // Array of local device ids are ready to send
   PTR(PLocalDeviceId) senders;
@@ -218,17 +195,6 @@ template <typename DeviceType,
 
   // Invoke device handlers
   void run() {
-    // Current out-going edge in multicast
-    POutEdge* outEdge;
-
-    // Outgoing edge to host
-    POutEdge outHost[2];
-    outHost[0].mbox = tinselHostId() >> TinselLogThreadsPerMailbox;
-    outHost[0].key = 0;
-    outHost[1].key = InvalidKey;
-    // Initialise outEdge to null terminator
-    outEdge = &outHost[1];
-
     // Did last call to step handler request a new time step?
     bool active = true;
 
@@ -252,29 +218,10 @@ template <typename DeviceType,
 
     // Event loop
     while (1) {
-      // Step 1: try to send
-      if (outEdge->key != InvalidKey) {
-        if (tinselCanSend()) {
-          PMessage<M>* m = (PMessage<M>*) tinselSendSlot();
-          // Send message
-          m->key = outEdge->key;
-          tinselMulticast(outEdge->mbox, outEdge->threadMaskHigh,
-            outEdge->threadMaskLow, m);
-          #ifdef POLITE_COUNT_MSGS
-          interThreadSendCount++;
-          interBoardSendCount +=
-            hopsBetween(outEdge->mbox << TinselLogThreadsPerMailbox,
-              tinselId());
-          #endif
-          // Move to next neighbour
-          outEdge++;
-        }
-        else
-          tinselWaitUntil(TINSEL_CAN_SEND|TINSEL_CAN_RECV);
-      }
-      else if (sendersTop != senders) {
+      // Try to send
+      if (sendersTop != senders) {
         if (tinselCanSend()) {
-          // Start new multicast
+          // Get next sender
           PLocalDeviceId src = *(--sendersTop);
           // Lookup device
           DeviceType dev = getDevice(src);
@@ -284,13 +231,14 @@ template <typename DeviceType,
           dev.send(&m->payload);
           // Reinsert sender, if it still wants to send
           if (*dev.readyToSend != No) sendersTop++;
-          // Determine out-edge array for sender
+          // Is it a send to the host pin or a user pin?
           if (pin == HostPin)
-            outEdge = outHost;
+            tinselSend(tinselHostId(), m);
           else
-            outEdge = (POutEdge*) &outTableBase[
-              devices[src].pinBase[pin-2]
-            ];
+            tinselKeySend(devices[src].pin[pin-2], m);
+          #ifdef POLITE_COUNT_MSGS
+          interThreadSendCount++;
+          #endif
         }
         else
           tinselWaitUntil(TINSEL_CAN_SEND|TINSEL_CAN_RECV);
diff --git a/include/POLite/PGraph.h b/include/POLite/PGraph.h
index 3ac93eae..b6a1245a 100644
--- a/include/POLite/PGraph.h
+++ b/include/POLite/PGraph.h
@@ -12,8 +12,8 @@
 #include <POLite/Seq.h>
 #include <POLite/Graph.h>
 #include <POLite/Placer.h>
+#include <POLite/ProgRouters.h>
 #include <type_traits>
-#include "Seq.h"
 
 // Nodes of a POETS graph are devices
 typedef NodeId PDeviceId;
@@ -56,12 +56,21 @@ template <typename DeviceType,
   uint32_t numBoardsX;
   uint32_t numBoardsY;
 
-  // Thread routing tables:
-  // Sequence of outgoing edges for every (device, pin) pair
-  Seq<POutEdge>*** outTable;
+  // Out table (sender-side edge tables)
+  // Sequence of destinations for every (device, pin) pair
+  Seq<PRoutingDest>*** outTable;
+
+  // Key table (sender-side key tables)
+  // Global routing key for every (device, pin) pair
+  uint32_t** keyTable;
+
+  // In table (receiver-side edge tables)
   // Sequence of incoming edges for every thread
   Seq<PInEdge<E>>** inTable;
 
+  // Mesh of per-board programmable routers
+  ProgRouterMesh* routingTables;
+
   // Generic constructor
   void constructor(uint32_t lenX, uint32_t lenY) {
     meshLenX = lenX;
@@ -82,14 +91,12 @@ template <typename DeviceType,
     inEdgeMem = NULL;
     inEdgeMemSize = NULL;
     inEdgeMemBase = NULL;
-    outEdgeMem = NULL;
-    outEdgeMemSize = NULL;
-    outEdgeMemBase = NULL;
     mapVerticesToDRAM = false;
     mapInEdgesToDRAM = true;
-    mapOutEdgesToDRAM = true;
     outTable = NULL;
+    keyTable = NULL;
     inTable = NULL;
+    routingTables = NULL;
     chatty = 0;
   }
 
@@ -122,17 +129,16 @@ template <typename DeviceType,
   uint32_t* vertexMemSize;  uint32_t* threadMemSize;
   uint32_t* vertexMemBase;  uint32_t* threadMemBase;
 
-  // Each thread's in-edge and out-edge regions
+  // Each thread's in-edge tables
   // (Not valid until the mapper is called)
-  uint8_t** inEdgeMem;      uint8_t** outEdgeMem;
-  uint32_t* inEdgeMemSize;  uint32_t* outEdgeMemSize;
-  uint32_t* inEdgeMemBase;  uint32_t* outEdgeMemBase;
+  uint8_t** inEdgeMem;
+  uint32_t* inEdgeMemSize;
+  uint32_t* inEdgeMemBase;
 
   // Where to map the various regions
   // (If false, map to SRAM instead)
   bool mapVerticesToDRAM;
   bool mapInEdgesToDRAM;
-  bool mapOutEdgesToDRAM;
 
   // Allow mapper to print useful information to stdout
   uint32_t chatty;
@@ -189,9 +195,6 @@ template <typename DeviceType,
     inEdgeMem = (uint8_t**) calloc(TinselMaxThreads, sizeof(uint8_t*));
     inEdgeMemSize = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
     inEdgeMemBase = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
-    outEdgeMem = (uint8_t**) calloc(TinselMaxThreads, sizeof(uint8_t*));
-    outEdgeMemSize = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
-    outEdgeMemBase = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
     // Compute partition sizes for each thread
     for (uint32_t threadId = 0; threadId < TinselMaxThreads; threadId++) {
       // This variable is used to count the size of the *initialised*
@@ -199,7 +202,6 @@ template <typename DeviceType,
       // uninitialised portions.
       uint32_t sizeVMem = 0;
       uint32_t sizeEIMem = 0;
-      uint32_t sizeEOMem = 0;
       uint32_t sizeTMem = 0;
       // Add space for thread structure (always stored in SRAM)
       sizeTMem = cacheAlign(sizeof(PThread<DeviceType, S, E, M>));
@@ -214,15 +216,6 @@ template <typename DeviceType,
         sizeEIMem = inTable[threadId]->numElems * sizeof(PInEdge<E>);
         sizeEIMem = wordAlign(sizeEIMem);
       }
-      // Add space for outgoing edge table
-      for (uint32_t devNum = 0; devNum < numDevs; devNum++) {
-        PDeviceId id = fromDeviceAddr[threadId][devNum];
-        for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) {
-          Seq<POutEdge>* edges = outTable[id][p];
-          sizeEOMem += sizeof(POutEdge) * edges->numElems;
-        }
-      }
-      sizeEOMem = wordAlign(sizeEOMem);
       // The total partition size including uninitialised portions
       uint32_t totalSizeVMem =
         sizeVMem + wordAlign(sizeof(PLocalDeviceId) * numDevs);
@@ -233,8 +226,6 @@ template <typename DeviceType,
                         else totalSizeSRAM += totalSizeVMem;
       if (mapInEdgesToDRAM)  totalSizeDRAM += sizeEIMem;
                         else totalSizeSRAM += sizeEIMem;
-      if (mapOutEdgesToDRAM) totalSizeDRAM += sizeEOMem;
-                        else totalSizeSRAM += sizeEOMem;
       if (totalSizeDRAM > maxDRAMSize) {
         printf("Error: max DRAM partition size exceeded\n");
         exit(EXIT_FAILURE);
@@ -247,15 +238,12 @@ template <typename DeviceType,
       assert((sizeVMem%4) == 0);
       assert((sizeTMem%4) == 0);
       assert((sizeEIMem%4) == 0);
-      assert((sizeEOMem%4) == 0);
       vertexMem[threadId] = (uint8_t*) calloc(sizeVMem, 1);
       vertexMemSize[threadId] = sizeVMem;
       threadMem[threadId] = (uint8_t*) calloc(sizeTMem, 1);
       threadMemSize[threadId] = sizeTMem;
       inEdgeMem[threadId] = (uint8_t*) calloc(sizeEIMem, 1);
       inEdgeMemSize[threadId] = sizeEIMem;
-      outEdgeMem[threadId] = (uint8_t*) calloc(sizeEOMem, 1);
-      outEdgeMemSize[threadId] = sizeEOMem;
       // Tinsel address of base of partition
       uint32_t partId = threadId & (TinselThreadsPerDRAM-1);
       uint32_t sramBase = (1 << TinselLogBytesPerSRAM) +
@@ -283,14 +271,6 @@ template <typename DeviceType,
         inEdgeMemBase[threadId] = sramBase;
         sramBase += sizeEIMem;
       }
-      if (mapOutEdgesToDRAM) {
-        outEdgeMemBase[threadId] = dramBase;
-        dramBase += sizeEOMem;
-      }
-      else {
-        outEdgeMemBase[threadId] = sramBase;
-        sramBase += sizeEOMem;
-      }
     }
   }
 
@@ -299,7 +279,6 @@ template <typename DeviceType,
     for (uint32_t threadId = 0; threadId < TinselMaxThreads; threadId++) {
       // Next pointers for each partition
       uint32_t nextVMem = 0;
-      uint32_t nextOutIndex = 0;
       // Pointer to thread structure
       PThread<DeviceType, S, E, M>* thread =
         (PThread<DeviceType, S, E, M>*) &threadMem[threadId][0];
@@ -309,8 +288,7 @@ template <typename DeviceType,
       thread->numVertices = numDevices;
       // Set tinsel address of array of device states
       thread->devices = vertexMemBase[threadId];
-      // Set tinsel address of base of edge tables
-      thread->outTableBase = outEdgeMemBase[threadId];
+      // Set tinsel address of base of in-edge table
       thread->inTableBase = inEdgeMemBase[threadId];
       // Add space for each device on thread
       uint32_t numDevs = numDevicesOnThread[threadId];
@@ -326,14 +304,8 @@ template <typename DeviceType,
         PDeviceId id = fromDeviceAddr[threadId][devNum];
         PState<S>* dev = devices[id];
         // Initialise
-        POutEdge* outEdgeArray = (POutEdge*) outEdgeMem[threadId];
         for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) {
-          dev->pinBase[p] = nextOutIndex;
-          Seq<POutEdge>* edges = outTable[id][p];
-          for (uint32_t i = 0; i < edges->numElems; i++) {
-            outEdgeArray[nextOutIndex] = edges->elems[i];
-            nextOutIndex++;
-          }
+          dev->pin[p] = keyTable[id][p];
         }
       }
       // Intialise thread's in edges
@@ -348,10 +320,6 @@ template <typename DeviceType,
         printf("Error: vertex mem size does not match pre-computed size\n");
         exit(EXIT_FAILURE);
       }
-      if ((nextOutIndex * sizeof(POutEdge)) != outEdgeMemSize[threadId]) {
-        printf("Error: out edge mem size does not match pre-computed size\n");
-        exit(EXIT_FAILURE);
-      }
       // Set tinsel address of senders array
       thread->senders = vertexMemBase[threadId] + nextVMem;
     }
@@ -365,9 +333,9 @@ template <typename DeviceType,
     numDevicesOnThread = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
   }
 
-  // Allocate thread routing tables
+  // Allocate thread edge input and output tables
   // (Only valid after mapper is called)
-  void allocateThreadRoutingTables() {
+  void allocateInOutTables() {
     // Receiver-side tables
     inTable = (Seq<PInEdge<E>>**)
       calloc(TinselMaxThreads,sizeof(Seq<PInEdge<E>>*));
@@ -377,13 +345,18 @@ template <typename DeviceType,
     }
 
     // Sender-side tables
-    outTable = (Seq<POutEdge>***) calloc(numDevices, sizeof(Seq<POutEdge>**));
+    outTable = (Seq<PRoutingDest>***)
+      calloc(numDevices, sizeof(Seq<PRoutingDest>**));
     for (uint32_t d = 0; d < numDevices; d++) {
-      outTable[d] = (Seq<POutEdge>**)
-        calloc(POLITE_NUM_PINS, sizeof(Seq<POutEdge>*));
+      outTable[d] = (Seq<PRoutingDest>**)
+        calloc(POLITE_NUM_PINS, sizeof(Seq<PRoutingDest>*));
       for (uint32_t p = 0; p < POLITE_NUM_PINS; p++)
-        outTable[d][p] = new SmallSeq<POutEdge>;
+        outTable[d][p] = new SmallSeq<PRoutingDest>;
     }
+
+    keyTable = new uint32_t* [numDevices];
+    for (uint32_t d = 0; d < numDevices; d++)
+      keyTable[d] = new uint32_t [POLITE_NUM_PINS];
   }
 
   // Pack a receivers array
@@ -407,9 +380,9 @@ template <typename DeviceType,
     }
   }
 
-  // Determine thread routing key for given set of receivers
+  // Determine in-table key for given set of receivers
   // (The key must be the same for all receivers)
-  uint32_t findThreadKey(Seq<PReceiverGroup<E>>* receivers) { 
+  uint32_t findInTableKey(Seq<PReceiverGroup<E>>* receivers) { 
     uint32_t key = 0;
 
     bool found = false;
@@ -445,9 +418,9 @@ template <typename DeviceType,
   // Add entries to the input tables for the given receivers
   // (Only valid after mapper is called)
   uint32_t addInTableEntries(Seq<PReceiverGroup<E>>* receivers) {
-    uint32_t key = findThreadKey(receivers);
+    uint32_t key = findInTableKey(receivers);
     if (key >= 0xfffe) {
-      printf("Thread routing key exceeds 16 bits\n");
+      printf("In-table routing key exceeds 16 bits\n");
       exit(EXIT_FAILURE);
     }
     PInEdge<E> null, unused;
@@ -475,9 +448,9 @@ template <typename DeviceType,
     return key;
   }
 
-  // Compute thread routing tables
+  // Compute thread edge input and output tables
   // (Only valid after mapper is called)
-  void computeThreadRoutingTables() {
+  void computeInOutTables() {
     // Routing table stats
     uint64_t totalOutEdges = 0;
 
@@ -534,7 +507,7 @@ template <typename DeviceType,
           // Add input table entries
           uint32_t key = addInTableEntries(&groups);
           // Add output table entry
-          POutEdge edge;
+          PRoutingDest edge;
           edge.mbox = mbox;
           edge.key = key;
           edge.threadMaskLow = threadMaskLow;
@@ -545,10 +518,6 @@ template <typename DeviceType,
           edges.numElems = destsRemaining;
           totalOutEdges++;
         }
-        // Add output edge terminator
-        POutEdge term;
-        term.key = InvalidKey;
-        outTable[d][p]->append(term);
       }
     }
     //printf("Average edges per pin: %lu\n",
@@ -579,11 +548,6 @@ template <typename DeviceType,
       free(inEdgeMem);
       free(inEdgeMemSize);
       free(inEdgeMemBase);
-      for (uint32_t t = 0; t < TinselMaxThreads; t++)
-        if (outEdgeMem[t] != NULL) free(outEdgeMem[t]);
-      free(outEdgeMem);
-      free(outEdgeMemSize);
-      free(outEdgeMemBase);
     }
     if (inTable != NULL) {
       for (uint32_t t = 0; t < TinselMaxThreads; t++)
@@ -601,13 +565,19 @@ template <typename DeviceType,
       free(outTable);
       outTable = NULL;
     }
+    if (keyTable != NULL) {
+      for (uint32_t d = 0; d < numDevices; d++) delete [] keyTable[d];
+      delete [] keyTable;
+      keyTable = NULL;
+    }
+    if (routingTables != NULL) delete routingTables;
   }
 
   // Implement mapping to tinsel threads
   void map() {
     // Let's measure some times
     struct timeval placementStart, placementFinish;
-    struct timeval threadRoutingStart, threadRoutingFinish;
+    struct timeval routingStart, routingFinish;
     struct timeval initStart, initFinish;
 
     // Release all mapping and heap structures
@@ -677,16 +647,25 @@ template <typename DeviceType,
       }
     }
 
-    // Stop placement timer and start routing timer
+    // Stop placement timer and start In/Out table timer
     gettimeofday(&placementFinish, NULL);
-    gettimeofday(&threadRoutingStart, NULL);
+    gettimeofday(&routingStart, NULL);
+
+    // Compute send and receive side routing tables
+    allocateInOutTables();
+    computeInOutTables();
 
-    // Compute send and receive side thread routing tables
-    allocateThreadRoutingTables();
-    computeThreadRoutingTables();
+    // Compute per-board programmable routing tables
+    routingTables = new ProgRouterMesh(numBoardsX, numBoardsY);
+    for (uint32_t d = 0; d < numDevices; d++) {
+      uint32_t src = getThreadId(toDeviceAddr[d]) >>
+        TinselLogThreadsPerMailbox;
+      for (uint32_t p = 0; p < POLITE_NUM_PINS; p++)
+        keyTable[d][p] = routingTables->addDestsFromBoard(src, outTable[d][p]);
+   }
 
     // Stop routing timer and start init timer
-    gettimeofday(&threadRoutingFinish, NULL);
+    gettimeofday(&routingFinish, NULL);
     gettimeofday(&initStart, NULL);
 
     // Reallocate and initialise heap structures
@@ -704,9 +683,9 @@ template <typename DeviceType,
       printf("POLite mapper profile:\n");
       printf("  Partitioning and placement: %lfs\n", duration);
 
-      timersub(&threadRoutingFinish, &threadRoutingStart, &diff);
+      timersub(&routingFinish, &routingStart, &diff);
       duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
-      printf("  Thread routing table construction: %lfs\n", duration);
+      printf("  In/Out table construction: %lfs\n", duration);
 
       timersub(&initFinish, &initStart, &diff);
       duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
@@ -811,7 +790,7 @@ template <typename DeviceType,
     writeRAM(hostLink, vertexMem, vertexMemSize, vertexMemBase);
     writeRAM(hostLink, threadMem, threadMemSize, threadMemBase);
     writeRAM(hostLink, inEdgeMem, inEdgeMemSize, inEdgeMemBase);
-    writeRAM(hostLink, outEdgeMem, outEdgeMemSize, outEdgeMemBase);
+    routingTables->write(hostLink);
     hostLink->flush();
     hostLink->useSendBuffer = useSendBufferOld;
 
diff --git a/include/POLite/ProgRouters.h b/include/POLite/ProgRouters.h
new file mode 100644
index 00000000..34f62694
--- /dev/null
+++ b/include/POLite/ProgRouters.h
@@ -0,0 +1,367 @@
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef _PROGROUTERS_H_
+#define _PROGROUTERS_H_
+
+#include <assert.h>
+#include <config.h>
+#include <HostLink.h>
+#include <POLite.h>
+#include <POLite/Seq.h>
+#include <boot.h>
+
+// =============================
+// Per-board programmable router
+// =============================
+
+class ProgRouter {
+
+  // Number of chunks used so far in current beat
+  uint32_t numChunks;
+
+  // Number of records used so far in current beat
+  uint32_t numRecords;
+
+  // Number of beats associated with current key
+  uint32_t numBeats;
+
+  // Index of RAM currently being used
+  uint32_t currentRAM;
+
+  // Pointer to previously created indirection
+  // (We need indirections to handle record sequences of 31 beats or more)
+  uint8_t* prevInd;
+
+  // Move on to next the beat
+  void nextBeat() {
+    // Set number of records in current beat
+    uint32_t beatBase = table[currentRAM]->numElems - 32;
+    uint8_t* beat = &table[currentRAM]->elems[beatBase];
+    beat[31] = 0;
+    beat[30] = numRecords;
+    numChunks = numRecords = 0;
+    // Allocate new beat, and check for overflow
+    numBeats++;
+    table[currentRAM]->extendBy(32);
+    if (table[currentRAM]->numElems >= (TinselPOLiteProgRouterLength-1024)) {
+      printf("ProgRouter out of memory\n");
+      exit(EXIT_FAILURE);
+    }
+    // We need indirections to handle sequences of 31 beats or more
+    if ((numBeats % 31) == 0) {
+      // Set previous indirection, if there is one
+      if (prevInd) {
+        uint32_t key = TinselPOLiteProgRouterBase +
+                         table[currentRAM]->numElems - 31*32;
+        if (currentRAM) key |= 0x80000000;
+        key |= 31;
+        setIND(prevInd, key);
+      }
+      prevInd = addIND();
+    }
+  }
+
+  // Get current record pointer for 48-bit entry
+  inline uint8_t* currentRecord48() {
+    uint32_t beatBase = (table[currentRAM]->numElems-32) + 6*(4-numChunks);
+    return &table[currentRAM]->elems[beatBase];
+  }
+
+  // Get current record pointer for 96-bit entry
+  inline uint8_t* currentRecord96() {
+    uint32_t beatBase = (table[currentRAM]->numElems-32) + 6*(3-numChunks);
+    return &table[currentRAM]->elems[beatBase];
+  }
+
+ public:
+
+  // A table holding encoded routing beats for each RAM
+  Seq<uint8_t>** table;
+
+  // Constructor
+  ProgRouter() {
+    // Currently we assume two RAMs per board
+    assert(TinselDRAMsPerBoard == 2);
+    // Initialise member variables
+    prevInd = NULL;
+    numBeats = 1;
+    numChunks = numRecords = currentRAM = 0;
+    // Allocate one sequence per RAM
+    table = new Seq<uint8_t>* [TinselDRAMsPerBoard];
+    // Initially each sequence is 32MB
+    for (int i = 0; i < TinselDRAMsPerBoard; i++) {
+      table[i] = new Seq<uint8_t> (1 << 15);
+      // Allocate first beat
+      table[i]->extendBy(32);
+    }
+  }
+
+  // Destructor
+  ~ProgRouter() {
+    for (int i = 0; i < TinselDRAMsPerBoard; i++) delete table[i];
+    delete [] table;
+  }
+
+  // Generate a new key for the records added
+  uint32_t genKey() {
+    // Determine index of first beat in record sequence
+    uint32_t index = table[currentRAM]->numElems - numBeats*32;
+    // Determine final key length
+    uint32_t finalKeyLen = prevInd ? 31 : numBeats;
+    // Insert outstanding indirection, if there is one
+    if (prevInd) {
+      // Set previous indirection to latest block of beats
+      uint32_t indKey = TinselPOLiteProgRouterBase +
+        table[currentRAM]->numElems - (numBeats%31)*32;
+      if (currentRAM) indKey |= 0x80000000;
+      indKey |= (numBeats%31);
+      setIND(prevInd, indKey); 
+    }
+    // Determine final key
+    uint32_t key = TinselPOLiteProgRouterBase + index;
+    if (currentRAM) key |= 0x80000000;
+    key |= finalKeyLen;
+    // Move to next beat
+    nextBeat();
+    numBeats = 1;
+    prevInd = NULL;
+    // Pick smaller RAM for next key
+    currentRAM = table[0]->numElems < table[1]->numElems ? 0 : 1;
+    return key;
+  }
+
+  // Add an IND record to the table
+  // Return a pointer to the indirection key,
+  // so it can be set later by the caller
+  uint8_t* addIND() {
+    if (numChunks == 5) nextBeat();
+    uint8_t* ptr = currentRecord48();
+    ptr[5] = 4 << 5;
+    numChunks++;
+    numRecords++;
+    return ptr;
+  }
+
+  // Set indirection key
+  void setIND(uint8_t* ind, uint32_t key) {
+    ind[0] = key;
+    ind[1] = key >> 8;
+    ind[2] = key >> 16;
+    ind[3] = key >> 24;
+  }
+
+  // Add an MRM record to the table
+  void addMRM(uint32_t mboxX, uint32_t mboxY,
+                uint32_t threadsHigh, uint32_t threadsLow,
+                  uint16_t localKey) {
+    if (numChunks >= 4) nextBeat();
+    uint8_t* ptr = currentRecord96();
+    ptr[0] = threadsLow;
+    ptr[1] = threadsLow >> 8;
+    ptr[2] = threadsLow >> 16;
+    ptr[3] = threadsLow >> 24;
+    ptr[4] = threadsHigh;
+    ptr[5] = threadsHigh >> 8;
+    ptr[6] = threadsHigh >> 16;
+    ptr[7] = threadsHigh >> 24;
+    ptr[8] = localKey;
+    ptr[9] = localKey >> 8;
+    ptr[11] = (3 << 5) | (mboxY << 3) | (mboxX << 1);
+    numChunks += 2;
+    numRecords++;
+  }
+
+  // Add an RR record to the table
+  void addRR(uint32_t dir, uint32_t key) {
+    if (numChunks == 5) nextBeat();
+    uint8_t* ptr = currentRecord48();
+    ptr[0] = key;
+    ptr[1] = key >> 8;
+    ptr[2] = key >> 16;
+    ptr[3] = key >> 24;
+    ptr[5] = (2 << 5) | (dir << 3);
+    numChunks++;
+    numRecords++;
+  }
+};
+
+// ==================================
+// Data type for routing destinations
+// ==================================
+
+struct PRoutingDest {
+  // Destination mailbox
+  uint32_t mbox;
+  // Thread-level routing key
+  uint16_t key;
+  // Destination threads
+  uint32_t threadMaskLow;
+  uint32_t threadMaskHigh;
+};
+
+// Extract board X coord from routing dest
+inline uint32_t destX(uint32_t mbox) {
+  uint32_t x = mbox >> (TinselMailboxMeshXBits + TinselMailboxMeshYBits);
+  return x & ((1<<TinselMeshXBits) - 1);
+}
+
+// Extract board Y coord from routing dest
+inline uint32_t destY(uint32_t mbox) {
+  uint32_t y = mbox >> (TinselMailboxMeshXBits +
+                 TinselMailboxMeshYBits + TinselMeshXBits);
+  return y & ((1<<TinselMeshYBits) - 1);
+}
+
+// Extract board-local mailbox X coord from routing dest
+inline uint32_t destMboxX(uint32_t mbox) {
+  return mbox & ((1<<TinselMailboxMeshXBits) - 1);
+}
+
+// Extract board-local mailbox Y coord from routing dest
+inline uint32_t destMboxY(uint32_t mbox) {
+  return (mbox >> TinselMailboxMeshXBits) &
+           ((1<<TinselMailboxMeshYBits) - 1);
+}
+
+// ============================
+// Mesh of programmable routers
+// ============================
+
+class ProgRouterMesh {
+  // 2D array of tables;
+  ProgRouter** table;
+
+  // Board mesh dimensions
+  uint32_t boardsX;
+  uint32_t boardsY;
+
+ public:
+
+  // Constructor
+  ProgRouterMesh(uint32_t numBoardsX, uint32_t numBoardsY) {
+    boardsX = numBoardsX;
+    boardsY = numBoardsY;
+    table = new ProgRouter* [numBoardsY];
+    for (int y = 0; y < numBoardsY; y++)
+      table[y] = new ProgRouter [numBoardsX];
+  }
+
+  // Add routing destinations from given sender board
+  // Returns routing key
+  uint32_t addDestsFromBoardXY(uint32_t senderX, uint32_t senderY,
+                                 Seq<PRoutingDest>* dests) {
+    assert(dests->numElems > 0);
+
+    // Categorise non-local dests into local, N, S, E, and W groups
+    Seq<PRoutingDest> local(dests->numElems);
+    Seq<PRoutingDest> north(dests->numElems);
+    Seq<PRoutingDest> south(dests->numElems);
+    Seq<PRoutingDest> east(dests->numElems);
+    Seq<PRoutingDest> west(dests->numElems);
+    for (int i = 0; i < dests->numElems; i++) {
+      PRoutingDest dest = dests->elems[i];
+      uint32_t receiverX = destX(dest.mbox);
+      uint32_t receiverY = destY(dest.mbox);
+      if (receiverX < senderX) east.append(dest);
+      else if (receiverX > senderX) west.append(dest);
+      else if (receiverY < senderY) south.append(dest);
+      else if (receiverY > senderY) north.append(dest);
+      else local.append(dest);
+    }
+
+    // Recurse on non-local groups and add RR records on return
+    if (north.numElems > 0) {
+      uint32_t key = addDestsFromBoardXY(senderX, senderY+1, &north);
+      table[senderY][senderX].addRR(0, key);
+    }
+    if (south.numElems > 0) {
+      uint32_t key = addDestsFromBoardXY(senderX, senderY-1, &south);
+      table[senderY][senderX].addRR(1, key);
+    }
+    if (east.numElems > 0) {
+      uint32_t key = addDestsFromBoardXY(senderX+1, senderY, &east);
+      table[senderY][senderX].addRR(2, key);
+    }
+    if (west.numElems > 0) {
+      uint32_t key = addDestsFromBoardXY(senderX-1, senderY, &west);
+      table[senderY][senderX].addRR(3, key);
+    }
+
+    // Add local records
+    for (int i = 0; i < local.numElems; i++) {
+      PRoutingDest dest = local.elems[i];
+      table[senderY][senderX].addMRM(destMboxX(dest.mbox),
+        destMboxY(dest.mbox), dest.threadMaskHigh,
+        dest.threadMaskLow, dest.key);
+    }
+
+    return table[senderY][senderX].genKey();
+  }
+
+  // Add routing destinations from given global mailbox id
+  uint32_t addDestsFromBoard(uint32_t mbox, Seq<PRoutingDest>* dests) {
+    addDestsFromBoardXY(destX(mbox), destY(mbox), dests);
+  }
+
+  // Write routing tables to memory via HostLink
+  void write(HostLink* hostLink) {
+    // Request to boot loader
+    BootReq req;
+
+    // Compute number of cores per DRAM
+    const uint32_t coresPerDRAM = 1 <<
+      (TinselLogCoresPerDCache + TinselLogDCachesPerDRAM);
+
+    // Initialise write address for each routing table
+    for (int y = 0; y < boardsY; y++) {
+      for (int x = 0; x < boardsX; x++) {
+        for (int i = 0; i < TinselDRAMsPerBoard; i++) {
+          // Use one core to initialise each DRAM
+          uint32_t dest = hostLink->toAddr(x, y, coresPerDRAM * i, 0);
+          req.cmd = SetAddrCmd;
+          req.numArgs = 1;
+          req.args[0] = TinselPOLiteProgRouterBase;
+          hostLink->send(dest, 1, &req);
+          // Ensure space for an extra 32 bytes in each 
+          // table so we don't have to check for overflow below
+          // when consuming the tables in chunks of 12 bytes
+          table[y][x].table[i]->ensureSpaceFor(32);
+        }
+      }
+    }
+
+    // Write each routing table
+    bool allDone = false;
+    uint32_t offset = 0;
+    while (! allDone) {
+      allDone = true;
+      for (int y = 0; y < boardsY; y++) {
+        for (int x = 0; x < boardsX; x++) {
+          for (int i = 0; i < TinselDRAMsPerBoard; i++) {
+            Seq<uint8_t>* seq = table[y][x].table[i];
+            if (offset < seq->numElems) {
+              uint32_t dest = hostLink->toAddr(x, y, coresPerDRAM * i, 0);
+              allDone = false;
+              req.cmd = StoreCmd;
+              req.numArgs = 3;
+              req.args[0] = ((uint32_t*) seq->elems)[0];
+              req.args[1] = ((uint32_t*) seq->elems)[1];
+              req.args[2] = ((uint32_t*) seq->elems)[2];
+              hostLink->send(dest, 1, &req);
+            }
+          }
+        }
+      }
+      offset += 12;
+    }
+  }
+
+  // Destructor
+  ~ProgRouterMesh() {
+     for (int y = 0; y < boardsY; y++)
+       delete [] table[y];
+     delete [] table;
+  }
+};
+
+
+#endif
diff --git a/include/POLite/Seq.h b/include/POLite/Seq.h
index b6cb61f1..23a7616c 100644
--- a/include/POLite/Seq.h
+++ b/include/POLite/Seq.h
@@ -45,12 +45,26 @@ template <class T> class Seq
       elems = newElems;
     }
 
+    // Extend size of sequence by N
+    void extendBy(int n)
+    {
+      numElems += n;
+      if (numElems > maxElems)
+        setCapacity(numElems*2);
+    }
+
     // Extend size of sequence by one
     void extend()
     {
-      numElems++;
-      if (numElems > maxElems)
-        setCapacity(maxElems*2);
+      extendBy(1);
+    }
+
+    // Ensure space for a further N elements
+    void ensureSpaceFor(int n)
+    {
+      int newNumElems = numElems + n;
+      if (newNumElems > maxElems)
+        setCapacity(newNumElems*2);
     }
 
     // Append
diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv
index f6712ba1..1cbdb53e 100644
--- a/rtl/ProgRouter.bsv
+++ b/rtl/ProgRouter.bsv
@@ -361,15 +361,21 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
     Flit flit = flitInPort.value;
     if (flitInPort.canGet) begin
       flitInPort.get;
-      consumeKey <= getRoutingKey(flit.dest);
+      RoutingKey key = getRoutingKey(flit.dest);
+      consumeKey <= key
       // Write to flit buffer
       flitBuffer.write({chosenReg, consumeFlitCount}, flit);
       consumeFlitCount <= consumeFlitCount + 1;
       // On final flit, move to fetch state
       if (! flit.notFinalFlit) begin
-        consumeState <= 2;
-        // Claim chosen slot
-        flitBufferUsedSlots[chosenReg].set;
+        // Ignore keys with zero beats
+        if (key.numBeats == 0) begin
+          consumeState <= 0;
+        end else begin
+          consumeState <= 2;
+          // Claim chosen slot
+          flitBufferUsedSlots[chosenReg].set;
+        end
       end
     end
   endrule

From cb708b8d47220b775a3fd7134ae1fbc455e93d3f Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Tue, 14 Apr 2020 20:50:39 +0100
Subject: [PATCH 28/78] Account for ProgRouters in termination detector

(Compiles, but untested)
---
 config.py            |  1 +
 rtl/DE5Top.bsv       |  9 ++---
 rtl/IdleDetector.bsv | 80 ++++++++++++++++++++++++++------------------
 rtl/Makefile         |  2 +-
 rtl/Network.bsv      |  5 +++
 rtl/ProgRouter.bsv   | 34 +++++++++++++++++--
 6 files changed, 92 insertions(+), 39 deletions(-)

diff --git a/config.py b/config.py
index a4099e7e..d0a2491a 100755
--- a/config.py
+++ b/config.py
@@ -309,6 +309,7 @@ def quoted(s): return "'\"" + s + "\"'"
 
 # Cores per board
 p["LogCoresPerBoard"] = p["LogCoresPerMailbox"] + p["LogMailboxesPerBoard"]
+p["LogCoresPerBoard1"] = p["LogCoresPerBoard"] + 1
 p["CoresPerBoard"] = 2**p["LogCoresPerBoard"]
 
 # Threads per core
diff --git a/rtl/DE5Top.bsv b/rtl/DE5Top.bsv
index 0e5672fa..5c353542 100644
--- a/rtl/DE5Top.bsv
+++ b/rtl/DE5Top.bsv
@@ -140,10 +140,6 @@ module de5Top (DE5Top);
   // Create idle-detector
   IdleDetector idle <- mkIdleDetector;
 
-  // Connect cores to idle-detector
-  function idleClient(core) = core.idleClient;
-  connectCoresToIdleDetector(map(idleClient, vecOfCores), idle);
-
   // Create mailboxes
   Vector#(`MailboxMeshYLen,
     Vector#(`MailboxMeshXLen, Mailbox)) mailboxes =
@@ -172,6 +168,11 @@ module de5Top (DE5Top);
     map(map(mailboxNet), mailboxes),
     idle);
 
+  // Connect cores and ProgRouter fetchers to idle-detector
+  function idleClient(core) = core.idleClient;
+  connectClientsToIdleDetector(
+    map(idleClient, vecOfCores), noc.activities, idle);
+
   // Connections to off-chip RAMs
   for (Integer i = 0; i < `DRAMsPerBoard; i=i+1)
     connectClientsToOffChipRAM(dcaches[i],
diff --git a/rtl/IdleDetector.bsv b/rtl/IdleDetector.bsv
index 4cb3ccc5..179a9f41 100644
--- a/rtl/IdleDetector.bsv
+++ b/rtl/IdleDetector.bsv
@@ -18,14 +18,16 @@
 // The implementation below is based on Safra's termination detection
 // algorithm (EWD998).
 
-import Mailbox   :: *;
-import Globals   :: *;
-import Interface :: *;
-import Queue     :: *;
-import Vector    :: *;
-import ConfigReg :: *;
-import Util      :: *;
-import DReg      :: *;
+import Mailbox    :: *;
+import Globals    :: *;
+import Interface  :: *;
+import Queue      :: *;
+import Vector     :: *;
+import ConfigReg  :: *;
+import Util       :: *;
+import DReg       :: *;
+import ProgRouter :: *;
+import Assert     :: *;
 
 // The total number of messages sent by all threads on an FPGA minus
 // the total number of messages received by all threads on an FPGA.
@@ -343,22 +345,33 @@ interface IdleDetectorClient;
   method Bool idleStage1Ack;
 endinterface
 
-// Connect cores to idle detector
-module connectCoresToIdleDetector#(
-         Vector#(n, IdleDetectorClient) core, IdleDetector detector) ()
-           provisos (Log#(n, log_n), Add#(log_n, 1, m), Add#(_a, m, 62));
+// Connect cores and fetchers to idle detector
+module connectClientsToIdleDetector#(
+         Vector#(`CoresPerBoard, IdleDetectorClient) core,
+         Vector#(`FetchersPerProgRouter, FetcherActivity) fetcher,
+         IdleDetector detector) ()
+           provisos (Mul#(2, `CoresPerBoard, n));
+
+  staticAssert(2**`LogCoresPerBoard1 > `CoresPerBoard+`FetchersPerProgRouter,
+    "connectCoresToIdleDetector: insufficient width");
 
   // Sum "incSent" wires from each core
-  Vector#(n, Bit#(m)) incSents = newVector;
-  for (Integer i = 0; i < valueOf(n); i=i+1)
+  Vector#(n, Bit#(`LogCoresPerBoard1)) incSents = replicate(0);
+  for (Integer i = 0; i < `CoresPerBoard; i=i+1)
     incSents[i] = zeroExtend(core[i].incSent);
-  Bit#(m) incSent <- mkPipelinedReductionTree( \+ , 0, toList(incSents));
+  for (Integer i = 0; i < `FetchersPerProgRouter; i=i+1)
+    incSents[`CoresPerBoard+i] = zeroExtend(fetcher[i].incSent);
+  Bit#(`LogCoresPerBoard1) incSent <-
+    mkPipelinedReductionTree( \+ , 0, toList(incSents));
 
   // Sum "incRecv" wires from each core
-  Vector#(n, Bit#(m)) incRecvs = newVector;
-  for (Integer i = 0; i < valueOf(n); i=i+1)
+  Vector#(n, Bit#(`LogCoresPerBoard1)) incRecvs = replicate(0);
+  for (Integer i = 0; i < `CoresPerBoard; i=i+1)
     incRecvs[i] = zeroExtend(core[i].incReceived);
-  Bit#(m) incRecv <- mkPipelinedReductionTree( \+ , 0, toList(incRecvs));
+  for (Integer i = 0; i < `FetchersPerProgRouter; i=i+1)
+    incRecvs[`CoresPerBoard+i] = zeroExtend(fetcher[i].incReceived);
+  Bit#(`LogCoresPerBoard1) incRecv <-
+    mkPipelinedReductionTree( \+ , 0, toList(incRecvs));
 
   // Maintain the total count
   Reg#(MsgCount) count <- mkConfigReg(0);
@@ -369,16 +382,18 @@ module connectCoresToIdleDetector#(
   endrule
 
   // OR the "active" wires from each core
-  Vector#(n, Bool) actives = newVector;
-  for (Integer i = 0; i < valueOf(n); i=i+1)
+  Vector#(n, Bool) actives = replicate(False);
+  for (Integer i = 0; i < `CoresPerBoard; i=i+1)
     actives[i] = core[i].active;
+  for (Integer i = 0; i < `FetchersPerProgRouter; i=i+1)
+    actives[`CoresPerBoard+i] = fetcher[i].active;
   Bool anyActive <- mkPipelinedReductionTree( \|| , True, toList(actives));
 
-  // OR the "vote" wires from each core
-  Vector#(n, Bool) votes = newVector;
-  for (Integer i = 0; i < valueOf(n); i=i+1)
+  // AND the "vote" wires from each core
+  Vector#(n, Bool) votes = replicate(True);
+  for (Integer i = 0; i < `CoresPerBoard; i=i+1)
     votes[i] = core[i].vote;
-  Bool unanamous <- mkPipelinedReductionTree( \&& , False, toList(votes));
+  Bool voteDecision <- mkPipelinedReductionTree( \&& , False, toList(votes));
 
   // Register the result
   Reg#(Bool) active <- mkConfigReg(True);
@@ -386,24 +401,25 @@ module connectCoresToIdleDetector#(
   
   rule updateActive;
     active <= anyActive;
-    vote <= unanamous;
+    vote <= voteDecision;
   endrule
 
   // Counter number of stage 1 acks
-  Reg#(Bit#(m)) numAcks <- mkConfigReg(0);
+  Reg#(Bit#(`LogCoresPerBoard1)) numAcks <- mkConfigReg(0);
 
   // Sum stage 1 ack wires from each core
-  Vector#(n, Bit#(m)) incAcks = newVector;
-  for (Integer i = 0; i < valueOf(n); i=i+1)
+  Vector#(`CoresPerBoard, Bit#(`LogCoresPerBoard1)) incAcks = newVector;
+  for (Integer i = 0; i < `CoresPerBoard; i=i+1)
     incAcks[i] = zeroExtend(pack(core[i].idleStage1Ack));
-  Bit#(m) incAck <- mkPipelinedReductionTree( \+ , 0, toList(incAcks));
+  Bit#(`LogCoresPerBoard1) incAck <-
+    mkPipelinedReductionTree( \+ , 0, toList(incAcks));
 
   // Stage 1 output ack
   Wire#(Bool) stage1AckWire <- mkDWire(False);
 
   rule updateAcks;
-    Bit#(m) total = numAcks + incAck;
-    if (total == fromInteger(valueOf(n))) begin
+    Bit#(`LogCoresPerBoard1) total = numAcks + incAck;
+    if (total == `CoresPerBoard) begin
       numAcks <= 0;
       stage1AckWire <= True;
     end else begin
@@ -419,7 +435,7 @@ module connectCoresToIdleDetector#(
     detector.idle.voteIn(vote);
     detector.idle.ackStage1(stage1AckWire);
 
-    for (Integer i = 0; i < valueOf(n); i=i+1) begin
+    for (Integer i = 0; i < `CoresPerBoard; i=i+1) begin
       core[i].idleDetectedStage1(detector.idle.detectedStage1);
       core[i].idleVoteStage1(detector.idle.voteStage1);
       core[i].idleDetectedStage2(detector.idle.detectedStage2);
diff --git a/rtl/Makefile b/rtl/Makefile
index cc521bae..57a2acf8 100644
--- a/rtl/Makefile
+++ b/rtl/Makefile
@@ -11,7 +11,7 @@ DEFS = $(shell python ../config.py defs)
 BSC = bsc
 BSCFLAGS = -wait-for-license -suppress-warnings S0015 \
            -suppress-warnings G0023 \
-           -steps-warn-interval 500000 -check-assert \
+           -steps-warn-interval 750000 -check-assert \
            +RTS -K32M -RTS
 
 # Top level module
diff --git a/rtl/Network.bsv b/rtl/Network.bsv
index 4ee2e69b..82fbbd6c 100644
--- a/rtl/Network.bsv
+++ b/rtl/Network.bsv
@@ -289,6 +289,8 @@ interface NoC;
     Vector#(`FetchersPerProgRouter, BOut#(DRAMReq))) dramReqs;
   interface Vector#(`DRAMsPerBoard,
     Vector#(`FetchersPerProgRouter, In#(DRAMResp))) dramResps;
+  // ProgRouter fetcher activities
+  interface Vector#(`FetchersPerProgRouter, FetcherActivity) activities;
 endinterface
 
 module mkNoC#(
@@ -462,6 +464,9 @@ module mkNoC#(
   // Responses from off-chip memory
   interface dramResps = boardRouter.ramResps;
 
+  // Fetcher activities
+  interface activities = boardRouter.activities;
+
 endmodule
 
 endpackage
diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv
index 1cbdb53e..f100fd10 100644
--- a/rtl/ProgRouter.bsv
+++ b/rtl/ProgRouter.bsv
@@ -11,6 +11,7 @@ import Interface :: *;
 import BlockRam  :: *;
 import Assert    :: *;
 import Util      :: *;
+import DReg      :: *;
 
 // =============================================================================
 // Routing keys and beats
@@ -253,6 +254,16 @@ interface Fetcher;
   // Off-chip RAM connections
   interface Vector#(`DRAMsPerBoard, BOut#(DRAMReq)) ramReqs;
   interface Vector#(`DRAMsPerBoard, In#(DRAMResp)) ramResps;
+  // Activity
+  interface FetcherActivity activity;
+endinterface
+
+// Fetcher activity for performance counters and termination detection
+(* always_ready *)
+interface FetcherActivity;
+  method Bit#(1) incSent;
+  method Bit#(1) incReceived;
+  method Bool active;
 endinterface
 
 // Fetcher module
@@ -293,6 +304,10 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
   // Final output queue for flits
   Queue1#(RoutedFlit) flitOutQueue <- mkUGShiftQueue(QueueOptFmax);
 
+  // Activity
+  Reg#(Bit#(1)) incSentReg <- mkDReg(0);
+  Reg#(Bit#(1)) incReceivedReg <- mkDReg(0);
+
   // Stage 1: consume input message
   // ------------------------------
 
@@ -362,7 +377,7 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
     if (flitInPort.canGet) begin
       flitInPort.get;
       RoutingKey key = getRoutingKey(flit.dest);
-      consumeKey <= key
+      consumeKey <= key;
       // Write to flit buffer
       flitBuffer.write({chosenReg, consumeFlitCount}, flit);
       consumeFlitCount <= consumeFlitCount + 1;
@@ -404,6 +419,7 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
       ramReqQueue[consumeKey.ram].enq(req);
       fetchBeatCount <= fetchBeatCount + zeroExtend(req.burst);
       beatBufferLen.incBy(zeroExtend(req.burst));
+      incReceivedReg <= 1;
       if (finished) consumeState <= 0;
     end
   endrule
@@ -583,8 +599,10 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
       // Is this the final flit in the message?
       if (flit.notFinalFlit)
         newFlitCount = emitFlitCount + 1;
-      else
+      else begin
+        incSentReg <= 1;
         newFlitCount = 0;
+      end
     end
     // Issue flit load request
     flitBuffer.read({info.msgAddr, newFlitCount});
@@ -626,6 +644,13 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
   interface ramReqs = map(queueToBOut, ramReqQueue);
   interface ramResps = ramRespsOut;
 
+  interface FetcherActivity activity;
+    method Bit#(1) incSent = incSentReg;
+    method Bit#(1) incReceived = incReceivedReg;
+    method Bool active =
+      beatBufferLen.value == 0 && interpreterState == 0;
+  endinterface
+
 endmodule
 
 // =============================================================================
@@ -761,6 +786,9 @@ interface ProgRouter;
     Vector#(`FetchersPerProgRouter, BOut#(DRAMReq))) ramReqs;
   interface Vector#(`DRAMsPerBoard,
     Vector#(`FetchersPerProgRouter, In#(DRAMResp))) ramResps;
+
+  // Activities
+  interface Vector#(`FetchersPerProgRouter, FetcherActivity) activities;
 endinterface
 
 module mkProgRouter#(BoardId boardId) (ProgRouter);
@@ -822,11 +850,13 @@ module mkProgRouter#(BoardId boardId) (ProgRouter);
       ramRespIfc[i][j] = fetchers[j].ramResps[i];
     end
 
+  function FetcherActivity getActivity(Fetcher f) = f.activity;
   interface flitIn = flitInIfc;
   interface flitOut = flitOutIfc;
   interface nocFlitOut = nocFlitOutIfc;
   interface ramReqs = ramReqIfc;
   interface ramResps = ramRespIfc;
+  interface activities = map(getActivity, fetchers);
 
 endmodule
 

From aec8e7b295a1e9cec38062151ca4db117cc5e1dc Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Wed, 15 Apr 2020 09:29:15 +0100
Subject: [PATCH 29/78] Fix to previous commit

The 'active' condition in the fetcher was inverted, and also referred
to the wrong state variable.
---
 rtl/ProgRouter.bsv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv
index f100fd10..63f9e1e8 100644
--- a/rtl/ProgRouter.bsv
+++ b/rtl/ProgRouter.bsv
@@ -648,7 +648,7 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
     method Bit#(1) incSent = incSentReg;
     method Bit#(1) incReceived = incReceivedReg;
     method Bool active =
-      beatBufferLen.value == 0 && interpreterState == 0;
+      beatBufferLen.value != 0 || consumeState != 0;
   endinterface
 
 endmodule

From 3276d487b8b93a33b6a5d0e4923f23722f51c412 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Wed, 15 Apr 2020 15:41:55 +0100
Subject: [PATCH 30/78] Some fixes to ProgRouters.h

Missing 'return' statement, and missing use of 'offset' when uploading
routing tables.  The POLite heat grid now appears to work in
simulation.  Ready to try things on FPGA.
---
 include/POLite/ProgRouters.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/POLite/ProgRouters.h b/include/POLite/ProgRouters.h
index 34f62694..8fe3c143 100644
--- a/include/POLite/ProgRouters.h
+++ b/include/POLite/ProgRouters.h
@@ -299,7 +299,7 @@ class ProgRouterMesh {
 
   // Add routing destinations from given global mailbox id
   uint32_t addDestsFromBoard(uint32_t mbox, Seq<PRoutingDest>* dests) {
-    addDestsFromBoardXY(destX(mbox), destY(mbox), dests);
+    return addDestsFromBoardXY(destX(mbox), destY(mbox), dests);
   }
 
   // Write routing tables to memory via HostLink
@@ -340,12 +340,13 @@ class ProgRouterMesh {
             Seq<uint8_t>* seq = table[y][x].table[i];
             if (offset < seq->numElems) {
               uint32_t dest = hostLink->toAddr(x, y, coresPerDRAM * i, 0);
+              uint8_t* base = &seq->elems[offset];
               allDone = false;
               req.cmd = StoreCmd;
               req.numArgs = 3;
-              req.args[0] = ((uint32_t*) seq->elems)[0];
-              req.args[1] = ((uint32_t*) seq->elems)[1];
-              req.args[2] = ((uint32_t*) seq->elems)[2];
+              req.args[0] = ((uint32_t*) base)[0];
+              req.args[1] = ((uint32_t*) base)[1];
+              req.args[2] = ((uint32_t*) base)[2];
               hostLink->send(dest, 1, &req);
             }
           }

From fcefa919a6abe5696e92ecf40a2fc176a838d512 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Thu, 16 Apr 2020 17:25:15 +0100
Subject: [PATCH 31/78] Better handling of IND records

There was a cyclic dependency in the ProgRouter between trying to send
a IND-routed flit, and trying to consume one which would surely lead
to deadlock in some cases.

This is solved by (1) removing the loopback route around the
ProgRouter, and handling IND records within each fetcher, entirely
independently of the other fetchers; and (2) restricting the number of
IND records allowed per key lookup to one.  For efficiency, we
introduce a second restriction: IND records are only allowed in
max-sized key lookups.  This means that fetchers only block in the
worst case (and even then we allow two outstanding indirections at a
time, per fetcher, to keep utilisation high).  These restrictions
still permit the use case for which IND records are intended:
overcoming the max beat limit of a key lookup.

While doing this I also spotted and fixed a few bugs: an overflow bug
in one of the fetcher's comparators, and incorrect handling of
multi-flit messages inside the fetchers.
---
 README.md                      |  19 ++-
 apps/progrouter/progrouter.cpp |  14 +-
 config.py                      |   4 +-
 de5/S5_DDR3_QSYS.qsys          |   4 +-
 doc/figures/logo.png           | Bin 0 -> 7183 bytes
 rtl/Network.bsv                |   3 -
 rtl/ProgRouter.bsv             | 228 ++++++++++++++++++++-------------
 7 files changed, 167 insertions(+), 105 deletions(-)
 create mode 100644 doc/figures/logo.png

diff --git a/README.md b/README.md
index b001abcd..a3340c3f 100644
--- a/README.md
+++ b/README.md
@@ -17,16 +17,21 @@ bool keySend(uint32_t key, uint32_t numFlits, void* msg, bool block = true);
 bool keyTrySend(uint32_t key, uint32_t numFlits, void* msg);
 ```
 
+New section on programmable routers:
+  * Routing record format, byte ordering etc.
+  * Semantics of records
+  * Restrictions on IND records
+  * Avoiding deadlock: programmer has some added resposibility here
+
 # Tinsel 0.7.1
 
 Tinsel is a [RISC-V](https://riscv.org/)-based manythread
 message-passing architecture designed for FPGA clusters.  It is being
 developed as part of the [POETS
 Project](https://poets-project.org/about) (Partial Ordered Event
-Triggered Systems).  This manual describes the architecture and
-associated APIs.  Further background can be found in our [FPL 2019
-paper](doc/fpl-2019-paper.pdf), which presents Tinsel 0.6.  If you're
-a POETS Partner, you can access a machine running Tinsel in the [POETS
+Triggered Systems).  Further background can be found in our [FPL 2019
+paper](doc/fpl-2019-paper.pdf).  If you're a POETS Partner, you can
+access a machine running Tinsel in the [POETS
 Cloud](https://github.com/POETSII/poets-cloud).  
 
 ## Release Log
@@ -46,7 +51,7 @@ Released on 10 Sep 2018 and maintained in the
 * [v0.5](https://github.com/POETSII/tinsel/releases/tag/v0.5):
 Released on 8 Jan 2019 and maintained in the
 [tinsel-0.5.1 branch](https://github.com/POETSII/tinsel/tree/tinsel-0.5.1).
-(Hardware idle-detection.)
+(Hardware termination-detection.)
 * [v0.6](https://github.com/POETSII/tinsel/releases/tag/v0.6):
 Released on 11 Apr 2019 and maintained in the
 [tinsel-0.6.3 branch](https://github.com/POETSII/tinsel/tree/tinsel-0.6.3).
@@ -106,7 +111,7 @@ demands, but fairly modest compute requrements.  The main features are:
     instructions for sending and receiving messages 
     between any two threads in the cluster.
 
-  * **Hardware termination detection**.  A global termination event is
+  * **Hardware termination-detection**.  A global termination event is
     triggered when every thread indicates termination and no messages
     are in-flight.  Termination can be interpreted as termination of a
     time step, or termination of the application, supporting
@@ -563,7 +568,7 @@ Tinsel also provides a function
   int tinselIdle(bool vote);
 ```
 
-which blocks until either
+for global termination detection, which blocks until either
 
   1. a message is available to receive, or
 
diff --git a/apps/progrouter/progrouter.cpp b/apps/progrouter/progrouter.cpp
index 6b764ac5..28fb494c 100644
--- a/apps/progrouter/progrouter.cpp
+++ b/apps/progrouter/progrouter.cpp
@@ -189,9 +189,14 @@ int main()
   msgOut[1] = 0x20;
   msgOut[2] = 0x30;
   msgOut[3] = 0x40;
+  msgOut[4] = 0x50;
+  msgOut[5] = 0x60;
+  msgOut[6] = 0x70;
+  msgOut[7] = 0x80;
 
   // On thread 0
   if (me == 0) {
+tinselSetLen(1);
     // Add an URM1 record
     uint8_t* entry1 = table.currentPointer();
     table.addURM1(0, 0, 10, 0xfff);
@@ -199,12 +204,7 @@ int main()
     table.addURM2(0, 0, 60, 0xff3, 0xff2);
     table.addURM2(0, 0, 60, 0xff5, 0xff4);
     //table.addMRM(1, 0, 0x22222222, 0x11111111, 0x2222);
-    uint8_t* ind = table.addIND();
     table.next();
-    uint8_t* entry2 = table.currentPointer();
-    table.addURM1(0, 0, 20, 0x111);
-    table.next();
-    table.setIND(ind, 0, entry2, 1);
 
     // Cache flush, to write table into RAM
     tinselCacheFlush();
@@ -226,7 +226,9 @@ int main()
   while (me != 0) {
     tinselWaitUntil(TINSEL_CAN_RECV);
     volatile uint32_t* msgIn = (uint32_t*) tinselRecv();
-    printf("%x %x %x %x\n", msgIn[0], msgIn[1], msgIn[2], msgIn[3]);
+    printf("%x %x %x %x %x %x %x %x\n",
+        msgIn[0], msgIn[1], msgIn[2], msgIn[3]
+      , msgIn[4], msgIn[5], msgIn[6], msgIn[7]);
     tinselFree(msgIn);
   }
 
diff --git a/config.py b/config.py
index d0a2491a..1d2e5b07 100755
--- a/config.py
+++ b/config.py
@@ -164,6 +164,8 @@ def quoted(s): return "'\"" + s + "\"'"
 # Programmable router parameters:
 p["LogRoutingEntryLen"] = 5 # Number of beats in a routing table entry
 p["ProgRouterMaxBurst"] = 4
+p["ProgRouterCrossbarOutputs"] = 4
+p["FetcherLogIndQueueSize"] = 1
 p["FetcherLogBeatBufferSize"] = 5
 p["FetcherLogFlitBufferSize"] = 5
 p["FetcherLogMsgsPerFlitBuffer"] = (
@@ -187,7 +189,7 @@ def quoted(s): return "'\"" + s + "\"'"
 p["UseCustomAccelerator"] = False
 
 # Clock frequency (in MHz)
-p["ClockFreq"] = 225
+p["ClockFreq"] = 220
 
 #==============================================================================
 # Derived Parameters
diff --git a/de5/S5_DDR3_QSYS.qsys b/de5/S5_DDR3_QSYS.qsys
index 0695a737..dc87cb4b 100644
--- a/de5/S5_DDR3_QSYS.qsys
+++ b/de5/S5_DDR3_QSYS.qsys
@@ -891,7 +891,7 @@
   <parameter name="MEM_CK_PHASE" value="0.0" />
   <parameter name="MEM_CK_WIDTH" value="1" />
   <parameter name="MEM_CLK_EN_WIDTH" value="1" />
-  <parameter name="MEM_CLK_FREQ" value="450.0" />
+  <parameter name="MEM_CLK_FREQ" value="440.0" />
   <parameter name="MEM_CLK_FREQ_MAX" value="800.0" />
   <parameter name="MEM_COL_ADDR_WIDTH" value="10" />
   <parameter name="MEM_CS_WIDTH" value="1" />
@@ -1214,7 +1214,7 @@
   <parameter name="MEM_CK_PHASE" value="0.0" />
   <parameter name="MEM_CK_WIDTH" value="1" />
   <parameter name="MEM_CLK_EN_WIDTH" value="1" />
-  <parameter name="MEM_CLK_FREQ" value="450.0" />
+  <parameter name="MEM_CLK_FREQ" value="440.0" />
   <parameter name="MEM_CLK_FREQ_MAX" value="800.0" />
   <parameter name="MEM_COL_ADDR_WIDTH" value="10" />
   <parameter name="MEM_CS_WIDTH" value="1" />
diff --git a/doc/figures/logo.png b/doc/figures/logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..8271002b57533674602c876f50bfdb9e2f8a4dcd
GIT binary patch
literal 7183
zcmWkz1z1yU6dv6%N?KC7Q@T@9T0*27WOVoF2I(&8=100iLP8qph7r>BU$=YrJ^P;B
zdr!XSJ!g^Xs`6Oqr05_J2uo2xMiW?bfKvw*8Tgz>ODO;r<S)wdGN9Lg@BGfPWMBu)
zSwY_w1i~KpcfzHzV3PrxC=f*zS(F_Fay&LBQ>{5;V2i>=OAjLL<ltc82m!u=K+<0=
zOd%HLR30`EYbtp~6?JoV3ltED%vDiFQp<Ds)W}AQz^dK<dwZtijY<d{0iymYpGHw{
z%+Ol0VBwoV`phihBlDluh59kB6d^+iGHOsox#0l~o&}zyZySP+F0tM*8WIHRh+#yy
zOJt~awc4=fU%dP(PE^$8*V!K#>c4kCv>*E%t^L6XhoY8pz9gG2zrNDQ2!bY`LYYM=
zni<sk82E{nXEe)aW;VDi(3@`0BvTXNV2gUz1n9h_#T$_eCi16=v>TO`n<Nxilf&}9
zg)}F@eBv5DTZm^JkL+3twcF_kDC%`zkTPqGW#`vBM@%|NGgl3@U@H+%t?TfIka7hD
ziC>?bBy|m0#Z<r)v=p?dq^e4EpNlmWGG8&$__D3^`|mAN$3}KP=j8O(Ro(H*LSi7G
z9?)mHq~8okz2|@SuH-P{cu6mqqCA~vnk8zTw(06LnMM>OKyu=t16E9h{lz>Na*DMA
zxRY8(;5CLK4ekb3;$J?5zcgmh3>-eWjW<x`)L?Oaf(|u=ONT<^l!S&9=#=@5czzLI
zU9Ay{;AmD1r1}%v3x%6Xi2NBUHOtQaOse({BdLBx0ylqjN`5!p1b*lbO`(r4UhynR
zAB|<MJ&X2=ktE{Gp}NlwYRV4}X`kB~g^q|INa-Y8MumnP&Ea|lha||Rx%}?$$MVL;
zg3&1fK{=Xsu0!HIf8x-CF5MOGq)xS=GX*i4xm-cG`L!P#*WSE_9gCeRLBlx^OppQ{
z5k128&fJ*SK^lf4A|7Ij-(T?J-dla26Jo^F?~lS_@o<7!x@?1oF&Dzg0Uq*mdBP}l
ztme-Ty%lP25703RSU4gEhUGsjxmjh^5Y~U7N7S&3Z}~BJhzRFvL;=rl53=EP_S+-5
z8y+rV;+wxzra^U>Kmi-cw-dS9v<nj13+XTe5;~?OQsqWGx=u-MonjeT(C?U<I(t3X
zcur=c&Dcr`QG`Bt&(Xc3ae9C-dYlm)!v%c2US|D8oYt9#%tM{+m92UZ&QH<b@Of?Z
ze7rz9;pPNwfy=B%fo^(dS28&(dbvnmBRK`JQxzp1OuIpdk8l79)MDn|5@`Nsskr%{
zrN=iH?BPMvbBMb&M(0LNsD|;B(@d$>ZJyv}@u$Ft*)mCxV4QSqvz;svge+^4u6vC0
z=d&P%MzqptOM-%WvyXs6Nib}j10fisF98yHr-p2z0jKj00rXd|0r3(Jp~u<q_>J;F
zEz2QvzhG=6(6}P>5$5mlr={L(w%2+#gSAPB0i}ILbELmt(twTk{yqVR9CgMze{wSX
z@oGx?lRC&qpTtluF7)WZ4K+44AuP0qiLrb(8`a2$iMjg$U9HLxZE_aB$}q09Qa4e+
zhrih0|6IR=IJTnVJTf+HV2z^6ke!GXQc{ecnW=t0#?r>+#yq#QgglBYJru<1390&E
z$#px{-j)yPM>bdc7I^ghs8R1g*1(`DvlfY8{_tRDn8%OxaAcd}xET0pdq`4OGx%vy
z5BMnvFj>yN%)xc}duPzV)p=Yb92)%hn=@Hs2=6Mx2Z{4s(S(QlUZ-6p1q67ii@&@B
zl(LYVVl!gos>;fOr&W<*TL|8Vl0d+VxS+|;qzzx+(^<27rH^6;>04QG;wdV737c~(
ztgafw_$fBJ&M<9w9k4K~Utcer8+8y5Trn=Sf!zQnF}yrk2_hi{xw(hmX21Wu`_)<0
zC)mZENOa*(U#VHl4cTyW4Tyhen=a|`sClW>?B{%K1%;EUagn?|Ic*{_bTxyCmnOkD
z9taldaxG_cbgAA#*>Z6oqM$rqc-S{$&dJG<Jn^^J`;$S(^W#*zGR@1^#k4f26WX+x
z!}@1tyZ0?<0m<i5Eb#ogB?(~={8{^+R?lxa#E#T7fXf)4HxLw_pPXT*t<)FSvI<{l
zvLi6zj?b{Y6i{~%kT=ebYiU`y@d#I1NQ(a>CVp{{eiy7`S9hrQyHLf7QViT=P{+>h
zKo?&B6v>sE#A&m0_%rMsymV;8L{<Sib8t%&!P%5JPTqg68qFRYfM?bJbf9&0Q5Mez
z!R52D4XzQgsNXMHp(-c@?l;*~8`FWi`43}`)M#w9sXrK0OKxT0Mx`o!x>n%48R>Vq
zjh2ixH(Q!)E`m-wAL5PS<6&}O_Df`CMK|DxhqBTkUcNkU^0~g__42yg-H^luoA>rQ
zQwribYG?q9y15iXVSPP#aBxtaIo0!K$%-wxW{`_BU^ccTH$z4KhMDi@si^Dx&j#et
zMQoe0h2wg&VeIVpuE<zeSWqamr@>;J(|iOMRfqzzyPJTF+{eU-Iw>74K}m_N4_ilG
zndnEz42D0jvz@-WUg-7mkv`f-snO+d?&{j;=#;f840#u;Ss|mNqciSu{F_ZnSGTC9
z2Ah0%^LO<pB8B~)zGSMVQ7JubSxmDw%gwIpAx=z06dz02>VF~njh353v1GWJXuf+t
zNuHiz0mI=~WOa^fLV~xj_?}S21UWM*&3MXi;_MH|&g6#3Siv1mCLzmK`apOjw6y~P
zhs=`Pe0oK=nIJ>PXG?&Nfq{jFSg7+ZBR<tZu1)7A^`bYcVve7mLPL3_pxN?Rhd*aj
zuOMe)3DDhhJ60eeBV%cgjh4oKwGI3n9Kr<~e1qj>Dj7I1Pl%_h+cS!WIo$l$Cm{l*
z{9c$#v%>UE$fVA7d`9u?@wLOqUV@61R&7-V9Zl>I)qfw}3?{`|D$i!}TaodQo$BrM
z^+}P7`7R#0<)}u$UZ0(0rZSQA^TgP8_!;o~UBW%{OJbs-HSx2}QN8*u$Ob|C4#^X%
zrH#|sSVbyURu@ixx*TrhS6SHQRE#Ky85sIaD+2l%J4xi|3|nb(=u?*swWan_>09iD
z;_S#4cw+bYIDD;6L;Aicn}OHzAeh5JNIfIF$UZ7GHmN@Ju1pqE>OuQ3Wt=#ka_gcs
ztCXZ6ioBRyJJ;ptM5SI*`eZ^(V$Pnk_=DLpoQ{r;fq7rOEl5=(54_j|DgZ<PBAngI
z5^qB|5BCIGe-R^3A7UcTA8t^PVWbyGZ(h!Q1Yk!J2pKPPIz20^{_se0l)S;{O)ftn
zI!em(KW*S*o&Mbnt1AEAz(CN?Tv0L<Z1OtKwb4<utNp16L-seiSEkeeljw6Diut_>
z#`lS_N&8%VB32Y@`6W3~2P0lFIZzRxZkOlIZQ`#TLBv5p&e2iuAkXKQTeenMy?Nvs
z$HCrScxWghFCX9L-Nklw&8RuI@BMB%G%k)v_tKOmC#1{e6ZcocbdnuONl8K9JI<u!
zWS6^h)4BP1)4TIcXk;XK{)kpu=<1C9quP8MQ(^s5d$iKR`#5PWErP;K`ci1bnosi<
zR#KP>ctUhG;)@AB7$?!&!JV17Q3ZTR{RdiFQGw>=<;BRzDB<Px{`>du>Lw;Qv2qBx
z`_##!8@sz<fby7>BE<dSSjThsSC^O0yJLim1n64Y+LF~eAr|^=KYkR+$5WzWW6Qn1
zJX2h*Qb?+*Vo&CY$7g4gr=+J#*DmxA4kE$9!Tn2JK+9jl#CkA({v3U>FUlnfZzL+`
zznm^N-(8M#Y?>GJEv~Im2z#6jq%eJ~S}!UtMg;CX_@C>(^Zlg-W2zErFsRcQM-rC`
zRVqshd%Ph*fo?+(+t}DxufvmTYio<uA4R_217_^|bTK?KHH9*Dwc*ue;kcb$dNpRw
z9YZNf8coC|1%dFImtI4$H>L}uq0!OOfWV%)PWPR+Z1nvic#(+x5jex8Dp{QVPj@B(
z8l#vVx;4d>U*A_at{0z_<(1k|QqG-UO+oBd1(o2Sh(V5G*8~>g*92g&5geo4G}X+`
z{%kE=vw=!&*8_7eF1x*}r>Ez1BCUK?oVfeRxB2>0(d_JOMlP=UweEkQ!|Qu@jy-j}
zyDJ+O7KS~Qo|UDnp`k&MX}8|trKO><IW6Mm=0-qFY<+DkCMHHStWTsnvAeU=W65J~
zc_lhFZQJ}!JV1=!VHt5<5}%fjk1+Y2N?)_xf{+MoB$0L#c-2A3gxy)$jXM?#{_DEm
zBkMTt!x$MIJv5N-E7zP5kTsv!hQu?i`@lrWMLeg^_UCN*h}aFGg@uJEnNH^$z0MvU
zt&jhOpx}Jj><eS$;0VaS*ZlaA%kDQTfSw*$hX&unRHW$5b=n=f7$|5HMRX>{f~h_)
zHCVWptVO)_hpw)!qU3ULbK~I&PYEzEpw!pbTU%Qn@a~)auF^M>5mIJO-CsF7zJdP5
z=fp&)&_d7nfc&2!HgLo>!c4Svb+@mk#9Qw(sz0?Zcd(}_0V=`0ef#$A?yhHLg)BEW
z*V)Z2tm*##;o*UXmG$S@ZHBN1dy_k9j-jF9#^E6v2L}h>f<?C|cBA&tfq{XV3*%U{
z0G{~%%TKQfE07|&*uFy9Xdv8?bai#lMR3BTf@aIqw&yB7@(=sJ-0$l(+olfM-QVAr
zS5`I({hpo0zqq)7#>R%#);f*fXJ(S_?CekhhJV9h9I8_(+wK2s?;j6%f9#9C=Qxck
zxv}RaQk7mkJdi2jm6a??%E~SW)718B?epKCOJ@)3>=xL6X7CIbys7VBx9e@NfK1FL
z&!Hn3;oy|n85_?fWGcPa&`23_p*y&S6a@r~QHo|lp-FJ<?|lVr&er`HI~P3Gy@t00
zW96jnepl5WpAr%h>XQe&Jb7hp>(m;bPQ!GWQ;QdltBq?bDhfuE89wDim=b&M=XgN(
zy`r~Wzkam_ysR@Ue!r8Pl9V)qv|d?ZVDYU*g?H#{sXR!Cg@llSfuZB|<<WC1lEFSs
zO;xq!d&?<OIYm%yZLNxrYJvJjS)PA`x8K2Zf!+^yHa!-Xc_<XM%D9tsa&nTWHa#uP
zy`AA$FH{$IdC~jCY?_uiRSArbZ+6A=doHuPyL-Y#*!`sEjfpQ@?*7WW$>-9Ngrp?+
z)Ku}wz6MsypWjSd8Nb||Eb)scr=}wLG03s%Rr2SQgm&vQyl&UYzCgjW_Qo76ERCLG
zyu1W8H8uRhHHQD;Nk?E0=8F0xBqz^0UZMpD<^{Ygm9hyqt_lK9ZZ>eZK0iN~n93Ec
zUT5Ion5s#>?1_`96r^zO##<b%aLbrAtD}&Ht#>*Y16sCvI~>lGxQi}URk1#8;si)2
zD4@#4Q^scSSpAa>o8&B*ii(A0{K&1-IDR5az-*<(fgg=6RjIm5Wav#L4?TsIS~d63
zMqPdW6&D8f3dY!xz1Gse;GiiWKzVg_v+z`ou!r;hBn6OzxGpX(cjG26!Rf}{qcz^0
z9<W>$&0G<$%b+|-Kflg?l5cSpx-2g3I=de@?Xp`4zIe%;Qx(9gsj2ziu6dj?<5}C<
zO1Zo96R{bDf@$S8&eneLD_SYfxD)Y`szjWs4hQ!eoIca2UPXRz5K416+{1f)N_6{>
zD?$<(83~PvvA<nxYU0Ht<Hxj`%sxygKIKh<e3b>;SrBou+AT``T;#BAI?Qx<AHLnw
z-X^hkjG--u%-nS+W&RBdc7DFu{|{Ax4F56odv5OY#n#}+_;{n|nn8I{W9Qu@U9M49
z&0r#DtSB1%(Nv!Na~Q@!z$xja<xh!2`L_j3e1y+&6!@(ln*;ryA55PguJzg>Op`gn
z5%J<LZ4){qztuH0gYWJ<Pu4qEK3s5~UtKk3I;ASmeG_n^1QK4Tiqb20xF-lfxOrug
zIwUO9wKmuQYvvf-W`Tj8&MrH7cY(UzMUg1}9NKYUJ(u%7gv`_wRoIX8E6Qb9JQ;C#
zP<OXD5vyJx5(W_!EiJ<FLhbOcBKcF&BcGcS8)>053ot1udYF`HDr7vJqqWTl;9aD@
zs&5pUv@itTYKn^=kdZpmvSI>#9d1$zV*0r=0jo*eR`&~G!Rw2FH>*Cpg9g|!mL$Z0
z>@gn7`58!=yQzSekiUN+9d}#N?v2Fh943y^LZ4h@03sIfd~DLa1j9${{o<7}PopYF
z182j#VP~h|bAapZw1z%qGn>!MK2&|NU7*Vhc0VHMuma5ZyLu!~00%CMpW1VZGC?OA
zP&B2=goAq=4wIA@H3H{t%O@ps_FsBwgNyZM4oaC$-Oog2YI!g7bggr8mP(cLH1za?
za4_w%km+fVk=J{vq$Chciv23;ZmuY*ijvgEa`Uu~ZqdeWx<-oyh7=BY;A@ss;oc&b
zc|19qez!krw=>)fL!QjxBaGJLNDJWNl*-&j?GQBE=H<;PwH%@OJvZgsC@54yMkaKg
zOIus}UkQ7;h!1Ew{$XWlxm=B8PCR7!@S#N0bUcmC+TOlNMQDnIC%>{%`lejtvjsP%
z7URa;4*?~y7viYb2aY?ZRp^GLTJxn-K>ukZJ{b3H{Q6rzM4&DH%AgX=%!7|mV-&(F
zLJrKEwr+qyg4y7iX9}0ox(HAgKA)}rr=X-HWoBlkjF50>%DdQIhYxoE_&m#%nwt8X
zpiAL|i&6=mG7vlfw4X0o<!#=au2g3x0d=yWp&{YhHz*;C4waxFF#tHSnwm)yk4kiS
zKa9-9U+!ohfX)Emc*SL9WHUAnN2-WHQLncFOcSG#NpQ@6x^e)VU&|Fr)daPJNfa~t
zq`oh=#&aVeFd^k<sfnv({_v;}Vv3x}^9<I6V_xgXxEqqw{FxzL0W!L}C}`)gYB#c(
z{jObUKN$cJv#jkm?1t7ZhGu3(gHyMMvk~u9vlTFiSo7(X)26ouqix1gn1E7c-Wv10
zt!;73kc5O}u2Q#F)lPz&l8cLrKQGp}w(NG@r`G89gR(NM-Vf9HF29uZB9s!4*4<HM
zF%x5cp}?PFf<W%4=Zc#-Cfl)*{F8FVc1(3kXU^xZJE#?e=wtpj`bx+Llfg*EZ0x0%
z)t{JDXbM;`rRibaYQZ{>Bgtn2k@z`Eft&piD^IxqNFGCGu1{AM&K|NwypoDFE2M9)
zZf@S?s9JUAPFlVA${4nR7J5T8TFg)gT=)B&=9xn&1pm!DE{#3my2-_2&OerI++L2W
zzaiLAwPyteVYDZAE`PuApp5xFCJ6fp84kxtE*{PT)4r6a>)N))&v19_3V)u`>a?GK
zf$i5jmiDf9$J1%W#JWE2S5;Lt*(HlTo-`5*IA#Nq+}@%rbg<=UkI>SBz3#bvt)}G6
zl+Ybz<fKRmLPw(C^xog)d#(xXa2Utr8~r%nA#;*YuTUp?8(&Dsf_ph3aL{Z^n90qP
zDBvd~5i05Bm4J+xzxN1B<gvv3o0>{<NZzyJ)O|YLSEJvEoacL~ST-?!)v*Sz3h)s7
zm6i!hn-uWSdEDA6^6^%10<0_K&v9)CL}*9knYT$+78Mvj)bxiJ{JWvB8?$${N&I@A
zA97>_3iHdo+Rj8a+N;|!ZiCZX#8~9kcON94<Hf&xwtOd2rWV1>g0UbiHSPS;u}g!{
z=&2@OvxKhh{@YFY>PjRYXai7;A>!iVRe)4k0!}R?nVeV!|B878Hbvh?Z^&C~0r;Y#
z4TVQ@Pfx*_nHf&^6TP<o{#&BYktRV4xA^6AtRsj+61?T{C+y3a;a%;g2!96Ee8`$m
z%IRw4rzz2hjkO<O_Xl2d6E7rf`8bKkRo8<~KVh259O;)rS@oyefT9JvT59V@cdfJ4
z$eBCO)m%ZapZiH&yDr6e11cI?FHmFH#5@5jWjX#xPk)1qN&NZ36c-$P@-(U@ARzD;
z_UDgnWO7L<2~8i$Ti?J&tEsJ{`N}r`=l1^2a)9FhV+L(5+qh(8(N^~M#5PzwmOp!d
zjq}siKU)B0mH7IVTS-ZYob>qgG`6me^B-&iq@0+?GDdpa`}Qmr;|==;U<pCDBZ8UP
zSq(isXjD{GT45+a^GWYyPCtzCm6R2o2fUzQE!6a0-S2}N0>o`^V8d$*#vud4C@-*1
zl)Nwu7pqC~xm-~U1{0Gzr)9)i{l)|U2mRms5ao71QUrUz!hspN7Z=yL_l0ceJ9_<}
zOL*X;qod^k?+=QHySo;A0_DG~^R<|(<sR2oSMA-Wjv+!?T3UJ(u#*!^Cnu*Y7w!_x
z$fzjvoxQ!nx;k6{#bRS)rw<_sN+kfPj{quNUw=PP&!PaDL_<ZD|52;>qIMxAEp2OW
zA`9pWt(Tf2wgvw+dAha6sKC_A+L}>BRCE)t3UGycTa_{{qLz+MVM_~<ii!%*iU0>O
zpKXfdgL|>GvaYslYne(B3&;W)F@c#O!Zx0wh?jvuX<Lxlpg9V<ugK-WUIGt6%HL`e
zq>;?;+Z|^j$%mMj{KJ*f7))lvuI#(gciwZ<sPJgP5+YNr?-3>MCiVn!Vaywn$ZuFb
z$=PT!8D0G?Zg<@;KYVa&rzBNk0_}D`aicL4raCN1*>IB>Av3;~q$`+wQntwRmD1h)
zs)SVT{pyd`WPk?|!n^PU;dqCP8U0}Fv{Gw_SND>Fw;3N-<g&sBu47=1BCRzlTHYRJ
zqUB6Tf*YfJp9Cax#a<8G*!gW4LExD9vz8%RBIHzBS_yp(8dIzlNZ`7c&{7qn#BZN(
z9thxg1@J7*{auaU`I5IG<HPaE1_k+9v~ttcX!r`#)&ujniVI=~$Hz><%u$+IB^fnU
zG2d@tWm<tVba>wvyK)e#(=H2(fMTEpJnS{wGd1cv9)4tP?HgTbI-_CryE1VN+eY4x
zYaNQQHa00sWjIF2<p<TRP$&ChAG_b`nOaJtAVStpziXPlp(aAoPCIgh55NBqE>SEg
zetNLeT<cP4E&EU~4%$`Ck~``U`M%m5UE}3N^7Th;=ewFvg?81ttE6Nm9!4loeGED?
zAr=-|P2PM)$Oqn@f_#SUGO@a~GuK^yhc`Q;bmj9~w$Nf9=W{G9h|iH0Py}w~D(K3!
zil4e)c?Qn((!M7rDVTheCJcDYvYw4BJ~kw%Hjo7>8Ehq;H!SS$cp`1e1<-)lW0FRD
zR!!4CUZT9CqXIzSAVq1FTo4y?nELOAq-P9etdENeqyMc@y~Y+7{E9aI_0wz~uKy)n
zaZu1m4nNlQ{uJ%WBG(<8QPdENE>%-ey2L<#tUJTgTY`r%C4!DiblW(}uv=I_{B$v0
zx#Mb}hzNOAbaWdH&7nU<f@(yc_zPIuBmgU&S{m1V3sF#9+=Wl(`hkVph*|dhYz=j_
z0m(KDK5RT~iEW4A;vn#mjp5ekN~H~xEc_wQKR91AsJYR#1XZKcJ-2<{8uM|jzN(ET
c9`5BWl~f{G9+4W_zyIYFWmRQrq&^4#4<Vk@r2qf`

literal 0
HcmV?d00001

diff --git a/rtl/Network.bsv b/rtl/Network.bsv
index 82fbbd6c..f8396509 100644
--- a/rtl/Network.bsv
+++ b/rtl/Network.bsv
@@ -404,9 +404,6 @@ module mkNoC#(
   List#(Out#(Flit)) botOutList = Nil;
   for (Integer x = `MailboxMeshXLen-1; x >= 0; x=x-1)
     botOutList = Cons(routers[0][x].bottomOut, botOutList);
-  // Also include loopback connection to board router to implement IND records
-  botOutList = Cons(fromBOut(boardRouter.flitOut[4]), botOutList);
-  function In#(Flit) getFlitIn(BoardLink link) = link.flitIn;
   reduceConnect(mkFlitMerger, botOutList, single(boardRouter.flitIn[4]));
 
   // Connect board router to mailbox mesh south rim
diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv
index 63f9e1e8..ecf6e927 100644
--- a/rtl/ProgRouter.bsv
+++ b/rtl/ProgRouter.bsv
@@ -126,7 +126,10 @@ typedef struct {
   Bit#(64) destMask;
 } MRMRecord deriving (Bits);
 
-// 48-bit Indirection (IND) record:
+// 48-bit Indirection (IND) record
+// Note the restrictions on IND records:
+// 1. At most one IND record per key lookup
+// 2. A max-sized key lookup must contain an IND record
 typedef struct {
   // Record type
   RoutingRecordTag tag;
@@ -136,6 +139,10 @@ typedef struct {
   Bit#(32) newKey;
 } INDRecord deriving (Bits);
 
+// =============================================================================
+// Internal types
+// =============================================================================
+
 // It is sometimes convenient (though redundant) to record a routing
 // decision for a flit internally within the programmable router
 typedef struct {
@@ -151,10 +158,17 @@ typedef enum {
   RouteSouth,
   RouteEast,
   RouteWest,
-  RouteNoC,
-  RouteLoop
+  RouteNoC
 } RoutingDecision deriving (Bits, Eq, FShow);
 
+// Elements of the indirection queue inside each fetcher
+typedef struct {
+  // The indirection
+  RoutingKey key;
+  // The location of the message in the flit buffer
+  FetcherFlitBufferMsgAddr addr;
+} IndQueueEntry deriving (Bits, FShow);
+
 // =============================================================================
 // Design
 // =============================================================================
@@ -165,17 +179,17 @@ typedef enum {
 // NoC edge, but the diagram assumes four.
 
 //
-//               N     S     E     W     L0..L3/Loop   Input flits
-//               |     |     |     |     |       |
-//             +---+ +---+ +---+ +---+ +---+     |
-//             | F | | F | | F | | F | | F |     |     Fetchers
-//             +---+ +---+ +---+ +---+ +---+     |
-//               |     |     |     |     |       |
-//             +---------------------------+     |
-//             |          Crossbar         |     |     Routing
-//             +---------------------------+     |
-//               |     |     |     |     |       |
-//              N/L0  S/L1  E/L2  W/L3   Ind-----+     Output queues
+//               N     S     E     W     L0..L3        Input flits
+//               |     |     |     |     |        
+//             +---+ +---+ +---+ +---+ +---+      
+//             | F | | F | | F | | F | | F |           Fetchers
+//             +---+ +---+ +---+ +---+ +---+      
+//               |     |     |     |     |        
+//             +---------------------------+      
+//             |          Crossbar         |           Routing
+//             +---------------------------+      
+//               |     |     |     |              
+//              N/L0  S/L1  E/L2  W/L3                 Output queues
 //               |     |     |     |
 //             +---------------------------+
 //             |          Splitter         |           Final splitting
@@ -192,15 +206,15 @@ typedef enum {
 
 // The key property of these fetchers is that they act entirely
 // indepdedently of each other: each one can make progress even if
-// another is blocked.  Unfortunately, this leads to a duplicated
-// logic resources, but is necessary to avoid deadlock.
+// another is blocked.  This leads to duplicated logic resources, but
+// is necessary to avoid deadlock.
 
-// Note that, as the routers are fully programmable, it is possible
-// for the programmer to introduce deadlock using an ill-defined
-// routing scheme, e.g. where a flit arrives in on (say) link N and
-// requires a flit to be sent back along the same direction N.
-// However, the hardware does guarantee deadlock-freedom if the
-// routing scheme is based on dimension-ordered routing.
+// As the routers are fully programmable, it is possible for the
+// programmer to introduce deadlock using an ill-defined routing
+// scheme, e.g. where a flit arrives in on (say) link N and requires a
+// flit to be sent back along the same direction N.  However, the
+// hardware does guarantee deadlock-freedom if the routing scheme is
+// based on dimension-ordered routing.
 
 // After the fetchers have interpreted the flits, they are fed to a
 // fair crossbar which organises them by destination into output
@@ -234,6 +248,8 @@ typedef struct {
   Bit#(`BeatBurstWidth) burst;
   // Is this the final burst of routing records for the current key?
   Bool finalBurst;
+  // Are we processing a max-sized key (which must contain an IND record)?
+  Bool isMaxSizedKey;
 } InflightFetcherReqInfo deriving (Bits, FShow);
 
 // Routing beat, tagged with the beat number in the DRAM burst
@@ -304,6 +320,12 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
   // Final output queue for flits
   Queue1#(RoutedFlit) flitOutQueue <- mkUGShiftQueue(QueueOptFmax);
 
+  // Indirection queue and size
+  SizedQueue#(`FetcherLogIndQueueSize, IndQueueEntry) indQueue <-
+    mkUGShiftQueue(QueueOptFmax);
+  Count#(TAdd#(`FetcherLogIndQueueSize, 1)) indQueueLen <-
+      mkCount(2 ** `FetcherLogIndQueueSize);
+
   // Activity
   Reg#(Bit#(1)) incSentReg <- mkDReg(0);
   Reg#(Bit#(1)) incReceivedReg <- mkDReg(0);
@@ -333,6 +355,9 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
   // Maintain count of routing beats fetched so far
   Reg#(Bit#(`LogRoutingEntryLen)) fetchBeatCount <- mkReg(0);
 
+  // Track when messages are bypassing fetcher, to keep the bypass atomic
+  Reg#(Bool) bypassInProgress <- mkReg(False);
+
   // State 0: pass through flits that don't contain routing keys
   rule consumeMessage0 (consumeState == 0);
     Flit flit = flitInPort.value;
@@ -344,29 +369,54 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
         found = True;
         chosen = fromInteger(i);
       end
-    chosenReg <= chosen;
     // Initialise counters for subsequent states
     consumeFlitCount <= 0;
     fetchBeatCount <= 0;
-    // Consume flit
-    if (flitInPort.canGet) begin
-      if (flit.dest.addr.isKey) begin
-        if (found) begin
-          consumeState <= 1;
+    // First, try to consume indirection
+    if (indQueue.canDeq && indQueue.canPeek && !bypassInProgress) begin
+      IndQueueEntry ind = indQueue.dataOut;
+      // Consume
+      indQueue.deq;
+      // Release space in indQueue, unless we have another max-sized key
+      if (!allHigh(ind.key.numBeats))
+        indQueueLen.dec;
+      // Jump straight to fetch state, as message already in flit buffer
+      chosenReg <= ind.addr;
+      consumeKey <= ind.key;
+      // Proceed only if key size is non-zero
+      if (ind.key.numBeats != 0)
+        consumeState <= 2;
+    end else begin
+      chosenReg <= chosen;
+      // Otherwise, try to consume flit
+      if (flitInPort.canGet) begin
+        if (flit.dest.addr.isKey) begin
+          if (found) begin
+            RoutingKey key = getRoutingKey(flit.dest);
+            // For a full-size key, we must reserve space in the indQueue
+            if (allHigh(key.numBeats)) begin
+              if (indQueueLen.notFull) begin
+                indQueueLen.inc;
+                consumeState <= 1;
+              end
+            end else
+              consumeState <= 1;
+          end
+        end else if (flitBypassQueue.notFull) begin
+          flitInPort.get;
+          bypassInProgress <= flit.notFinalFlit;
+          // Make routing decision
+          RoutingDecision decision = RouteNoC;
+          MailboxNetAddr addr = flit.dest.addr;
+          if (addr.host.valid)
+            decision = addr.host.value == 0 ? RouteWest : RouteEast;
+          else if (addr.board.x < boardId.x) decision = RouteWest;
+          else if (addr.board.x > boardId.x) decision = RouteEast;
+          else if (addr.board.y < boardId.y) decision = RouteSouth;
+          else if (addr.board.y > boardId.y) decision = RouteNorth;
+          // Insert into bypass queue
+          flitBypassQueue.enq(RoutedFlit { decision: decision, flit: flit});
         end
-      end else if (flitBypassQueue.notFull) begin
-        flitInPort.get;
-        // Make routing decision
-        RoutingDecision decision = RouteNoC;
-        MailboxNetAddr addr = flit.dest.addr;
-        if (addr.host.valid)
-          decision = addr.host.value == 0 ? RouteWest : RouteEast;
-        else if (addr.board.x < boardId.x) decision = RouteWest;
-        else if (addr.board.x > boardId.x) decision = RouteEast;
-        else if (addr.board.y < boardId.y) decision = RouteSouth;
-        else if (addr.board.y > boardId.y) decision = RouteNorth;
-        // Insert into bypass queue
-        flitBypassQueue.enq(RoutedFlit { decision: decision, flit: flit});
       end
     end
   endrule
@@ -398,7 +448,7 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
   // State 2: fetch routing beats
   rule consumeMessage2 (consumeState == 2);
     // Have we finished fetching beats?
-    Bool finished = fetchBeatCount+`ProgRouterMaxBurst >= consumeKey.numBeats;
+    Bool finished = (consumeKey.numBeats-fetchBeatCount) <= `ProgRouterMaxBurst;
     // Prepare inflight RAM request info
     // (to handle out of order resps from the RAMs)
     InflightFetcherReqInfo info;
@@ -406,6 +456,7 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
     info.burst = truncate(
       min(consumeKey.numBeats - fetchBeatCount, `ProgRouterMaxBurst));
     info.finalBurst = finished;
+    info.isMaxSizedKey = allHigh(consumeKey.numBeats);
     // Prepare RAM request
     DRAMReq req;
     req.isStore = False;
@@ -559,20 +610,16 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
         decision = RouteNoC;
       end
       // 48-bit Indirection
-      IND: begin
-        INDRecord rec = unpack(beat.chunks[4]);
-        flit.dest.threads = {?, rec.newKey};
-        decision = RouteLoop;
-      end
+      IND: begin end
     endcase
     // Is output queue ready for new flit?
     Bool emit = flitProcessedQueue.notFull;
     let newFlitCount = emitFlitCount;
     // Consume routing record
     if (emit) begin
-      flitProcessedQueue.enq(RoutedFlit { decision: decision, flit: flit });
-      // Move to next record
-      recordCount <= recordCount + 1;
+      // Only enqueue if not an IND record
+      if (tag != IND)
+        flitProcessedQueue.enq(RoutedFlit { decision: decision, flit: flit });
       // Shift beat to point to next record
       RoutingBeat newBeat = beat;
       Bool doubleChunk = unpack(pack(tag)[0]);
@@ -585,21 +632,32 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
         for (Integer i = 4; i > 0; i=i-1)
           newBeat.chunks[i] = beat.chunks[i-1];
       end
-      beatReg <= NumberedRoutingBeat {
-        beat: newBeat, beatNum: beatNum, info: info };
-      // Is this the final record in the beat?
-      if ((recordCount+1) == truncate(beat.size)) begin
-        interpreterState <= 0;
-        // Have we finished with this message yet?
-        if (info.finalBurst && info.burst == (beatNum+1)) begin
-          // Reclaim message slot in flit buffer
-          flitBufferUsedSlots[info.msgAddr].clear;
-        end
-      end
       // Is this the final flit in the message?
       if (flit.notFinalFlit)
         newFlitCount = emitFlitCount + 1;
       else begin
+        // Move to next record
+        recordCount <= recordCount + 1;
+        beatReg <= NumberedRoutingBeat {
+          beat: newBeat, beatNum: beatNum, info: info };
+        // Handle IND record: insert into indirection queue
+        if (tag == IND) begin
+          myAssert(indQueue.notFull, "Restrictions on IND records violated");
+          INDRecord ind = unpack(beat.chunks[4]);
+          indQueue.enq(IndQueueEntry
+            { key: unpack(ind.newKey), addr: info.msgAddr });
+        end
+        // Is this the final record in the beat?
+        if ((recordCount+1) == truncate(beat.size)) begin
+          interpreterState <= 0;
+          // Have we finished with this message yet?
+          if (info.finalBurst && info.burst == (beatNum+1)) begin
+            // Reclaim message slot in flit buffer
+            // (Don't do this when we have an indirection to process)
+            if (! info.isMaxSizedKey)
+              flitBufferUsedSlots[info.msgAddr].clear;
+          end
+        end
         incSentReg <= 1;
         newFlitCount = 0;
       end
@@ -661,46 +719,46 @@ endmodule
 typedef function Bool selector(RoutedFlit flit) SelectorFunc;
 
 module mkProgRouterCrossbar#(
-         Vector#(n, SelectorFunc) f,
-         Vector#(n, BOut#(RoutedFlit)) out)
-           (Vector#(n, BOut#(RoutedFlit)))
-  provisos (Add#(a_, 1, n));
+         Vector#(numOut, SelectorFunc) f,
+         Vector#(numIn, BOut#(RoutedFlit)) out)
+           (Vector#(numOut, BOut#(RoutedFlit)))
+             provisos(Add#(a__, 1, numIn));
 
   // Input ports
-  Vector#(n, InPort#(RoutedFlit)) inPort <- replicateM(mkInPort);
+  Vector#(numIn, InPort#(RoutedFlit)) inPort <- replicateM(mkInPort);
 
   // Connect up input ports
-  for (Integer i = 0; i < valueOf(n); i=i+1)
+  for (Integer i = 0; i < valueOf(numIn); i=i+1)
     connectDirect(out[i], inPort[i].in);
 
   // Cosume wires, for each input port
-  Vector#(n, PulseWire) consumeWire<- replicateM(mkPulseWireOR);
+  Vector#(numIn, PulseWire) consumeWire <- replicateM(mkPulseWireOR);
 
   // Keep track of service history for flit sources (for fair selection)
-  Vector#(n, Reg#(Bit#(n))) hist <- replicateM(mkReg(0));
+  Vector#(numOut, Reg#(Bit#(numIn))) hist <- replicateM(mkReg(0));
 
   // Current choice of flit source
-  Vector#(n, Reg#(Bit#(n))) choiceReg <- replicateM(mkReg(0));
+  Vector#(numOut, Reg#(Bit#(numIn))) choiceReg <- replicateM(mkReg(0));
 
   // Output queue
-  Vector#(n, Queue1#(RoutedFlit)) outQueue <-
+  Vector#(numOut, Queue1#(RoutedFlit)) outQueue <-
     replicateM(mkUGShiftQueue(QueueOptFmax));
 
   // Selector mux for each out queue
-  for (Integer i = 0; i < valueOf(n); i=i+1) begin
+  for (Integer i = 0; i < valueOf(numOut); i=i+1) begin
 
     rule select;
       // Vector of input flits and available flits
-      Vector#(n, RoutedFlit) flits = newVector;
-      Vector#(n, Bool) nextAvails = newVector;
+      Vector#(numIn, RoutedFlit) flits = newVector;
+      Vector#(numIn, Bool) nextAvails = newVector;
       Bool avail = False;
-      for (Integer j = 0; j < valueOf(n); j=j+1) begin
+      for (Integer j = 0; j < valueOf(numIn); j=j+1) begin
         flits[j] = inPort[j].value;
         nextAvails[j] = inPort[j].canGet && f[i](inPort[j].value)
                           && choiceReg[i][j] == 0;
         avail = avail || (choiceReg[i][j] == 1 && inPort[j].canGet);
       end
-      Bit#(n) nextAvail = pack(nextAvails);
+      Bit#(numIn) nextAvail = pack(nextAvails);
       // Choose a new source using fair scheduler
       match {.newHist, .nextChoice} = sched(hist[i], nextAvail);
       // Select a flit
@@ -721,7 +779,7 @@ module mkProgRouterCrossbar#(
         hist[i] <= newHist;
       end
       // Consume from chosen source
-      for (Integer j = 0; j < valueOf(n); j=j+1)
+      for (Integer j = 0; j < valueOf(numIn); j=j+1)
         if (inPort[j].canGet && choiceReg[i][j] == 1 && outQueue[i].notFull)
           consumeWire[j].send;
     endrule
@@ -730,7 +788,7 @@ module mkProgRouterCrossbar#(
 
   // Consume from flit sources
   rule consumeFlitSources;
-    for (Integer j = 0; j < valueOf(n); j=j+1)
+    for (Integer j = 0; j < valueOf(numIn); j=j+1)
       if (consumeWire[j]) inPort[j].get;
   endrule
 
@@ -778,7 +836,7 @@ endmodule
 interface ProgRouter;
   // Incoming and outgoing flits
   interface Vector#(`FetchersPerProgRouter, In#(Flit)) flitIn;
-  interface Vector#(`FetchersPerProgRouter, BOut#(Flit)) flitOut;
+  interface Vector#(`ProgRouterCrossbarOutputs, BOut#(Flit)) flitOut;
   interface Vector#(`MailboxMeshXLen, BOut#(Flit)) nocFlitOut;
 
   // Interface to off-chip memory
@@ -809,15 +867,14 @@ module mkProgRouter#(BoardId boardId) (ProgRouter);
     rf.decision == RouteEast || (rf.decision == RouteNoC && xcoord(rf) == 2);
   function Bool routeW(RoutedFlit rf) =
     rf.decision == RouteWest || (rf.decision == RouteNoC && xcoord(rf) == 3);
-  function Bool routeLoop(RoutedFlit rf) = rf.decision == RouteLoop;
-  Vector#(`FetchersPerProgRouter, SelectorFunc) funcs =
-    vector(routeN, routeS, routeE, routeW, routeLoop);
+  Vector#(`ProgRouterCrossbarOutputs, SelectorFunc) funcs =
+    vector(routeN, routeS, routeE, routeW);
 
   // Crossbar
   function BOut#(RoutedFlit) getFetcherFlitOut(Fetcher f) = f.flitOut;
   Vector#(`FetchersPerProgRouter, BOut#(RoutedFlit)) fetcherOuts =
     map(getFetcherFlitOut, fetchers);
-  Vector#(`FetchersPerProgRouter, BOut#(RoutedFlit))
+  Vector#(`ProgRouterCrossbarOutputs, BOut#(RoutedFlit))
     crossbarOuts <- mkProgRouterCrossbar(funcs, fetcherOuts);
 
   // Flit input interfaces
@@ -826,18 +883,17 @@ module mkProgRouter#(BoardId boardId) (ProgRouter);
     flitInIfc[i] = fetchers[i].flitIn;
 
   // Flit output interfaces
-  Vector#(`FetchersPerProgRouter, BOut#(Flit)) flitOutIfc = newVector;
+  Vector#(`ProgRouterCrossbarOutputs, BOut#(Flit)) flitOutIfc = newVector;
   Vector#(`MailboxMeshXLen, BOut#(Flit)) nocFlitOutIfc = newVector;
 
   // Strands
   function Bool forNoC(RoutedFlit rf) = rf.decision == RouteNoC;
-  for (Integer i = 0; i < 4; i=i+1) begin
+  for (Integer i = 0; i < `ProgRouterCrossbarOutputs; i=i+1) begin
     match {.noc, .other} <- splitFlits(forNoC, crossbarOuts[i]);
     flitOutIfc[i] = other;
     if (i < `MailboxMeshXLen) nocFlitOutIfc[i] = noc;
   end
   function Flit toFlit (RoutedFlit rf) = rf.flit;
-  flitOutIfc[4] <- onBOut(toFlit, crossbarOuts[4]);
 
   // RAM interfaces
   Vector#(`DRAMsPerBoard, Vector#(`FetchersPerProgRouter, In#(DRAMResp)))

From db331bdd7597487344525460079acfc6f6d2f665 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mn416@jennings.cl.cam.ac.uk>
Date: Mon, 20 Apr 2020 08:49:35 +0000
Subject: [PATCH 32/78] Better ProgRouter test

---
 Makefile                                    |   2 +-
 apps/POLite/progrouters/Makefile            |   7 +
 apps/POLite/progrouters/ProgRoutersTest.cpp |  43 ++++
 apps/POLite/progrouters/Run.cpp             |  47 ++++
 apps/progrouter/Makefile                    |  53 -----
 apps/progrouter/entry.S                     |   3 -
 apps/progrouter/genld.sh                    |  32 ---
 apps/progrouter/progrouter.cpp              | 236 --------------------
 apps/progrouter/run.cpp                     |  12 -
 include/POLite/ProgRouters.h                |   5 +-
 10 files changed, 100 insertions(+), 340 deletions(-)
 create mode 100644 apps/POLite/progrouters/Makefile
 create mode 100644 apps/POLite/progrouters/ProgRoutersTest.cpp
 create mode 100644 apps/POLite/progrouters/Run.cpp
 delete mode 100644 apps/progrouter/Makefile
 delete mode 100644 apps/progrouter/entry.S
 delete mode 100755 apps/progrouter/genld.sh
 delete mode 100644 apps/progrouter/progrouter.cpp
 delete mode 100644 apps/progrouter/run.cpp

diff --git a/Makefile b/Makefile
index d95602f9..133b1533 100644
--- a/Makefile
+++ b/Makefile
@@ -24,7 +24,6 @@ clean:
 	make -C apps/multiprog clean
 	make -C apps/sync clean
 	make -C apps/temps clean
-	make -C apps/progrouter clean
 	make -C apps/POLite/heat-gals clean
 	make -C apps/POLite/heat-sync clean
 	make -C apps/POLite/heat-cube-sync clean
@@ -40,5 +39,6 @@ clean:
 	make -C apps/POLite/izhikevich-sync clean
 	make -C apps/POLite/pressure-sync clean
 	make -C apps/POLite/hashmin-sync clean
+	make -C apps/POLite/progrouters clean
 	make -C bin clean
 	make -C tests clean
diff --git a/apps/POLite/progrouters/Makefile b/apps/POLite/progrouters/Makefile
new file mode 100644
index 00000000..9c0837be
--- /dev/null
+++ b/apps/POLite/progrouters/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: BSD-2-Clause
+APP_CPP = ProgRoutersTest.cpp
+APP_HDR = 
+RUN_CPP = Run.cpp
+RUN_H = 
+
+include ../util/polite.mk
diff --git a/apps/POLite/progrouters/ProgRoutersTest.cpp b/apps/POLite/progrouters/ProgRoutersTest.cpp
new file mode 100644
index 00000000..109565df
--- /dev/null
+++ b/apps/POLite/progrouters/ProgRoutersTest.cpp
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: BSD-2-Clause
+#include <tinsel.h>
+
+int main()
+{
+  // Get thread id
+  int me = tinselId();
+
+  // Sample outgoing message
+  volatile uint32_t* msgOut = (uint32_t*) tinselSendSlot();
+  msgOut[0] = 0x10;
+  msgOut[1] = 0x20;
+  msgOut[2] = 0x30;
+  msgOut[3] = 0x40;
+  msgOut[4] = 0x50;
+  msgOut[5] = 0x60;
+  msgOut[6] = 0x70;
+  msgOut[7] = 0x80;
+
+  // On thread 0, send to key supplied by host
+  if (me == 0) {
+    tinselSetLen(1);
+    tinselWaitUntil(TINSEL_CAN_RECV);
+    volatile uint32_t* msgIn = (uint32_t*) tinselRecv();
+    uint32_t key = msgIn[0];
+    tinselFree(msgIn);
+    
+    tinselWaitUntil(TINSEL_CAN_SEND);
+    tinselKeySend(key, msgOut);
+  }
+
+  // Print anything received
+  while (1) {
+    tinselWaitUntil(TINSEL_CAN_RECV);
+    volatile uint32_t* msgIn = (uint32_t*) tinselRecv();
+    printf("%x %x %x %x %x %x %x %x\n",
+        msgIn[0], msgIn[1], msgIn[2], msgIn[3]
+      , msgIn[4], msgIn[5], msgIn[6], msgIn[7]);
+    tinselFree(msgIn);
+  }
+
+  return 0;
+}
diff --git a/apps/POLite/progrouters/Run.cpp b/apps/POLite/progrouters/Run.cpp
new file mode 100644
index 00000000..4a7ad6ed
--- /dev/null
+++ b/apps/POLite/progrouters/Run.cpp
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: BSD-2-Clause
+#include <HostLink.h>
+#include <POLite.h>
+
+int main(int argc, char **argv)
+{
+  // Connection to tinsel machine
+  HostLink hostLink;
+
+  // Create routing tables
+  ProgRouterMesh mesh(TinselMeshXLenWithinBox, TinselMeshYLenWithinBox);
+
+  // Board (1, 0)
+  for (int i = 0; i < 60; i++) {
+    uint64_t mask = 1ul << i;
+    mesh.table[0][1].addMRM(1, 0, mask >> 32, mask, 0xf0f0);
+  }
+  uint32_t key01 = mesh.table[0][0].genKey();
+
+  // Board (0, 0)
+  for (int i = 0; i < 40; i++) {
+    uint64_t mask = 1ul << i;
+    mesh.table[0][0].addMRM(1, 0, mask >> 32, mask, 0xf0f0);
+  }
+  for (int i = 0; i < 30; i++) {
+    uint64_t mask = 1ul << i;
+    mesh.table[0][0].addMRM(1, 1, mask >> 32, mask, 0xf0f0);
+  }
+  mesh.table[0][0].addRR(2, key01); // East
+  uint32_t key00 = mesh.table[0][0].genKey();
+
+  // Transfer routing tables to FPGAs
+  mesh.write(&hostLink);
+
+  // Load code and trigger execution
+  hostLink.boot("code.v", "data.v");
+  hostLink.go();
+
+  // Send key
+  printf("Sending key %x\n", key00);
+  uint32_t msg[1 << TinselLogWordsPerMsg];
+  msg[0] = key00;
+  hostLink.send(0, 1, msg);
+
+  hostLink.dumpStdOut();
+  return 0;
+}
diff --git a/apps/progrouter/Makefile b/apps/progrouter/Makefile
deleted file mode 100644
index 76c728f5..00000000
--- a/apps/progrouter/Makefile
+++ /dev/null
@@ -1,53 +0,0 @@
-# Tinsel root
-TINSEL_ROOT=../..
-
-ifndef QUARTUS_ROOTDIR
-  $(error Please set QUARTUS_ROOTDIR)
-endif
-
-include $(TINSEL_ROOT)/globals.mk
-
-# RISC-V compiler flags
-CFLAGS = $(RV_CFLAGS) -O2 -I $(INC)
-LDFLAGS = -melf32lriscv -G 0 
-
-.PHONY: all
-all: code.v data.v run
-
-code.v: progrouter.elf
-	checkelf.sh progrouter.elf
-	$(RV_OBJCOPY) -O verilog --only-section=.text progrouter.elf code.v
-
-data.v: progrouter.elf
-	$(RV_OBJCOPY) -O verilog --remove-section=.text \
-                --set-section-flags .bss=alloc,load,contents \
-                progrouter.elf data.v
-
-progrouter.elf: progrouter.cpp link.ld $(INC)/config.h $(INC)/tinsel.h entry.o $(LIB)/lib.o
-	$(RV_CPPC) $(CFLAGS) -Wall -c -o progrouter.o progrouter.cpp
-	$(RV_LD) $(LDFLAGS) -T link.ld -o progrouter.elf entry.o progrouter.o $(LIB)/lib.o
-
-entry.o:
-	$(RV_CPPC) $(CFLAGS) -Wall -c -o entry.o entry.S
-
-link.ld: genld.sh
-	./genld.sh > link.ld
-
-$(LIB)/lib.o:
-	make -C $(LIB)
-
-$(INC)/config.h: $(TINSEL_ROOT)/config.py
-	make -C $(INC)
-
-$(HL)/%.o:
-	make -C $(HL)
-
-run: run.cpp $(HL)/*.o
-	g++ -O2 -I $(INC) -I $(HL) -o run run.cpp $(HL)/*.o
-
-sim: run.cpp $(HL)/sim/*.o
-	g++ -O2 -I $(INC) -I $(HL) -o sim run.cpp $(HL)/sim/*.o
-
-.PHONY: clean
-clean:
-	rm -f *.o *.elf link.ld *.v run sim
diff --git a/apps/progrouter/entry.S b/apps/progrouter/entry.S
deleted file mode 100644
index 18cd8d27..00000000
--- a/apps/progrouter/entry.S
+++ /dev/null
@@ -1,3 +0,0 @@
-# We assume the boot loader has already setup the stack.
-# All we need to do is jump to main.
-j main
diff --git a/apps/progrouter/genld.sh b/apps/progrouter/genld.sh
deleted file mode 100755
index cfe144c4..00000000
--- a/apps/progrouter/genld.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-# Load config parameters
-while read -r EXPORT; do
-  eval $EXPORT
-done <<< `python ../../config.py envs`
-
-# Compute space available for instructions
-MaxInstrBytes=$((4 * 2**$LogInstrsPerCore - $MaxBootImageBytes))
-
-cat - << EOF
-/* THIS FILE HAS BEEN GENERATED AUTOMATICALLY. */
-/* DO NOT MODIFY. INSTEAD, MODIFY THE genld.sh SCRIPT. */
-
-OUTPUT_ARCH( "riscv" )
-
-MEMORY
-{
-  instrs  : ORIGIN = $MaxBootImageBytes, LENGTH = $MaxInstrBytes
-  globals : ORIGIN = $DRAMBase, LENGTH = $DRAMGlobalsLength
-}
-
-SECTIONS
-{
-  .text   : { *.o(.text*) }             > instrs
-  .bss    : { *.o(.bss*) }              > globals = 0
-  .rodata : { *.o(.rodata*) }           > globals
-  .sdata  : { *.o(.sdata*) }            > globals
-  .data   : { *.o(.data*) }             > globals
-  __heapBase = ALIGN(.);
-}
-EOF
diff --git a/apps/progrouter/progrouter.cpp b/apps/progrouter/progrouter.cpp
deleted file mode 100644
index 28fb494c..00000000
--- a/apps/progrouter/progrouter.cpp
+++ /dev/null
@@ -1,236 +0,0 @@
-#include <tinsel.h>
-
-// Simplest possible example involving programmable routers
-
-/*
-Byte ordering in a routing beat:
-
-  31: Upper byte of length (i.e. number of records in beat)
-  30: Lower byte of length
-  29: Upper byte of first chunk
-  28:
-  27:
-  26:
-  25: 
-  24: Lower byte of first chunk
-  23: Upper byte of second chunk
-  22:
-  21:
-  20: 
-  19: 
-  18: Lower byte of second chunk
-  17: Upper byte of third chunk
-  16:
-  15: 
-  14: 
-  13:
-  12: Lower byte of third chunk
-  11: Upper byte of fourth chunk
-  10: 
-   9: 
-   8:
-   7:
-   6: Lower byte of fourth chunk
-   5: Upper byte of fifth chunk
-   4: 
-   3:
-   2:
-   1:
-   0: Lower byte of fifth chunk
-
-Need to fold this into the docs eventually.
-*/
-
-// Use this to align on beat boundary
-#define ALIGNED __attribute__((aligned(32)))
-
-// A single RAM beat
-struct ALIGNED Beat {
-  uint8_t bytes[32];
-};
-
-// Routing table, with methods to aid construction
-template <int NumBeats> struct RoutingTable {
-  // Raw beats comprising the table
-  Beat beats[NumBeats];
-
-  // Number of chunks used so far in current beat
-  uint32_t numChunks;
-
-  // Number of records used so far in current beat
-  uint32_t numRecords;
-
-  // Index of beat currently being filled
-  uint32_t currentBeat;
-
-  // Constructor
-  RoutingTable() {
-    currentBeat = 0;
-    numChunks = numRecords = 0;
-  }
-
-  // Pointer to current beat being filled
-  uint8_t* currentPointer() {
-    return beats[currentBeat].bytes;
-  }
-
-  // Move on to next the beat
-  void next() {
-    beats[currentBeat].bytes[31] = 0;
-    beats[currentBeat].bytes[30] = numRecords;
-    numChunks = 0;
-    numRecords = 0;
-    currentBeat++;
-  }
-
-  // Add a URM1 record to the table
-  void addURM1(uint32_t mboxX, uint32_t mboxY,
-                 uint32_t mboxThread, uint32_t localKey) {
-    if (numChunks == 5) next();
-    uint8_t* ptr = beats[currentBeat].bytes + 6*(4-numChunks);
-    ptr[0] = localKey;
-    ptr[1] = localKey >> 8;
-    ptr[2] = localKey >> 16;
-    ptr[3] = localKey >> 24;
-    ptr[4] = (mboxThread&0x1f) << 3;
-    ptr[5] = (mboxY << 3) | (mboxX << 1) | (mboxThread >> 5);
-    numChunks++;
-    numRecords++;
-  }
-
-  // Add a URM2 record to the table
-  void addURM2(uint32_t mboxX, uint32_t mboxY, uint32_t mboxThread,
-                 uint32_t localKeyHigh, uint32_t localKeyLow) {
-    if (numChunks >= 4) next();
-    uint8_t* ptr = beats[currentBeat].bytes + 6*(3-numChunks);
-    ptr[0] = localKeyLow;
-    ptr[1] = localKeyLow >> 8;
-    ptr[2] = localKeyLow >> 16;
-    ptr[3] = localKeyLow >> 24;
-    ptr[4] = localKeyHigh;
-    ptr[5] = localKeyHigh >> 8;
-    ptr[6] = localKeyHigh >> 16;
-    ptr[7] = localKeyHigh >> 24;
-    ptr[10] = (mboxThread&0x1f) << 3;
-    ptr[11] = (1 << 5) | (mboxY << 3) | (mboxX << 1) | (mboxThread >> 5);
-    numChunks += 2;
-    numRecords++;
-  }
-
-  // Add an MRM record to the table
-  void addMRM(uint32_t mboxX, uint32_t mboxY,
-                uint32_t threadsHigh, uint32_t threadsLow,
-                  uint16_t localKey) {
-    if (numChunks >= 4) next();
-    uint8_t* ptr = beats[currentBeat].bytes + 6*(3-numChunks);
-    ptr[0] = threadsLow;
-    ptr[1] = threadsLow >> 8;
-    ptr[2] = threadsLow >> 16;
-    ptr[3] = threadsLow >> 24;
-    ptr[4] = threadsHigh;
-    ptr[5] = threadsHigh >> 8;
-    ptr[6] = threadsHigh >> 16;
-    ptr[7] = threadsHigh >> 24;
-    ptr[8] = localKey;
-    ptr[9] = localKey >> 8;
-    ptr[11] = (3 << 5) | (mboxY << 3) | (mboxX << 1);
-    numChunks += 2;
-    numRecords++;
-  }
-
-  // Add an IND record to the table
-  // Return a pointer to the indirection key,
-  // so it can be set later by the caller
-  uint8_t* addIND() {
-    if (numChunks == 5) next();
-    uint8_t* ptr = beats[currentBeat].bytes + 6*(4-numChunks);
-    ptr[5] = 4 << 5;
-    numChunks++;
-    numRecords++;
-    return ptr;
-  }
-
-  // Set indirection key
-  void setIND(uint8_t* ind, bool upperRam,
-                uint8_t* beatPtr, uint32_t numBeats) {
-    uint32_t key = (uint32_t) beatPtr | numBeats;
-    if (upperRam) key |= 0x80000000;
-    ind[0] = key;
-    ind[1] = key >> 8;
-    ind[2] = key >> 16;
-    ind[3] = key >> 24;
-  }
-
-  // Add an RR record to the table
-  void addRR(uint32_t dir, uint32_t key) {
-    if (numChunks == 5) next();
-    uint8_t* ptr = beats[currentBeat].bytes + 6*(4-numChunks);
-    ptr[0] = key;
-    ptr[1] = key >> 8;
-    ptr[2] = key >> 16;
-    ptr[3] = key >> 24;
-    ptr[5] = (2 << 5) | (dir << 3);
-    numChunks++;
-    numRecords++;
-  }
-};
-
-// Create global routing table of 16 beats
-RoutingTable<16> table;
-
-int main()
-{
-  // Get thread id
-  int me = tinselId();
-
-  // Sample outgoing message
-  volatile uint32_t* msgOut = (uint32_t*) tinselSendSlot();
-  msgOut[0] = 0x10;
-  msgOut[1] = 0x20;
-  msgOut[2] = 0x30;
-  msgOut[3] = 0x40;
-  msgOut[4] = 0x50;
-  msgOut[5] = 0x60;
-  msgOut[6] = 0x70;
-  msgOut[7] = 0x80;
-
-  // On thread 0
-  if (me == 0) {
-tinselSetLen(1);
-    // Add an URM1 record
-    uint8_t* entry1 = table.currentPointer();
-    table.addURM1(0, 0, 10, 0xfff);
-    table.addURM2(0, 0, 60, 0xff1, 0xff0);
-    table.addURM2(0, 0, 60, 0xff3, 0xff2);
-    table.addURM2(0, 0, 60, 0xff5, 0xff4);
-    //table.addMRM(1, 0, 0x22222222, 0x11111111, 0x2222);
-    table.next();
-
-    // Cache flush, to write table into RAM
-    tinselCacheFlush();
-    // Wait until flush done, by issuing a load
-    volatile uint32_t* dummyPtr = (uint32_t*) entry1; dummyPtr[0];
-
-    // Construct key
-    uint32_t key = (uint32_t) entry1;
-    key = key | 2; // Entry is 2 beats long
-
-    // Send message to key
-    tinselWaitUntil(TINSEL_CAN_SEND);
-    tinselKeySend(key, msgOut);
-
-    while (1);
-  }
-
-  // On other threads, print anything received
-  while (me != 0) {
-    tinselWaitUntil(TINSEL_CAN_RECV);
-    volatile uint32_t* msgIn = (uint32_t*) tinselRecv();
-    printf("%x %x %x %x %x %x %x %x\n",
-        msgIn[0], msgIn[1], msgIn[2], msgIn[3]
-      , msgIn[4], msgIn[5], msgIn[6], msgIn[7]);
-    tinselFree(msgIn);
-  }
-
-  return 0;
-}
diff --git a/apps/progrouter/run.cpp b/apps/progrouter/run.cpp
deleted file mode 100644
index a198a064..00000000
--- a/apps/progrouter/run.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-#include <HostLink.h>
-
-int main()
-{
-  HostLink hostLink;
-
-  hostLink.boot("code.v", "data.v");
-  hostLink.go();
-  hostLink.dumpStdOut();
-
-  return 0;
-}
diff --git a/include/POLite/ProgRouters.h b/include/POLite/ProgRouters.h
index 8fe3c143..45d12cbd 100644
--- a/include/POLite/ProgRouters.h
+++ b/include/POLite/ProgRouters.h
@@ -227,14 +227,13 @@ inline uint32_t destMboxY(uint32_t mbox) {
 // ============================
 
 class ProgRouterMesh {
-  // 2D array of tables;
-  ProgRouter** table;
-
   // Board mesh dimensions
   uint32_t boardsX;
   uint32_t boardsY;
 
  public:
+  // 2D array of tables;
+  ProgRouter** table;
 
   // Constructor
   ProgRouterMesh(uint32_t numBoardsX, uint32_t numBoardsY) {

From 468f65f1d59bf11989bf343d154086b7e6ff2c21 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Mon, 20 Apr 2020 14:08:18 +0100
Subject: [PATCH 33/78] Fix mistake in beat shifter

---
 apps/POLite/progrouters/Run.cpp | 10 +++++-----
 rtl/ProgRouter.bsv              |  4 +---
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/apps/POLite/progrouters/Run.cpp b/apps/POLite/progrouters/Run.cpp
index 4a7ad6ed..c2b27bd2 100644
--- a/apps/POLite/progrouters/Run.cpp
+++ b/apps/POLite/progrouters/Run.cpp
@@ -8,21 +8,21 @@ int main(int argc, char **argv)
   HostLink hostLink;
 
   // Create routing tables
-  ProgRouterMesh mesh(TinselMeshXLenWithinBox, TinselMeshYLenWithinBox);
+  ProgRouterMesh mesh(2, 1);
 
   // Board (1, 0)
-  for (int i = 0; i < 60; i++) {
+  for (int i = 0; i < 2; i++) {
     uint64_t mask = 1ul << i;
     mesh.table[0][1].addMRM(1, 0, mask >> 32, mask, 0xf0f0);
   }
-  uint32_t key01 = mesh.table[0][0].genKey();
+  uint32_t key01 = mesh.table[0][1].genKey();
 
   // Board (0, 0)
-  for (int i = 0; i < 40; i++) {
+  for (int i = 0; i < 2; i++) {
     uint64_t mask = 1ul << i;
     mesh.table[0][0].addMRM(1, 0, mask >> 32, mask, 0xf0f0);
   }
-  for (int i = 0; i < 30; i++) {
+  for (int i = 0; i < 2; i++) {
     uint64_t mask = 1ul << i;
     mesh.table[0][0].addMRM(1, 1, mask >> 32, mask, 0xf0f0);
   }
diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv
index ecf6e927..9c0b5f95 100644
--- a/rtl/ProgRouter.bsv
+++ b/rtl/ProgRouter.bsv
@@ -624,10 +624,8 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
       RoutingBeat newBeat = beat;
       Bool doubleChunk = unpack(pack(tag)[0]);
       if (doubleChunk) begin
-        for (Integer i = 4; i > 2; i=i-2) begin
+        for (Integer i = 4; i > 1; i=i-1)
           newBeat.chunks[i] = beat.chunks[i-2];
-          newBeat.chunks[i-1] = beat.chunks[i-3];
-        end
       end else begin
         for (Integer i = 4; i > 0; i=i-1)
           newBeat.chunks[i] = beat.chunks[i-1];

From 249e0915da65f978e13fd2604a105e6be4d7f056 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mn416@jennings.cl.cam.ac.uk>
Date: Tue, 21 Apr 2020 13:16:40 +0000
Subject: [PATCH 34/78] Fix E/W mixup in POLite

---
 include/POLite/ProgRouters.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/POLite/ProgRouters.h b/include/POLite/ProgRouters.h
index 45d12cbd..90083802 100644
--- a/include/POLite/ProgRouters.h
+++ b/include/POLite/ProgRouters.h
@@ -250,7 +250,7 @@ class ProgRouterMesh {
                                  Seq<PRoutingDest>* dests) {
     assert(dests->numElems > 0);
 
-    // Categorise non-local dests into local, N, S, E, and W groups
+    // Categorise dests into local, N, S, E, and W groups
     Seq<PRoutingDest> local(dests->numElems);
     Seq<PRoutingDest> north(dests->numElems);
     Seq<PRoutingDest> south(dests->numElems);
@@ -260,8 +260,8 @@ class ProgRouterMesh {
       PRoutingDest dest = dests->elems[i];
       uint32_t receiverX = destX(dest.mbox);
       uint32_t receiverY = destY(dest.mbox);
-      if (receiverX < senderX) east.append(dest);
-      else if (receiverX > senderX) west.append(dest);
+      if (receiverX < senderX) west.append(dest);
+      else if (receiverX > senderX) east.append(dest);
       else if (receiverY < senderY) south.append(dest);
       else if (receiverY > senderY) north.append(dest);
       else local.append(dest);

From d4d4da8854a480c6f6d22b41f948587ed0624ea8 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mn416@cl.cam.ac.uk>
Date: Tue, 21 Apr 2020 22:38:01 +0100
Subject: [PATCH 35/78] Fix another bug

---
 rtl/ProgRouter.bsv | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv
index 9c0b5f95..b9cdc469 100644
--- a/rtl/ProgRouter.bsv
+++ b/rtl/ProgRouter.bsv
@@ -470,8 +470,10 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
       ramReqQueue[consumeKey.ram].enq(req);
       fetchBeatCount <= fetchBeatCount + zeroExtend(req.burst);
       beatBufferLen.incBy(zeroExtend(req.burst));
-      incReceivedReg <= 1;
-      if (finished) consumeState <= 0;
+      if (finished) begin
+        consumeState <= 0;
+        incReceivedReg <= 1;
+      end
     end
   endrule
 

From 5f6eafa9ff4d07a5b2b2479f2cf959cb53bddb24 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mn416@cl.cam.ac.uk>
Date: Wed, 22 Apr 2020 09:13:00 +0100
Subject: [PATCH 36/78] Set board id in flits emitted by ProgRouter

This was a sutble problem due to the way tinselKeySend works: it uses
a zero address with the isKey bit set as the message destination.
This means that when a message reaches the ProgRouter on a board other
than (0, 0), the board-id of any genenerated local messages is
incorrect.  This could be easily solved in software by tweaking
tinselKeySend, but it's simpler (from a docs/semantics perspective)
and safer to solve it in hardware.
---
 rtl/ProgRouter.bsv | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv
index b9cdc469..1ea7a8c6 100644
--- a/rtl/ProgRouter.bsv
+++ b/rtl/ProgRouter.bsv
@@ -545,6 +545,9 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
     // Modify flit by interpreting routing key
     RoutingDecision decision = ?;
     Flit flit = flitBuffer.dataOut;
+    // Unless otherwise stated (e.g. RR records),
+    // flits emitted will be destined for this board
+    flit.dest.addr.board = boardId;
     case (tag)
       // 48-bit Unicast Router-to-Mailbox
       URM1: begin

From ce65fc5d1abc2c4f11c117e713d842e210eeafd2 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Wed, 22 Apr 2020 20:37:45 +0100
Subject: [PATCH 37/78] Exploring impact of an extra fetcher

Introduce an additional fetcher for messages arriving locally.  How
does this impact performance (area, throughput)?
---
 config.py          |  2 +-
 rtl/Network.bsv    | 12 ++++++++----
 rtl/ProgRouter.bsv | 18 +++++++++---------
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/config.py b/config.py
index 1d2e5b07..e9917ae5 100755
--- a/config.py
+++ b/config.py
@@ -380,7 +380,7 @@ def quoted(s): return "'\"" + s + "\"'"
 
 # Parameters for programmable routers
 # (and the routing-record fetchers they contain)
-p["FetchersPerProgRouter"] = 5
+p["FetchersPerProgRouter"] = 6
 p["LogFetcherFlitBufferSize"] = 5
 
 #==============================================================================
diff --git a/rtl/Network.bsv b/rtl/Network.bsv
index f8396509..baf36d63 100644
--- a/rtl/Network.bsv
+++ b/rtl/Network.bsv
@@ -401,10 +401,14 @@ module mkNoC#(
 
   // Connect mailbox mesh south rim to board router
   function List#(t) single(t elem) = List::cons(elem, Nil);
-  List#(Out#(Flit)) botOutList = Nil;
-  for (Integer x = `MailboxMeshXLen-1; x >= 0; x=x-1)
-    botOutList = Cons(routers[0][x].bottomOut, botOutList);
-  reduceConnect(mkFlitMerger, botOutList, single(boardRouter.flitIn[4]));
+  List#(Out#(Flit)) botOutList0 = Nil;
+  List#(Out#(Flit)) botOutList1 = Nil;
+  for (Integer x = `MailboxMeshXLen-1; x >= 0; x=x-2) begin
+    botOutList0 = Cons(routers[0][x].bottomOut, botOutList0);
+    botOutList1 = Cons(routers[0][x-1].bottomOut, botOutList1);
+  end
+  reduceConnect(mkFlitMerger, botOutList0, single(boardRouter.flitIn[4]));
+  reduceConnect(mkFlitMerger, botOutList1, single(boardRouter.flitIn[5]));
 
   // Connect board router to mailbox mesh south rim
   function In#(Flit) getBottomIn(MeshRouter r) = r.bottomIn;
diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv
index 1ea7a8c6..0cfd6e16 100644
--- a/rtl/ProgRouter.bsv
+++ b/rtl/ProgRouter.bsv
@@ -179,15 +179,15 @@ typedef struct {
 // NoC edge, but the diagram assumes four.
 
 //
-//               N     S     E     W     L0..L3        Input flits
-//               |     |     |     |     |        
-//             +---+ +---+ +---+ +---+ +---+      
-//             | F | | F | | F | | F | | F |           Fetchers
-//             +---+ +---+ +---+ +---+ +---+      
-//               |     |     |     |     |        
-//             +---------------------------+      
-//             |          Crossbar         |           Routing
-//             +---------------------------+      
+//               N     S     E     W   L0/L1 L2/L3     Input flits
+//               |     |     |     |     |     |  
+//             +---+ +---+ +---+ +---+ +---+ +---+
+//             | F | | F | | F | | F | | F | | F |     Fetchers
+//             +---+ +---+ +---+ +---+ +---+ +---+
+//               |     |     |     |     |     |  
+//             +---------------------------------+      
+//             |          Crossbar               |     Routing
+//             +---------------------------------+      
 //               |     |     |     |              
 //              N/L0  S/L1  E/L2  W/L3                 Output queues
 //               |     |     |     |

From 100b18147044d137454d61f09759f4905cb16bd8 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Sun, 26 Apr 2020 23:02:09 +0100
Subject: [PATCH 38/78] Throttle ProgRouter's RAM access

---
 rtl/Connections.bsv | 65 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 59 insertions(+), 6 deletions(-)

diff --git a/rtl/Connections.bsv b/rtl/Connections.bsv
index 013224c9..2e472c37 100644
--- a/rtl/Connections.bsv
+++ b/rtl/Connections.bsv
@@ -7,6 +7,7 @@ import DRAM        :: *;
 import Queue       :: *;
 import DCache      :: *;
 import DCacheTypes :: *;
+import Util        :: *;
 
 // ============================================================================
 // DCache <-> Core connections
@@ -54,17 +55,54 @@ endmodule
 module connectClientsToOffChipRAM#(
   // Data caches
   Vector#(`DCachesPerDRAM, DCache) caches,
-  // Programmable per-board router, reqs and resps
+  // Reqs and resps from ProgRouter's fetchers
   Vector#(`FetchersPerProgRouter, BOut#(DRAMReq)) routerReqs,
   Vector#(`FetchersPerProgRouter, In#(DRAMResp)) routerResps,
   // Off-chip memory
   OffChipRAM ram) ();
 
-  // Connect requests
+  // Count the number of outstanding fetcher requests
+  // Used to throttle the fetcher requests to avoid starving/blocking
+  // the cache requests
+  Integer throttleCount = 2 ** (`DRAMLogMaxInFlight - 1);
+  Count#(`DRAMLogMaxInFlight) fetcherCount <- mkCount(throttleCount);
+
+  // Merge cache requests
   function getReqOut(cache) = cache.reqOut;
-  let reqs <- mkMergeTreeB(Fair,
-                mkUGShiftQueue1(QueueOptFmax),
-                append(map(getReqOut, caches), routerReqs));
+  Out#(DRAMReq) cacheReqs <-
+    mkMergeTreeB(Fair,
+      mkUGShiftQueue1(QueueOptFmax),
+      map(getReqOut, caches));
+  Queue1#(DRAMReq) cacheReqsQueue <- mkUGShiftQueue1(QueueOptFmax);
+  connectToQueue(cacheReqs, cacheReqsQueue);
+  BOut#(DRAMReq) cacheReqsB = queueToBOut(cacheReqsQueue);
+
+  // Merge router requests
+  Out#(DRAMReq) fetcherReqs <-
+    mkMergeTreeB(Fair,
+      mkUGShiftQueue1(QueueOptFmax),
+      routerReqs);
+  Queue1#(DRAMReq) fetcherReqsQueue <- mkUGShiftQueue1(QueueOptFmax);
+  connectToQueue(fetcherReqs, fetcherReqsQueue);
+  BOut#(DRAMReq) fetcherReqsB = queueToBOut(fetcherReqsQueue);
+
+  // Update count on router request
+  BOut#(DRAMReq) fetcherReqsIncCountB =
+    interface BOut
+      method Action get =
+        action
+          fetcherReqsB.get;
+          fetcherCount.incBy(zeroExtend(fetcherReqsB.value.burst));
+        endaction;
+      method Bool valid = fetcherReqsB.valid && 
+        (fetcherCount.available +
+           zeroExtend(fetcherReqsB.value.burst) <=
+             fromInteger(throttleCount));
+      method DRAMReq value = fetcherReqsB.value;
+    endinterface;
+
+  // Merge cache and router requests, and connect to off-chip RAM
+  let reqs <- mkMergeTwoB(Fair, cacheReqsB, fetcherReqsIncCountB);
   connectUsing(mkUGQueue, reqs, ram.reqIn);
 
   // Connect load responses
@@ -74,7 +112,22 @@ module connectClientsToOffChipRAM#(
                     getRespKey,
                     mkUGShiftQueue2(QueueOptFmax),
                     append(map(getRespIn, caches), routerResps));
-  connectDirect(ram.respOut, ramResps);
+
+  // Update count on respose
+  BOut#(DRAMResp) ramRespOutDecCount =
+    interface BOut
+      method Action get =
+        action
+          ram.respOut.get;
+          if (ram.respOut.value.id >= fromInteger(`DCachesPerDRAM))
+            fetcherCount.dec;
+        endaction;
+      method Bool valid = ram.respOut.valid;
+      method DRAMResp value = ram.respOut.value;
+    endinterface;
+
+  // Connect responses from off-chip RAM
+  connectDirect(ramRespOutDecCount, ramResps);
 
 endmodule
 

From cc785bdcc01bcf6253752467777c94715cfac087 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Sun, 26 Apr 2020 23:07:08 +0100
Subject: [PATCH 39/78] Fix throttle condition

---
 rtl/Connections.bsv | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/rtl/Connections.bsv b/rtl/Connections.bsv
index 2e472c37..cd762f66 100644
--- a/rtl/Connections.bsv
+++ b/rtl/Connections.bsv
@@ -95,9 +95,7 @@ module connectClientsToOffChipRAM#(
           fetcherCount.incBy(zeroExtend(fetcherReqsB.value.burst));
         endaction;
       method Bool valid = fetcherReqsB.valid && 
-        (fetcherCount.available +
-           zeroExtend(fetcherReqsB.value.burst) <=
-             fromInteger(throttleCount));
+        zeroExtend(fetcherReqsB.value.burst) <= fetcherCount.available;
       method DRAMReq value = fetcherReqsB.value;
     endinterface;
 

From e58f70d7aa7cade6ef1fe2b03da3aed1b38948f9 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Sun, 26 Apr 2020 23:16:34 +0100
Subject: [PATCH 40/78] Preserve order in mkMergeTreeB

---
 rtl/Interface.bsv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rtl/Interface.bsv b/rtl/Interface.bsv
index 0484cb41..a7cd0e91 100644
--- a/rtl/Interface.bsv
+++ b/rtl/Interface.bsv
@@ -412,7 +412,7 @@ module mkMergeTreeB#(MergeMethod m, module#(SizedQueue#(d, t)) mkQ,
     xs = List::cons(x, xs);
   end
 
-  let out <- mkMergeTreeList(m, mkQ, xs);
+  let out <- mkMergeTreeList(m, mkQ, List::reverse(xs));
   return out;
 endmodule
 

From 2c76edc82887a2e16a185842789e1de27c699834 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Wed, 29 Apr 2020 15:14:48 +0100
Subject: [PATCH 41/78] Use full-rate queue at root of cache request tree

When adding the ProgRouter to the RAM request tree, the max rate of
requests from caches was halved even though the ProgRouter may not be
busy.  Not desirable, but easy to fix: keep a full-rate queue at the
root of the cache request tree.
---
 rtl/Connections.bsv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rtl/Connections.bsv b/rtl/Connections.bsv
index cd762f66..214c30f7 100644
--- a/rtl/Connections.bsv
+++ b/rtl/Connections.bsv
@@ -73,7 +73,7 @@ module connectClientsToOffChipRAM#(
     mkMergeTreeB(Fair,
       mkUGShiftQueue1(QueueOptFmax),
       map(getReqOut, caches));
-  Queue1#(DRAMReq) cacheReqsQueue <- mkUGShiftQueue1(QueueOptFmax);
+  Queue#(DRAMReq) cacheReqsQueue <- mkUGQueue;
   connectToQueue(cacheReqs, cacheReqsQueue);
   BOut#(DRAMReq) cacheReqsB = queueToBOut(cacheReqsQueue);
 

From 5a4beabfb681913286c3473312823e35134ba2bb Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Thu, 30 Apr 2020 15:43:52 +0100
Subject: [PATCH 42/78] ProgRouter performance counters

Accessible from core zero on each board only.
---
 README.md            |  3 ++
 rtl/Connections.bsv  | 19 ++++++++++++
 rtl/Core.bsv         | 69 +++++++++++++++++++++++++++++++++-----------
 rtl/DE5Top.bsv       |  4 +++
 rtl/IdleDetector.bsv | 27 -----------------
 rtl/Network.bsv      |  7 ++++-
 rtl/ProgRouter.bsv   | 47 +++++++++++++++++++++++++++++-
 rtl/Util.bsv         | 27 +++++++++++++++++
 8 files changed, 157 insertions(+), 46 deletions(-)

diff --git a/README.md b/README.md
index a3340c3f..871c9e5e 100644
--- a/README.md
+++ b/README.md
@@ -23,6 +23,9 @@ New section on programmable routers:
   * Restrictions on IND records
   * Avoiding deadlock: programmer has some added resposibility here
 
+New performance counters accessible from core zero on each board only:
+  * `ProgRouterSent` and `ProgRouterSentInterBoard`
+
 # Tinsel 0.7.1
 
 Tinsel is a [RISC-V](https://riscv.org/)-based manythread
diff --git a/rtl/Connections.bsv b/rtl/Connections.bsv
index 214c30f7..d093001b 100644
--- a/rtl/Connections.bsv
+++ b/rtl/Connections.bsv
@@ -8,6 +8,8 @@ import Queue       :: *;
 import DCache      :: *;
 import DCacheTypes :: *;
 import Util        :: *;
+import ProgRouter  :: *;
+import Core        :: *;
 
 // ============================================================================
 // DCache <-> Core connections
@@ -129,4 +131,21 @@ module connectClientsToOffChipRAM#(
 
 endmodule
 
+// ============================================================================
+// ProgRouter performance counter connections
+// ============================================================================
+
+module connectProgRouterPerfCountersToCores#(
+         ProgRouterPerfCounters counters, Vector#(n, Core) cores) (Empty);
+  rule connect;
+    // Only core zero can access the ProgRouter perf counters
+    cores[0].progRouterPerfClient.incSent(counters.incSent);
+    cores[0].progRouterPerfClient.incSentInterBoard(counters.incSentInterBoard);
+    for (Integer i = 1; i < valueOf(n); i=i+1) begin
+      cores[i].progRouterPerfClient.incSent(?);
+      cores[i].progRouterPerfClient.incSentInterBoard(?);
+    end
+  endrule
+endmodule
+
 endpackage
diff --git a/rtl/Core.bsv b/rtl/Core.bsv
index 1d35d278..4c454c98 100644
--- a/rtl/Core.bsv
+++ b/rtl/Core.bsv
@@ -25,6 +25,7 @@ import FPUOps       :: *;
 import InstrMem     :: *;
 import DCacheTypes  :: *;
 import IdleDetector :: *;
+import ProgRouter   :: *;
 
 // ============================================================================
 // Control/status registers (CSRs) supported
@@ -60,15 +61,17 @@ import IdleDetector :: *;
 // Performance Counter CSRs (Optional)
 // ============================================================================
 
-// Name            | CSR    | R/W | Function
-// --------------- | ------ | --- | --------
-// PerfCount       | 0xc07  | W   | Reset(0)/Start(1)/Stop(2) all counters
-// MissCount       | 0xc08  | R   | Cache miss count
-// HitCount        | 0xc09  | R   | Cache hit count
-// WritebackCount  | 0xc0a  | R   | Cache writeback count
-// CPUIdleCount    | 0xc0b  | R   | CPU idle-cycle count (lower 32 bits)
-// CPUIdleCountU   | 0xc0c  | R   | CPU idle-cycle count (upper 8 bits)
-// CycleU          | 0xc0d  | R   | Cycle counter (upper 8 bits)
+// Name                | CSR    | R/W | Function
+// ------------------- | ------ | --- | --------
+// PerfCount           | 0xc07  | W   | Reset(0)/Start(1)/Stop(2) all counters
+// MissCount           | 0xc08  | R   | Cache miss count
+// HitCount            | 0xc09  | R   | Cache hit count
+// WritebackCount      | 0xc0a  | R   | Cache writeback count
+// CPUIdleCount        | 0xc0b  | R   | CPU idle-cycle count (lower 32 bits)
+// CPUIdleCountU       | 0xc0c  | R   | CPU idle-cycle count (upper 8 bits)
+// CycleU              | 0xc0d  | R   | Cycle counter (upper 8 bits)
+// ProgRouterSent      | 0xc0e  | R   | Msgs sent by ProgRouter
+// ProgRouterSentInter | 0xc0f  | R   | Inter-board msgs sent by ProgRouter
 
 // ============================================================================
 // Types
@@ -505,12 +508,13 @@ endfunction
 // ============================================================================
 
 interface Core;
-  interface DCacheClient       dcacheClient;
-  interface MailboxClient      mailboxClient;
-  interface DebugLinkClient    debugLinkClient;
-  interface FPUClient          fpuClient;
-  interface InstrMemClient     instrMemClient;
-  interface IdleDetectorClient idleClient;
+  interface DCacheClient         dcacheClient;
+  interface MailboxClient        mailboxClient;
+  interface DebugLinkClient      debugLinkClient;
+  interface FPUClient            fpuClient;
+  interface InstrMemClient       instrMemClient;
+  interface IdleDetectorClient   idleClient;
+  interface ProgRouterPerfClient progRouterPerfClient;
 
   // Each core can see its board id
   (* always_ready, always_enabled *)
@@ -676,18 +680,27 @@ module mkCore#(CoreId myId) (Core);
   Reg#(Bit#(32)) hitCount       <- mkConfigReg(0);
   Reg#(Bit#(32)) writebackCount <- mkConfigReg(0);
   Reg#(Bit#(40)) cpuIdleCount   <- mkConfigReg(0);
+  // Only core zero maintains the following two counters
+  Reg#(Bit#(32)) progRouterSent <- mkConfigReg(0);
+  Reg#(Bit#(32)) progRouterSentInterBoard <- mkConfigReg(0);
 
   // Indexable vector of performance counters
-  Vector#(6, Bit#(32)) perfCounters =
+  Vector#(8, Bit#(32)) perfCounters =
     vector(missCount, hitCount, writebackCount, cpuIdleCount[31:0],
              zeroExtend(cpuIdleCount[39:32]),
-             zeroExtend(cycleCount[39:32]));
+             zeroExtend(cycleCount[39:32]),
+             myId == 0 ? progRouterSent : ?,
+             myId == 0 ? progRouterSentInterBoard : ?);
 
   // Increment wires
   Wire#(Bool) incMissCountWire      <- mkDWire(False);
   Wire#(Bool) incHitCountWire       <- mkDWire(False);
   Wire#(Bool) incWritebackCountWire <- mkDWire(False);
   Wire#(Bool) incCPUIdleCountWire   <- mkDWire(False);
+  Wire#(Bit#(LogFetchersPerProgRouter))
+    incProgRouterSent <- mkBypassWire;
+  Wire#(Bit#(LogFetchersPerProgRouter))
+    incProgRouterSentInterBoard <- mkBypassWire;
 
   // Update performance counters
   rule updatePerfCounters;
@@ -696,11 +709,20 @@ module mkCore#(CoreId myId) (Core);
       hitCount       <= 0;
       writebackCount <= 0;
       cpuIdleCount   <= 0;
+      if (myId == 0) begin
+        progRouterSent <= 0;
+        progRouterSentInterBoard <= 0;
+      end
     end else if (perfCountEnabled) begin
       if (incMissCountWire) missCount <= missCount+1;
       if (incHitCountWire) hitCount <= hitCount+1;
       if (incWritebackCountWire) writebackCount <= writebackCount+1;
       if (incCPUIdleCountWire) cpuIdleCount <= cpuIdleCount+1;
+      if (myId == 0) begin
+        progRouterSent <= progRouterSent + zeroExtend(incProgRouterSent);
+        progRouterSentInterBoard <= progRouterSentInterBoard +
+          zeroExtend(incProgRouterSentInterBoard);
+      end
     end
   endrule
   `endif
@@ -1321,6 +1343,19 @@ module mkCore#(CoreId myId) (Core);
     method Bool idleStage1Ack = mailbox.idleStage1Ack;
   endinterface
 
+  interface ProgRouterPerfClient progRouterPerfClient;
+    method Action incSent(Bit#(LogFetchersPerProgRouter) amount);
+      `ifdef EnablePerfCount
+        incProgRouterSent <= amount;
+      `endif
+    endmethod
+    method Action incSentInterBoard(Bit#(LogFetchersPerProgRouter) amount);
+      `ifdef EnablePerfCount
+        incProgRouterSentInterBoard <= amount;
+      `endif
+    endmethod
+  endinterface
+
 endmodule
 
 endpackage
diff --git a/rtl/DE5Top.bsv b/rtl/DE5Top.bsv
index 5c353542..b522284d 100644
--- a/rtl/DE5Top.bsv
+++ b/rtl/DE5Top.bsv
@@ -178,6 +178,10 @@ module de5Top (DE5Top);
     connectClientsToOffChipRAM(dcaches[i],
       noc.dramReqs[i], noc.dramResps[i], rams[i]);
 
+  // Connects ProgRouter performance counters to cores
+  connectProgRouterPerfCountersToCores(noc.progRouterPerfCounters,
+    concat(concat(cores)));
+
   // Set board ids
   rule setBoardIds;
     for (Integer i = 0; i < `DRAMsPerBoard; i=i+1)
diff --git a/rtl/IdleDetector.bsv b/rtl/IdleDetector.bsv
index 179a9f41..59e4b530 100644
--- a/rtl/IdleDetector.bsv
+++ b/rtl/IdleDetector.bsv
@@ -304,33 +304,6 @@ module mkIdleDetector (IdleDetector);
 
 endmodule
 
-// Pipelined reduction tree
-module mkPipelinedReductionTree#(
-         function a reduce(a x, a y),
-         a init,
-         List#(a) xs)
-       (a) provisos(Bits#(a, _));
-  Integer len = List::length(xs);
-  if (len == 0)
-    return error("mkSumList applied to empty list");
-  else if (len == 1)
-    return xs[0];
-  else begin
-    List#(a) ys = xs;
-    List#(a) reduced = Nil;
-    for (Integer i = 0; i < len; i=i+2) begin
-      Reg#(a) r <- mkConfigReg(init);
-      rule assignOut;
-        r <= reduce(ys[0], ys[1]);
-      endrule
-      ys = List::drop(2, ys);
-      reduced = Cons(readReg(r), reduced);
-    end
-    a res <- mkPipelinedReductionTree(reduce, init, reduced);
-    return res;
-  end
-endmodule
-
 interface IdleDetectorClient;
   method Bit#(1) incSent;
   method Bit#(1) incReceived;
diff --git a/rtl/Network.bsv b/rtl/Network.bsv
index baf36d63..136d327c 100644
--- a/rtl/Network.bsv
+++ b/rtl/Network.bsv
@@ -289,8 +289,9 @@ interface NoC;
     Vector#(`FetchersPerProgRouter, BOut#(DRAMReq))) dramReqs;
   interface Vector#(`DRAMsPerBoard,
     Vector#(`FetchersPerProgRouter, In#(DRAMResp))) dramResps;
-  // ProgRouter fetcher activities
+  // ProgRouter fetcher activities & performance counters
   interface Vector#(`FetchersPerProgRouter, FetcherActivity) activities;
+  interface ProgRouterPerfCounters progRouterPerfCounters;
 endinterface
 
 module mkNoC#(
@@ -468,6 +469,10 @@ module mkNoC#(
   // Fetcher activities
   interface activities = boardRouter.activities;
 
+  // Performance counters
+  interface ProgRouterPerfCounters progRouterPerfCounters =
+    boardRouter.perfCounters;
+
 endmodule
 
 endpackage
diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv
index 0cfd6e16..2ab0a0d6 100644
--- a/rtl/ProgRouter.bsv
+++ b/rtl/ProgRouter.bsv
@@ -277,8 +277,13 @@ endinterface
 // Fetcher activity for performance counters and termination detection
 (* always_ready *)
 interface FetcherActivity;
+  // Increment number of sent messages
   method Bit#(1) incSent;
+  // Increment number of messages sent to another board
+  method Bit#(1) incSentInterBoard;
+  // Increment number of received messages
   method Bit#(1) incReceived;
+  // Active (in the termination-detection sense)?
   method Bool active;
 endinterface
 
@@ -328,6 +333,7 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
 
   // Activity
   Reg#(Bit#(1)) incSentReg <- mkDReg(0);
+  Reg#(Bit#(1)) incSentInterBoardReg <- mkDReg(0);
   Reg#(Bit#(1)) incReceivedReg <- mkDReg(0);
 
   // Stage 1: consume input message
@@ -662,6 +668,7 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
           end
         end
         incSentReg <= 1;
+        if (tag == RR) incSentInterBoardReg <= 1;
         newFlitCount = 0;
       end
     end
@@ -707,6 +714,7 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
 
   interface FetcherActivity activity;
     method Bit#(1) incSent = incSentReg;
+    method Bit#(1) incSentInterBoard = incSentInterBoardReg;
     method Bit#(1) incReceived = incReceivedReg;
     method Bool active =
       beatBufferLen.value != 0 || consumeState != 0;
@@ -836,6 +844,16 @@ endmodule
 // Programmable router
 // =============================================================================
 
+// Enough bits to store a count of the number of fetchers
+typedef TLog#(TAdd#(`FetchersPerProgRouter, 1)) LogFetchersPerProgRouter;
+
+// ProgRouter's performance counters
+(* always_ready, always_enabled *)
+interface ProgRouterPerfCounters;
+  method Bit#(LogFetchersPerProgRouter) incSent;
+  method Bit#(LogFetchersPerProgRouter) incSentInterBoard;
+endinterface
+
 interface ProgRouter;
   // Incoming and outgoing flits
   interface Vector#(`FetchersPerProgRouter, In#(Flit)) flitIn;
@@ -848,8 +866,9 @@ interface ProgRouter;
   interface Vector#(`DRAMsPerBoard,
     Vector#(`FetchersPerProgRouter, In#(DRAMResp))) ramResps;
 
-  // Activities
+  // Activities & performance counters
   interface Vector#(`FetchersPerProgRouter, FetcherActivity) activities;
+  interface ProgRouterPerfCounters perfCounters;
 endinterface
 
 module mkProgRouter#(BoardId boardId) (ProgRouter);
@@ -909,6 +928,21 @@ module mkProgRouter#(BoardId boardId) (ProgRouter);
       ramRespIfc[i][j] = fetchers[j].ramResps[i];
     end
 
+  // Performance counters
+  Vector#(`FetchersPerProgRouter,
+    Bit#(LogFetchersPerProgRouter)) incSents;
+  Vector#(`FetchersPerProgRouter,
+    Bit#(LogFetchersPerProgRouter)) incSentsInterBoard;
+  for (Integer i = 0; i < `FetchersPerProgRouter; i=i+1) begin
+    incSents[i] = zeroExtend(fetchers[i].activity.incSent);
+    incSentsInterBoard[i] =
+      zeroExtend(fetchers[i].activity.incSentInterBoard);
+  end
+  Bit#(LogFetchersPerProgRouter) numSent <-
+    mkPipelinedReductionTree( \+ , 0, toList(incSents));
+  Bit#(LogFetchersPerProgRouter) numSentInterBoard <-
+    mkPipelinedReductionTree( \+ , 0, toList(incSentsInterBoard));
+
   function FetcherActivity getActivity(Fetcher f) = f.activity;
   interface flitIn = flitInIfc;
   interface flitOut = flitOutIfc;
@@ -916,7 +950,18 @@ module mkProgRouter#(BoardId boardId) (ProgRouter);
   interface ramReqs = ramReqIfc;
   interface ramResps = ramRespIfc;
   interface activities = map(getActivity, fetchers);
+  interface ProgRouterPerfCounters perfCounters;
+    method incSent = numSent;
+    method incSentInterBoard = numSentInterBoard;
+  endinterface
 
 endmodule
 
+// For core(s) to access ProgRouter's performance counters
+(* always_ready, always_enabled *)
+interface ProgRouterPerfClient;
+  method Action incSent(Bit#(LogFetchersPerProgRouter) amount);
+  method Action incSentInterBoard(Bit#(LogFetchersPerProgRouter) amount);
+endinterface
+
 endpackage
diff --git a/rtl/Util.bsv b/rtl/Util.bsv
index 507d1ef2..f45ece48 100644
--- a/rtl/Util.bsv
+++ b/rtl/Util.bsv
@@ -274,4 +274,31 @@ function Tuple2#(Bit#(n), Bit#(n)) sched(Bit#(n) hist, Bit#(n) avail);
   end
 endfunction
 
+// Pipelined reduction tree
+module mkPipelinedReductionTree#(
+         function a reduce(a x, a y),
+         a init,
+         List#(a) xs)
+       (a) provisos(Bits#(a, _));
+  Integer len = List::length(xs);
+  if (len == 0)
+    return error("mkSumList applied to empty list");
+  else if (len == 1)
+    return xs[0];
+  else begin
+    List#(a) ys = xs;
+    List#(a) reduced = Nil;
+    for (Integer i = 0; i < len; i=i+2) begin
+      Reg#(a) r <- mkConfigReg(init);
+      rule assignOut;
+        r <= reduce(ys[0], ys[1]);
+      endrule
+      ys = List::drop(2, ys);
+      reduced = Cons(readReg(r), reduced);
+    end
+    a res <- mkPipelinedReductionTree(reduce, init, reduced);
+    return res;
+  end
+endmodule
+
 endpackage

From 1c8e863fe4f7159974eee8377988d2185c6c786b Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Thu, 30 Apr 2020 18:12:16 +0100
Subject: [PATCH 43/78] Satisfy limitations of mkPipelinedReductionTree

---
 rtl/ProgRouter.bsv | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv
index 2ab0a0d6..ae361ed1 100644
--- a/rtl/ProgRouter.bsv
+++ b/rtl/ProgRouter.bsv
@@ -929,10 +929,10 @@ module mkProgRouter#(BoardId boardId) (ProgRouter);
     end
 
   // Performance counters
-  Vector#(`FetchersPerProgRouter,
-    Bit#(LogFetchersPerProgRouter)) incSents;
-  Vector#(`FetchersPerProgRouter,
-    Bit#(LogFetchersPerProgRouter)) incSentsInterBoard;
+  Vector#(TExp#(TLog#(`FetchersPerProgRouter)),
+    Bit#(LogFetchersPerProgRouter)) incSents = replicate(0);
+  Vector#(TExp#(TLog#(`FetchersPerProgRouter)),
+    Bit#(LogFetchersPerProgRouter)) incSentsInterBoard = replicate(0);
   for (Integer i = 0; i < `FetchersPerProgRouter; i=i+1) begin
     incSents[i] = zeroExtend(fetchers[i].activity.incSent);
     incSentsInterBoard[i] =

From 324e03a14973fb69837ed7b7d71e2bc6f256ba1b Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Fri, 1 May 2020 09:40:52 +0100
Subject: [PATCH 44/78] Tinsel API for ProgRouter perf counters

---
 README.md        | 10 ++++++++++
 include/tinsel.h | 34 ++++++++++++++++++++++++++--------
 2 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 871c9e5e..c6101804 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,16 @@ New section on programmable routers:
 New performance counters accessible from core zero on each board only:
   * `ProgRouterSent` and `ProgRouterSentInterBoard`
 
+Document the following:
+
+```c++
+// Performance counter: number of messages emitted by ProgRouter
+INLINE uint32_t tinselProgRouterSent();
+
+// Performance counter: number of inter-board messages emitted by ProgRouter
+INLINE uint32_t tinselProgRouterSentInterBoard();
+```
+
 # Tinsel 0.7.1
 
 Tinsel is a [RISC-V](https://riscv.org/)-based manythread
diff --git a/include/tinsel.h b/include/tinsel.h
index ec26b849..d06e2bfd 100644
--- a/include/tinsel.h
+++ b/include/tinsel.h
@@ -28,13 +28,15 @@
 #define CSR_FLUSH       "0xc01"
 
 // Performance counter CSRs
-#define CSR_PERFCOUNT     "0xc07"
-#define CSR_MISSCOUNT     "0xc08"
-#define CSR_HITCOUNT      "0xc09"
-#define CSR_WBCOUNT       "0xc0a"
-#define CSR_CPUIDLECOUNT  "0xc0b"
-#define CSR_CPUIDLECOUNTU "0xc0c"
-#define CSR_CYCLEU        "0xc0d"
+#define CSR_PERFCOUNT           "0xc07"
+#define CSR_MISSCOUNT           "0xc08"
+#define CSR_HITCOUNT            "0xc09"
+#define CSR_WBCOUNT             "0xc0a"
+#define CSR_CPUIDLECOUNT        "0xc0b"
+#define CSR_CPUIDLECOUNTU       "0xc0c"
+#define CSR_CYCLEU              "0xc0d"
+#define CSR_PROGROUTERSENT      "0xc0e"
+#define CSR_PROGROUTERSENTINTER "0xc0f"
 
 // Get globally unique thread id of caller
 INLINE uint32_t tinselId()
@@ -280,7 +282,7 @@ INLINE uint32_t tinselWritebackCount()
   return n;
 }
 
-// Performance counter:: get the CPU-idle count
+// Performance counter: get the CPU-idle count
 INLINE uint32_t tinselCPUIdleCount()
 {
   uint32_t n;
@@ -304,6 +306,22 @@ INLINE uint32_t tinselCycleCountU()
   return n;
 }
 
+// Performance counter: number of messages emitted by ProgRouter
+INLINE uint32_t tinselProgRouterSent()
+{
+  uint32_t n;
+  asm volatile ("csrrw %0, " CSR_PROGROUTERSENT ", zero" : "=r"(n));
+  return n;
+}
+
+// Performance counter: number of inter-board messages emitted by ProgRouter
+INLINE uint32_t tinselProgRouterSentInterBoard()
+{
+  uint32_t n;
+  asm volatile ("csrrw %0, " CSR_PROGROUTERSENTINTER ", zero" : "=r"(n));
+  return n;
+}
+
 // Get address of any specified host
 // (This Y coordinate specifies the row of the FPGA mesh that the
 // host is connected to, and the X coordinate specifies whether it is

From 65dc133cd7aa7db373f93d884b772bb0f20eb6d4 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Fri, 1 May 2020 10:14:07 +0100
Subject: [PATCH 45/78] Update POLite perf counter script

(Untested)
---
 apps/POLite/util/sumstats.awk | 37 ++++++++++++++++++++++-------------
 include/POLite/PDevice.h      | 21 +++++++++++---------
 2 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/apps/POLite/util/sumstats.awk b/apps/POLite/util/sumstats.awk
index f1f70329..f29e32ea 100755
--- a/apps/POLite/util/sumstats.awk
+++ b/apps/POLite/util/sumstats.awk
@@ -10,10 +10,11 @@ BEGIN {
   cacheCount = 0;
   coreCount = 0;
   cacheLineSize = 32;
-  intraThreadSendCount = 0;
-  interThreadSendCount = 0;
-  interBoardSendCount = 0;
-  fmax = 250000000;
+  msgsReceived = 0;
+  msgsSent = 0;
+  progRouterSent = 0;
+  progRouterSentInter = 0;
+  fmax = 220000000;
   if (boardsX == "" || boardsY == "") {
     boardsX = 3;
     boardsY = 2;
@@ -48,13 +49,15 @@ BEGIN {
         coreCount = coreCount+1;
       }
       # Per-thread message counts
-      else if (match($0, /(.*) LS:(.*),TS:(.*),BS:(.*)/, fields)) {
-        ls=strtonum("0x"fields[2]);
-        ts=strtonum("0x"fields[3]);
-        bs=strtonum("0x"fields[4]);
-        intraThreadSendCount = intraThreadSendCount+ls;
-        interThreadSendCount = interThreadSendCount+ts;
-        interBoardSendCount = interBoardSendCount+bs;
+      else if (match($0, /(.*) MS:(.*),MR:(.*),PR:(.*),PRI:(.*)/, fields)) {
+        ms=strtonum("0x"fields[2]);
+        mr=strtonum("0x"fields[3]);
+        pr=strtonum("0x"fields[4]);
+        pri=strtonum("0x"fields[5]);
+        msgsSent = msgsSent + mr;
+        msgsReceived = msgsReceived + mr;
+        progRouterSent = progRouterSent + pr;
+        progRouterSentInter = progRouterSentInter + pri;
       }
     }
   }
@@ -70,7 +73,13 @@ END {
   bytes = cacheLineSize * (missCount + writebackCount)
   print "Off-chip memory (GBytes/s): ", ((1/time) * bytes)/1000000000
   print "CPU util (%): ", (1-(cpuIdleCount/cycleCount))*100
-  print "Intra-thread messages: ", intraThreadSendCount
-  print "Inter-thread messages: ", interThreadSendCount
-  print "Inter-board messages: ", interBoardSendCount
+  print "Msgs received: ", msgsReceived
+  print "Msgs sent by threads: ", msgsSent
+  print "Msgs injected by ProgRouter:", progRouterSent
+  print "Inter-board msgs:", progRouterSentInter
+  print ""
+  print "Notes:"
+  print "  * ProgRouter injections includes inter-board msgs"
+  print "  * Memory bandwidth does not include lookups by ProgRouter"
+  print "  * If runtime > 40s approx, hit/miss counts may overflow"
 }
diff --git a/include/POLite/PDevice.h b/include/POLite/PDevice.h
index b5f99340..41de7ac9 100644
--- a/include/POLite/PDevice.h
+++ b/include/POLite/PDevice.h
@@ -147,11 +147,9 @@ template <typename DeviceType,
   // Count number of messages sent
   #ifdef POLITE_COUNT_MSGS
   // Total messages sent
-  uint32_t intraThreadSendCount;
-  // Total messages sent between threads
-  uint32_t interThreadSendCount;
-  // Messages sent between threads on different boards
-  uint32_t interBoardSendCount;
+  uint32_t msgsSent;
+  // Total messages received
+  uint32_t msgsReceived;
   #endif
 
   #ifdef TINSEL
@@ -188,8 +186,13 @@ template <typename DeviceType,
     }
     // Per-thread performance counters
     #ifdef POLITE_COUNT_MSGS
-    printf("LS:%x,TS:%x,BS:%x\n", intraThreadSendCount,
-             interThreadSendCount, interBoardSendCount);
+    uint32_t intraBoardId = me & ((1<<TinselLogThreadsPerBoard) - 1);
+    uint32_t progRouterSent =
+      intraBoardId == 0 ? tinselProgRouterSent() : 0;
+    uint32_t progRouterSentInter =
+      intraBoardId == 0 ? tinselProgRouterSentInterBoard() : 0;
+    printf("MS:%x,MR:%x,PR:%x,PRI:%x\n",
+      msgsSent, msgsReceived, progRouterSent, progRouterSentInter);
     #endif
   }
 
@@ -237,7 +240,7 @@ template <typename DeviceType,
           else
             tinselKeySend(devices[src].pin[pin-2], m);
           #ifdef POLITE_COUNT_MSGS
-          interThreadSendCount++;
+          msgsSent++;
           #endif
         }
         else
@@ -280,7 +283,7 @@ template <typename DeviceType,
             *(sendersTop++) = id;
           inEdge++;
           #ifdef POLITE_COUNT_MSGS
-          intraThreadSendCount++;
+          msgsReceived++;
           #endif
         }
         tinselFree(inMsg);

From b36a564bf3d23415cef8dcceeea44cac19cf0189 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Fri, 1 May 2020 11:02:19 +0100
Subject: [PATCH 46/78] Typo

---
 apps/POLite/util/sumstats.awk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/POLite/util/sumstats.awk b/apps/POLite/util/sumstats.awk
index f29e32ea..4a79a2f9 100755
--- a/apps/POLite/util/sumstats.awk
+++ b/apps/POLite/util/sumstats.awk
@@ -54,7 +54,7 @@ BEGIN {
         mr=strtonum("0x"fields[3]);
         pr=strtonum("0x"fields[4]);
         pri=strtonum("0x"fields[5]);
-        msgsSent = msgsSent + mr;
+        msgsSent = msgsSent + ms;
         msgsReceived = msgsReceived + mr;
         progRouterSent = progRouterSent + pr;
         progRouterSentInter = progRouterSentInter + pri;

From d6f29800e942796da90401ebb5abcd258881175f Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mn416@jennings.cl.cam.ac.uk>
Date: Mon, 4 May 2020 08:53:26 +0000
Subject: [PATCH 47/78] Count blocked sends in POLite softswitch

---
 apps/POLite/util/sumstats.awk |  7 ++++++-
 include/POLite/PDevice.h      | 17 ++++++++++++-----
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/apps/POLite/util/sumstats.awk b/apps/POLite/util/sumstats.awk
index 4a79a2f9..9ccfac8c 100755
--- a/apps/POLite/util/sumstats.awk
+++ b/apps/POLite/util/sumstats.awk
@@ -14,6 +14,7 @@ BEGIN {
   msgsSent = 0;
   progRouterSent = 0;
   progRouterSentInter = 0;
+  blockedSends = 0;
   fmax = 220000000;
   if (boardsX == "" || boardsY == "") {
     boardsX = 3;
@@ -49,15 +50,18 @@ BEGIN {
         coreCount = coreCount+1;
       }
       # Per-thread message counts
-      else if (match($0, /(.*) MS:(.*),MR:(.*),PR:(.*),PRI:(.*)/, fields)) {
+      else if (match($0, /(.*) MS:(.*),MR:(.*),PR:(.*),PRI:(.*),BL:(.*)/,
+                 fields)) {
         ms=strtonum("0x"fields[2]);
         mr=strtonum("0x"fields[3]);
         pr=strtonum("0x"fields[4]);
         pri=strtonum("0x"fields[5]);
+        bl=strtonum("0x"fields[6]);
         msgsSent = msgsSent + ms;
         msgsReceived = msgsReceived + mr;
         progRouterSent = progRouterSent + pr;
         progRouterSentInter = progRouterSentInter + pri;
+        blockedSends = blockedSends + bl;
       }
     }
   }
@@ -77,6 +81,7 @@ END {
   print "Msgs sent by threads: ", msgsSent
   print "Msgs injected by ProgRouter:", progRouterSent
   print "Inter-board msgs:", progRouterSentInter
+  print "Blocked sends:", blockedSends
   print ""
   print "Notes:"
   print "  * ProgRouter injections includes inter-board msgs"
diff --git a/include/POLite/PDevice.h b/include/POLite/PDevice.h
index 41de7ac9..d46bd7b4 100644
--- a/include/POLite/PDevice.h
+++ b/include/POLite/PDevice.h
@@ -150,6 +150,8 @@ template <typename DeviceType,
   uint32_t msgsSent;
   // Total messages received
   uint32_t msgsReceived;
+  // Number of times we wanted to send but couldn't
+  uint32_t blockedSends;
   #endif
 
   #ifdef TINSEL
@@ -191,8 +193,9 @@ template <typename DeviceType,
       intraBoardId == 0 ? tinselProgRouterSent() : 0;
     uint32_t progRouterSentInter =
       intraBoardId == 0 ? tinselProgRouterSentInterBoard() : 0;
-    printf("MS:%x,MR:%x,PR:%x,PRI:%x\n",
-      msgsSent, msgsReceived, progRouterSent, progRouterSentInter);
+    printf("MS:%x,MR:%x,PR:%x,PRI:%x,BL:%x\n",
+      msgsSent, msgsReceived, progRouterSent,
+        progRouterSentInter, blockedSends);
     #endif
   }
 
@@ -240,11 +243,15 @@ template <typename DeviceType,
           else
             tinselKeySend(devices[src].pin[pin-2], m);
           #ifdef POLITE_COUNT_MSGS
-          msgsSent++;
+            msgsSent++;
           #endif
         }
-        else
+        else {
+          #ifdef POLITE_COUNT_MSGS
+            blockedSends++;
+          #endif
           tinselWaitUntil(TINSEL_CAN_SEND|TINSEL_CAN_RECV);
+        }
       }
       else {
         // Idle detection
@@ -283,7 +290,7 @@ template <typename DeviceType,
             *(sendersTop++) = id;
           inEdge++;
           #ifdef POLITE_COUNT_MSGS
-          msgsReceived++;
+            msgsReceived++;
           #endif
         }
         tinselFree(inMsg);

From 20b0a74ae08bdba1264cde7d16e50d44a439d3c3 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mn416@jennings.cl.cam.ac.uk>
Date: Mon, 4 May 2020 14:47:28 +0000
Subject: [PATCH 48/78] Control chattyness from environment var

---
 include/POLite/PGraph.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/POLite/PGraph.h b/include/POLite/PGraph.h
index b6a1245a..20cc4b5b 100644
--- a/include/POLite/PGraph.h
+++ b/include/POLite/PGraph.h
@@ -98,6 +98,10 @@ template <typename DeviceType,
     inTable = NULL;
     routingTables = NULL;
     chatty = 0;
+    str = getenv("POLITE_CHATTY");
+    if (str != NULL) {
+      chatty = !strcmp(str, "0") ? 0 : 1;
+    }
   }
 
  public:

From 88b0034fe03b3c02843325c4730f19f34e4e9475 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Tue, 5 May 2020 16:14:34 +0100
Subject: [PATCH 49/78] URM1 record support in POLite

---
 include/POLite/ProgRouters.h | 60 +++++++++++++++++++++++++++++++-----
 1 file changed, 53 insertions(+), 7 deletions(-)

diff --git a/include/POLite/ProgRouters.h b/include/POLite/ProgRouters.h
index 90083802..a1c5942d 100644
--- a/include/POLite/ProgRouters.h
+++ b/include/POLite/ProgRouters.h
@@ -182,22 +182,58 @@ class ProgRouter {
     numChunks++;
     numRecords++;
   }
+
+  // Add a URM1 record to the table
+  void addURM1(uint32_t mboxX, uint32_t mboxY,
+                 uint32_t threadId, uint32_t key) {
+    if (numChunks == 5) nextBeat();
+    uint8_t* ptr = currentRecord48();
+    ptr[0] = key;
+    ptr[1] = key >> 8;
+    ptr[2] = key >> 16;
+    ptr[3] = key >> 24;
+    ptr[4] = (threadId << 3);
+    ptr[5] = (mboxY << 3) | (mboxX << 1) | (threadId >> 5);
+    numChunks++;
+    numRecords++;
+  }
 };
 
 // ==================================
 // Data type for routing destinations
 // ==================================
 
-struct PRoutingDest {
-  // Destination mailbox
-  uint32_t mbox;
-  // Thread-level routing key
+enum PRoutingDestKind { PRDestKindURM1, PRDestKindMRM };
+
+// URM1 routing destination
+struct PRoutingDestURM1 {
+  // Mailbox-local thread
+  uint16_t threadId;
+  // Thread-local routing key
+  uint32_t key;
+};
+
+// MRM routing destination
+struct PRoutingDestMRM {
+  // Thread-local routing key
   uint16_t key;
   // Destination threads
   uint32_t threadMaskLow;
   uint32_t threadMaskHigh;
 };
 
+// Routing destination
+struct PRoutingDest {
+  PRoutingDestKind kind;
+  // Destination mailbox
+  uint32_t mbox;
+  // URM1 or MRM destination
+  union {
+    PRoutingDestURM1 urm1;
+    PRoutingDestMRM mrm;
+  };
+};
+
 // Extract board X coord from routing dest
 inline uint32_t destX(uint32_t mbox) {
   uint32_t x = mbox >> (TinselMailboxMeshXBits + TinselMailboxMeshYBits);
@@ -288,9 +324,19 @@ class ProgRouterMesh {
     // Add local records
     for (int i = 0; i < local.numElems; i++) {
       PRoutingDest dest = local.elems[i];
-      table[senderY][senderX].addMRM(destMboxX(dest.mbox),
-        destMboxY(dest.mbox), dest.threadMaskHigh,
-        dest.threadMaskLow, dest.key);
+      if (dest.kind == PRDestKindMRM) {
+        table[senderY][senderX].addMRM(destMboxX(dest.mbox),
+          destMboxY(dest.mbox), dest.mrm.threadMaskHigh,
+          dest.mrm.threadMaskLow, dest.mrm.key);
+      }
+      else if (dest.kind == PRDestKindURM1) {
+        table[senderY][senderX].addURM1(destMboxX(dest.mbox),
+          destMboxY(dest.mbox), dest.urm1.threadId, dest.urm1.key);
+      }
+      else {
+        fprintf(stderr, "ProgRouters.h: unknown routing record kind\n");
+        exit(EXIT_FAILURE);
+      }
     }
 
     return table[senderY][senderX].genKey();

From 2ca458933a9c2f0e0dbc7d484d0427a5b1c7b8b6 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Tue, 5 May 2020 17:05:49 +0100
Subject: [PATCH 50/78] This should have been in previous commit

---
 include/POLite/PGraph.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/POLite/PGraph.h b/include/POLite/PGraph.h
index 20cc4b5b..57b3172e 100644
--- a/include/POLite/PGraph.h
+++ b/include/POLite/PGraph.h
@@ -512,10 +512,11 @@ template <typename DeviceType,
           uint32_t key = addInTableEntries(&groups);
           // Add output table entry
           PRoutingDest edge;
+          edge.kind = PRDestKindMRM;
           edge.mbox = mbox;
-          edge.key = key;
-          edge.threadMaskLow = threadMaskLow;
-          edge.threadMaskHigh = threadMaskHigh;
+          edge.mrm.key = key;
+          edge.mrm.threadMaskLow = threadMaskLow;
+          edge.mrm.threadMaskHigh = threadMaskHigh;
           outTable[d][p]->append(edge);
           // Prepare for new output table entry
           dests.numElems = destsRemaining;

From ba1e8b49978b63794286e85902708a0f5092902c Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Tue, 5 May 2020 17:06:01 +0100
Subject: [PATCH 51/78] First attempt at fast mapper

Just exploring... there's probably a better way to do this with less
duplicated code.
---
 include/POLite.h                 |  15 +-
 include/POLite/FastMap/PDevice.h | 302 +++++++++++++
 include/POLite/FastMap/PGraph.h  | 710 +++++++++++++++++++++++++++++++
 3 files changed, 1024 insertions(+), 3 deletions(-)
 create mode 100644 include/POLite/FastMap/PDevice.h
 create mode 100644 include/POLite/FastMap/PGraph.h

diff --git a/include/POLite.h b/include/POLite.h
index d12a0e73..858b865e 100644
--- a/include/POLite.h
+++ b/include/POLite.h
@@ -6,13 +6,22 @@
 
 #ifdef TINSEL
   #include <tinsel.h>
-  #include <POLite/PDevice.h>
+  #ifdef POLITE_FAST_MAP
+    #include <POLite/FastMap/PDevice.h>
+  #else
+    #include <POLite/PDevice.h>
+  #endif
 #else
-  #include <POLite/PDevice.h>
+  #ifdef POLITE_FAST_MAP
+    #include <POLite/FastMap/PDevice.h>
+    #include <POLite/FastMap/PGraph.h>
+  #else
+    #include <POLite/PDevice.h>
+    #include <POLite/PGraph.h>
+  #endif
   #include <POLite/Seq.h>
   #include <POLite/Graph.h>
   #include <POLite/Placer.h>
-  #include <POLite/PGraph.h>
 #endif
 
 #endif
diff --git a/include/POLite/FastMap/PDevice.h b/include/POLite/FastMap/PDevice.h
new file mode 100644
index 00000000..f095eba6
--- /dev/null
+++ b/include/POLite/FastMap/PDevice.h
@@ -0,0 +1,302 @@
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef _PDEVICE_H_
+#define _PDEVICE_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <type_traits>
+
+#ifdef TINSEL
+  #include <tinsel.h>
+  #define PTR(t) t*
+#else
+  #include <tinsel-interface.h>
+  #define PTR(t) uint32_t
+#endif
+
+// Use this to align on half-cache-line boundary
+#define ALIGNED __attribute__((aligned(1<<(TinselLogBytesPerLine-1))))
+
+// This is a static limit on the number of pins per device
+#ifndef POLITE_NUM_PINS
+#define POLITE_NUM_PINS 1
+#endif
+
+// Macros for performance stats
+//   POLITE_DUMP_STATS - dump performance stats on termination
+//   POLITE_COUNT_MSGS - include message counts of performance stats
+
+// Thread-local device id
+typedef uint16_t PLocalDeviceId;
+
+// Thread id
+typedef uint32_t PThreadId;
+
+// Device address
+// Bits 17->0: thread id
+// Bit 18: invalid address
+// Bits 31->19: thread-local device id
+typedef uint32_t PDeviceAddr;
+
+// Device address constructors
+inline PDeviceAddr invalidDeviceAddr() { return 0x40000; }
+inline PDeviceAddr makeDeviceAddr(PThreadId t, PLocalDeviceId d) {
+  return (d << 19) | t;
+}
+
+// Device address deconstructors
+inline bool isValidDeviceAddr(PDeviceAddr addr) { return !(addr & 0x40000); }
+inline PThreadId getThreadId(PDeviceAddr addr) { return addr & 0x3ffff; }
+inline PLocalDeviceId getLocalDeviceId(PDeviceAddr addr) { return addr >> 19; }
+
+// What's the max allowed local device address?
+inline uint32_t maxLocalDeviceId() { return 8192; }
+
+// Pins
+//   No      - means 'not ready to send'
+//   HostPin - means 'send to host'
+//   Pin(n)  - means 'send to application pin number n'
+typedef uint8_t PPin;
+#define No 0
+#define HostPin 1
+#define Pin(n) ((n)+2)
+
+// For template arguments that are not used
+struct None {};
+
+// Generic device structure
+// Type parameters:
+//   S - State
+//   E - Edge label
+//   M - Message structure
+template <typename S, typename E, typename M> struct PDevice {
+  // State
+  S* s;
+  PPin* readyToSend;
+  uint32_t numVertices;
+  uint16_t time;
+
+  // Handlers
+  void init();
+  void send(volatile M* msg);
+  void recv(M* msg, E* edge);
+  bool step();
+  bool finish(volatile M* msg);
+};
+
+// Generic device state structure
+template <typename S> struct ALIGNED PState {
+  // Board-level routing key for each outgoing pin
+  uint32_t pin[POLITE_NUM_PINS];
+  // Ready-to-send status
+  PPin readyToSend;
+  // Custom state
+  S state;
+};
+
+// Message structure
+template <typename M> struct PMessage {
+  // Destination thread-local device id
+  uint16_t devId;
+  // Id of incoming edge
+  uint16_t edgeId;
+  // Application message
+  M payload;
+};
+
+// An incoming edge to a device
+template <typename E> struct PInEdge {
+  E edge;
+};
+
+// Generic thread structure
+template <typename DeviceType,
+          typename S, typename E, typename M> struct PThread {
+
+  // Number of devices handled by thread
+  PLocalDeviceId numDevices;
+  // Number of times step handler has been called
+  uint16_t time;
+  // Number of devices in graph
+  uint32_t numVertices;
+  // Pointer to array of device states
+  PTR(PState<S>) devices;
+  // Pointer to base of edge table
+  PTR(PInEdge<E>) inTableBase;
+  // Array of local device ids are ready to send
+  PTR(PLocalDeviceId) senders;
+  // This array is accessed in a LIFO manner
+  PTR(PLocalDeviceId) sendersTop;
+
+  // Count number of messages sent
+  #ifdef POLITE_COUNT_MSGS
+  // Total messages sent
+  uint32_t msgsSent;
+  // Total messages received
+  uint32_t msgsReceived;
+  // Number of times we wanted to send but couldn't
+  uint32_t blockedSends;
+  #endif
+
+  #ifdef TINSEL
+
+  // Helper function to construct a device
+  INLINE DeviceType getDevice(uint32_t id) {
+    DeviceType dev;
+    dev.s           = &devices[id].state;
+    dev.readyToSend = &devices[id].readyToSend;
+    dev.numVertices = numVertices;
+    dev.time        = time;
+    return dev;
+  }
+
+  // Dump performance counter stats over UART
+  void dumpStats() {
+    tinselPerfCountStop();
+    uint32_t me = tinselId();
+    // Per-cache performance counters
+    uint32_t cacheMask = (1 <<
+      (TinselLogThreadsPerCore + TinselLogCoresPerDCache)) - 1;
+    if ((me & cacheMask) == 0) {
+      printf("H:%x,M:%x,W:%x\n",
+        tinselHitCount(),
+        tinselMissCount(),
+        tinselWritebackCount());
+    }
+    // Per-core performance counters
+    uint32_t coreMask = (1 << (TinselLogThreadsPerCore)) - 1;
+    if ((me & coreMask) == 0) {
+      printf("C:%x %x,I:%x %x\n",
+        tinselCycleCountU(), tinselCycleCount(),
+        tinselCPUIdleCountU(), tinselCPUIdleCount());
+    }
+    // Per-thread performance counters
+    #ifdef POLITE_COUNT_MSGS
+    uint32_t intraBoardId = me & ((1<<TinselLogThreadsPerBoard) - 1);
+    uint32_t progRouterSent =
+      intraBoardId == 0 ? tinselProgRouterSent() : 0;
+    uint32_t progRouterSentInter =
+      intraBoardId == 0 ? tinselProgRouterSentInterBoard() : 0;
+    printf("MS:%x,MR:%x,PR:%x,PRI:%x,BL:%x\n",
+      msgsSent, msgsReceived, progRouterSent,
+        progRouterSentInter, blockedSends);
+    #endif
+  }
+
+  // Invoke device handlers
+  void run() {
+    // Did last call to step handler request a new time step?
+    bool active = true;
+
+    // Reset performance counters
+    tinselPerfCountReset();
+
+    // Initialisation
+    sendersTop = senders;
+    for (uint32_t i = 0; i < numDevices; i++) {
+      DeviceType dev = getDevice(i);
+      // Invoke the initialiser for each device
+      dev.init();
+      // Device ready to send?
+      if (*dev.readyToSend != No) {
+        *(sendersTop++) = i;
+      }
+    }
+
+    // Set number of flits per message
+    tinselSetLen((sizeof(PMessage<M>)-1) >> TinselLogBytesPerFlit);
+
+    // Event loop
+    while (1) {
+      // Try to send
+      if (sendersTop != senders) {
+        if (tinselCanSend()) {
+          // Get next sender
+          PLocalDeviceId src = *(--sendersTop);
+          // Lookup device
+          DeviceType dev = getDevice(src);
+          PPin pin = *dev.readyToSend;
+          // Invoke send handler
+          PMessage<M>* m = (PMessage<M>*) tinselSendSlot();
+          dev.send(&m->payload);
+          // Reinsert sender, if it still wants to send
+          if (*dev.readyToSend != No) sendersTop++;
+          // Is it a send to the host pin or a user pin?
+          if (pin == HostPin)
+            tinselSend(tinselHostId(), m);
+          else
+            tinselKeySend(devices[src].pin[pin-2], m);
+          #ifdef POLITE_COUNT_MSGS
+            msgsSent++;
+          #endif
+        }
+        else {
+          #ifdef POLITE_COUNT_MSGS
+            blockedSends++;
+          #endif
+          tinselWaitUntil(TINSEL_CAN_SEND|TINSEL_CAN_RECV);
+        }
+      }
+      else {
+        // Idle detection
+        int idle = tinselIdle(!active);
+        if (idle > 1)
+          break;
+        else if (idle) {
+          active = false;
+          for (uint32_t i = 0; i < numDevices; i++) {
+            DeviceType dev = getDevice(i);
+            // Invoke the step handler for each device
+            active = dev.step() || active;
+            // Device ready to send?
+            if (*dev.readyToSend != No) {
+              *(sendersTop++) = i;
+            }
+          }
+          time++;
+        }
+      }
+
+      // Step 2: try to receive
+      while (tinselCanRecv()) {
+        PMessage<M>* inMsg = (PMessage<M>*) tinselRecv();
+        PInEdge<E>* inEdge = &inTableBase[inMsg->edgeId];
+        // Lookup destination device
+        PLocalDeviceId id = inMsg->devId;
+        DeviceType dev = getDevice(id);
+        // Was it ready to send?
+        PPin oldReadyToSend = *dev.readyToSend;
+        // Invoke receive handler
+        dev.recv(&inMsg->payload, &inEdge->edge);
+        // Insert device into a senders array, if not already there
+        if (*dev.readyToSend != No && oldReadyToSend == No)
+          *(sendersTop++) = id;
+        #ifdef POLITE_COUNT_MSGS
+          msgsReceived++;
+        #endif
+        tinselFree(inMsg);
+      }
+    }
+
+    // Termination
+    #ifdef POLITE_DUMP_STATS
+      dumpStats();
+    #endif
+
+    // Invoke finish handler for each device
+    for (uint32_t i = 0; i < numDevices; i++) {
+      DeviceType dev = getDevice(i);
+      tinselWaitUntil(TINSEL_CAN_SEND);
+      PMessage<M>* m = (PMessage<M>*) tinselSendSlot();
+      if (dev.finish(&m->payload)) tinselSend(tinselHostId(), m);
+    }
+
+    // Sleep
+    tinselWaitUntil(TINSEL_CAN_RECV); while (1);
+  }
+
+  #endif
+
+};
+
+#endif
diff --git a/include/POLite/FastMap/PGraph.h b/include/POLite/FastMap/PGraph.h
new file mode 100644
index 00000000..8ac0c84d
--- /dev/null
+++ b/include/POLite/FastMap/PGraph.h
@@ -0,0 +1,710 @@
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef _PGRAPH_H_
+#define _PGRAPH_H_
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+#include <HostLink.h>
+#include <config.h>
+#include <POLite.h>
+#include <POLite/Seq.h>
+#include <POLite/Graph.h>
+#include <POLite/Placer.h>
+#include <POLite/ProgRouters.h>
+#include <type_traits>
+
+// Nodes of a POETS graph are devices
+typedef NodeId PDeviceId;
+
+// POETS graph
+template <typename DeviceType,
+          typename S, typename E, typename M> class PGraph {
+ private:
+  // Align address to 2^n byte boundary
+  inline uint32_t align(uint32_t n, uint32_t addr) {
+    if ((addr & (1<<n)-1) == 0) return addr;
+    return ((addr >> n) + 1) << n;
+  }
+
+  // Align address to 32-bit word boundary
+  uint32_t wordAlign(uint32_t addr) { return align(2, addr); }
+
+  // Align address to cache-line boundary
+  uint32_t cacheAlign(uint32_t addr) {
+    return align(TinselLogBytesPerLine, addr);
+  }
+
+  // Helper function
+  inline uint32_t min(uint32_t x, uint32_t y) { return x < y ? x : y; }
+
+  // Number of FPGA boards available
+  uint32_t meshLenX;
+  uint32_t meshLenY;
+
+  // Number of FPGA boards to use
+  uint32_t numBoardsX;
+  uint32_t numBoardsY;
+
+  // Out table (sender-side edge tables)
+  // Sequence of destinations for every (device, pin) pair
+  Seq<PRoutingDest>*** outTable;
+
+  // Key table (sender-side key tables)
+  // Global routing key for every (device, pin) pair
+  uint32_t** keyTable;
+
+  // In table (receiver-side edge tables)
+  // Sequence of incoming edges for every thread
+  Seq<PInEdge<E>>** inTable;
+
+  // Mesh of per-board programmable routers
+  ProgRouterMesh* routingTables;
+
+  // Generic constructor
+  void constructor(uint32_t lenX, uint32_t lenY) {
+    meshLenX = lenX;
+    meshLenY = lenY;
+    char* str = getenv("POLITE_BOARDS_X");
+    int nx = str ? atoi(str) : meshLenX;
+    str = getenv("POLITE_BOARDS_Y");
+    int ny = str ? atoi(str) : meshLenY;
+    setNumBoards(nx, ny);
+    numDevices = 0;
+    devices = NULL;
+    toDeviceAddr = NULL;
+    numDevicesOnThread = NULL;
+    fromDeviceAddr = NULL;
+    vertexMem = NULL;
+    vertexMemSize = NULL;
+    vertexMemBase = NULL;
+    inEdgeMem = NULL;
+    inEdgeMemSize = NULL;
+    inEdgeMemBase = NULL;
+    mapVerticesToDRAM = false;
+    mapInEdgesToDRAM = true;
+    outTable = NULL;
+    keyTable = NULL;
+    inTable = NULL;
+    routingTables = NULL;
+    chatty = 0;
+    str = getenv("POLITE_CHATTY");
+    if (str != NULL) {
+      chatty = !strcmp(str, "0") ? 0 : 1;
+    }
+  }
+
+ public:
+  // Number of devices
+  uint32_t numDevices;
+
+  // Graph containing device ids and connections
+  Graph graph;
+
+  // Edge labels: has same structure as graph.outgoing
+  Seq<Seq<E>*> edgeLabels;
+
+  // Mapping from device id to device state
+  // (Not valid until the mapper is called)
+  PState<S>** devices;
+
+  // Mapping from thread id to number of devices on that thread
+  // (Not valid until the mapper is called)
+  uint32_t* numDevicesOnThread;
+
+  // Mapping from device id to device address and back
+  // (Not valid until the mapper is called)
+  PDeviceAddr* toDeviceAddr;  // Device id -> device address
+  PDeviceId** fromDeviceAddr; // Device address -> device id
+
+  // Each thread's vertex mem and thread mem regions
+  // (Not valid until the mapper is called)
+  uint8_t** vertexMem;      uint8_t** threadMem;
+  uint32_t* vertexMemSize;  uint32_t* threadMemSize;
+  uint32_t* vertexMemBase;  uint32_t* threadMemBase;
+
+  // Each thread's in-edge tables
+  // (Not valid until the mapper is called)
+  uint8_t** inEdgeMem;
+  uint32_t* inEdgeMemSize;
+  uint32_t* inEdgeMemBase;
+
+  // Where to map the various regions
+  // (If false, map to SRAM instead)
+  bool mapVerticesToDRAM;
+  bool mapInEdgesToDRAM;
+
+  // Allow mapper to print useful information to stdout
+  uint32_t chatty;
+
+  // Setter for number of boards to use
+  void setNumBoards(uint32_t x, uint32_t y) {
+    if (x > meshLenX || y > meshLenY) {
+      printf("Mapper: %d x %d boards requested, %d x %d available\n",
+        numBoardsX, numBoardsY, meshLenX, meshLenY);
+      exit(EXIT_FAILURE);
+    }
+    numBoardsX = x;
+    numBoardsY = y;
+  }
+
+  // Create new device
+  inline PDeviceId newDevice() {
+    edgeLabels.append(new SmallSeq<E>);
+    numDevices++;
+    return graph.newNode();
+  }
+
+  // Add a connection between devices
+  inline void addEdge(PDeviceId from, PinId pin, PDeviceId to) {
+    if (pin >= POLITE_NUM_PINS) {
+      printf("addEdge: pin exceeds POLITE_NUM_PINS\n");
+      exit(EXIT_FAILURE);
+    }
+    graph.addEdge(from, pin, to);
+    E edge;
+    edgeLabels.elems[from]->append(edge);
+  }
+
+  // Add labelled edge using given output pin
+  void addLabelledEdge(E edge, PDeviceId x, PinId pin, PDeviceId y) {
+    graph.addEdge(x, pin, y);
+    edgeLabels.elems[x]->append(edge);
+  }
+
+  // Allocate SRAM and DRAM partitions
+  void allocatePartitions() {
+    // Decide a maximum partition size that is reasonable
+    // SRAM: Partition size minus 2048 bytes for the stack
+    uint32_t maxSRAMSize = (1<<TinselLogBytesPerSRAMPartition) - 2048;
+    // DRAM: Partition size minus 65536 bytes for the stack
+    uint32_t maxDRAMSize = (1<<TinselLogBytesPerDRAMPartition) - 65536;
+    // Allocate partition sizes and bases
+    vertexMem = (uint8_t**) calloc(TinselMaxThreads, sizeof(uint8_t*));
+    vertexMemSize = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+    vertexMemBase = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+    threadMem = (uint8_t**) calloc(TinselMaxThreads, sizeof(uint8_t*));
+    threadMemSize = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+    threadMemBase = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+    inEdgeMem = (uint8_t**) calloc(TinselMaxThreads, sizeof(uint8_t*));
+    inEdgeMemSize = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+    inEdgeMemBase = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+    // Compute partition sizes for each thread
+    for (uint32_t threadId = 0; threadId < TinselMaxThreads; threadId++) {
+      // This variable is used to count the size of the *initialised*
+      // partition.  The total partition size is larger as it includes
+      // uninitialised portions.
+      uint32_t sizeVMem = 0;
+      uint32_t sizeEIMem = 0;
+      uint32_t sizeTMem = 0;
+      // Add space for thread structure (always stored in SRAM)
+      sizeTMem = cacheAlign(sizeof(PThread<DeviceType, S, E, M>));
+      // Add space for devices
+      uint32_t numDevs = numDevicesOnThread[threadId];
+      for (uint32_t devNum = 0; devNum < numDevs; devNum++) {
+        // Add space for device
+        sizeVMem = sizeVMem + sizeof(PState<S>);
+      }
+      // Add space for incoming edge table
+      if (inTable[threadId]) {
+        sizeEIMem = inTable[threadId]->numElems * sizeof(PInEdge<E>);
+        sizeEIMem = wordAlign(sizeEIMem);
+      }
+      // The total partition size including uninitialised portions
+      uint32_t totalSizeVMem =
+        sizeVMem + wordAlign(sizeof(PLocalDeviceId) * numDevs);
+      // Check that total size is reasonable
+      uint32_t totalSizeSRAM = sizeTMem;
+      uint32_t totalSizeDRAM = 0;
+      if (mapVerticesToDRAM) totalSizeDRAM += totalSizeVMem;
+                        else totalSizeSRAM += totalSizeVMem;
+      if (mapInEdgesToDRAM)  totalSizeDRAM += sizeEIMem;
+                        else totalSizeSRAM += sizeEIMem;
+      if (totalSizeDRAM > maxDRAMSize) {
+        printf("Error: max DRAM partition size exceeded\n");
+        exit(EXIT_FAILURE);
+      }
+      if (totalSizeSRAM > maxSRAMSize) {
+        printf("Error: max SRAM partition size exceeded\n");
+        exit(EXIT_FAILURE);
+      }
+      // Allocate space for the initialised portion of the partition
+      assert((sizeVMem%4) == 0);
+      assert((sizeTMem%4) == 0);
+      assert((sizeEIMem%4) == 0);
+      vertexMem[threadId] = (uint8_t*) calloc(sizeVMem, 1);
+      vertexMemSize[threadId] = sizeVMem;
+      threadMem[threadId] = (uint8_t*) calloc(sizeTMem, 1);
+      threadMemSize[threadId] = sizeTMem;
+      inEdgeMem[threadId] = (uint8_t*) calloc(sizeEIMem, 1);
+      inEdgeMemSize[threadId] = sizeEIMem;
+      // Tinsel address of base of partition
+      uint32_t partId = threadId & (TinselThreadsPerDRAM-1);
+      uint32_t sramBase = (1 << TinselLogBytesPerSRAM) +
+          (partId << TinselLogBytesPerSRAMPartition);
+      uint32_t dramBase = TinselBytesPerDRAM -
+          ((partId+1) << TinselLogBytesPerDRAMPartition);
+      // Use partition-interleaved region for DRAM
+      dramBase |= 0x80000000;
+      threadMemBase[threadId] = sramBase;
+      sramBase += threadMemSize[threadId];
+      // Determine base addresses of each region
+      if (mapVerticesToDRAM) {
+        vertexMemBase[threadId] = dramBase;
+        dramBase += totalSizeVMem;
+      }
+      else {
+        vertexMemBase[threadId] = sramBase;
+        sramBase += totalSizeVMem;
+      }
+      if (mapInEdgesToDRAM) {
+        inEdgeMemBase[threadId] = dramBase;
+        dramBase += sizeEIMem;
+      }
+      else {
+        inEdgeMemBase[threadId] = sramBase;
+        sramBase += sizeEIMem;
+      }
+    }
+  }
+
+  // Initialise partitions
+  void initialisePartitions() {
+    for (uint32_t threadId = 0; threadId < TinselMaxThreads; threadId++) {
+      // Next pointers for each partition
+      uint32_t nextVMem = 0;
+      // Pointer to thread structure
+      PThread<DeviceType, S, E, M>* thread =
+        (PThread<DeviceType, S, E, M>*) &threadMem[threadId][0];
+      // Set number of devices on thread
+      thread->numDevices = numDevicesOnThread[threadId];
+      // Set number of devices in graph
+      thread->numVertices = numDevices;
+      // Set tinsel address of array of device states
+      thread->devices = vertexMemBase[threadId];
+      // Set tinsel address of base of in-edge table
+      thread->inTableBase = inEdgeMemBase[threadId];
+      // Add space for each device on thread
+      uint32_t numDevs = numDevicesOnThread[threadId];
+      for (uint32_t devNum = 0; devNum < numDevs; devNum++) {
+        PState<S>* dev = (PState<S>*) &vertexMem[threadId][nextVMem];
+        PDeviceId id = fromDeviceAddr[threadId][devNum];
+        devices[id] = dev;
+        // Add space for device
+        nextVMem = nextVMem + sizeof(PState<S>);
+      }
+      // Initialise each device and the thread's out edges
+      for (uint32_t devNum = 0; devNum < numDevs; devNum++) {
+        PDeviceId id = fromDeviceAddr[threadId][devNum];
+        PState<S>* dev = devices[id];
+        // Initialise
+        for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) {
+          dev->pin[p] = keyTable[id][p];
+        }
+      }
+      // Intialise thread's in edges
+      PInEdge<E>* inEdgeArray = (PInEdge<E>*) inEdgeMem[threadId];
+      Seq<PInEdge<E>>* edges = inTable[threadId];
+      if (edges)
+        for (uint32_t i = 0; i < edges->numElems; i++) {
+          inEdgeArray[i] = edges->elems[i];
+        }
+      // At this point, check that next pointers line up with heap sizes
+      if (nextVMem != vertexMemSize[threadId]) {
+        printf("Error: vertex mem size does not match pre-computed size\n");
+        exit(EXIT_FAILURE);
+      }
+      // Set tinsel address of senders array
+      thread->senders = vertexMemBase[threadId] + nextVMem;
+    }
+  }
+
+  // Allocate mapping structures
+  void allocateMapping() {
+    devices = (PState<S>**) calloc(numDevices, sizeof(PState<S>*));
+    toDeviceAddr = (PDeviceAddr*) calloc(numDevices, sizeof(PDeviceAddr));
+    fromDeviceAddr = (PDeviceId**) calloc(TinselMaxThreads, sizeof(PDeviceId*));
+    numDevicesOnThread = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+  }
+
+  // Allocate thread edge input and output tables
+  // (Only valid after mapper is called)
+  void allocateInOutTables() {
+    // Receiver-side tables
+    inTable = (Seq<PInEdge<E>>**)
+      calloc(TinselMaxThreads,sizeof(Seq<PInEdge<E>>*));
+    for (uint32_t t = 0; t < TinselMaxThreads; t++) {
+      if (numDevicesOnThread[t] != 0)
+        inTable[t] = new SmallSeq<PInEdge<E>>;
+    }
+
+    // Sender-side tables
+    outTable = (Seq<PRoutingDest>***)
+      calloc(numDevices, sizeof(Seq<PRoutingDest>**));
+    for (uint32_t d = 0; d < numDevices; d++) {
+      outTable[d] = (Seq<PRoutingDest>**)
+        calloc(POLITE_NUM_PINS, sizeof(Seq<PRoutingDest>*));
+      for (uint32_t p = 0; p < POLITE_NUM_PINS; p++)
+        outTable[d][p] = new SmallSeq<PRoutingDest>;
+    }
+
+    keyTable = new uint32_t* [numDevices];
+    for (uint32_t d = 0; d < numDevices; d++)
+      keyTable[d] = new uint32_t [POLITE_NUM_PINS];
+  }
+
+  // Compute thread edge input and output tables
+  // (Only valid after mapper is called)
+  void computeInOutTables() {
+    // For each device
+    for (uint32_t d = 0; d < numDevices; d++) {
+      // For each pin
+      for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) {
+        Seq<PDeviceId>* dests = graph.outgoing->elems[d];
+        Seq<E>* edges = edgeLabels.elems[d];
+        for (uint32_t i = 0; i < dests->numElems; i++) {
+          PDeviceId destId = dests->elems[i];
+          // Destination thread id
+          uint32_t threadId = getThreadId(toDeviceAddr[destId]);
+          // Thread-local device id
+          uint32_t devId = getLocalDeviceId(toDeviceAddr[destId]);
+          // Add edge to thread's input table
+          uint32_t edgeId = inTable[threadId]->numElems;
+          if (i < inTable[threadId]->numElems) {
+            PInEdge<E> edge;
+            edge.edge = edges->elems[i];
+            inTable[threadId]->append(edge);
+          }
+          // Add output table entry
+          PRoutingDest rdest;
+          rdest.kind = PRDestKindURM1;
+          rdest.mbox = threadId >> TinselLogThreadsPerMailbox;
+          rdest.urm1.key = devId | (edgeId << 16);
+          rdest.urm1.threadId = threadId &
+            ((1<<TinselLogThreadsPerMailbox) - 1);
+          outTable[d][p]->append(rdest);
+        }
+      }
+    }
+  }
+
+  // Release all structures
+  void releaseAll() {
+    if (devices != NULL) {
+      free(devices);
+      free(toDeviceAddr);
+      free(numDevicesOnThread);
+      for (uint32_t t = 0; t < TinselMaxThreads; t++)
+        if (fromDeviceAddr[t] != NULL) free(fromDeviceAddr[t]);
+      free(fromDeviceAddr);
+      for (uint32_t t = 0; t < TinselMaxThreads; t++)
+        if (vertexMem[t] != NULL) free(vertexMem[t]);
+      free(vertexMem);
+      free(vertexMemSize);
+      free(vertexMemBase);
+      for (uint32_t t = 0; t < TinselMaxThreads; t++)
+        if (threadMem[t] != NULL) free(threadMem[t]);
+      free(threadMem);
+      free(threadMemSize);
+      free(threadMemBase);
+      for (uint32_t t = 0; t < TinselMaxThreads; t++)
+        if (inEdgeMem[t] != NULL) free(inEdgeMem[t]);
+      free(inEdgeMem);
+      free(inEdgeMemSize);
+      free(inEdgeMemBase);
+    }
+    if (inTable != NULL) {
+      for (uint32_t t = 0; t < TinselMaxThreads; t++)
+        if (inTable[t] != NULL) delete inTable[t];
+      free(inTable);
+      inTable = NULL;
+    }
+    if (outTable != NULL) {
+      for (uint32_t d = 0; d < numDevices; d++) {
+        if (outTable[d] == NULL) continue;
+        for (uint32_t p = 0; p < POLITE_NUM_PINS; p++)
+          delete outTable[d][p];
+        free(outTable[d]);
+      }
+      free(outTable);
+      outTable = NULL;
+    }
+    if (keyTable != NULL) {
+      for (uint32_t d = 0; d < numDevices; d++) delete [] keyTable[d];
+      delete [] keyTable;
+      keyTable = NULL;
+    }
+    if (routingTables != NULL) delete routingTables;
+  }
+
+  // Implement mapping to tinsel threads
+  void map() {
+    // Let's measure some times
+    struct timeval placementStart, placementFinish;
+    struct timeval routingStart, routingFinish;
+    struct timeval initStart, initFinish;
+
+    // Release all mapping and heap structures
+    releaseAll();
+
+    // Reallocate mapping structures
+    allocateMapping();
+
+    // Start placement timer
+    gettimeofday(&placementStart, NULL);
+
+    // Partition into subgraphs, one per board
+    Placer boards(&graph, numBoardsX, numBoardsY);
+
+    // Place subgraphs onto 2D mesh
+    const uint32_t placerEffort = 8;
+    boards.place(placerEffort);
+
+    // For each board
+    for (uint32_t boardY = 0; boardY < numBoardsY; boardY++) {
+      for (uint32_t boardX = 0; boardX < numBoardsX; boardX++) {
+        // Partition into subgraphs, one per mailbox
+        PartitionId b = boards.mapping[boardY][boardX];
+        Placer boxes(&boards.subgraphs[b], 
+                 TinselMailboxMeshXLen, TinselMailboxMeshYLen);
+        boxes.place(placerEffort);
+
+        // For each mailbox
+        for (uint32_t boxX = 0; boxX < TinselMailboxMeshXLen; boxX++) {
+          for (uint32_t boxY = 0; boxY < TinselMailboxMeshYLen; boxY++) {
+            // Partition into subgraphs, one per thread
+            uint32_t numThreads = 1<<TinselLogThreadsPerMailbox;
+            PartitionId t = boxes.mapping[boxY][boxX];
+            Placer threads(&boxes.subgraphs[t], numThreads, 1);
+
+            // For each thread
+            for (uint32_t threadNum = 0; threadNum < numThreads; threadNum++) {
+              // Determine tinsel thread id
+              uint32_t threadId = boardY;
+              threadId = (threadId << TinselMeshXBits) | boardX;
+              threadId = (threadId << TinselMailboxMeshYBits) | boxY;
+              threadId = (threadId << TinselMailboxMeshXBits) | boxX;
+              threadId = (threadId << (TinselLogCoresPerMailbox +
+                            TinselLogThreadsPerCore)) | threadNum;
+
+              // Get subgraph
+              Graph* g = &threads.subgraphs[threadNum];
+
+              // Populate fromDeviceAddr mapping
+              uint32_t numDevs = g->incoming->numElems;
+              numDevicesOnThread[threadId] = numDevs;
+              fromDeviceAddr[threadId] = (PDeviceId*)
+                malloc(sizeof(PDeviceId) * numDevs);
+              for (uint32_t devNum = 0; devNum < numDevs; devNum++)
+                fromDeviceAddr[threadId][devNum] = g->labels->elems[devNum];
+  
+              // Populate toDeviceAddr mapping
+              assert(numDevs < maxLocalDeviceId());
+              for (uint32_t devNum = 0; devNum < numDevs; devNum++) {
+                PDeviceAddr devAddr =
+                  makeDeviceAddr(threadId, devNum);
+                toDeviceAddr[g->labels->elems[devNum]] = devAddr;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // Stop placement timer and start In/Out table timer
+    gettimeofday(&placementFinish, NULL);
+    gettimeofday(&routingStart, NULL);
+
+    // Compute send and receive side routing tables
+    allocateInOutTables();
+    computeInOutTables();
+
+    // Compute per-board programmable routing tables
+    routingTables = new ProgRouterMesh(numBoardsX, numBoardsY);
+    for (uint32_t d = 0; d < numDevices; d++) {
+      uint32_t src = getThreadId(toDeviceAddr[d]) >>
+        TinselLogThreadsPerMailbox;
+      for (uint32_t p = 0; p < POLITE_NUM_PINS; p++)
+        keyTable[d][p] = routingTables->addDestsFromBoard(src, outTable[d][p]);
+   }
+
+    // Stop routing timer and start init timer
+    gettimeofday(&routingFinish, NULL);
+    gettimeofday(&initStart, NULL);
+
+    // Reallocate and initialise heap structures
+    allocatePartitions();
+    initialisePartitions();
+
+    // Display times, if chatty
+    gettimeofday(&initFinish, NULL);
+    if (chatty > 0) {
+      struct timeval diff;
+
+      timersub(&placementFinish, &placementStart, &diff);
+      double duration = (double) diff.tv_sec +
+        (double) diff.tv_usec / 1000000.0;
+      printf("POLite mapper profile:\n");
+      printf("  Partitioning and placement: %lfs\n", duration);
+
+      timersub(&routingFinish, &routingStart, &diff);
+      duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
+      printf("  In/Out table construction: %lfs\n", duration);
+
+      timersub(&initFinish, &initStart, &diff);
+      duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
+      printf("  Thread state initialisation: %lfs\n", duration);
+    }
+  }
+
+  // Constructor
+  PGraph() {
+    char* str = getenv("HOSTLINK_BOXES_X");
+    int x = str ? atoi(str) : 1;
+    x = x * TinselMeshXLenWithinBox;
+    str = getenv("HOSTLINK_BOXES_Y");
+    int y = str ? atoi(str) : 1;
+    y = y * TinselMeshYLenWithinBox;
+    constructor(x, y);
+  }
+  PGraph(uint32_t numBoxesX, uint32_t numBoxesY) {
+    int x = numBoxesX * TinselMeshXLenWithinBox; 
+    int y = numBoxesY * TinselMeshYLenWithinBox;
+    constructor(x, y);
+  }
+
+  // Deconstructor
+  ~PGraph() {
+    releaseAll();
+    for (uint32_t i = 0; i < edgeLabels.numElems; i++)
+      delete edgeLabels.elems[i];
+  }
+
+  // Write partition to tinsel machine
+  void writeRAM(HostLink* hostLink,
+         uint8_t** heap, uint32_t* heapSize, uint32_t* heapBase) {
+    // Number of bytes written by each thread
+    uint32_t* writeCount = (uint32_t*)
+      calloc(TinselMaxThreads, sizeof(uint32_t));
+
+    // Number of threads completed by each core
+    uint32_t*** threadCount = (uint32_t***)
+      calloc(meshLenX, sizeof(uint32_t**));
+    for (uint32_t x = 0; x < meshLenX; x++) {
+      threadCount[x] = (uint32_t**)
+        calloc(meshLenY, sizeof(uint32_t*));
+      for (uint32_t y = 0; y < meshLenY; y++)
+        threadCount[x][y] = (uint32_t*)
+          calloc(TinselCoresPerBoard, sizeof(uint32_t));
+    }
+
+    // Initialise write addresses
+    for (int x = 0; x < meshLenX; x++)
+      for (int y = 0; y < meshLenY; y++)
+        for (int c = 0; c < TinselCoresPerBoard; c++)
+          hostLink->setAddr(x, y, c, heapBase[hostLink->toAddr(x, y, c, 0)]);
+
+    // Write heaps
+    uint32_t done = false;
+    while (! done) {
+      done = true;
+      for (int x = 0; x < meshLenX; x++) {
+        for (int y = 0; y < meshLenY; y++) {
+          for (int c = 0; c < TinselCoresPerBoard; c++) {
+            uint32_t t = threadCount[x][y][c];
+            if (t < TinselThreadsPerCore) {
+              done = false;
+              uint32_t threadId = hostLink->toAddr(x, y, c, t);
+              uint32_t written = writeCount[threadId];
+              if (written == heapSize[threadId]) {
+                threadCount[x][y][c] = t+1;
+                if ((t+1) < TinselThreadsPerCore)
+                  hostLink->setAddr(x, y, c,
+                    heapBase[hostLink->toAddr(x, y, c, t+1)]);
+              } else {
+                uint32_t send = min((heapSize[threadId] - written)>>2, 15);
+                hostLink->store(x, y, c, send,
+                  (uint32_t*) &heap[threadId][written]);
+                writeCount[threadId] = written + send * sizeof(uint32_t);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // Release memory
+    free(writeCount);
+    for (uint32_t x = 0; x < meshLenX; x++) {
+      for (uint32_t y = 0; y < meshLenY; y++)
+        free(threadCount[x][y]);
+      free(threadCount[x]);
+    }
+    free(threadCount);
+  }
+
+  // Write graph to tinsel machine
+  void write(HostLink* hostLink) { 
+    // Start timer
+    struct timeval start, finish;
+    gettimeofday(&start, NULL);
+
+    bool useSendBufferOld = hostLink->useSendBuffer;
+    hostLink->useSendBuffer = true;
+    writeRAM(hostLink, vertexMem, vertexMemSize, vertexMemBase);
+    writeRAM(hostLink, threadMem, threadMemSize, threadMemBase);
+    writeRAM(hostLink, inEdgeMem, inEdgeMemSize, inEdgeMemBase);
+    routingTables->write(hostLink);
+    hostLink->flush();
+    hostLink->useSendBuffer = useSendBufferOld;
+
+    // Display time if chatty
+    gettimeofday(&finish, NULL);
+    if (chatty > 0) {
+      struct timeval diff;
+      timersub(&finish, &start, &diff);
+      double duration = (double) diff.tv_sec +
+        (double) diff.tv_usec / 1000000.0;
+      printf("POLite graph upload time: %lfs\n", duration);
+    }
+  }
+
+  // Determine fan-in of given device
+  uint32_t fanIn(PDeviceId id) {
+    return graph.fanIn(id);
+  }
+
+  // Determine fan-out of given device
+  uint32_t fanOut(PDeviceId id) {
+    return graph.fanOut(id);
+  }
+
+};
+
+// Read performance stats and store in file
+inline void politeSaveStats(HostLink* hostLink, const char* filename) {
+  #ifdef POLITE_DUMP_STATS
+  // Open file for performance counters
+  FILE* statsFile = fopen(filename, "wt");
+  if (statsFile == NULL) {
+    printf("Error creating stats file\n");
+    exit(EXIT_FAILURE);
+  }
+  uint32_t meshLenX = hostLink->meshXLen;
+  uint32_t meshLenY = hostLink->meshYLen;
+  // Number of caches
+  uint32_t numLines = meshLenX * meshLenY *
+                        TinselDCachesPerDRAM * TinselDRAMsPerBoard;
+  // Add on number of cores
+  numLines += meshLenX * meshLenY * TinselCoresPerBoard;
+  // Add on number of threads
+  #ifdef POLITE_COUNT_MSGS
+  numLines += meshLenX * meshLenY * TinselThreadsPerBoard;
+  #endif
+  hostLink->dumpStdOut(statsFile, numLines);
+  fclose(statsFile);
+  #endif
+}
+
+#endif

From ce3894962fbff386925def486ec2c675ae439093 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mn416@jennings.cl.cam.ac.uk>
Date: Tue, 5 May 2020 20:34:20 +0000
Subject: [PATCH 52/78] Allow random placement

---
 include/POLite/Placer.h | 60 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 59 insertions(+), 1 deletion(-)

diff --git a/include/POLite/Placer.h b/include/POLite/Placer.h
index 32aec831..1468f0c7 100644
--- a/include/POLite/Placer.h
+++ b/include/POLite/Placer.h
@@ -10,6 +10,14 @@ typedef uint32_t PartitionId;
 
 // Partition and place a graph on a 2D mesh
 struct Placer {
+  // Select between different methods
+  enum Method {
+    Default,
+    Metis,
+    Random
+  };
+  const Method defaultMethod=Metis;
+
   // The graph being placed
   Graph* graph;
 
@@ -41,8 +49,31 @@ struct Placer {
   uint32_t* yCoordSaved;
   uint64_t savedCost;
 
+  // Controls which strategy is used
+  Method method = Default;
+
+  // Select placer method
+  void chooseMethod()
+  {
+    auto e = getenv("POLITE_PLACER");
+    if (e) {
+      if (!strcmp(e, "metis"))
+        method=Metis;
+      else if (!strcmp(e, "random"))
+        method=Random;
+      else if (!strcmp(e, "default") || *e == '\0')
+        method=Default;
+      else {
+        fprintf(stderr, "Don't understand placer method : %s\n", e);
+        exit(EXIT_FAILURE);
+      }
+    }
+    if (method == Default)
+      method = defaultMethod;
+  }
+
   // Partition the graph using Metis
-  void partition() {
+  void partitionMetis() {
     // Compute total number of edges
     uint32_t numEdges = 0;
     for (uint32_t i = 0; i < graph->incoming->numElems; i++) {
@@ -116,6 +147,31 @@ struct Placer {
     free(parts);
   }
 
+  // Partition the graph randomly
+  void partitionRandom() {
+    uint32_t numVertices = graph->incoming->numElems;
+    uint32_t numParts = width * height;
+
+    // Populate result array
+    srand(0);
+    for (uint32_t i = 0; i < numVertices; i++) {
+      partitions[i] = rand() % numParts;
+    }
+  }
+
+  void partition()
+  {
+    switch(method){
+    case Default:
+    case Metis:
+      partitionMetis();
+      break;
+    case Random:
+      partitionRandom();
+      break;
+    }
+  }
+
   // Create subgraph for each partition
   void computeSubgraphs() {
     uint32_t numPartitions = width*height;
@@ -316,6 +372,8 @@ struct Placer {
     yCoord = new uint32_t [width*height];
     xCoordSaved = new uint32_t [width*height];
     yCoordSaved = new uint32_t [width*height];
+    // Pick a placement method, or select default
+    chooseMethod();
     // Partition the graph using Metis
     partition();
     // Compute subgraphs, one per partition

From a679245e9559c473de2640c77b48ca87ec7cd7b5 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Tue, 5 May 2020 21:58:50 +0100
Subject: [PATCH 53/78] Full rate ProgRouter

---
 config.py          |  3 +-
 rtl/Network.bsv    | 14 +++-----
 rtl/ProgRouter.bsv | 86 ++++++++++++++++++----------------------------
 3 files changed, 38 insertions(+), 65 deletions(-)

diff --git a/config.py b/config.py
index e9917ae5..6faca950 100755
--- a/config.py
+++ b/config.py
@@ -164,7 +164,6 @@ def quoted(s): return "'\"" + s + "\"'"
 # Programmable router parameters:
 p["LogRoutingEntryLen"] = 5 # Number of beats in a routing table entry
 p["ProgRouterMaxBurst"] = 4
-p["ProgRouterCrossbarOutputs"] = 4
 p["FetcherLogIndQueueSize"] = 1
 p["FetcherLogBeatBufferSize"] = 5
 p["FetcherLogFlitBufferSize"] = 5
@@ -380,7 +379,7 @@ def quoted(s): return "'\"" + s + "\"'"
 
 # Parameters for programmable routers
 # (and the routing-record fetchers they contain)
-p["FetchersPerProgRouter"] = 6
+p["FetchersPerProgRouter"] = 4 + p["MailboxMeshXLen"]
 p["LogFetcherFlitBufferSize"] = 5
 
 #==============================================================================
diff --git a/rtl/Network.bsv b/rtl/Network.bsv
index 136d327c..07d9adfd 100644
--- a/rtl/Network.bsv
+++ b/rtl/Network.bsv
@@ -401,22 +401,16 @@ module mkNoC#(
     westLink[0].flitOut, boardRouter.flitIn[3]);
 
   // Connect mailbox mesh south rim to board router
-  function List#(t) single(t elem) = List::cons(elem, Nil);
-  List#(Out#(Flit)) botOutList0 = Nil;
-  List#(Out#(Flit)) botOutList1 = Nil;
-  for (Integer x = `MailboxMeshXLen-1; x >= 0; x=x-2) begin
-    botOutList0 = Cons(routers[0][x].bottomOut, botOutList0);
-    botOutList1 = Cons(routers[0][x-1].bottomOut, botOutList1);
-  end
-  reduceConnect(mkFlitMerger, botOutList0, single(boardRouter.flitIn[4]));
-  reduceConnect(mkFlitMerger, botOutList1, single(boardRouter.flitIn[5]));
+  for (Integer i = 0; i < `MailboxMeshXLen; i=i+1)
+    connectUsing(mkUGShiftQueue1(QueueOptFmax),
+      routers[0][i].bottomOut, boardRouter.flitIn[4+i]);
 
   // Connect board router to mailbox mesh south rim
   function In#(Flit) getBottomIn(MeshRouter r) = r.bottomIn;
   Vector#(`MailboxMeshXLen, In#(Flit)) southRimInPorts =
     map(getBottomIn, routers[0]);
   for (Integer i = 0; i < `MailboxMeshXLen; i=i+1)
-    connectDirect(boardRouter.nocFlitOut[i], southRimInPorts[i]);
+    connectDirect(boardRouter.flitOut[4+i], southRimInPorts[i]);
 
   // Detect inter-board activity
   // ---------------------------
diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv
index ae361ed1..e8a7d97c 100644
--- a/rtl/ProgRouter.bsv
+++ b/rtl/ProgRouter.bsv
@@ -179,24 +179,17 @@ typedef struct {
 // NoC edge, but the diagram assumes four.
 
 //
-//               N     S     E     W   L0/L1 L2/L3     Input flits
-//               |     |     |     |     |     |  
-//             +---+ +---+ +---+ +---+ +---+ +---+
-//             | F | | F | | F | | F | | F | | F |     Fetchers
-//             +---+ +---+ +---+ +---+ +---+ +---+
-//               |     |     |     |     |     |  
-//             +---------------------------------+      
-//             |          Crossbar               |     Routing
-//             +---------------------------------+      
-//               |     |     |     |              
-//              N/L0  S/L1  E/L2  W/L3                 Output queues
-//               |     |     |     |
-//             +---------------------------+
-//             |          Splitter         |           Final splitting
-//             +---------------------------+
-//               |  |  |  |  |  |  |  |
-//               N  S  E  W  L0 L1 L2 L3               Output flits
-//
+//   N     S     E     W    L0     L1    L2    L3     Input flits
+//   |     |     |     |     |     |     |     |
+// +---+ +---+ +---+ +---+ +---+ +---+ +---+ +---+
+// | F | | F | | F | | F | | F | | F | | F | | F |    Fetchers
+// +---+ +---+ +---+ +---+ +---+ +---+ +---+ +---+
+//   |     |     |     |     |     |     |     |
+// +-------------------------------------------+      
+// |                 Crossbar                  |      Routing
+// +-------------------------------------------+      
+//   |     |     |     |     |     |     |     |  
+//   N     S     E     W     L0    L1    L2    L3     Output queues
 
 // The core functionality is implemented in the fetchers, which:
 //   (1) extract routing keys from incoming flits;
@@ -218,10 +211,7 @@ typedef struct {
 
 // After the fetchers have interpreted the flits, they are fed to a
 // fair crossbar which organises them by destination into output
-// queues.  To reduce logic, we allow each inter-board link to share
-// an output queue with a local link, as this does not compromise
-// forward progress.  Finally the queues are split to provide an
-// output stream for each possible destination.
+// queues.
 
 // =============================================================================
 // Fetcher
@@ -751,8 +741,8 @@ module mkProgRouterCrossbar#(
   // Current choice of flit source
   Vector#(numOut, Reg#(Bit#(numIn))) choiceReg <- replicateM(mkReg(0));
 
-  // Output queue
-  Vector#(numOut, Queue1#(RoutedFlit)) outQueue <-
+  // Output queues
+  Vector#(numOut, Queue#(RoutedFlit)) outQueue <-
     replicateM(mkUGShiftQueue(QueueOptFmax));
 
   // Selector mux for each out queue
@@ -857,8 +847,7 @@ endinterface
 interface ProgRouter;
   // Incoming and outgoing flits
   interface Vector#(`FetchersPerProgRouter, In#(Flit)) flitIn;
-  interface Vector#(`ProgRouterCrossbarOutputs, BOut#(Flit)) flitOut;
-  interface Vector#(`MailboxMeshXLen, BOut#(Flit)) nocFlitOut;
+  interface Vector#(`FetchersPerProgRouter, BOut#(Flit)) flitOut;
 
   // Interface to off-chip memory
   interface Vector#(`DRAMsPerBoard,
@@ -879,44 +868,36 @@ module mkProgRouter#(BoardId boardId) (ProgRouter);
     fetchers[i] <- mkFetcher(boardId, i);
 
   // Crossbar routing functions
-  function Bit#(2) xcoord(RoutedFlit rf) =
+  function Bit#(`MailboxMeshXBits) xcoord(RoutedFlit rf) =
     zeroExtend(rf.flit.dest.addr.mbox.x);
-  function Bool routeN(RoutedFlit rf) =
-    rf.decision == RouteNorth || (rf.decision == RouteNoC && xcoord(rf) == 0);
-  function Bool routeS(RoutedFlit rf) =
-    rf.decision == RouteSouth || (rf.decision == RouteNoC && xcoord(rf) == 1);
-  function Bool routeE(RoutedFlit rf) =
-    rf.decision == RouteEast || (rf.decision == RouteNoC && xcoord(rf) == 2);
-  function Bool routeW(RoutedFlit rf) =
-    rf.decision == RouteWest || (rf.decision == RouteNoC && xcoord(rf) == 3);
-  Vector#(`ProgRouterCrossbarOutputs, SelectorFunc) funcs =
-    vector(routeN, routeS, routeE, routeW);
+  function Bool routeN(RoutedFlit rf) = rf.decision == RouteNorth;
+  function Bool routeS(RoutedFlit rf) = rf.decision == RouteSouth;
+  function Bool routeE(RoutedFlit rf) = rf.decision == RouteEast;
+  function Bool routeW(RoutedFlit rf) = rf.decision == RouteWest;
+  function Bool routeL(Bit#(`MailboxMeshXBits) x, RoutedFlit rf) =
+    rf.decision == RouteNoC && xcoord(rf) == x;
+  Vector#(`FetchersPerProgRouter, SelectorFunc) funcs;
+  funcs[0] = routeN; funcs[1] = routeS;
+  funcs[2] = routeE; funcs[3] = routeW;
+  for (Integer i = 0; i < `MailboxMeshXLen; i=i+1)
+    funcs[4+i] = routeL(fromInteger(i));
 
   // Crossbar
   function BOut#(RoutedFlit) getFetcherFlitOut(Fetcher f) = f.flitOut;
   Vector#(`FetchersPerProgRouter, BOut#(RoutedFlit)) fetcherOuts =
     map(getFetcherFlitOut, fetchers);
-  Vector#(`ProgRouterCrossbarOutputs, BOut#(RoutedFlit))
+  Vector#(`FetchersPerProgRouter, BOut#(RoutedFlit))
     crossbarOuts <- mkProgRouterCrossbar(funcs, fetcherOuts);
+  Vector#(`FetchersPerProgRouter, BOut#(Flit)) crossbarOutFlits;
+  function Flit toFlit (RoutedFlit rf) = rf.flit;
+  for (Integer i = 0; i < `FetchersPerProgRouter; i=i+1)
+    crossbarOutFlits[i] <- onBOut(toFlit, crossbarOuts[i]);
 
   // Flit input interfaces
   Vector#(`FetchersPerProgRouter, In#(Flit)) flitInIfc = newVector;
   for (Integer i = 0; i < `FetchersPerProgRouter; i=i+1)
     flitInIfc[i] = fetchers[i].flitIn;
 
-  // Flit output interfaces
-  Vector#(`ProgRouterCrossbarOutputs, BOut#(Flit)) flitOutIfc = newVector;
-  Vector#(`MailboxMeshXLen, BOut#(Flit)) nocFlitOutIfc = newVector;
-
-  // Strands
-  function Bool forNoC(RoutedFlit rf) = rf.decision == RouteNoC;
-  for (Integer i = 0; i < `ProgRouterCrossbarOutputs; i=i+1) begin
-    match {.noc, .other} <- splitFlits(forNoC, crossbarOuts[i]);
-    flitOutIfc[i] = other;
-    if (i < `MailboxMeshXLen) nocFlitOutIfc[i] = noc;
-  end
-  function Flit toFlit (RoutedFlit rf) = rf.flit;
-
   // RAM interfaces
   Vector#(`DRAMsPerBoard, Vector#(`FetchersPerProgRouter, In#(DRAMResp)))
     ramRespIfc = replicate(newVector);
@@ -945,8 +926,7 @@ module mkProgRouter#(BoardId boardId) (ProgRouter);
 
   function FetcherActivity getActivity(Fetcher f) = f.activity;
   interface flitIn = flitInIfc;
-  interface flitOut = flitOutIfc;
-  interface nocFlitOut = nocFlitOutIfc;
+  interface flitOut = crossbarOutFlits;
   interface ramReqs = ramReqIfc;
   interface ramResps = ramRespIfc;
   interface activities = map(getActivity, fetchers);

From 5c72ead39232f13b892177410c7394ba1ffd5707 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Wed, 6 May 2020 14:22:53 +0100
Subject: [PATCH 54/78] Set clock to 215MHz

---
 README.md                     | 2 +-
 apps/POLite/util/sumstats.awk | 2 +-
 config.py                     | 2 +-
 de5/S5_DDR3_QSYS.qsys         | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index c6101804..c148323a 100644
--- a/README.md
+++ b/README.md
@@ -1180,7 +1180,7 @@ the DE5-Net.
   `MeshXLenWithinBox`      |       3 | Boards in X dimension within box
   `MeshYLenWithinBox`      |       2 | Boards in Y dimension within box
   `EnablePerfCount`        |    True | Enable performance counters
-  `ClockFreq`              |     225 | Clock frequency in MHz
+  `ClockFreq`              |     215 | Clock frequency in MHz
 
 Further parameters can be found in [config.py](config.py).
 
diff --git a/apps/POLite/util/sumstats.awk b/apps/POLite/util/sumstats.awk
index 9ccfac8c..719699aa 100755
--- a/apps/POLite/util/sumstats.awk
+++ b/apps/POLite/util/sumstats.awk
@@ -15,7 +15,7 @@ BEGIN {
   progRouterSent = 0;
   progRouterSentInter = 0;
   blockedSends = 0;
-  fmax = 220000000;
+  fmax = 215000000;
   if (boardsX == "" || boardsY == "") {
     boardsX = 3;
     boardsY = 2;
diff --git a/config.py b/config.py
index 6faca950..6500be58 100755
--- a/config.py
+++ b/config.py
@@ -188,7 +188,7 @@ def quoted(s): return "'\"" + s + "\"'"
 p["UseCustomAccelerator"] = False
 
 # Clock frequency (in MHz)
-p["ClockFreq"] = 220
+p["ClockFreq"] = 215
 
 #==============================================================================
 # Derived Parameters
diff --git a/de5/S5_DDR3_QSYS.qsys b/de5/S5_DDR3_QSYS.qsys
index dc87cb4b..4d8e3a49 100644
--- a/de5/S5_DDR3_QSYS.qsys
+++ b/de5/S5_DDR3_QSYS.qsys
@@ -891,7 +891,7 @@
   <parameter name="MEM_CK_PHASE" value="0.0" />
   <parameter name="MEM_CK_WIDTH" value="1" />
   <parameter name="MEM_CLK_EN_WIDTH" value="1" />
-  <parameter name="MEM_CLK_FREQ" value="440.0" />
+  <parameter name="MEM_CLK_FREQ" value="430.0" />
   <parameter name="MEM_CLK_FREQ_MAX" value="800.0" />
   <parameter name="MEM_COL_ADDR_WIDTH" value="10" />
   <parameter name="MEM_CS_WIDTH" value="1" />
@@ -1214,7 +1214,7 @@
   <parameter name="MEM_CK_PHASE" value="0.0" />
   <parameter name="MEM_CK_WIDTH" value="1" />
   <parameter name="MEM_CLK_EN_WIDTH" value="1" />
-  <parameter name="MEM_CLK_FREQ" value="440.0" />
+  <parameter name="MEM_CLK_FREQ" value="430.0" />
   <parameter name="MEM_CLK_FREQ_MAX" value="800.0" />
   <parameter name="MEM_COL_ADDR_WIDTH" value="10" />
   <parameter name="MEM_CS_WIDTH" value="1" />

From b8ad3b4f85bec7b32bcf07b931922522fac45b34 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Thu, 7 May 2020 13:25:16 +0100
Subject: [PATCH 55/78] 2nd attempt at optional extra send slot

---
 README.md              |  7 ++++-
 hostlink/DebugLink.cpp | 14 ++++++----
 hostlink/DebugLink.h   |  9 +++++-
 hostlink/HostLink.cpp  | 24 ++++++++++++----
 hostlink/HostLink.h    | 13 ++++++++-
 include/tinsel.h       | 12 ++++++++
 rtl/DE5Top.bsv         |  7 +++++
 rtl/DebugLink.bsv      | 35 +++++++++++++++++------
 rtl/GenInit.sh         | 19 -------------
 rtl/Mailbox.bsv        | 63 ++++++++++++++++++++++++++++++++++--------
 rtl/Makefile           | 11 ++------
 11 files changed, 152 insertions(+), 62 deletions(-)
 delete mode 100755 rtl/GenInit.sh

diff --git a/README.md b/README.md
index c148323a..acc9d10c 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ New section on programmable routers:
 New performance counters accessible from core zero on each board only:
   * `ProgRouterSent` and `ProgRouterSentInterBoard`
 
-Document the following:
+Document the following new perf counters:
 
 ```c++
 // Performance counter: number of messages emitted by ProgRouter
@@ -36,6 +36,11 @@ INLINE uint32_t tinselProgRouterSent();
 INLINE uint32_t tinselProgRouterSentInterBoard();
 ```
 
+Document extra send slot option:
+
+  * `HostLinkParams`, `DebugLinkParams`
+  * `tinselSendSlotExtra()`
+
 # Tinsel 0.7.1
 
 Tinsel is a [RISC-V](https://riscv.org/)-based manythread
diff --git a/hostlink/DebugLink.cpp b/hostlink/DebugLink.cpp
index f838441d..0031969c 100644
--- a/hostlink/DebugLink.cpp
+++ b/hostlink/DebugLink.cpp
@@ -60,10 +60,10 @@ void DebugLink::putPacket(int x, int y, BoardCtrlPkt* pkt)
 }
 
 // Constructor
-DebugLink::DebugLink(uint32_t numBoxesX, uint32_t numBoxesY)
+DebugLink::DebugLink(DebugLinkParams p)
 {
-  boxMeshXLen = numBoxesX;
-  boxMeshYLen = numBoxesY;
+  boxMeshXLen = p.numBoxesX;
+  boxMeshYLen = p.numBoxesY;
   get_tryNextX = 0;
   get_tryNextY = 0;
 
@@ -105,11 +105,11 @@ DebugLink::DebugLink(uint32_t numBoxesX, uint32_t numBoxesY)
                     "But is has a box X coordinate of %i\n", thisBoxX);
     exit(EXIT_FAILURE);
   }
-  if ((thisBoxX+numBoxesX-1) >= TinselBoxMeshXLen ||
-      (thisBoxY+numBoxesY-1) >= TinselBoxMeshYLen) {
+  if ((thisBoxX+p.numBoxesX-1) >= TinselBoxMeshXLen ||
+      (thisBoxY+p.numBoxesY-1) >= TinselBoxMeshYLen) {
     fprintf(stderr, "Requested box sub-mesh of size %ix%i "
                     "is not valid from box %s\n",
-                    numBoxesX, numBoxesY, hostname);
+                    p.numBoxesX, p.numBoxesY, hostname);
     exit(EXIT_FAILURE);
   }
 
@@ -187,6 +187,8 @@ DebugLink::DebugLink(uint32_t numBoxesX, uint32_t numBoxesY)
       if (y == 0) pkt.payload[2] |= 2;
       if (thisBoxX == 0 && boxMeshXLen == 1) pkt.payload[2] |= 4;
       if (thisBoxX == 1 && boxMeshXLen == 1) pkt.payload[2] |= 8;
+      // Reserve extra send slot?
+      pkt.payload[2] |= p.useExtraSendSlot ? 0x10 : 0;
       // Send commands to each board
       for (int b = 0; b < TinselBoardsPerBox; b++) {
         pkt.linkId = b;
diff --git a/hostlink/DebugLink.h b/hostlink/DebugLink.h
index fd3c8291..18d352dc 100644
--- a/hostlink/DebugLink.h
+++ b/hostlink/DebugLink.h
@@ -8,6 +8,13 @@
 #include "BoardCtrl.h"
 #include "DebugLinkFormat.h"
 
+// DebugLinkH parameters
+struct DebugLinkParams {
+  uint32_t numBoxesX;
+  uint32_t numBoxesY;
+  bool useExtraSendSlot;
+};
+
 class DebugLink {
 
   // Location of this box with full box mesh
@@ -46,7 +53,7 @@ class DebugLink {
   int meshYLen;
 
   // Constructor
-  DebugLink(uint32_t numBoxesX, uint32_t numBoxesY);
+  DebugLink(DebugLinkParams params);
 
   // On given board, set destination core and thread
   void setDest(uint32_t boardX, uint32_t boardY,
diff --git a/hostlink/HostLink.cpp b/hostlink/HostLink.cpp
index 4708457e..725c673b 100644
--- a/hostlink/HostLink.cpp
+++ b/hostlink/HostLink.cpp
@@ -60,9 +60,11 @@ static int connectToPCIeStream(const char* socketPath)
 }
 
 // Internal constructor
-void HostLink::constructor(uint32_t numBoxesX, uint32_t numBoxesY)
+void HostLink::constructor(HostLinkParams p)
 {
-  if (numBoxesX > TinselBoxMeshXLen || numBoxesY > TinselBoxMeshYLen) {
+  useExtraSendSlot = p.useExtraSendSlot;
+
+  if (p.numBoxesX > TinselBoxMeshXLen || p.numBoxesY > TinselBoxMeshYLen) {
     fprintf(stderr, "Number of boxes requested exceeds those available\n");
     exit(EXIT_FAILURE);
   }
@@ -92,7 +94,11 @@ void HostLink::constructor(uint32_t numBoxesX, uint32_t numBoxesY)
   #endif
 
   // Create DebugLink
-  debugLink = new DebugLink(numBoxesX, numBoxesY);
+  DebugLinkParams debugLinkParams;
+  debugLinkParams.numBoxesX = p.numBoxesX;
+  debugLinkParams.numBoxesY = p.numBoxesY;
+  debugLinkParams.useExtraSendSlot = p.useExtraSendSlot;
+  debugLink = new DebugLink(debugLinkParams);
 
   // Set board mesh dimensions
   meshXLen = debugLink->meshXLen;
@@ -145,12 +151,20 @@ HostLink::HostLink()
   int x = str ? atoi(str) : 1;
   str = getenv("HOSTLINK_BOXES_Y");
   int y = str ? atoi(str) : 1;
-  constructor(x, y);
+  HostLinkParams params;
+  params.numBoxesX = x;
+  params.numBoxesY = y;
+  params.useExtraSendSlot = false;
+  constructor(params);
 }
 
 HostLink::HostLink(uint32_t numBoxesX, uint32_t numBoxesY)
 {
-  constructor(numBoxesX, numBoxesY);
+  HostLinkParams params;
+  params.numBoxesX = numBoxesX;
+  params.numBoxesY = numBoxesY;
+  params.useExtraSendSlot = false;
+  constructor(params);
 }
 
 // Destructor
diff --git a/hostlink/HostLink.h b/hostlink/HostLink.h
index f6a7a71c..41d78303 100644
--- a/hostlink/HostLink.h
+++ b/hostlink/HostLink.h
@@ -16,6 +16,13 @@
 #define PCIESTREAM      "pciestream"
 #define PCIESTREAM_SIM  "tinsel.b-1.1"
 
+// HostLink parameters
+struct HostLinkParams {
+  uint32_t numBoxesX;
+  uint32_t numBoxesY;
+  bool useExtraSendSlot;
+};
+
 class HostLink {
   // Lock file for acquring exclusive access to PCIeStream
   int lockFile;
@@ -33,8 +40,11 @@ class HostLink {
   char* sendBuffer;
   int sendBufferLen;
 
+  // Request an extra send slot when bringing up Tinsel FPGAs
+  bool useExtraSendSlot;
+
   // Internal constructor
-  void constructor(uint32_t numBoxesX, uint32_t numBoxesY);
+  void constructor(HostLinkParams params);
 
   // Internal helper for sending messages
   bool sendHelper(uint32_t dest, uint32_t numFlits, void* payload,
@@ -47,6 +57,7 @@ class HostLink {
   // Constructors
   HostLink();
   HostLink(uint32_t numBoxesX, uint32_t numBoxesY);
+  HostLink(HostLinkParams params);
 
   // Destructor
   ~HostLink();
diff --git a/include/tinsel.h b/include/tinsel.h
index d06e2bfd..ab29fbae 100644
--- a/include/tinsel.h
+++ b/include/tinsel.h
@@ -129,6 +129,18 @@ INLINE volatile void* tinselSendSlot()
   return mb_scratchpad_base + (threadId << TinselLogBytesPerMsg);
 }
 
+// Get pointer to thread's extra message slot reserved for sending
+// (Assumes that HostLink has requested the extra slot)
+INLINE volatile void* tinselSendSlotExtra()
+{
+  volatile char* mb_scratchpad_base =
+    (volatile char*) (1 << TinselLogBytesPerMailbox);
+  uint32_t threadId = tinselId() &
+    ((1<<TinselLogThreadsPerMailbox) - 1);
+  return mb_scratchpad_base +
+           ((TinselThreadsPerMailbox+threadId) << TinselLogBytesPerMsg);
+}
+
 // Determine if calling thread can send a message
 INLINE int tinselCanSend()
 {
diff --git a/rtl/DE5Top.bsv b/rtl/DE5Top.bsv
index b522284d..bb35bc19 100644
--- a/rtl/DE5Top.bsv
+++ b/rtl/DE5Top.bsv
@@ -148,6 +148,13 @@ module de5Top (DE5Top);
     for (Integer x = 0; x < `MailboxMeshXLen; x=x+1)
       mailboxes[y][x] <- mkMailboxAcc(debugLink.getBoardId(), x, y);
 
+  // Initialise mailbox send slots
+  rule initSendSlots;
+    for (Integer y = 0; y < `MailboxMeshYLen; y=y+1)
+      for (Integer x = 0; x < `MailboxMeshXLen; x=x+1)
+        mailboxes[y][x].initSendSlots(debugLink.useExtraSendSlot);
+  endrule
+
   // Connect cores to mailboxes
   for (Integer y = 0; y < `MailboxMeshYLen; y=y+1)
     for (Integer x = 0; x < `MailboxMeshXLen; x=x+1) begin
diff --git a/rtl/DebugLink.bsv b/rtl/DebugLink.bsv
index 676696e7..a09236b5 100644
--- a/rtl/DebugLink.bsv
+++ b/rtl/DebugLink.bsv
@@ -13,16 +13,18 @@ package DebugLink;
 // Commands sent from the host PC to DebugLink typically consist of a
 // few bytes over the JTAG UART.
 //
-//   QueryIn: tag (1 byte), board offset (1 byte), edge disable (1 byte)
-//   -------------------------------------------------------------------
+//   QueryIn: tag (1 byte), board offset (1 byte), config (1 byte)
+//   -------------------------------------------------------------
 //
 //   Sets the X offset (offset[3:0]) and the Y offset (offset[7:4])
 //   of the board id (to support multiple boxes).
 //   Disable the specified inter-FPGA links:
-//     * disable[0]: disable links on north side of box
-//     * disable[1]: disable links on south side of box
-//     * disable[2]: disable links on east side of box
-//     * disable[3]: disable links on west side of box
+//     * config[0]: disable links on north side of box
+//     * config[1]: disable links on south side of box
+//     * config[2]: disable links on east side of box
+//     * config[3]: disable links on west side of box
+//   Enable extra send slot:
+//     * config[4]: reserve extra send slot
 //   Responds with a QueryOut (see below).
 //
 //   SetDest: tag (1 byte), thread id (1 byte), core id (1 byte)
@@ -202,9 +204,13 @@ interface DebugLink;
   // Get board id via DebugLink
   (* always_ready, always_enabled *)
   method BoardId getBoardId();
-  // Optionally disable each inter-FPGA link via DebugLink
+  // Config option: disable each inter-FPGA link via DebugLink
+  // (Allows sanboxing of boxes or groups of boxes)
   (* always_ready, always_enabled *)
   method Vector#(4, Bool) linkEnable;
+  // Config option: reserve extra send slot per thread in mailbox
+  (* always_ready, always_enabled *)
+  method Option#(Bool) useExtraSendSlot;
 endinterface
 
 module mkDebugLink#(
@@ -224,6 +230,11 @@ module mkDebugLink#(
   // (Initially, all disabled)
   Reg#(Vector#(4, Bool)) linkEnableReg <- mkConfigReg(replicate(False));
 
+  // Config option: reserve extra send slot in mailbox?
+  // Use a chain of registers to aid propagation on chip
+  Vector#(3, Reg#(Option#(Bool))) useExtraSendSlotReg <-
+     replicateM(mkConfigReg(Option {valid : False, value: False}));
+
   // Ports
   InPort#(Bit#(8)) fromJtag <- mkInPort;
   OutPort#(Bit#(8)) toJtag <- mkOutPort;
@@ -331,6 +342,9 @@ module mkDebugLink#(
         // Disable west link?
         if (x == 0 && edgeEn[3] == 1) linkEn[3] = False;
         linkEnableReg <= linkEn;
+        // Reserve extra send slot?
+        useExtraSendSlotReg[2] <=
+          Option {valid: True, value: fromJtag.value[4] == 1};
         respondFlag <= True;
         respondCmd <= cmdQueryIn;
         recvState <= 0;
@@ -404,6 +418,11 @@ module mkDebugLink#(
     end
   endrule
 
+  // Propagate extra send slot option through chain of registers (for timing)
+  rule chain;
+    for (Integer i = 0; i < 2; i=i+1)
+      useExtraSendSlotReg[i] <= useExtraSendSlotReg[i+1];
+  endrule
 
   `ifndef SIMULATE
   interface jtagAvalon = uart.jtagAvalon;
@@ -411,7 +430,7 @@ module mkDebugLink#(
 
   method BoardId getBoardId() = boardId;
   method Vector#(4, Bool) linkEnable = linkEnableReg;
-
+  method Option#(Bool) useExtraSendSlot = useExtraSendSlotReg[0];
 endmodule
 
 endpackage
diff --git a/rtl/GenInit.sh b/rtl/GenInit.sh
deleted file mode 100755
index ad2a6e0c..00000000
--- a/rtl/GenInit.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-# Generate memory initialisation files
-
-# Load config parameters
-while read -r EXPORT; do
-  eval $EXPORT
-done <<< `python ../config.py envs`
-
-MaxSlot=$(((2**LogMsgsPerMailbox) - 1))
-ThreadsPerMailbox=$((2**$LogThreadsPerMailbox))
-
-# Emit hex file
-for I in $(seq $ThreadsPerMailbox $MaxSlot); do
-  printf "%x\n" $I
-done >> FreeSlots.hex
-
-# Emit MIF file
-../bin/hex-to-mif.py FreeSlots.hex $LogMsgsPerMailbox > ../de5/FreeSlots.mif
diff --git a/rtl/Mailbox.bsv b/rtl/Mailbox.bsv
index 0398b0e2..e08b1b9a 100644
--- a/rtl/Mailbox.bsv
+++ b/rtl/Mailbox.bsv
@@ -260,6 +260,9 @@ interface Mailbox;
   (* always_ready *) method Bit#(1) freeDone;
   // Network-side interface
   interface MailboxNet            net;
+  // Initialise send slots (use extra send slot?)
+  (* always_ready, always_enabled *)
+  method Action initSendSlots(Option#(Bool) useExtraSendSlot);
 endinterface
 
 // Combined receive request/response interface
@@ -292,6 +295,45 @@ module mkMailbox (Mailbox);
   Vector#(`CoresPerMailbox, InPort#(ReceiveReq)) rxReqPorts <-
     replicateM(mkInPort);
 
+  // Initialise free slots
+  // =====================
+
+  // Set of currently-unused message slots
+  // By default, the first ThreadsPerMailbox slots are reserved for sending
+  // Optionally, the first 2*ThreadsPerMailbox slots are reserved for sending
+  SizedQueue#(`LogMsgsPerMailbox, Bit#(`LogMsgsPerMailbox))
+    freeSlots <- mkUGSizedQueuePrefetch;
+
+  // Reserve extra send slot?
+  Wire#(Option#(Bool)) useExtraSendSlot <- mkBypassWire;
+
+  // State of free slot initialiser
+  Reg#(Bit#(1)) freeSlotsInitState <- mkConfigReg(0);
+
+  // Have the free slots been initialised yet?
+  Reg#(Bool) freeSlotsInitDone <- mkConfigReg(False);
+
+  // Next slot to insert into free slot queue
+  Reg#(Bit#(`LogMsgsPerMailbox)) freeSlotsInitNext <- mkConfigRegU;
+
+  // Wait until config option available, which tells us how
+  // many slots to reserve for sending
+  rule initFreeSlots0 (freeSlotsInitState == 0);
+    if (useExtraSendSlot.valid) begin
+      freeSlotsInitNext <= useExtraSendSlot.value ?
+        fromInteger(2*`ThreadsPerMailbox) : `ThreadsPerMailbox;
+      freeSlotsInitState <= 1;
+    end
+  endrule
+
+  // Initialise free slots
+  rule initFreeSlots1 (!freeSlotsInitDone && freeSlotsInitState == 1);
+    freeSlots.enq(freeSlotsInitNext);
+    freeSlotsInitNext <= freeSlotsInitNext + 1;
+    if (freeSlotsInitNext == fromInteger(2**`LogMsgsPerMailbox - 1))
+      freeSlotsInitDone <= True;
+  endrule
+
   // Message access unit
   // ===================
 
@@ -336,15 +378,6 @@ module mkMailbox (Mailbox);
   Reg#(RefCount) refCountReg <- mkConfigRegU;
   Reg#(Bit#(`LogMsgsPerMailbox)) refCountSlot <- mkConfigRegU;
 
-  // Set of currently-unused message slots
-  // (The first ThreadsPerMailbox slots are reserved for sending)
-  QueueOpts freeSlotsOpts;
-  freeSlotsOpts.style = "AUTO";
-  freeSlotsOpts.size = 2**`LogMsgsPerMailbox - `ThreadsPerMailbox;
-  freeSlotsOpts.file = Valid("FreeSlots");
-  SizedQueue#(`LogMsgsPerMailbox, Bit#(`LogMsgsPerMailbox))
-    freeSlots <- mkUGSizedQueuePrefetchOpts(freeSlotsOpts);
-
   // Multicast buffer
   Vector#(`CoresPerMailbox,
     SizedQueue#(`LogMulticastBufferSize, MulticastBufferEntry))
@@ -598,7 +631,7 @@ module mkMailbox (Mailbox);
   // to a message slot is freed
   Reg#(Bit#(1)) freeDoneReg <- mkDReg(0);
 
-  rule free (freeReqPort.canGet);
+  rule free (freeReqPort.canGet && freeSlotsInitDone);
     FreeReq req = freeReqPort.value;
     // Process request in two cycles
     let count = refCount.dataOutB;
@@ -667,6 +700,10 @@ module mkMailbox (Mailbox);
     endinterface
   endinterface
 
+  method Action initSendSlots(Option#(Bool) useExtra);
+    useExtraSendSlot <= useExtra;
+  endmethod
+
 endmodule
 
 // =============================================================================
@@ -1138,14 +1175,16 @@ import "BVI" ExternalTinselAccelerator =
 
 `ifndef UseCustomAccelerator
 
-module mkMailboxAcc#(BoardId boardId, Integer tileX, Integer tileY) (Mailbox);
+module mkMailboxAcc#(BoardId boardId,
+         Integer tileX, Integer tileY) (Mailbox);
   Mailbox mbox <- mkMailbox;
   return mbox;
 endmodule
 
 `else
 
-module mkMailboxAcc#(BoardId boardId, Integer tileX, Integer tileY) (Mailbox);
+module mkMailboxAcc#(BoardId boardId,
+         Integer tileX, Integer tileY) (Mailbox);
   // Instantiate standard mailbox
   Mailbox mbox <- mkMailbox;
 
diff --git a/rtl/Makefile b/rtl/Makefile
index 57a2acf8..e938b015 100644
--- a/rtl/Makefile
+++ b/rtl/Makefile
@@ -28,13 +28,13 @@ sim: $(TOPMOD) $(HOSTTOPMOD)
 .PHONY: verilog
 verilog: $(TOPMOD).v $(HOSTTOPMOD).v
 
-$(TOPMOD): *.bsv *.c InstrMem.hex FreeSlots.hex
+$(TOPMOD): *.bsv *.c InstrMem.hex
 	make -C $(TINSEL_ROOT)/apps/boot
 	make -C $(TINSEL_ROOT)/hostlink udsock
 	$(BSC) $(BSCFLAGS) $(DEFS) -D SIMULATE -sim -g $(TOPMOD) -u $(TOPFILE)
 	$(BSC) $(BSCFLAGS) -sim -o $(TOPMOD) -e $(TOPMOD) *.c
 
-$(TOPMOD).v: *.bsv $(QP)/InstrMem.mif $(QP)/FreeSlots.mif
+$(TOPMOD).v: *.bsv $(QP)/InstrMem.mif
 	make -C $(TINSEL_ROOT)/apps/boot
 	$(BSC) $(BSCFLAGS) -opt-undetermined-vals -unspecified-to X \
          $(DEFS) -u -verilog -g $(TOPMOD) $(TOPFILE)
@@ -63,12 +63,6 @@ InstrMem.hex:
 $(QP)/InstrMem.mif:
 	make -C $(TINSEL_ROOT)/apps/boot
 
-FreeSlots.hex: GenInit.sh
-	./GenInit.sh
-
-$(QP)/FreeSlots.mif: GenInit.sh
-	./GenInit.sh
-
 .PHONY: test-mem
 test-mem: testMem
 
@@ -83,7 +77,6 @@ clean:
 	rm -f de5Top.v mkCore.v mkDCache.v mkMailbox.v mkDebugLinkRouter.v
 	rm -f mkFPU.v mkMeshRouter.v
 	rm -f de5BridgeTop.v
-	rm -f FreeSlots.hex ../de5/FreeSlots.mif
 	rm -rf test-mem-log
 	rm -rf test-mailbox-log
 	rm -rf test-array-of-queue-log

From a49ee33b42620e75d6c85a4898886be6674f9a72 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Thu, 7 May 2020 14:08:08 +0100
Subject: [PATCH 56/78] Forgot to add new HostLink constructor

---
 hostlink/HostLink.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/hostlink/HostLink.cpp b/hostlink/HostLink.cpp
index 725c673b..dd896f4d 100644
--- a/hostlink/HostLink.cpp
+++ b/hostlink/HostLink.cpp
@@ -167,6 +167,11 @@ HostLink::HostLink(uint32_t numBoxesX, uint32_t numBoxesY)
   constructor(params);
 }
 
+HostLink::HostLink(HostLinkParams params)
+{
+  constructor(params);
+}
+
 // Destructor
 HostLink::~HostLink()
 {

From 590cb373b32f68b157b60fb885fdcab2b2d1733a Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Sun, 10 May 2020 11:09:53 +0100
Subject: [PATCH 57/78] Full throughput on ProgRouter side of ram req tree

---
 rtl/Connections.bsv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rtl/Connections.bsv b/rtl/Connections.bsv
index d093001b..7f542acc 100644
--- a/rtl/Connections.bsv
+++ b/rtl/Connections.bsv
@@ -84,7 +84,7 @@ module connectClientsToOffChipRAM#(
     mkMergeTreeB(Fair,
       mkUGShiftQueue1(QueueOptFmax),
       routerReqs);
-  Queue1#(DRAMReq) fetcherReqsQueue <- mkUGShiftQueue1(QueueOptFmax);
+  Queue#(DRAMReq) fetcherReqsQueue <- mkUGQueue;
   connectToQueue(fetcherReqs, fetcherReqsQueue);
   BOut#(DRAMReq) fetcherReqsB = queueToBOut(fetcherReqsQueue);
 

From 78842a7e6fde4141874c48f4d916c6947c7c9e25 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Mon, 11 May 2020 14:09:01 +0100
Subject: [PATCH 58/78] Handle 0-size routing keys properly

---
 include/POLite/ProgRouters.h | 2 +-
 rtl/ProgRouter.bsv           | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/POLite/ProgRouters.h b/include/POLite/ProgRouters.h
index a1c5942d..9890c43e 100644
--- a/include/POLite/ProgRouters.h
+++ b/include/POLite/ProgRouters.h
@@ -284,7 +284,7 @@ class ProgRouterMesh {
   // Returns routing key
   uint32_t addDestsFromBoardXY(uint32_t senderX, uint32_t senderY,
                                  Seq<PRoutingDest>* dests) {
-    assert(dests->numElems > 0);
+    if (dests->numElems == 0) return 0;
 
     // Categorise dests into local, N, S, E, and W groups
     Seq<PRoutingDest> local(dests->numElems);
diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv
index e8a7d97c..9570ff09 100644
--- a/rtl/ProgRouter.bsv
+++ b/rtl/ProgRouter.bsv
@@ -432,6 +432,7 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
         // Ignore keys with zero beats
         if (key.numBeats == 0) begin
           consumeState <= 0;
+          incReceivedReg <= 1;
         end else begin
           consumeState <= 2;
           // Claim chosen slot

From b6f98ad5a8ea4a7574c8d363869b11aa6d8c9c8d Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mn416@jennings.cl.cam.ac.uk>
Date: Mon, 11 May 2020 14:56:15 +0000
Subject: [PATCH 59/78] Faster edge list reader

---
 include/EdgeList.h | 43 +++++++++++++++++++------------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/include/EdgeList.h b/include/EdgeList.h
index efb65c73..a4bca0fe 100644
--- a/include/EdgeList.h
+++ b/include/EdgeList.h
@@ -3,8 +3,11 @@
 #define _NETWORK_H_
 
 #include <stdio.h>
-#include <stdlib.h>
 #include <stdint.h>
+#include <assert.h>
+#include <iostream>
+#include <fstream>
+#include <vector>
 
 struct EdgeList {
   // Number of nodes and edges
@@ -18,50 +21,42 @@ struct EdgeList {
   // Read network from file
   void read(const char* filename, bool warn = true)
   {
-    // Read edges
-    FILE* fp = fopen(filename, "rt");
-    if (fp == NULL) {
-      fprintf(stderr, "Can't open '%s'\n", filename);
-      exit(EXIT_FAILURE);
-    }
+    std::fstream file(filename, std::ios_base::in);
+    std::vector<uint32_t> vec;
 
     // Count number of nodes and edges
     numEdges = 0;
     numNodes = 0;
-    int ret;
-    while (1) {
-      uint32_t src, dst;
-      ret = fscanf(fp, "%d %d", &src, &dst);
-      if (ret == EOF) break;
+    uint32_t numInts = 0;
+    uint32_t val;
+    while (file >> val) {
+      vec.push_back(val);
+      numNodes = val >= numNodes ? val+1 : numNodes;
       numEdges++;
-      numNodes = src >= numNodes ? src+1 : numNodes;
-      numNodes = dst >= numNodes ? dst+1 : numNodes;
     }
-    rewind(fp);
+    assert((numEdges&1) == 0);
+    numEdges >>= 1;
 
     uint32_t* count = (uint32_t*) calloc(numNodes, sizeof(uint32_t));
-    for (int i = 0; i < numEdges; i++) {
-      uint32_t src, dst;
-      ret = fscanf(fp, "%d %d", &src, &dst);
-      count[src]++;
+    for (int i = 0; i < vec.size(); i+=2) {
+      count[vec[i]]++;
     }
 
     // Create mapping from node id to neighbours
     neighbours = (uint32_t**) calloc(numNodes, sizeof(uint32_t*));
-    rewind(fp);
     for (int i = 0; i < numNodes; i++) {
       neighbours[i] = (uint32_t*) calloc(count[i]+1, sizeof(uint32_t));
       neighbours[i][0] = count[i];
     }
-    for (int i = 0; i < numEdges; i++) {
-      uint32_t src, dst;
-      ret = fscanf(fp, "%d %d", &src, &dst);
+    for (int i = 0; i < vec.size(); i+=2) {
+      uint32_t src = vec[i];
+      uint32_t dst = vec[i+1];
       neighbours[src][count[src]--] = dst;
     }
  
     // Release
     free(count);
-    fclose(fp);
+    file.close();
 
     if (warn && minFanOut() == 0) {
       printf("Warning: some vertices have no outgoing edges and\n");

From 35dd55ad555be2bd544f8d17384d24bb3562b1a8 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mn416@jennings.cl.cam.ac.uk>
Date: Mon, 11 May 2020 15:46:28 +0000
Subject: [PATCH 60/78] Add direct-mapped placer

---
 include/POLite/Placer.h | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/include/POLite/Placer.h b/include/POLite/Placer.h
index 1468f0c7..4178af50 100644
--- a/include/POLite/Placer.h
+++ b/include/POLite/Placer.h
@@ -14,7 +14,8 @@ struct Placer {
   enum Method {
     Default,
     Metis,
-    Random
+    Random,
+    Direct
   };
   const Method defaultMethod=Metis;
 
@@ -61,6 +62,8 @@ struct Placer {
         method=Metis;
       else if (!strcmp(e, "random"))
         method=Random;
+      else if (!strcmp(e, "direct"))
+        method=Direct;
       else if (!strcmp(e, "default") || *e == '\0')
         method=Default;
       else {
@@ -159,6 +162,18 @@ struct Placer {
     }
   }
 
+  // Partition the graph using direct mapping
+  void partitionDirect() {
+    uint32_t numVertices = graph->incoming->numElems;
+    uint32_t numParts = width * height;
+    uint32_t partSize = (numVertices + numParts) / numParts;
+
+    // Populate result array
+    for (uint32_t i = 0; i < numVertices; i++) {
+      partitions[i] = i / partSize;
+    }
+  }
+
   void partition()
   {
     switch(method){
@@ -169,6 +184,9 @@ struct Placer {
     case Random:
       partitionRandom();
       break;
+    case Direct:
+      partitionDirect();
+      break;
     }
   }
 

From 8f4d197e980cbdcf31bcffd5c8f77b4eca7f87d8 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Thu, 14 May 2020 10:41:45 +0100
Subject: [PATCH 61/78] Bit of work on the docs

---
 README.md                        | 127 +++++++++++------
 doc/PIP-0024-global-multicast.md | 226 +++++++++++++++++++++++++++++++
 2 files changed, 312 insertions(+), 41 deletions(-)
 create mode 100644 doc/PIP-0024-global-multicast.md

diff --git a/README.md b/README.md
index acc9d10c..502a2e29 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ TODO, document the following:
 // ==========
 
 // Send message at addr using given routing key 
-inline void tinselKeySend(int key, volatile void* addr);
+inline void tinselKeySend(uint32_t key, volatile void* addr);
 
 // HostLink API
 // ============
@@ -36,12 +36,7 @@ INLINE uint32_t tinselProgRouterSent();
 INLINE uint32_t tinselProgRouterSentInterBoard();
 ```
 
-Document extra send slot option:
-
-  * `HostLinkParams`, `DebugLinkParams`
-  * `tinselSendSlotExtra()`
-
-# Tinsel 0.7.1
+# Tinsel 0.8
 
 Tinsel is a [RISC-V](https://riscv.org/)-based manythread
 message-passing architecture designed for FPGA clusters.  It is being
@@ -76,8 +71,12 @@ Released on 11 Apr 2019 and maintained in the
 (Multi-box cluster.)
 * [v0.7](https://github.com/POETSII/tinsel/releases/tag/v0.7):
 Released on 2 Dec 2019 and maintained in the
+[tinsel-0.7.1 branch](https://github.com/POETSII/tinsel/tree/tinsel-0.7.1).
+(Local hardware multicast.)
+* [v0.8](https://github.com/POETSII/tinsel/releases/tag/v0.8):
+Released on 18 May 2020 and maintained in the
 [master branch](https://github.com/POETSII/tinsel/).
-(Localised hardware multicast.)
+(Global hardware multicast.)
 
 ## Contents
 
@@ -87,8 +86,9 @@ Released on 2 Dec 2019 and maintained in the
 * [4. Tinsel Cache](#4-tinsel-cache)
 * [5. Tinsel Mailbox](#5-tinsel-mailbox)
 * [6. Tinsel Network](#6-tinsel-network)
-* [7. Tinsel HostLink](#7-tinsel-hostlink)
-* [8. POLite API](#8-polite-api)
+* [7. Tinsel Router](#7-tinsel-router)
+* [8. Tinsel HostLink](#8-tinsel-hostlink)
+* [9. POLite API](#9-polite-api)
 
 ## Appendices
 
@@ -135,11 +135,16 @@ demands, but fairly modest compute requrements.  The main features are:
     time step, or termination of the application, supporting
     both synchronous and asynchronous event-driven systems.
 
-  * **Localised hardware multicast**.  Threads can send a message to
-    multiple colocated destination threads simultaneously, greatly reducing
+  * **Local hardware multicast**.  Threads can send a message to
+    multiple collocated destination threads simultaneously, greatly reducing
     the number of inter-thread messages in applications exhibiting good
     locality of communication.
 
+  * **Global hardware multicast**.  Programmable routers
+    automatically propagate messages to any number of destination
+    threads distributed throughout the cluster, minimising inter-FPGA
+    bandwidth usage for distributed fanouts.
+
   * **Host communication**. Tinsel threads communicate with x86
     machines distributed throughout the FPGA cluster, for command and
     control, via PCI Express and USB.
@@ -148,7 +153,7 @@ demands, but fairly modest compute requrements.  The main features are:
     include custom accelerators written in SystemVerilog.
 
 This repository also includes a prototype high-level vertex-centric
-programming API for Tinsel, called [POLite](#8-polite-api).
+programming API for Tinsel, called [POLite](#9-polite-api).
 
 ## 2. High-Level Structure
 
@@ -175,11 +180,13 @@ accelerators](doc/custom) in tiles.
 
 #### Tinsel FPGA
 
-Each FPGA contains two *Tinsel Slices*, with each slice typically
+Each FPGA contains two *Tinsel Slices*, with each slice by default
 comprising eight tiles connected to one 4GB DDR3 DIMM and two 8MB
 QDRII+ SRAMs.  All tiles are connected together via a routers to form
 a 2D NoC.  The NoC is connected to the inter-FPGA links using a
-per-board router.
+*per-board programmable router*.  Note that the per-board router also
+has connections to off-chip memory: this is where the programmable
+routing tables are stored.
 
 <img align="center" src="doc/figures/fpga.png">
 
@@ -460,16 +467,22 @@ has reached the destination or none of it has.  As one would expect,
 shorter messages consume less bandwidth than longer ones.  The size of
 a flit is defined by `LogWordsPerFlit`.
 
-At the heart of a mailbox is a memory-mapped *scratchpad* that
-stores both incoming and outgoing messages.  The capacity of the
-scratchpad is defined by `LogMsgsPerMailbox`.  Each thread connected
-to the mailbox has one message slot reserved for sending messages.
-The address of this slot is obtained using the following Tinsel API
-call.
+At the heart of a mailbox is a memory-mapped *scratchpad* that stores
+both incoming and outgoing messages.  The capacity of the scratchpad
+is defined by `LogMsgsPerMailbox`.  Each thread connected to the
+mailbox has one or two message slots reserved for sending messages.
+(By default, only a single send slot is reserved; the extra send slot
+may be optionally reserved at power-up via a parameter to the
+[HostLink](#8-tinsel-hostlink) constructor.)  The addresses of these
+slots are obtained using the following Tinsel API calls.
 
 ```c
-// Get pointer to thread's message slot reserved for sending.
+// Get pointer to thread's message slot reserved for sending
 volatile void* tinselSendSlot();
+
+// Get pointer to thread's extra message slot reserved for sending
+// (Assumes that HostLink has requested the extra slot)
+volatile void* tinselSendSlotExtra();
 ```
 
 Once a thread has written a message to the scratchpad, it can trigger
@@ -681,7 +694,11 @@ communication.  And since we are using the links point-to-point,
 almost all of the ethernet header fields can be used for our own
 purposes, resulting in very little overhead on the wire.
 
-## 7. Tinsel HostLink
+## 7. Tinsel Router
+
+TODO
+
+## 8. Tinsel HostLink
 
 *HostLink* is the means by which Tinsel cores running on a mesh of
 FPGA boards communicate with a *host PC*.  It comprises three main
@@ -689,7 +706,7 @@ communication channels:
 
 * An FPGA *bridge board* that connects the host PC inside a POETS box
 (PCI Express) to the FPGA mesh (SFP+).  Using this high-bandwidth
-channel (10Gbps), the host PC can efficiently send messages to any
+channel (2 x 10Gbps), the host PC can efficiently send messages to any
 Tinsel thread and vice-versa.
 
 * A set of *debug links* connecting the host PC inside a POETS box to
@@ -704,34 +721,45 @@ each FPGA's *power management module* via separate USB UART cables.
 These connections can be used to power-on/power-off each FPGA and to
 monitor power consumption, temperature, and fan tachometer.
 
-HostLink supports multiple POETS boxes, but requires that one of these
-boxes is designated as the **master box**.  Currently, all messages
-are injected/extracted to/from the FPGA network via the master box's
-bridge board.
-
-A Tinsel application typically consists of two programs: one which
-runs on the RISC-V cores, linked against the [Tinsel
+HostLink allows multiple POETS boxes to be used to run an application,
+but requires that one of these boxes is designated as the **master
+box**.  A Tinsel application typically consists of two programs: one
+which runs on the RISC-V cores, linked against the [Tinsel
 API](#f-tinsel-api), and the other which runs on the host PC of the
 master box, linked against the [HostLink API](#g-hostlink-api).  The
 HostLink API is implemented as a C++ class called `HostLink`.  The
 constructor for this class first powers up all the worker FPGAs (which
-are by default powered down).  On power-up the FPGAs are automatically
-programmed using the Tinsel bit-file residing in flash memory, and are
-ready to be used within a few seconds, as soon as the `HostLink`
-constructor returns.
+are by default powered down).  On power-up, the FPGAs are
+automatically programmed using the Tinsel bit-file residing in flash
+memory, and are ready to be used within a few seconds, as soon as the
+`HostLink` constructor returns.
 
 The `HostLink` constructor is overloaded:
 
 ```cpp
 HostLink::HostLink();
 HostLink::HostLink(uint32_t numBoxesX, uint32_t numBoxesY);
+HostLink::HostLink(HostLinkParams params);
 ```
 
 If it is called without any arguments, then it assumes that a single
-box is to be used.  Alternatively, the user may request multiple
-boxes by specifying the width and height of the box sub-mesh they
-wish to use.  (The box from which the application is started is
-considered as the origin of this sub-mesh.)
+box is to be used.  Alternatively, the user may request multiple boxes
+by specifying the width and height of the box sub-mesh they wish to
+use.  (The box from which the application is started, i.e. the master
+box, is considered as the the origin of this sub-mesh.)  The most
+general constructor takes a `HostLinkParams` structure as an argument,
+which allows additional options to be specified.
+
+```cpp
+// HostLink parameters
+struct HostLinkParams {
+  // Number of boxes to use (default is 1x1)
+  uint32_t numBoxesX;
+  uint32_t numBoxesY;
+  // Enable use of tinselSendSlotExtra() on threads (default is false)
+  bool useExtraSendSlot;
+};
+```
 
 HostLink methods for sending and receiving messages on the host PC are
 as follows.
@@ -937,7 +965,7 @@ not be called.  When the application returns from `main()`, all but
 one thread on each core are killed and the remaining threads reenter
 the boot loader.
 
-## 8. POLite API
+## 9. POLite API
 
 POLite is a layer of abstraction that takes care of mapping arbitrary
 task graphs onto the Tinsel overlay, completely hiding architectural
@@ -1300,6 +1328,13 @@ inline void tinselFlushLine(uint32_t lineNum, uint32_t way);
 // (A message of length n is comprised of n+1 flits)
 inline void tinselSetLen(uint32_t n);
 
+// Get pointer to thread's message slot reserved for sending
+volatile void* tinselSendSlot();
+
+// Get pointer to thread's extra message slot reserved for sending
+// (Assumes that HostLink has requested the extra slot)
+volatile void* tinselSendSlotExtra();
+
 // Determine if calling thread can send a message
 inline uint32_t tinselCanSend();
 
@@ -1518,14 +1553,24 @@ class HostLink {
   // Trigger application execution on all started threads on given core
   void goOne(uint32_t meshX, uint32_t meshY, uint32_t coreId);
 };
+
+// HostLink parameters (used by the most general HostLink constructor)
+struct HostLinkParams {
+  // Number of boxes to use (default is 1x1)
+  uint32_t numBoxesX;
+  uint32_t numBoxesY;
+  // Enable use of tinselSendSlotExtra() on threads (default is false)
+  bool useExtraSendSlot;
+};
 ```
 
 ```cpp
 class DebugLink {
  public:
 
-  // Constructor
+  // Constructors
   DebugLink(uint32_t numBoxesX, uint32_t numBoxesY);
+  DebugLink(DebugLinkParams params);
 
   // On given board, set destination core and thread
   void setDest(uint32_t boardX, uint32_t boardY,
diff --git a/doc/PIP-0024-global-multicast.md b/doc/PIP-0024-global-multicast.md
new file mode 100644
index 00000000..65105f71
--- /dev/null
+++ b/doc/PIP-0024-global-multicast.md
@@ -0,0 +1,226 @@
+# PIP-0024: Programmable routers and global multicast
+
+Author: Matthew Naylor
+
+This proposal replaces PIP 21.
+
+## Proposal
+
+We propose to generalise the destination component of a message so
+that it can be (1) a thread id; or (2) a **routing key**.  A message,
+sent by a thread, containing a routing key as a destination will go to
+a **per-board router** on the same FPGA.  The router will use they key
+as an index into a DRAM-based routing table and automatically
+propagate the message towards all the destinations associated with
+that key.
+
+## Motivation/Rationale
+
+PIP 22 resulted in a *mailbox-level* multicast feature, implemented in
+Tinsel 0.7.  It enables each thread to send to a message
+simultaneously to any subset of the 64 threads on a destination
+mailbox.  It works well when graphs exhibit good locality, with
+destination vertices often collocated on the same mailbox.
+
+However, it has a few drawbacks:
+
+  1. Costly graph partitioning algorithms are needed to identify
+     locality. This is problematic for graphs with billions of edges
+     and vertices, because mapping time may significantly outweigh
+     execution time.  (Indeed, graph partitioning is itself an
+     interesting application for the hardware.)
+
+  2. In some graphs there are limits to how well destination vertices
+     can be collocated after partitioning.  For example, *small-world
+     graphs* contain some extremely large, highly-distributed fanouts.
+
+A *global multicast* feature should reduce the need to find optimal
+partitions for very large graphs, and support distributed fanouts.  It
+should also move work away from the cores and into the hardware
+routers: the softswitch no longer needs to iterate over the outgoing
+edges of a pin.  While providing these improvements, it is also
+important to maintain the advantages of the existing mailbox-level
+multicast, for applications in which the mapping time is not a
+concern.
+
+## Functional overview
+
+A **routing key** is a 32-bit value consisting of a *ram id*, an
+*address*, and a *size*:
+
+```sv
+// 32-bit routing key (MSB to LSB)
+typedef struct {
+  // Which off-chip RAM on this board?
+  Bit#(`LogDRAMsPerBoard) ram;
+  // Pointer to array of routing beats containing routing records
+  Bit#(`LogBeatsPerDRAM) ptr;
+  // Number of beats in the array
+  Bit#(`LogRoutingEntryLen) numBeats;
+} RoutingKey;
+```
+
+When a message reaches the per-board router, the `ptr` field of the
+routing key is used as an index into DRAM, where a sequence of 256-bit
+**routing beats** are found.  The `numBeats` field of the routing key
+indicates how many contiguous routing beats there are.  Knowing the
+size before the lookup makes the hardware simpler and more efficient,
+e.g. it can avoid blocking on responses and issue a burst of an
+appropriate size.  The value of `numBeats` may be zero.
+
+A routing beat consists of a *size* and a sequence of five 48-bit
+*routing chunks*:
+
+```sv
+// 256-bit routing beat (aligned, MSB to LSB)
+typedef struct {
+  // Number of routing records present in this beat
+  Bit#(16) size;
+  // Five 48-bit record chunks
+  Vector#(5, Bit#(48)) chunks;
+} RoutingBeat;
+```
+
+The *size* must lie in the range 1 to 5 inclusive (0 is disallowed).
+A **routing record** consists of one or two routing chunks, depending
+on the **record type**.
+
+All byte orderings are little endian.  For example, the order of bytes
+in a routing beat is as follows.
+
+```
+Byte  Contents
+----  --------
+31:   Upper byte of length (i.e. number of records in beat)
+30:   Lower byte of length
+29:   Upper byte of first chunk
+      ...
+24:   Lower byte of first chunk
+23:   Upper byte of second chunk
+      ...
+18:   Lower byte of second chunk
+17:   Upper byte of third chunk
+      ...
+12:   Lower byte of third chunk
+11:   Upper byte of fourth chunk
+      ...
+ 6:   Lower byte of fourth chunk
+ 5:   Upper byte of fifth chunk
+      ...
+ 0:   Lower byte of fifth chunk
+```
+
+Clearly, both routing keys and routing beats have a maximum size.
+However, in principle there is no limit to the number of records
+associated with a key, due to the possibility of *indirection records*
+(see below).
+
+There are five types of routing record, defined below.
+
+**48-bit Unicast Router-to-Mailbox (URM1).**
+
+```sv
+typedef struct {
+  // Record type (URM1 == 0)
+  Bit#(3) tag;
+  // Mailbox destination
+  Bit#(4) mbox;
+  // Mailbox-local thread identifier
+  Bit#(6) thread;
+  // Unused
+  Bit#(3) unused;
+  // Local key. The first word of the message
+  // payload is overwritten with this.
+  Bit#(32) localKey;
+} URM1Record;
+```
+
+The `localKey` can be used for anything, but might encode the
+destination thread-local device identifier, or edge identifier, or
+both.  The `mbox` field is currently 4 bits (two Y bits followed by
+two X bits), but there are spare bits available to increase the size
+of this field in future if necessary.
+
+**96-bit Unicast Router-to-Mailbox (URM2).**
+
+```sv
+typedef struct {
+  // Record type (URM2 == 1)
+  Bit#(3) tag;
+  // Mailbox destination
+  Bit#(4) mbox;
+  // Mailbox-local thread identifier
+  Bit#(6) thread;
+  // Currently unused
+  Bit#(19) unused;
+  // Local key. The first two words of the message
+  // payload is overwritten with this.
+  Bit#(64) localKey;
+} URM2Record;
+```
+
+This is the same as a URM1 record except the local key is 64-bits in
+size.
+
+**48-bit Router-to-Router (RR).**
+
+```sv
+typedef struct {
+  // Record type (RR == 2)
+  Bit#(3) tag;
+  // Direction (N,S,E,W == 0,1,2,3)
+  Bit#(2) dir;
+  // Currently unused
+  Bit#(11) unused;
+  // New 32-bit routing key that will replace the one in the
+  // current message for the next hop of the message's journey
+  Bit#(32) newKey;
+} RRRecord;
+```
+
+The `newKey` field will replace the key in the current message for the
+next hop of the message's journey.  Introducing a new key at each hop
+simplifies the mapping process (keeping it quick).
+
+**96-bit Multicast Router-to-Mailbox (MRM).**
+
+```sv
+typedef struct {
+  // Record type (MRM == 3)
+  Bit#(3) tag;
+  // Mailbox destination
+  Bit#(4) mbox;
+  // Currently unused
+  Bit#(9) unused;
+  // Local key. The least-significant half-word
+  // of the message is replaced with this
+  Bit#(16) localKey;
+  // Mailbox-local destination mask
+  Bit#(64) destMask;
+} MRMRecord;
+```
+
+**48-bit Indirection (IND).**
+
+```sv
+// 48-bit Indirection (IND) record
+// Note the restrictions on IND records:
+// 1. At most one IND record per key lookup
+// 2. A max-sized key lookup must contain an IND record
+typedef struct {
+  // Record type (IND == 4)
+  Bit#(3) tag;
+  // Currently unused
+  Bit#(13) unused;
+  // New 32-bit routing key for new set of records on current router
+  Bit#(32) newKey;
+} INDRecord;
+```
+
+Indirection records can be used to handle large fanouts, which exceed
+the number of bits available in the size portion of the routing key.
+
+## Impact
+
+Since use of routing keys is optional, existing applications will
+continue to work unmodified.

From bd03ed85f2097f4d15f5adbe04f3571e488fdb01 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mn416@jennings.cl.cam.ac.uk>
Date: Thu, 14 May 2020 10:36:45 +0000
Subject: [PATCH 62/78] Relax POLite constraint on min fan out

POLite now allows empty pins, so fan outs of some vertices may be zero
---
 apps/POLite/asp-sync/Run.cpp        | 1 +
 apps/POLite/heat-sync/Run.cpp       | 1 -
 apps/POLite/izhikevich-sync/Run.cpp | 3 ++-
 apps/POLite/pagerank-sync/Run.cpp   | 4 ++--
 apps/POLite/sssp-async/Run.cpp      | 4 ++--
 apps/POLite/sssp-sync/Run.cpp       | 4 ++--
 include/EdgeList.h                  | 7 +------
 7 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/apps/POLite/asp-sync/Run.cpp b/apps/POLite/asp-sync/Run.cpp
index 3264d114..287f32db 100644
--- a/apps/POLite/asp-sync/Run.cpp
+++ b/apps/POLite/asp-sync/Run.cpp
@@ -22,6 +22,7 @@ int main(int argc, char**argv)
   
   // Print max fan-out
   printf("Max fan-out = %d\n", net.maxFanOut());
+  printf("Min fan-out = %d\n", net.minFanOut());
   assert(net.minFanOut() > 0);
 
   // Check that parameters make sense
diff --git a/apps/POLite/heat-sync/Run.cpp b/apps/POLite/heat-sync/Run.cpp
index c3db2fbf..91652712 100644
--- a/apps/POLite/heat-sync/Run.cpp
+++ b/apps/POLite/heat-sync/Run.cpp
@@ -25,7 +25,6 @@ int main(int argc, char **argv)
   // Print max fan-out
   printf("Min fan-out = %d\n", net.minFanOut());
   printf("Max fan-out = %d\n", net.maxFanOut());
-  assert(net.minFanOut() > 0);
 
   // Connection to tinsel machine
   HostLink hostLink;
diff --git a/apps/POLite/izhikevich-sync/Run.cpp b/apps/POLite/izhikevich-sync/Run.cpp
index dd1ac79e..09efb701 100644
--- a/apps/POLite/izhikevich-sync/Run.cpp
+++ b/apps/POLite/izhikevich-sync/Run.cpp
@@ -21,7 +21,8 @@ int main(int argc, char**argv)
   // Read network
   EdgeList net;
   net.read(argv[1]);
-  assert(net.minFanOut() > 0);
+  printf("Max fan-out = %d\n", net.maxFanOut());
+  printf("Min fan-out = %d\n", net.minFanOut());
 
   // Connection to tinsel machine
   HostLink hostLink;
diff --git a/apps/POLite/pagerank-sync/Run.cpp b/apps/POLite/pagerank-sync/Run.cpp
index 3ce786b5..1b0eb356 100644
--- a/apps/POLite/pagerank-sync/Run.cpp
+++ b/apps/POLite/pagerank-sync/Run.cpp
@@ -27,9 +27,9 @@ int main(int argc, char **argv)
   EdgeList net;
   net.read(argv[1]);
   printf(" done\n");
-  assert(net.minFanOut() > 0);
 
-  // Print max fan-out
+  // Print fan-out
+  printf("Min fan-out = %d\n", net.minFanOut());
   printf("Max fan-out = %d\n", net.maxFanOut());
   
   // Create nodes in POETS graph
diff --git a/apps/POLite/sssp-async/Run.cpp b/apps/POLite/sssp-async/Run.cpp
index b9c174a3..b78ccec4 100644
--- a/apps/POLite/sssp-async/Run.cpp
+++ b/apps/POLite/sssp-async/Run.cpp
@@ -20,9 +20,9 @@ int main(int argc, char**argv)
   EdgeList net;
   net.read(argv[1]);
 
-  // Print max fan-out
+  // Print fan-out
   printf("Max fan-out = %d\n", net.maxFanOut());
-  assert(net.minFanOut() > 0);
+  printf("Min fan-out = %d\n", net.minFanOut());
 
   // Connection to tinsel machine
   HostLink hostLink;
diff --git a/apps/POLite/sssp-sync/Run.cpp b/apps/POLite/sssp-sync/Run.cpp
index b9c174a3..b78ccec4 100644
--- a/apps/POLite/sssp-sync/Run.cpp
+++ b/apps/POLite/sssp-sync/Run.cpp
@@ -20,9 +20,9 @@ int main(int argc, char**argv)
   EdgeList net;
   net.read(argv[1]);
 
-  // Print max fan-out
+  // Print fan-out
   printf("Max fan-out = %d\n", net.maxFanOut());
-  assert(net.minFanOut() > 0);
+  printf("Min fan-out = %d\n", net.minFanOut());
 
   // Connection to tinsel machine
   HostLink hostLink;
diff --git a/include/EdgeList.h b/include/EdgeList.h
index a4bca0fe..ebd5d37f 100644
--- a/include/EdgeList.h
+++ b/include/EdgeList.h
@@ -19,7 +19,7 @@ struct EdgeList {
   uint32_t** neighbours;
 
   // Read network from file
-  void read(const char* filename, bool warn = true)
+  void read(const char* filename)
   {
     std::fstream file(filename, std::ios_base::in);
     std::vector<uint32_t> vec;
@@ -57,11 +57,6 @@ struct EdgeList {
     // Release
     free(count);
     file.close();
-
-    if (warn && minFanOut() == 0) {
-      printf("Warning: some vertices have no outgoing edges and\n");
-      printf("         some POLite apps do not handle this case.\n");
-    }
   }
 
   // Determine max fan-out

From 0e52f73e0cef7948b87b6320661f4ce75e1fd19f Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mn416@jennings.cl.cam.ac.uk>
Date: Thu, 14 May 2020 10:43:42 +0000
Subject: [PATCH 63/78] Tweak

---
 include/POLite/PGraph.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/POLite/PGraph.h b/include/POLite/PGraph.h
index 57b3172e..126eaa97 100644
--- a/include/POLite/PGraph.h
+++ b/include/POLite/PGraph.h
@@ -178,6 +178,10 @@ template <typename DeviceType,
 
   // Add labelled edge using given output pin
   void addLabelledEdge(E edge, PDeviceId x, PinId pin, PDeviceId y) {
+    if (pin >= POLITE_NUM_PINS) {
+      printf("addEdge: pin exceeds POLITE_NUM_PINS\n");
+      exit(EXIT_FAILURE);
+    }
     graph.addEdge(x, pin, y);
     edgeLabels.elems[x]->append(edge);
   }

From 8d8751bfa03db45c6daab889aa9800b958ca35a9 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Thu, 14 May 2020 17:37:43 +0100
Subject: [PATCH 64/78] More updates to the docs

---
 README.md | 317 ++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 262 insertions(+), 55 deletions(-)

diff --git a/README.md b/README.md
index 502a2e29..465243ca 100644
--- a/README.md
+++ b/README.md
@@ -1,41 +1,3 @@
-TODO, document the following:
-
-```c++
-// Tinsel API
-// ==========
-
-// Send message at addr using given routing key 
-inline void tinselKeySend(uint32_t key, volatile void* addr);
-
-// HostLink API
-// ============
-
-// Send a message using routing key (blocking by default)
-bool keySend(uint32_t key, uint32_t numFlits, void* msg, bool block = true);
-
-// Try to send using routing key (non-blocking, returns true on success)
-bool keyTrySend(uint32_t key, uint32_t numFlits, void* msg);
-```
-
-New section on programmable routers:
-  * Routing record format, byte ordering etc.
-  * Semantics of records
-  * Restrictions on IND records
-  * Avoiding deadlock: programmer has some added resposibility here
-
-New performance counters accessible from core zero on each board only:
-  * `ProgRouterSent` and `ProgRouterSentInterBoard`
-
-Document the following new perf counters:
-
-```c++
-// Performance counter: number of messages emitted by ProgRouter
-INLINE uint32_t tinselProgRouterSent();
-
-// Performance counter: number of inter-board messages emitted by ProgRouter
-INLINE uint32_t tinselProgRouterSentInterBoard();
-```
-
 # Tinsel 0.8
 
 Tinsel is a [RISC-V](https://riscv.org/)-based manythread
@@ -696,7 +658,203 @@ purposes, resulting in very little overhead on the wire.
 
 ## 7. Tinsel Router
 
-TODO
+The Tinsel overlay provides a programmable router on each FPGA board
+to support *global* multicasting.  Programmable routers automatically
+propagate messages to any number of destination threads distributed
+throughout the cluster, minimising inter-FPGA bandwidth usage for
+distributed fanouts, and offloading the work from the cores.  Further
+background can be found in [PIP 24](doc/PIP-0024-global-multicast.md).
+
+To support programmable routers, the destination component of a
+message is generalised so that it can be (1) a thread id; or (2) a
+*routing key*.  A message, sent by a thread, containing a routing
+key as a destination will go to a per-board router on the same
+FPGA.  The router will use the key as an index into a DRAM-based
+routing table and automatically propagate the message towards all the
+destinations associated with that key. 
+
+A **routing key** is a 32-bit value consisting of a board-local *ram
+id*, a *pointer*, and a *size*:
+
+```sv
+// 32-bit routing key (MSB to LSB)
+typedef struct {
+  // Which off-chip RAM on this board?
+  Bit#(`LogDRAMsPerBoard) ram;
+  // Pointer to array of routing beats containing routing records
+  Bit#(`LogBeatsPerDRAM) ptr;
+  // Number of beats in the array
+  Bit#(`LogRoutingEntryLen) numBeats;
+} RoutingKey;
+```
+
+To send a message to a routing key, a new Tinsel API call is provided:
+
+```c
+// Send message at addr using given routing key 
+inline void tinselKeySend(uint32_t key, volatile void* addr);
+```
+
+When a message reaches the per-board router, the `ptr` field of the
+routing key is used as an index into DRAM, where a sequence of 256-bit
+**routing beats** are found.  The `numBeats` field of the routing key
+indicates how many contiguous routing beats there are.  The value of
+`numBeats` may be zero, in which case there are no destinations
+associated with the key.
+
+A routing beat consists of a *size* and a sequence of five 48-bit
+*routing chunks*:
+
+```sv
+// 256-bit routing beat (aligned, MSB to LSB)
+typedef struct {
+  // Number of routing records present in this beat
+  Bit#(16) size;
+  // Five 48-bit record chunks
+  Vector#(5, Bit#(48)) chunks;
+} RoutingBeat;
+```
+
+The *size* must lie in the range 1 to 5 inclusive (0 is disallowed).
+A **routing record** consists of one or two routing chunks, depending
+on the **record type**.
+
+All byte orderings are little endian.  For example, the order of bytes
+in a routing beat is as follows.
+
+Byte | Contents
+---- | --------
+31:  | Upper byte of size (i.e. number of records in beat)
+30:  | Lower byte of size
+29:  | Upper byte of first chunk
+...  | ...
+24:  | Lower byte of first chunk
+23:  | Upper byte of second chunk
+...  | ...
+18:  | Lower byte of second chunk
+17:  | Upper byte of third chunk
+...  | ...
+12:  | Lower byte of third chunk
+11:  | Upper byte of fourth chunk
+...  | ...
+ 6:  | Lower byte of fourth chunk
+ 5:  | Upper byte of fifth chunk
+...  | ...
+ 0:  | Lower byte of fifth chunk
+
+Clearly, both routing keys and routing beats have a maximum size.
+However, in principle there is no limit to the number of records
+associated with a key, due to the possibility of *indirection records*
+(see below).
+
+There are five types of routing record, defined below.
+
+**48-bit Unicast Router-to-Mailbox (URM1):**
+
+```sv
+typedef struct {
+  // Record type (URM1 == 0)
+  Bit#(3) tag;
+  // Mailbox destination
+  Bit#(4) mbox;
+  // Mailbox-local thread identifier
+  Bit#(6) thread;
+  // Unused
+  Bit#(3) unused;
+  // Local key. The first word of the message
+  // payload is overwritten with this.
+  Bit#(32) localKey;
+} URM1Record;
+```
+
+The `localKey` can be used for anything, but might encode the
+destination thread-local device identifier, or edge identifier, or
+both.  The `mbox` field is currently 4 bits (two Y bits followed by
+two X bits), but there are spare bits available to increase the size
+of this field in future if necessary.
+
+**96-bit Unicast Router-to-Mailbox (URM2):**
+
+```sv
+typedef struct {
+  // Record type (URM2 == 1)
+  Bit#(3) tag;
+  // Mailbox destination
+  Bit#(4) mbox;
+  // Mailbox-local thread identifier
+  Bit#(6) thread;
+  // Currently unused
+  Bit#(19) unused;
+  // Local key. The first two words of the message
+  // payload is overwritten with this.
+  Bit#(64) localKey;
+} URM2Record;
+```
+
+This is the same as a URM1 record except the local key is 64-bits in
+size.
+
+**48-bit Router-to-Router (RR):**
+
+```sv
+typedef struct {
+  // Record type (RR == 2)
+  Bit#(3) tag;
+  // Direction (N,S,E,W == 0,1,2,3)
+  Bit#(2) dir;
+  // Currently unused
+  Bit#(11) unused;
+  // New 32-bit routing key that will replace the one in the
+  // current message for the next hop of the message's journey
+  Bit#(32) newKey;
+} RRRecord;
+```
+
+The `newKey` field will replace the key in the current message for the
+next hop of the message's journey.  Introducing a new key at each hop
+simplifies the mapping process (keeping it quick).
+
+**96-bit Multicast Router-to-Mailbox (MRM):**
+
+```sv
+typedef struct {
+  // Record type (MRM == 3)
+  Bit#(3) tag;
+  // Mailbox destination
+  Bit#(4) mbox;
+  // Currently unused
+  Bit#(9) unused;
+  // Local key. The least-significant half-word
+  // of the message is replaced with this
+  Bit#(16) localKey;
+  // Mailbox-local destination mask
+  Bit#(64) destMask;
+} MRMRecord;
+```
+
+**48-bit Indirection (IND):**
+
+```sv
+// 48-bit Indirection (IND) record
+// Note the restrictions on IND records:
+// 1. At most one IND record per key lookup
+// 2. A max-sized key lookup must contain an IND record
+typedef struct {
+  // Record type (IND == 4)
+  Bit#(3) tag;
+  // Currently unused
+  Bit#(13) unused;
+  // New 32-bit routing key for new set of records on current router
+  Bit#(32) newKey;
+} INDRecord;
+```
+
+Indirection records can be used to handle large fanouts, which exceed
+the number of bits available in the size portion of the routing key.
+
+Finally, it is worth noting that when using programmable routers,
+there is an added responsibility for the programmer to use a
+deadlock-free routing scheme, such as dimension-ordered routing.
 
 ## 8. Tinsel HostLink
 
@@ -781,6 +939,12 @@ bool HostLink::canRecv();
 // Receive a message (blocking), given size of message in bytes
 // Any bytes beyond numBytes up to the next message boundary will be ignored
 void HostLink::recvMsg(void* msg, uint32_t numBytes);
+
+// Send a message using routing key (blocking)
+bool HostLink::keySend(uint32_t key, uint32_t numFlits, void* msg);
+
+// Try to send using routing key (non-blocking, returns true on success)
+bool HostLink::keyTrySend(uint32_t key, uint32_t numFlits, void* msg);
 ```
 
 The `send` method allows a message consisting of multiple flits to be
@@ -1148,7 +1312,7 @@ vertex can send messages to the host via the `HostPin` or the `finish`
 handler, and the host can send messages to any vertex.
 
 **Softswitch**. Central to POLite is an event loop running on each
-Tinsel thread, which we call **the softswitch** as it effectively
+Tinsel thread, which we call the softswitch as it effectively
 context-switches between vertices mapped to the same thread.  The
 softswitch has four main responsibilities: (1) to maintain a queue of
 vertices wanting to send; (2) to implement multicast sends over a pin
@@ -1157,14 +1321,34 @@ messages efficiently between vertices running on the same thread and
 on different threads; and (4) to invoke the vertex handlers when
 required, to meet the semantics of the POLite library.
 
+**POLite static parameters**. The following macros can be defined,
+before the first instance of `#include <POLite.h>`, to control some
+aspects of POLite behaviour.
+
+  Macro               | Meaning
+  ---------           | -------
+  `POLITE_NUM_PINS`   | Max number of pins per vertex (default 1)
+  `POLITE_DUMP_STATS` | Dump stats upon completion
+  `POLITE_COUNT_MSGS` | Include message counts in stats dump
+  `POLITE_FAST_MAP`   | Use fast mapper (at the expense of application performance)
+
+**POLite dynamic parameters**.  The following environment variables can
+be set, to control some aspects of POLite behaviour.
+
+  Environment variable | Meaning
+  -------------------- | -------
+  `HOSTLINK_BOXES_X`   | Size of box mesh to use in X dimension
+  `HOSTLINK_BOXES_Y`   | Size of box mesh to use in Y dimension
+  `POLITE_BOARDS_X`    | Size of board mesh to use in X dimension
+  `POLITE_BOARDS_Y`    | Size of board mesh to use in Y dimension
+  `POLITE_CHATTY`      | Set to `1` to enable emission of mapper stats
+  `POLITE_PLACER`      | Use `metis`, `random`, or `direct` placement
+
 **Limitations**. POLite provides several important features of the
 vertex-centric paradigm, but there are some limitations. One of the
 features of the Pregel framework is the ability for vertices to add
 and remove vertices and edges at runtime -- but currently, POLite only
-supports static graphs.  And for large *non-localised* fan-outs, a
-hierarchical hardware or software multicast feature may be desirable
-(where messages get forked at intermediate stages along the way to the
-destinations).
+supports static graphs. 
 
 ## A. DE5-Net Synthesis Report
 
@@ -1181,9 +1365,10 @@ The default Tinsel configuration on a single DE5-Net board contains:
   * four QDRII+ SRAM controllers
   * four 10Gbps reliable links
   * one termination/idle detector
+  * one 8x8 programmable router
   * a JTAG UART
 
-The clock frequency is 225MHz and the resource utilisation is 74% of
+The clock frequency is 215MHz and the resource utilisation is 84% of
 the DE5-Net.
 
 ## B. Tinsel Parameters
@@ -1215,7 +1400,7 @@ the DE5-Net.
   `EnablePerfCount`        |    True | Enable performance counters
   `ClockFreq`              |     215 | Clock frequency in MHz
 
-Further parameters can be found in [config.py](config.py).
+A full list of parameters can be found in [config.py](config.py).
 
 ## C. Tinsel Memory Map
 
@@ -1274,15 +1459,20 @@ separate memory regions (which they are not).
 
 Optional performance-counter CSRs (when `EnablePerfCount` is `True`):
 
-  Name             | CSR    | R/W | Function
-  ---------------- | ------ | --- | --------
-  `PerfCount`      | 0xc07  | W   | Reset(0)/Start(1)/Stop(2) all counters
-  `MissCount`      | 0xc08  | R   | Cache miss count
-  `HitCount`       | 0xc09  | R   | Cache hit count
-  `WritebackCount` | 0xc0a  | R   | Cache writeback count
-  `CPUIdleCount`   | 0xc0b  | R   | CPU idle-cycle count (lower 32 bits)
-  `CPUIdleCountU`  | 0xc0c  | R   | CPU idle-cycle count (upper 8 bits)
-  `CycleU`         | 0xc0d  | R   | Cycle counter (upper 8 bits)
+ Name                  | CSR    | R/W | Function
+ ----------------      | ------ | --- | --------
+ `PerfCount`           | 0xc07  | W   | Reset(0)/Start(1)/Stop(2) all counters
+ `MissCount`           | 0xc08  | R   | Cache miss count
+ `HitCount`            | 0xc09  | R   | Cache hit count
+ `WritebackCount`      | 0xc0a  | R   | Cache writeback count
+ `CPUIdleCount`        | 0xc0b  | R   | CPU idle-cycle count (lower 32 bits)
+ `CPUIdleCountU`       | 0xc0c  | R   | CPU idle-cycle count (upper 8 bits)
+ `CycleU`              | 0xc0d  | R   | Cycle counter (upper 8 bits)
+ `ProgRouterSent`      | 0xc0e  | R   | Total msgs sent by ProgRouter
+ `ProgRouterSentInter` | 0xc0f  | R   | Inter-board msgs sent by ProgRouter
+
+Note that `ProgRouterSent` and `ProgRouterSentInter` are only valid
+from thread zero on each board.
 
 Tinsel also supports the following custom instructions.
 
@@ -1350,6 +1540,9 @@ inline void tinselMulticast(
 // (Address must be aligned on message boundary)
 inline void tinselSend(uint32_t dest, volatile void* addr);
 
+// Send message at address using given routing key
+inline void tinselKeySend(uint32_t key, volatile void* addr);
+
 // Determine if calling thread can receive a message
 inline uint32_t tinselCanRecv();
 
@@ -1429,6 +1622,14 @@ inline uint32_t tinselCPUIdleCountU();
 // Read cycle counter (upper 8 bits)
 inline uint32_t tinselCycleCountU();
 
+// Performance counter: number of messages emitted by ProgRouter
+// (Only valid from thread zero on each board)
+inline uint32_t tinselProgRouterSent();
+
+// Performance counter: number of inter-board messages emitted by ProgRouter
+// (Only valid from thread zero on each board)
+inline uint32_t tinselProgRouterSentInterBoard();
+
 // Address construction
 inline uint32_t tinselToAddr(
          uint32_t boardX, uint32_t boardY,
@@ -1487,6 +1688,12 @@ class HostLink {
   // Any bytes beyond numBytes up to the next message boundary will be ignored
   void recvMsg(void* msg, uint32_t numBytes);
 
+  // Send a message using routing key (blocking by default)
+  bool keySend(uint32_t key, uint32_t numFlits, void* msg, bool block = true);
+
+  // Try to send using routing key (non-blocking, returns true on success)
+  bool keyTrySend(uint32_t key, uint32_t numFlits, void* msg);
+
   // Bulk send and receive
   // ---------------------
 

From 4da990df5e9677decf756ab1ec88692602ab7026 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Fri, 15 May 2020 11:06:50 +0100
Subject: [PATCH 65/78] Tweaks

---
 README.md | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 465243ca..631a4a82 100644
--- a/README.md
+++ b/README.md
@@ -66,12 +66,11 @@ Released on 18 May 2020 and maintained in the
 ## 1. Overview
 
 On the [POETS Project](https://poets-project.org/about), we are
-looking at ways to accelerate applications that can be expressed as
-large numbers of small processes communicating by message-passing.
-Our first attempt is based around a manythread RISC-V architecture
-called Tinsel running on an FPGA cluster.  Tinsel aims to support
-irregular applications that have heavy memory and communication
-demands, but fairly modest compute requrements.  The main features are:
+looking at ways to accelerate applications that are naturally
+expressed as a large number of small processes communicating by
+message-passing.  Our first attempt is based around a manythread
+RISC-V architecture called Tinsel, running on an FPGA cluster.  The
+main features are:
 
   * **Multithreading**.  A critical aspect of the design
     is to tolerate latency as cleanly as possible.  This includes the
@@ -80,10 +79,6 @@ demands, but fairly modest compute requrements.  The main features are:
     (keeping Fmax high); and sharing of resources between cores
     (such as caches, mailboxes, and FPUs).
 
-  * **Caches**.  To keep the programming model simple, we have opted
-    to use thread-partitioned data caches to optimise access to
-    off-chip memory rather than DMA. 
-
   * **Message-passing**. Although there is a requirement to support a
     large amount of memory, it is not necessary to provide the
     illusion of a single shared memory space: message-passing is intended
@@ -658,12 +653,12 @@ purposes, resulting in very little overhead on the wire.
 
 ## 7. Tinsel Router
 
-The Tinsel overlay provides a programmable router on each FPGA board
-to support *global* multicasting.  Programmable routers automatically
-propagate messages to any number of destination threads distributed
-throughout the cluster, minimising inter-FPGA bandwidth usage for
-distributed fanouts, and offloading the work from the cores.  Further
-background can be found in [PIP 24](doc/PIP-0024-global-multicast.md).
+Tinsel provides a programmable router on each FPGA board to support
+*global* multicasting.  Programmable routers automatically propagate
+messages to any number of destination threads distributed throughout
+the cluster, minimising inter-FPGA bandwidth usage for distributed
+fanouts, and offloading work from the cores.  Further background can
+be found in [PIP 24](doc/PIP-0024-global-multicast.md).
 
 To support programmable routers, the destination component of a
 message is generalised so that it can be (1) a thread id; or (2) a
@@ -688,7 +683,8 @@ typedef struct {
 } RoutingKey;
 ```
 
-To send a message to a routing key, a new Tinsel API call is provided:
+To send a message using a routing key as the destination, a new Tinsel
+API call is provided:
 
 ```c
 // Send message at addr using given routing key 

From 1bc368f4a6e6c4c1113bb441c1b9315cdde44ac3 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Mon, 18 May 2020 20:32:00 +0100
Subject: [PATCH 66/78] Route in box Y dir before considering host bit

Multiple boxes in the Y direction only has stopped working.  Looking
back at the first commit the in 0.8 branch, it looks like I accidently
changed the behviour of inter-board routing by considering the host
bit before knowing that we're on the correct Y coordinate.  Hopefully,
this is the fix...
---
 rtl/ProgRouter.bsv | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv
index 9570ff09..6e531261 100644
--- a/rtl/ProgRouter.bsv
+++ b/rtl/ProgRouter.bsv
@@ -404,12 +404,12 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher);
           // Make routing decision
           RoutingDecision decision = RouteNoC;
           MailboxNetAddr addr = flit.dest.addr;
-          if (addr.host.valid)
+          if (addr.board.y < boardId.y) decision = RouteSouth;
+          else if (addr.board.y > boardId.y) decision = RouteNorth;
+          else if (addr.host.valid)
             decision = addr.host.value == 0 ? RouteWest : RouteEast;
           else if (addr.board.x < boardId.x) decision = RouteWest;
           else if (addr.board.x > boardId.x) decision = RouteEast;
-          else if (addr.board.y < boardId.y) decision = RouteSouth;
-          else if (addr.board.y > boardId.y) decision = RouteNorth;
           // Insert into bypass queue
           flitBypassQueue.enq(RoutedFlit { decision: decision, flit: flit});
         end

From ff6deed7e52b881f1919d71aee1fc3be2dd0e134 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Wed, 27 May 2020 09:14:05 +0100
Subject: [PATCH 67/78] Fix fast-mapping of weights

---
 include/POLite/FastMap/PGraph.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/include/POLite/FastMap/PGraph.h b/include/POLite/FastMap/PGraph.h
index 8ac0c84d..923c34d3 100644
--- a/include/POLite/FastMap/PGraph.h
+++ b/include/POLite/FastMap/PGraph.h
@@ -371,11 +371,9 @@ template <typename DeviceType,
           uint32_t devId = getLocalDeviceId(toDeviceAddr[destId]);
           // Add edge to thread's input table
           uint32_t edgeId = inTable[threadId]->numElems;
-          if (i < inTable[threadId]->numElems) {
-            PInEdge<E> edge;
-            edge.edge = edges->elems[i];
-            inTable[threadId]->append(edge);
-          }
+          PInEdge<E> edge;
+          edge.edge = edges->elems[i];
+          inTable[threadId]->append(edge);
           // Add output table entry
           PRoutingDest rdest;
           rdest.kind = PRDestKindURM1;

From 8a19d07b5c8857b5a06fd08446be285d40f5ac6b Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Wed, 3 Jun 2020 10:39:04 +0100
Subject: [PATCH 68/78] POLite examples: display time conditionally

If POLITE_DUMP_STATS is defined then measuring time from the host
isn't very accurate because it will include the time to transfer a
large number of stats over the slow UART.  To help users become aware
of this, we now don't display the time if POLITE_DUMP_STATS is
enabled.
---
 apps/POLite/asp-gals/ASP.h          | 4 ++--
 apps/POLite/asp-gals/Run.cpp        | 5 ++++-
 apps/POLite/asp-sync/Run.cpp        | 2 ++
 apps/POLite/asp-tiles-sync/Run.cpp  | 4 ++--
 apps/POLite/clocktree-async/Run.cpp | 2 ++
 apps/POLite/hashmin-sync/Run.cpp    | 2 ++
 apps/POLite/heat-cube-sync/Run.cpp  | 2 ++
 apps/POLite/heat-gals/Run.cpp       | 2 ++
 apps/POLite/heat-sync/Run.cpp       | 2 ++
 apps/POLite/izhikevich-gals/Run.cpp | 2 ++
 apps/POLite/izhikevich-sync/Run.cpp | 2 ++
 apps/POLite/sssp-async/Run.cpp      | 2 ++
 apps/POLite/sssp-sync/Run.cpp       | 2 ++
 13 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/apps/POLite/asp-gals/ASP.h b/apps/POLite/asp-gals/ASP.h
index 42462622..f69dfa3d 100644
--- a/apps/POLite/asp-gals/ASP.h
+++ b/apps/POLite/asp-gals/ASP.h
@@ -9,8 +9,8 @@
 #ifndef _ASP_H_
 #define _ASP_H_
 
-//#define POLITE_DUMP_STATS
-//#define POLITE_COUNT_MSGS
+#define POLITE_DUMP_STATS
+#define POLITE_COUNT_MSGS
 
 // Lightweight POETS frontend
 #include <POLite.h>
diff --git a/apps/POLite/asp-gals/Run.cpp b/apps/POLite/asp-gals/Run.cpp
index d50821ce..4c00e1da 100644
--- a/apps/POLite/asp-gals/Run.cpp
+++ b/apps/POLite/asp-gals/Run.cpp
@@ -51,7 +51,8 @@ int main(int argc, char**argv)
   // Create random set of source nodes
   uint32_t numSources = NUM_SOURCES*32;
   uint32_t sources[numSources];
-  randomSet(numSources, sources, graph.numDevices);
+  //randomSet(numSources, sources, graph.numDevices);
+  for (int i = 0; i < numSources; i++) sources[i] = i;
 
   // Initialise devices
   for (PDeviceId i = 0; i < graph.numDevices; i++) {
@@ -102,7 +103,9 @@ int main(int argc, char**argv)
   // Display time
   timersub(&finish, &start, &diff);
   double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
+  #ifndef POLITE_DUMP_STATS
   printf("Time = %lf\n", duration);
+  #endif
 
   return 0;
 }
diff --git a/apps/POLite/asp-sync/Run.cpp b/apps/POLite/asp-sync/Run.cpp
index 287f32db..518a33b5 100644
--- a/apps/POLite/asp-sync/Run.cpp
+++ b/apps/POLite/asp-sync/Run.cpp
@@ -99,7 +99,9 @@ int main(int argc, char**argv)
   // Display time
   timersub(&finish, &start, &diff);
   double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
+  #ifndef POLITE_DUMP_STATS
   printf("Time = %lf\n", duration);
+  #endif
 
   return 0;
 }
diff --git a/apps/POLite/asp-tiles-sync/Run.cpp b/apps/POLite/asp-tiles-sync/Run.cpp
index 049d83a8..cdc2bb14 100644
--- a/apps/POLite/asp-tiles-sync/Run.cpp
+++ b/apps/POLite/asp-tiles-sync/Run.cpp
@@ -135,11 +135,11 @@ int main(int argc, char**argv)
   double duration;
   timersub(&finishCompute, &startCompute, &diff);
   duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
-  printf("Time (compute) = %lf\n", duration);
+  printf("Time (compute, including stats transfer over UART) = %lf\n", duration);
   gettimeofday(&finishAll, NULL);
   timersub(&finishAll, &startAll, &diff);
   duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
-  printf("Time (all) = %lf\n", duration);
+  printf("Time (all, including stats transfer over UART) = %lf\n", duration);
 
   return 0;
 }
diff --git a/apps/POLite/clocktree-async/Run.cpp b/apps/POLite/clocktree-async/Run.cpp
index 270c9b48..02f76723 100644
--- a/apps/POLite/clocktree-async/Run.cpp
+++ b/apps/POLite/clocktree-async/Run.cpp
@@ -93,7 +93,9 @@ int main(int argc, char** argv)
   // Display time
   timersub(&finish, &start, &diff);
   double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
+  #ifndef POLITE_DUMP_STATS
   printf("Time = %lf\n", duration);
+  #endif
 
   return 0;
 }
diff --git a/apps/POLite/hashmin-sync/Run.cpp b/apps/POLite/hashmin-sync/Run.cpp
index cb6a7ced..eab92eff 100644
--- a/apps/POLite/hashmin-sync/Run.cpp
+++ b/apps/POLite/hashmin-sync/Run.cpp
@@ -82,7 +82,9 @@ int main(int argc, char**argv)
   // Display time
   timersub(&finish, &start, &diff);
   double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
+  #ifndef POLITE_DUMP_STATS
   printf("Time = %lf\n", duration);
+  #endif
 
   return 0;
 }
diff --git a/apps/POLite/heat-cube-sync/Run.cpp b/apps/POLite/heat-cube-sync/Run.cpp
index aaa42c39..1163f01b 100644
--- a/apps/POLite/heat-cube-sync/Run.cpp
+++ b/apps/POLite/heat-cube-sync/Run.cpp
@@ -76,7 +76,9 @@ int main()
   // Display time
   timersub(&finish, &start, &diff);
   double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
+  #ifndef POLITE_DUMP_STATS
   printf("Time = %lf\n", duration);
+  #endif
 
   return 0;
 }
diff --git a/apps/POLite/heat-gals/Run.cpp b/apps/POLite/heat-gals/Run.cpp
index eacf449f..44c2f921 100644
--- a/apps/POLite/heat-gals/Run.cpp
+++ b/apps/POLite/heat-gals/Run.cpp
@@ -98,7 +98,9 @@ int main(int argc, char **argv)
   // Display time
   timersub(&finish, &start, &diff);
   double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
+  #ifndef POLITE_DUMP_STATS
   printf("Time = %lf\n", duration);
+  #endif
 
   return 0;
 }
diff --git a/apps/POLite/heat-sync/Run.cpp b/apps/POLite/heat-sync/Run.cpp
index 91652712..ed978e39 100644
--- a/apps/POLite/heat-sync/Run.cpp
+++ b/apps/POLite/heat-sync/Run.cpp
@@ -97,7 +97,9 @@ int main(int argc, char **argv)
   // Display time
   timersub(&finish, &start, &diff);
   double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
+  #ifndef POLITE_DUMP_STATS
   printf("Time = %lf\n", duration);
+  #endif
 
   return 0;
 }
diff --git a/apps/POLite/izhikevich-gals/Run.cpp b/apps/POLite/izhikevich-gals/Run.cpp
index 43fb3d4d..e542881f 100644
--- a/apps/POLite/izhikevich-gals/Run.cpp
+++ b/apps/POLite/izhikevich-gals/Run.cpp
@@ -124,7 +124,9 @@ int main(int argc, char**argv)
   // Display time
   timersub(&finish, &start, &diff);
   double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
+  #ifndef POLITE_DUMP_STATS
   printf("Time = %lf\n", duration);
+  #endif
 
   return 0;
 }
diff --git a/apps/POLite/izhikevich-sync/Run.cpp b/apps/POLite/izhikevich-sync/Run.cpp
index 09efb701..0693b8c3 100644
--- a/apps/POLite/izhikevich-sync/Run.cpp
+++ b/apps/POLite/izhikevich-sync/Run.cpp
@@ -112,7 +112,9 @@ int main(int argc, char**argv)
   // Display time
   timersub(&finish, &start, &diff);
   double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
+  #ifndef POLITE_DUMP_STATS
   printf("Time = %lf\n", duration);
+  #endif
 
   return 0;
 }
diff --git a/apps/POLite/sssp-async/Run.cpp b/apps/POLite/sssp-async/Run.cpp
index b78ccec4..37ffcb4e 100644
--- a/apps/POLite/sssp-async/Run.cpp
+++ b/apps/POLite/sssp-async/Run.cpp
@@ -87,7 +87,9 @@ int main(int argc, char**argv)
   // Display time
   timersub(&finish, &start, &diff);
   double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
+  #ifndef POLITE_DUMP_STATS
   printf("Time = %lf\n", duration);
+  #endif
 
   return 0;
 }
diff --git a/apps/POLite/sssp-sync/Run.cpp b/apps/POLite/sssp-sync/Run.cpp
index b78ccec4..37ffcb4e 100644
--- a/apps/POLite/sssp-sync/Run.cpp
+++ b/apps/POLite/sssp-sync/Run.cpp
@@ -87,7 +87,9 @@ int main(int argc, char**argv)
   // Display time
   timersub(&finish, &start, &diff);
   double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
+  #ifndef POLITE_DUMP_STATS
   printf("Time = %lf\n", duration);
+  #endif
 
   return 0;
 }

From d70c212309952f774b2782c010be32b8cacd0590 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Wed, 3 Jun 2020 15:16:06 +0100
Subject: [PATCH 69/78] Support three mapper variants in POLite

---
 README.md                                  |  19 +-
 include/POLite.h                           |  33 +-
 include/POLite/{FastMap => Dist}/PDevice.h |   0
 include/POLite/{FastMap => Dist}/PGraph.h  |   0
 include/POLite/{ => Hybrid}/PDevice.h      |   0
 include/POLite/{ => Hybrid}/PGraph.h       |   0
 include/POLite/Local/PDevice.h             | 370 +++++++++
 include/POLite/Local/PGraph.h              | 866 +++++++++++++++++++++
 8 files changed, 1273 insertions(+), 15 deletions(-)
 rename include/POLite/{FastMap => Dist}/PDevice.h (100%)
 rename include/POLite/{FastMap => Dist}/PGraph.h (100%)
 rename include/POLite/{ => Hybrid}/PDevice.h (100%)
 rename include/POLite/{ => Hybrid}/PGraph.h (100%)
 create mode 100644 include/POLite/Local/PDevice.h
 create mode 100644 include/POLite/Local/PGraph.h

diff --git a/README.md b/README.md
index 631a4a82..e4fdf72c 100644
--- a/README.md
+++ b/README.md
@@ -1326,7 +1326,15 @@ aspects of POLite behaviour.
   `POLITE_NUM_PINS`   | Max number of pins per vertex (default 1)
   `POLITE_DUMP_STATS` | Dump stats upon completion
   `POLITE_COUNT_MSGS` | Include message counts in stats dump
-  `POLITE_FAST_MAP`   | Use fast mapper (at the expense of application performance)
+
+POLite supports three mapping modes, also controlled via macros:
+
+ 
+  Macro               | Use when graphs have...
+  ---------           | -----------------------
+  `POLITE_MAP_LOCAL`  | ...lots of local connections and few distributed connections
+  `POLITE_MAP_DIST`   | ...lots of distributed connections and few local connections (this mapper is fast)
+  `POLITE_MAP_HYBRID` | ...a mix of local and distributed connections (default)
 
 **POLite dynamic parameters**.  The following environment variables can
 be set, to control some aspects of POLite behaviour.
@@ -1341,10 +1349,11 @@ be set, to control some aspects of POLite behaviour.
   `POLITE_PLACER`      | Use `metis`, `random`, or `direct` placement
 
 **Limitations**. POLite provides several important features of the
-vertex-centric paradigm, but there are some limitations. One of the
-features of the Pregel framework is the ability for vertices to add
-and remove vertices and edges at runtime -- but currently, POLite only
-supports static graphs. 
+vertex-centric paradigm, but there are lots of limitations and quirks;
+it is only intended as a prototype library for hardware evaluation
+purposes. One of the features of the Pregel framework is the ability
+for vertices to add and remove vertices and edges at runtime -- but
+currently, POLite only supports static graphs. 
 
 ## A. DE5-Net Synthesis Report
 
diff --git a/include/POLite.h b/include/POLite.h
index 858b865e..d1d5fbc6 100644
--- a/include/POLite.h
+++ b/include/POLite.h
@@ -4,20 +4,33 @@
 
 #include <stdint.h>
 
+// Select default mapper
+#if !defined(POLITE_MAP_LOCAL) || \
+    !defined(POLITE_MAP_DIST)  || \
+    !defined(POLITE_MAP_HYBRID)
+  // Default mapper
+  #define POLITE_MAP_HYBRID
+#endif
+
 #ifdef TINSEL
   #include <tinsel.h>
-  #ifdef POLITE_FAST_MAP
-    #include <POLite/FastMap/PDevice.h>
-  #else
-    #include <POLite/PDevice.h>
+  #if defined(POLITE_MAP_LOCAL)
+    #include <POLite/Local/PDevice.h>
+  #elif defined(POLITE_MAP_DIST)
+    #include <POLite/Dist/PDevice.h>
+  #elif defined(POLITE_MAP_HYBRID)
+    #include <POLite/Hybrid/PDevice.h>
   #endif
 #else
-  #ifdef POLITE_FAST_MAP
-    #include <POLite/FastMap/PDevice.h>
-    #include <POLite/FastMap/PGraph.h>
-  #else
-    #include <POLite/PDevice.h>
-    #include <POLite/PGraph.h>
+  #if defined(POLITE_FAST_LOCAL)
+    #include <POLite/Local/PDevice.h>
+    #include <POLite/Local/PGraph.h>
+  #elif defined(POLITE_MAP_DIST)
+    #include <POLite/Dist/PDevice.h>
+    #include <POLite/Dist/PGraph.h>
+  #elif defined (POLITE_MAP_HYBRID)
+    #include <POLite/Hybrid/PDevice.h>
+    #include <POLite/Hybrid/PGraph.h>
   #endif
   #include <POLite/Seq.h>
   #include <POLite/Graph.h>
diff --git a/include/POLite/FastMap/PDevice.h b/include/POLite/Dist/PDevice.h
similarity index 100%
rename from include/POLite/FastMap/PDevice.h
rename to include/POLite/Dist/PDevice.h
diff --git a/include/POLite/FastMap/PGraph.h b/include/POLite/Dist/PGraph.h
similarity index 100%
rename from include/POLite/FastMap/PGraph.h
rename to include/POLite/Dist/PGraph.h
diff --git a/include/POLite/PDevice.h b/include/POLite/Hybrid/PDevice.h
similarity index 100%
rename from include/POLite/PDevice.h
rename to include/POLite/Hybrid/PDevice.h
diff --git a/include/POLite/PGraph.h b/include/POLite/Hybrid/PGraph.h
similarity index 100%
rename from include/POLite/PGraph.h
rename to include/POLite/Hybrid/PGraph.h
diff --git a/include/POLite/Local/PDevice.h b/include/POLite/Local/PDevice.h
new file mode 100644
index 00000000..ca806a58
--- /dev/null
+++ b/include/POLite/Local/PDevice.h
@@ -0,0 +1,370 @@
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef _PDEVICE_H_
+#define _PDEVICE_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <type_traits>
+
+#ifdef TINSEL
+  #include <tinsel.h>
+  #define PTR(t) t*
+#else
+  #include <tinsel-interface.h>
+  #define PTR(t) uint32_t
+#endif
+
+// Use this to align on half-cache-line boundary
+#define ALIGNED __attribute__((aligned(1<<(TinselLogBytesPerLine-1))))
+
+// This is a static limit on the number of pins per device
+#ifndef POLITE_NUM_PINS
+#define POLITE_NUM_PINS 1
+#endif
+
+// Macros for performance stats
+//   POLITE_DUMP_STATS - dump performance stats on termination
+//   POLITE_COUNT_MSGS - include message counts of performance stats
+
+// Thread-local device id
+typedef uint16_t PLocalDeviceId;
+#define InvalidLocalDevId 0xffff
+#define UnusedLocalDevId 0xfffe
+
+// Thread id
+typedef uint32_t PThreadId;
+
+// Device address
+// Bits 17->0: thread id
+// Bit 18: invalid address
+// Bits 31->19: thread-local device id
+typedef uint32_t PDeviceAddr;
+
+// Device address constructors
+inline PDeviceAddr invalidDeviceAddr() { return 0x40000; }
+inline PDeviceAddr makeDeviceAddr(PThreadId t, PLocalDeviceId d) {
+  return (d << 19) | t;
+}
+
+// Device address deconstructors
+inline bool isValidDeviceAddr(PDeviceAddr addr) { return !(addr & 0x40000); }
+inline PThreadId getThreadId(PDeviceAddr addr) { return addr & 0x3ffff; }
+inline PLocalDeviceId getLocalDeviceId(PDeviceAddr addr) { return addr >> 19; }
+
+// What's the max allowed local device address?
+inline uint32_t maxLocalDeviceId() { return 8192; }
+
+// Routing key
+typedef uint16_t Key;
+#define InvalidKey 0xffff
+
+// Pins
+//   No      - means 'not ready to send'
+//   HostPin - means 'send to host'
+//   Pin(n)  - means 'send to application pin number n'
+typedef uint8_t PPin;
+#define No 0
+#define HostPin 1
+#define Pin(n) ((n)+2)
+
+// For template arguments that are not used
+struct None {};
+
+// Generic device structure
+// Type parameters:
+//   S - State
+//   E - Edge label
+//   M - Message structure
+template <typename S, typename E, typename M> struct PDevice {
+  // State
+  S* s;
+  PPin* readyToSend;
+  uint32_t numVertices;
+  uint16_t time;
+
+  // Handlers
+  void init();
+  void send(volatile M* msg);
+  void recv(M* msg, E* edge);
+  bool step();
+  bool finish(volatile M* msg);
+};
+
+// Generic device state structure
+template <typename S> struct ALIGNED PState {
+  // Pointer to base of neighbours arrays
+  uint16_t pinBase[POLITE_NUM_PINS];
+  // Ready-to-send status
+  PPin readyToSend;
+  // Custom state
+  S state;
+};
+
+// Message structure
+template <typename M> struct PMessage {
+  // Source-based routing key
+  Key key;
+  // Application message
+  M payload;
+};
+
+// An outgoing edge from a device
+struct POutEdge {
+  // Destination mailbox
+  uint16_t mbox;
+  // Routing key
+  uint16_t key;
+  // Destination threads
+  uint32_t threadMaskLow;
+  uint32_t threadMaskHigh;
+};
+
+// An incoming edge to a device (labelleled)
+template <typename E> struct PInEdge {
+  // Destination device
+  PLocalDeviceId devId;
+  // Edge info
+  E edge;
+};
+
+// An incoming edge to a device (unlabelleled)
+template <> struct PInEdge<None> {
+  union {
+    // Destination device
+    PLocalDeviceId devId;
+    // Unused
+    None edge;
+  };
+};
+
+// Helper function: Count board hops between two threads
+inline uint32_t hopsBetween(uint32_t t0, uint32_t t1) {
+  uint32_t xmask = ((1<<TinselMeshXBits)-1);
+  int32_t y0 = t0 >> (TinselLogThreadsPerBoard + TinselMeshXBits);
+  int32_t x0 = (t0 >> TinselLogThreadsPerBoard) & xmask;
+  int32_t y1 = t1 >> (TinselLogThreadsPerBoard + TinselMeshXBits);
+  int32_t x1 = (t1 >> TinselLogThreadsPerBoard) & xmask;
+  return (abs(x0-x1) + abs(y0-y1));
+}
+
+// Generic thread structure
+template <typename DeviceType,
+          typename S, typename E, typename M> struct PThread {
+
+  // Number of devices handled by thread
+  PLocalDeviceId numDevices;
+  // Number of times step handler has been called
+  uint16_t time;
+  // Number of devices in graph
+  uint32_t numVertices;
+  // Pointer to array of device states
+  PTR(PState<S>) devices;
+  // Pointer to base of routing tables
+  PTR(POutEdge) outTableBase;
+  PTR(PInEdge<E>) inTableBase;
+  // Array of local device ids are ready to send
+  PTR(PLocalDeviceId) senders;
+  // This array is accessed in a LIFO manner
+  PTR(PLocalDeviceId) sendersTop;
+
+  // Count number of messages sent
+  #ifdef POLITE_COUNT_MSGS
+  // Total message received
+  uint32_t msgsReceived;
+  // Number of times we wanted to send but couldn't
+  uint32_t blockedSends;
+  // Total messages sent between threads
+  uint32_t interThreadSendCount;
+  // Messages sent between threads on different boards
+  uint32_t interBoardSendCount;
+  #endif
+
+  #ifdef TINSEL
+
+  // Helper function to construct a device
+  INLINE DeviceType getDevice(uint32_t id) {
+    DeviceType dev;
+    dev.s           = &devices[id].state;
+    dev.readyToSend = &devices[id].readyToSend;
+    dev.numVertices = numVertices;
+    dev.time        = time;
+    return dev;
+  }
+
+  // Dump performance counter stats over UART
+  void dumpStats() {
+    tinselPerfCountStop();
+    uint32_t me = tinselId();
+    // Per-cache performance counters
+    uint32_t cacheMask = (1 <<
+      (TinselLogThreadsPerCore + TinselLogCoresPerDCache)) - 1;
+    if ((me & cacheMask) == 0) {
+      printf("H:%x,M:%x,W:%x\n",
+        tinselHitCount(),
+        tinselMissCount(),
+        tinselWritebackCount());
+    }
+    // Per-core performance counters
+    uint32_t coreMask = (1 << (TinselLogThreadsPerCore)) - 1;
+    if ((me & coreMask) == 0) {
+      printf("C:%x %x,I:%x %x\n",
+        tinselCycleCountU(), tinselCycleCount(),
+        tinselCPUIdleCountU(), tinselCPUIdleCount());
+    }
+    // Per-thread performance counters
+    #ifdef POLITE_COUNT_MSGS
+    printf("MS:%x,MR:%x,PR:%x,PRI:%x,BL:%x\n",
+      interThreadSendCount, msgsReceived, 0,
+        interBoardSendCount, blockedSends);
+    #endif
+  }
+
+  // Invoke device handlers
+  void run() {
+    // Current out-going edge in multicast
+    POutEdge* outEdge;
+
+    // Outgoing edge to host
+    POutEdge outHost[2];
+    outHost[0].mbox = tinselHostId() >> TinselLogThreadsPerMailbox;
+    outHost[0].key = 0;
+    outHost[1].key = InvalidKey;
+    // Initialise outEdge to null terminator
+    outEdge = &outHost[1];
+
+    // Did last call to step handler request a new time step?
+    bool active = true;
+
+    // Reset performance counters
+    tinselPerfCountReset();
+
+    // Initialisation
+    sendersTop = senders;
+    for (uint32_t i = 0; i < numDevices; i++) {
+      DeviceType dev = getDevice(i);
+      // Invoke the initialiser for each device
+      dev.init();
+      // Device ready to send?
+      if (*dev.readyToSend != No) {
+        *(sendersTop++) = i;
+      }
+    }
+
+    // Set number of flits per message
+    tinselSetLen((sizeof(PMessage<M>)-1) >> TinselLogBytesPerFlit);
+
+    // Event loop
+    while (1) {
+      // Step 1: try to send
+      if (outEdge->key != InvalidKey) {
+        if (tinselCanSend()) {
+          PMessage<M>* m = (PMessage<M>*) tinselSendSlot();
+          // Send message
+          m->key = outEdge->key;
+          tinselMulticast(outEdge->mbox, outEdge->threadMaskHigh,
+            outEdge->threadMaskLow, m);
+          #ifdef POLITE_COUNT_MSGS
+          interThreadSendCount++;
+          interBoardSendCount +=
+            hopsBetween(outEdge->mbox << TinselLogThreadsPerMailbox,
+              tinselId());
+          #endif
+          // Move to next neighbour
+          outEdge++;
+        }
+        else {
+          blockedSends++;
+          tinselWaitUntil(TINSEL_CAN_SEND|TINSEL_CAN_RECV);
+        }
+      }
+      else if (sendersTop != senders) {
+        if (tinselCanSend()) {
+          // Start new multicast
+          PLocalDeviceId src = *(--sendersTop);
+          // Lookup device
+          DeviceType dev = getDevice(src);
+          PPin pin = *dev.readyToSend;
+          // Invoke send handler
+          PMessage<M>* m = (PMessage<M>*) tinselSendSlot();
+          dev.send(&m->payload);
+          // Reinsert sender, if it still wants to send
+          if (*dev.readyToSend != No) sendersTop++;
+          // Determine out-edge array for sender
+          if (pin == HostPin)
+            outEdge = outHost;
+          else
+            outEdge = (POutEdge*) &outTableBase[
+              devices[src].pinBase[pin-2]
+            ];
+        }
+        else {
+          blockedSends++;
+          tinselWaitUntil(TINSEL_CAN_SEND|TINSEL_CAN_RECV);
+        }
+      }
+      else {
+        // Idle detection
+        int idle = tinselIdle(!active);
+        if (idle > 1)
+          break;
+        else if (idle) {
+          active = false;
+          for (uint32_t i = 0; i < numDevices; i++) {
+            DeviceType dev = getDevice(i);
+            // Invoke the step handler for each device
+            active = dev.step() || active;
+            // Device ready to send?
+            if (*dev.readyToSend != No) {
+              *(sendersTop++) = i;
+            }
+          }
+          time++;
+        }
+      }
+
+      // Step 2: try to receive
+      while (tinselCanRecv()) {
+        PMessage<M>* inMsg = (PMessage<M>*) tinselRecv();
+        PInEdge<E>* inEdge = &inTableBase[inMsg->key];
+        while (inEdge->devId != InvalidLocalDevId) {
+          // Lookup destination device
+          PLocalDeviceId id = inEdge->devId;
+          DeviceType dev = getDevice(id);
+          // Was it ready to send?
+          PPin oldReadyToSend = *dev.readyToSend;
+          // Invoke receive handler
+          dev.recv(&inMsg->payload, &inEdge->edge);
+          // Insert device into a senders array, if not already there
+          if (*dev.readyToSend != No && oldReadyToSend == No)
+            *(sendersTop++) = id;
+          inEdge++;
+          #ifdef POLITE_COUNT_MSGS
+          msgsReceived++;
+          #endif
+        }
+        tinselFree(inMsg);
+      }
+    }
+
+    // Termination
+    #ifdef POLITE_DUMP_STATS
+      dumpStats();
+    #endif
+
+    // Invoke finish handler for each device
+    for (uint32_t i = 0; i < numDevices; i++) {
+      DeviceType dev = getDevice(i);
+      tinselWaitUntil(TINSEL_CAN_SEND);
+      PMessage<M>* m = (PMessage<M>*) tinselSendSlot();
+      if (dev.finish(&m->payload)) tinselSend(tinselHostId(), m);
+    }
+
+    // Sleep
+    tinselWaitUntil(TINSEL_CAN_RECV); while (1);
+  }
+
+  #endif
+
+};
+
+#endif
diff --git a/include/POLite/Local/PGraph.h b/include/POLite/Local/PGraph.h
new file mode 100644
index 00000000..4181c3da
--- /dev/null
+++ b/include/POLite/Local/PGraph.h
@@ -0,0 +1,866 @@
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef _PGRAPH_H_
+#define _PGRAPH_H_
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+#include <HostLink.h>
+#include <config.h>
+#include <POLite.h>
+#include <POLite/Seq.h>
+#include <POLite/Graph.h>
+#include <POLite/Placer.h>
+#include <type_traits>
+#include "Seq.h"
+
+// Nodes of a POETS graph are devices
+typedef NodeId PDeviceId;
+
+// This structure holds a group of receiving edges on a thread.
+// All of the edges originate from the same output pin.
+template <typename E> struct PReceiverGroup {
+  // Thread id where all the receivers reside
+  uint32_t threadId;
+  // A sequence of receiving devices on that thread
+  Seq<PInEdge<E>>* receivers;
+};
+
+// POETS graph
+template <typename DeviceType,
+          typename S, typename E, typename M> class PGraph {
+ private:
+  // Align address to 2^n byte boundary
+  inline uint32_t align(uint32_t n, uint32_t addr) {
+    if ((addr & (1<<n)-1) == 0) return addr;
+    return ((addr >> n) + 1) << n;
+  }
+
+  // Align address to 32-bit word boundary
+  uint32_t wordAlign(uint32_t addr) { return align(2, addr); }
+
+  // Align address to cache-line boundary
+  uint32_t cacheAlign(uint32_t addr) {
+    return align(TinselLogBytesPerLine, addr);
+  }
+
+  // Helper function
+  inline uint32_t min(uint32_t x, uint32_t y) { return x < y ? x : y; }
+
+  // Number of FPGA boards available
+  uint32_t meshLenX;
+  uint32_t meshLenY;
+
+  // Number of FPGA boards to use
+  uint32_t numBoardsX;
+  uint32_t numBoardsY;
+
+  // Multicast routing tables:
+  // Sequence of outgoing edges for every (device, pin) pair
+  Seq<POutEdge>*** outTable;
+  // Sequence of incoming edges for every thread
+  Seq<PInEdge<E>>** inTable;
+
+  // Generic constructor
+  void constructor(uint32_t lenX, uint32_t lenY) {
+    meshLenX = lenX;
+    meshLenY = lenY;
+    char* str = getenv("POLITE_BOARDS_X");
+    int nx = str ? atoi(str) : meshLenX;
+    str = getenv("POLITE_BOARDS_Y");
+    int ny = str ? atoi(str) : meshLenY;
+    setNumBoards(nx, ny);
+    numDevices = 0;
+    devices = NULL;
+    toDeviceAddr = NULL;
+    numDevicesOnThread = NULL;
+    fromDeviceAddr = NULL;
+    vertexMem = NULL;
+    vertexMemSize = NULL;
+    vertexMemBase = NULL;
+    inEdgeMem = NULL;
+    inEdgeMemSize = NULL;
+    inEdgeMemBase = NULL;
+    outEdgeMem = NULL;
+    outEdgeMemSize = NULL;
+    outEdgeMemBase = NULL;
+    mapVerticesToDRAM = false;
+    mapInEdgesToDRAM = true;
+    mapOutEdgesToDRAM = true;
+    outTable = NULL;
+    inTable = NULL;
+    chatty = 0;
+  }
+
+ public:
+  // Number of devices
+  uint32_t numDevices;
+
+  // Graph containing device ids and connections
+  Graph graph;
+
+  // Edge labels: has same structure as graph.outgoing
+  Seq<Seq<E>*> edgeLabels;
+
+  // Mapping from device id to device state
+  // (Not valid until the mapper is called)
+  PState<S>** devices;
+
+  // Mapping from thread id to number of devices on that thread
+  // (Not valid until the mapper is called)
+  uint32_t* numDevicesOnThread;
+
+  // Mapping from device id to device address and back
+  // (Not valid until the mapper is called)
+  PDeviceAddr* toDeviceAddr;  // Device id -> device address
+  PDeviceId** fromDeviceAddr; // Device address -> device id
+
+  // Each thread's vertex mem and thread mem regions
+  // (Not valid until the mapper is called)
+  uint8_t** vertexMem;      uint8_t** threadMem;
+  uint32_t* vertexMemSize;  uint32_t* threadMemSize;
+  uint32_t* vertexMemBase;  uint32_t* threadMemBase;
+
+  // Each thread's in-edge and out-edge regions
+  // (Not valid until the mapper is called)
+  uint8_t** inEdgeMem;      uint8_t** outEdgeMem;
+  uint32_t* inEdgeMemSize;  uint32_t* outEdgeMemSize;
+  uint32_t* inEdgeMemBase;  uint32_t* outEdgeMemBase;
+
+  // Where to map the various regions
+  // (If false, map to SRAM instead)
+  bool mapVerticesToDRAM;
+  bool mapInEdgesToDRAM;
+  bool mapOutEdgesToDRAM;
+
+  // Allow mapper to print useful information to stdout
+  uint32_t chatty;
+
+  // Setter for number of boards to use
+  void setNumBoards(uint32_t x, uint32_t y) {
+    if (x > meshLenX || y > meshLenY) {
+      printf("Mapper: %d x %d boards requested, %d x %d available\n",
+        numBoardsX, numBoardsY, meshLenX, meshLenY);
+      exit(EXIT_FAILURE);
+    }
+    numBoardsX = x;
+    numBoardsY = y;
+  }
+
+  // Create new device
+  inline PDeviceId newDevice() {
+    edgeLabels.append(new SmallSeq<E>);
+    numDevices++;
+    return graph.newNode();
+  }
+
+  // Add a connection between devices
+  inline void addEdge(PDeviceId from, PinId pin, PDeviceId to) {
+    if (pin >= POLITE_NUM_PINS) {
+      printf("addEdge: pin exceeds POLITE_NUM_PINS\n");
+      exit(EXIT_FAILURE);
+    }
+    graph.addEdge(from, pin, to);
+    E edge;
+    edgeLabels.elems[from]->append(edge);
+  }
+
+  // Add labelled edge using given output pin
+  void addLabelledEdge(E edge, PDeviceId x, PinId pin, PDeviceId y) {
+    graph.addEdge(x, pin, y);
+    edgeLabels.elems[x]->append(edge);
+  }
+
+  // Allocate SRAM and DRAM partitions
+  void allocatePartitions() {
+    // Decide a maximum partition size that is reasonable
+    // SRAM: Partition size minus 2048 bytes for the stack
+    uint32_t maxSRAMSize = (1<<TinselLogBytesPerSRAMPartition) - 2048;
+    // DRAM: Partition size minus 65536 bytes for the stack
+    uint32_t maxDRAMSize = (1<<TinselLogBytesPerDRAMPartition) - 65536;
+    // Allocate partition sizes and bases
+    vertexMem = (uint8_t**) calloc(TinselMaxThreads, sizeof(uint8_t*));
+    vertexMemSize = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+    vertexMemBase = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+    threadMem = (uint8_t**) calloc(TinselMaxThreads, sizeof(uint8_t*));
+    threadMemSize = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+    threadMemBase = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+    inEdgeMem = (uint8_t**) calloc(TinselMaxThreads, sizeof(uint8_t*));
+    inEdgeMemSize = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+    inEdgeMemBase = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+    outEdgeMem = (uint8_t**) calloc(TinselMaxThreads, sizeof(uint8_t*));
+    outEdgeMemSize = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+    outEdgeMemBase = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+    // Compute partition sizes for each thread
+    for (uint32_t threadId = 0; threadId < TinselMaxThreads; threadId++) {
+      // This variable is used to count the size of the *initialised*
+      // partition.  The total partition size is larger as it includes
+      // uninitialised portions.
+      uint32_t sizeVMem = 0;
+      uint32_t sizeEIMem = 0;
+      uint32_t sizeEOMem = 0;
+      uint32_t sizeTMem = 0;
+      // Add space for thread structure (always stored in SRAM)
+      sizeTMem = cacheAlign(sizeof(PThread<DeviceType, S, E, M>));
+      // Add space for devices
+      uint32_t numDevs = numDevicesOnThread[threadId];
+      for (uint32_t devNum = 0; devNum < numDevs; devNum++) {
+        // Add space for device
+        sizeVMem = sizeVMem + sizeof(PState<S>);
+      }
+      // Add space for incoming edge table
+      if (inTable[threadId]) {
+        sizeEIMem = inTable[threadId]->numElems * sizeof(PInEdge<E>);
+        sizeEIMem = wordAlign(sizeEIMem);
+      }
+      // Add space for outgoing edge table
+      for (uint32_t devNum = 0; devNum < numDevs; devNum++) {
+        PDeviceId id = fromDeviceAddr[threadId][devNum];
+        for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) {
+          Seq<POutEdge>* edges = outTable[id][p];
+          sizeEOMem += sizeof(POutEdge) * edges->numElems;
+        }
+      }
+      sizeEOMem = wordAlign(sizeEOMem);
+      // The total partition size including uninitialised portions
+      uint32_t totalSizeVMem =
+        sizeVMem + wordAlign(sizeof(PLocalDeviceId) * numDevs);
+      // Check that total size is reasonable
+      uint32_t totalSizeSRAM = sizeTMem;
+      uint32_t totalSizeDRAM = 0;
+      if (mapVerticesToDRAM) totalSizeDRAM += totalSizeVMem;
+                        else totalSizeSRAM += totalSizeVMem;
+      if (mapInEdgesToDRAM)  totalSizeDRAM += sizeEIMem;
+                        else totalSizeSRAM += sizeEIMem;
+      if (mapOutEdgesToDRAM) totalSizeDRAM += sizeEOMem;
+                        else totalSizeSRAM += sizeEOMem;
+      if (totalSizeDRAM > maxDRAMSize) {
+        printf("Error: max DRAM partition size exceeded\n");
+        exit(EXIT_FAILURE);
+      }
+      if (totalSizeSRAM > maxSRAMSize) {
+        printf("Error: max SRAM partition size exceeded\n");
+        exit(EXIT_FAILURE);
+      }
+      // Allocate space for the initialised portion of the partition
+      assert((sizeVMem%4) == 0);
+      assert((sizeTMem%4) == 0);
+      assert((sizeEIMem%4) == 0);
+      assert((sizeEOMem%4) == 0);
+      vertexMem[threadId] = (uint8_t*) calloc(sizeVMem, 1);
+      vertexMemSize[threadId] = sizeVMem;
+      threadMem[threadId] = (uint8_t*) calloc(sizeTMem, 1);
+      threadMemSize[threadId] = sizeTMem;
+      inEdgeMem[threadId] = (uint8_t*) calloc(sizeEIMem, 1);
+      inEdgeMemSize[threadId] = sizeEIMem;
+      outEdgeMem[threadId] = (uint8_t*) calloc(sizeEOMem, 1);
+      outEdgeMemSize[threadId] = sizeEOMem;
+      // Tinsel address of base of partition
+      uint32_t partId = threadId & (TinselThreadsPerDRAM-1);
+      uint32_t sramBase = (1 << TinselLogBytesPerSRAM) +
+          (partId << TinselLogBytesPerSRAMPartition);
+      uint32_t dramBase = TinselBytesPerDRAM -
+          ((partId+1) << TinselLogBytesPerDRAMPartition);
+      // Use partition-interleaved region for DRAM
+      dramBase |= 0x80000000;
+      threadMemBase[threadId] = sramBase;
+      sramBase += threadMemSize[threadId];
+      // Determine base addresses of each region
+      if (mapVerticesToDRAM) {
+        vertexMemBase[threadId] = dramBase;
+        dramBase += totalSizeVMem;
+      }
+      else {
+        vertexMemBase[threadId] = sramBase;
+        sramBase += totalSizeVMem;
+      }
+      if (mapInEdgesToDRAM) {
+        inEdgeMemBase[threadId] = dramBase;
+        dramBase += sizeEIMem;
+      }
+      else {
+        inEdgeMemBase[threadId] = sramBase;
+        sramBase += sizeEIMem;
+      }
+      if (mapOutEdgesToDRAM) {
+        outEdgeMemBase[threadId] = dramBase;
+        dramBase += sizeEOMem;
+      }
+      else {
+        outEdgeMemBase[threadId] = sramBase;
+        sramBase += sizeEOMem;
+      }
+    }
+  }
+
+  // Initialise partitions
+  void initialisePartitions() {
+    for (uint32_t threadId = 0; threadId < TinselMaxThreads; threadId++) {
+      // Next pointers for each partition
+      uint32_t nextVMem = 0;
+      uint32_t nextOutIndex = 0;
+      // Pointer to thread structure
+      PThread<DeviceType, S, E, M>* thread =
+        (PThread<DeviceType, S, E, M>*) &threadMem[threadId][0];
+      // Set number of devices on thread
+      thread->numDevices = numDevicesOnThread[threadId];
+      // Set number of devices in graph
+      thread->numVertices = numDevices;
+      // Set tinsel address of array of device states
+      thread->devices = vertexMemBase[threadId];
+      // Set tinsel address of base of edge tables
+      thread->outTableBase = outEdgeMemBase[threadId];
+      thread->inTableBase = inEdgeMemBase[threadId];
+      // Add space for each device on thread
+      uint32_t numDevs = numDevicesOnThread[threadId];
+      for (uint32_t devNum = 0; devNum < numDevs; devNum++) {
+        PState<S>* dev = (PState<S>*) &vertexMem[threadId][nextVMem];
+        PDeviceId id = fromDeviceAddr[threadId][devNum];
+        devices[id] = dev;
+        // Add space for device
+        nextVMem = nextVMem + sizeof(PState<S>);
+      }
+      // Initialise each device and the thread's out edges
+      for (uint32_t devNum = 0; devNum < numDevs; devNum++) {
+        PDeviceId id = fromDeviceAddr[threadId][devNum];
+        PState<S>* dev = devices[id];
+        // Initialise
+        POutEdge* outEdgeArray = (POutEdge*) outEdgeMem[threadId];
+        for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) {
+          dev->pinBase[p] = nextOutIndex;
+          Seq<POutEdge>* edges = outTable[id][p];
+          for (uint32_t i = 0; i < edges->numElems; i++) {
+            outEdgeArray[nextOutIndex] = edges->elems[i];
+            nextOutIndex++;
+          }
+        }
+      }
+      // Intialise thread's in edges
+      PInEdge<E>* inEdgeArray = (PInEdge<E>*) inEdgeMem[threadId];
+      Seq<PInEdge<E>>* edges = inTable[threadId];
+      if (edges)
+        for (uint32_t i = 0; i < edges->numElems; i++) {
+          inEdgeArray[i] = edges->elems[i];
+        }
+      // At this point, check that next pointers line up with heap sizes
+      if (nextVMem != vertexMemSize[threadId]) {
+        printf("Error: vertex mem size does not match pre-computed size\n");
+        exit(EXIT_FAILURE);
+      }
+      if ((nextOutIndex * sizeof(POutEdge)) != outEdgeMemSize[threadId]) {
+        printf("Error: out edge mem size does not match pre-computed size\n");
+        exit(EXIT_FAILURE);
+      }
+      // Set tinsel address of senders array
+      thread->senders = vertexMemBase[threadId] + nextVMem;
+    }
+  }
+
+  // Allocate mapping structures
+  void allocateMapping() {
+    devices = (PState<S>**) calloc(numDevices, sizeof(PState<S>*));
+    toDeviceAddr = (PDeviceAddr*) calloc(numDevices, sizeof(PDeviceAddr));
+    fromDeviceAddr = (PDeviceId**) calloc(TinselMaxThreads, sizeof(PDeviceId*));
+    numDevicesOnThread = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+  }
+
+  // Allocate routing tables
+  // (Only valid after mapper is called)
+  void allocateRoutingTables() {
+    // Receiver-side tables
+    inTable = (Seq<PInEdge<E>>**)
+      calloc(TinselMaxThreads,sizeof(Seq<PInEdge<E>>*));
+    for (uint32_t t = 0; t < TinselMaxThreads; t++) {
+      if (numDevicesOnThread[t] != 0)
+        inTable[t] = new SmallSeq<PInEdge<E>>;
+    }
+
+    // Sender-side tables
+    outTable = (Seq<POutEdge>***) calloc(numDevices, sizeof(Seq<POutEdge>**));
+    for (uint32_t d = 0; d < numDevices; d++) {
+      outTable[d] = (Seq<POutEdge>**)
+        calloc(POLITE_NUM_PINS, sizeof(Seq<POutEdge>*));
+      for (uint32_t p = 0; p < POLITE_NUM_PINS; p++)
+        outTable[d][p] = new SmallSeq<POutEdge>;
+    }
+  }
+
+  // Pack a receivers array
+  // Input: an in-edge sequence for each thread in a mailbox.
+  // Input array may contain lots of holes (0-element sequences)
+  // Output: a sequence of receiver groups
+  // Output array contains no empty receiver groups
+  void createReceiverGroups(
+        uint32_t mbox,
+        Seq<PInEdge<E>>* receivers,
+        Seq<PReceiverGroup<E>>* groups) {
+    groups->clear();
+    for (uint32_t i = 0; i < 64; i++) {
+      if (receivers[i].numElems > 0) {
+        // Add receiver group
+        PReceiverGroup<E> g;
+        g.threadId = (mbox << TinselLogThreadsPerMailbox) | i;
+        g.receivers = &receivers[i];
+        groups->append(g);
+      }
+    }
+  }
+
+  // Determine routing key for given set of receivers
+  // (The key must be the same for all receivers)
+  uint32_t findKey(Seq<PReceiverGroup<E>>* receivers) { 
+    uint32_t key = 0;
+
+    bool found = false;
+    while (!found) {
+      found = true; 
+      for (uint32_t i = 0; i < receivers->numElems; i++) {
+        PReceiverGroup<E> g = receivers->elems[i];
+        uint32_t numReceivers = g.receivers->numElems;
+        if (numReceivers > 0) {
+          // Lookup thread id of receiver
+          uint32_t t = g.threadId;
+          // Lookup table size for this thread
+          uint32_t tableSize = inTable[t]->numElems;
+          // Move to next receiver when we find a space
+          if (key >= tableSize) continue;
+          // Is there space at the current key?
+          // (Need space for numReceivers plus null terminator)
+          bool space = true;
+          for (int j = 0; j < numReceivers+1; j++) {
+            if ((key+j) >= tableSize) break;
+            if (inTable[t]->elems[key+j].devId != UnusedLocalDevId) {
+              found = false;
+              key = key+j+1;
+              break;
+            }
+          }
+        }
+      }
+    }
+    return key;
+  }
+
+  // Add entries to the input tables for the given receivers
+  // (Only valid after mapper is called)
+  uint32_t addInTableEntries(Seq<PReceiverGroup<E>>* receivers) {
+    uint32_t key = findKey(receivers);
+    if (key >= 0xfffe) {
+      printf("Routing key exceeds 16 bits\n");
+      exit(EXIT_FAILURE);
+    }
+    PInEdge<E> null, unused;
+    null.devId = InvalidLocalDevId;
+    unused.devId = UnusedLocalDevId;
+    // Now that a key with sufficient space has been found, populate the tables
+    for (uint32_t i = 0; i < receivers->numElems; i++) {
+      PReceiverGroup<E> g = receivers->elems[i];
+      uint32_t numReceivers = g.receivers->numElems;
+      if (numReceivers > 0) {
+        // Lookup thread id of receiver
+        uint32_t t = g.threadId;
+        // Lookup table size for this thread
+        uint32_t tableSize = inTable[t]->numElems;
+        // Make sure inTable is big enough for new entries
+        for (uint32_t j = tableSize; j < (key+numReceivers+1); j++)
+          inTable[t]->append(unused);
+        // Add receivers to thread's inTable
+        for (uint32_t j = 0; j < numReceivers; j++) {
+          inTable[t]->elems[key+j] = g.receivers->elems[j];
+        }
+        inTable[t]->elems[key+numReceivers] = null;
+      }
+    }
+    return key;
+  }
+
+  // Compute routing tables
+  // (Only valid after mapper is called)
+  void computeRoutingTables() {
+    // Routing table stats
+    uint64_t totalOutEdges = 0;
+
+    // Sequence of local device ids, for each multicast destiation
+    SmallSeq<PInEdge<E>> receivers[64];
+
+    // Sequence of receiver groups
+    // (A more compact representation of the receivers array)
+    SmallSeq<PReceiverGroup<E>> groups;
+
+    // For each device
+    for (uint32_t d = 0; d < numDevices; d++) {
+      // For each pin
+      for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) {
+        Seq<PDeviceId> dests = *(graph.outgoing->elems[d]);
+        Seq<E> edges = *(edgeLabels.elems[d]);
+        // While destinations are remaining
+        while (dests.numElems > 0) {
+          // Clear receivers
+          for (uint32_t i = 0; i < 64; i++) receivers[i].clear();
+          uint32_t threadMaskLow = 0;
+          uint32_t threadMaskHigh = 0;
+          // Current mailbox being considered
+          PDeviceAddr mbox = getThreadId(toDeviceAddr[dests.elems[0]]) >>
+                               TinselLogThreadsPerMailbox;
+          // For each destination
+          uint32_t destsRemaining = 0;
+          for (uint32_t i = 0; i < dests.numElems; i++) {
+            // Determine destination mailbox address and mailbox-local thread
+            PDeviceId destId = dests.elems[i];
+            PDeviceAddr destAddr = toDeviceAddr[destId];
+            uint32_t destMailbox = getThreadId(destAddr) >>
+                                     TinselLogThreadsPerMailbox;
+            uint32_t destThread = getThreadId(destAddr) &
+                                     ((1<<TinselLogThreadsPerMailbox)-1);
+            // Does destination match current destination?
+            if (destMailbox == mbox) {
+              PInEdge<E> edge;
+              edge.devId = getLocalDeviceId(destAddr);
+              if (! std::is_same<E, None>::value) edge.edge = edges.elems[i];
+              receivers[destThread].append(edge);
+              if (destThread < 32) threadMaskLow |= 1 << destThread;
+              if (destThread >= 32) threadMaskHigh |= 1 << (destThread-32);
+            }
+            else {
+              // Add destination back into sequence
+              dests.elems[destsRemaining] = dests.elems[i];
+              edges.elems[destsRemaining] = edges.elems[i];
+              destsRemaining++;
+            }
+          }
+          // Create receiver groups
+          createReceiverGroups(mbox, receivers, &groups);
+          // Add input table entries
+          uint32_t key = addInTableEntries(&groups);
+          // Add output table entry
+          POutEdge edge;
+          edge.mbox = mbox;
+          edge.key = key;
+          edge.threadMaskLow = threadMaskLow;
+          edge.threadMaskHigh = threadMaskHigh;
+          outTable[d][p]->append(edge);
+          // Prepare for new output table entry
+          dests.numElems = destsRemaining;
+          edges.numElems = destsRemaining;
+          totalOutEdges++;
+        }
+        // Add output edge terminator
+        POutEdge term;
+        term.key = InvalidKey;
+        outTable[d][p]->append(term);
+      }
+    }
+    //printf("Average edges per pin: %lu\n",
+    //  totalOutEdges / (numDevices * POLITE_NUM_PINS);
+  }  
+
+  // Release all structures
+  void releaseAll() {
+    if (devices != NULL) {
+      free(devices);
+      free(toDeviceAddr);
+      free(numDevicesOnThread);
+      for (uint32_t t = 0; t < TinselMaxThreads; t++)
+        if (fromDeviceAddr[t] != NULL) free(fromDeviceAddr[t]);
+      free(fromDeviceAddr);
+      for (uint32_t t = 0; t < TinselMaxThreads; t++)
+        if (vertexMem[t] != NULL) free(vertexMem[t]);
+      free(vertexMem);
+      free(vertexMemSize);
+      free(vertexMemBase);
+      for (uint32_t t = 0; t < TinselMaxThreads; t++)
+        if (threadMem[t] != NULL) free(threadMem[t]);
+      free(threadMem);
+      free(threadMemSize);
+      free(threadMemBase);
+      for (uint32_t t = 0; t < TinselMaxThreads; t++)
+        if (inEdgeMem[t] != NULL) free(inEdgeMem[t]);
+      free(inEdgeMem);
+      free(inEdgeMemSize);
+      free(inEdgeMemBase);
+      for (uint32_t t = 0; t < TinselMaxThreads; t++)
+        if (outEdgeMem[t] != NULL) free(outEdgeMem[t]);
+      free(outEdgeMem);
+      free(outEdgeMemSize);
+      free(outEdgeMemBase);
+    }
+    if (inTable != NULL) {
+      for (uint32_t t = 0; t < TinselMaxThreads; t++)
+        if (inTable[t] != NULL) delete inTable[t];
+      free(inTable);
+      inTable = NULL;
+    }
+    if (outTable != NULL) {
+      for (uint32_t d = 0; d < numDevices; d++) {
+        if (outTable[d] == NULL) continue;
+        for (uint32_t p = 0; p < POLITE_NUM_PINS; p++)
+          delete outTable[d][p];
+        free(outTable[d]);
+      }
+      free(outTable);
+      outTable = NULL;
+    }
+  }
+
+  // Implement mapping to tinsel threads
+  void map() {
+    // Let's measure some times
+    struct timeval placementStart, placementFinish;
+    struct timeval routingStart, routingFinish;
+    struct timeval initStart, initFinish;
+
+    // Release all mapping and heap structures
+    releaseAll();
+
+    // Reallocate mapping structures
+    allocateMapping();
+
+    // Start placement timer
+    gettimeofday(&placementStart, NULL);
+
+    // Partition into subgraphs, one per board
+    Placer boards(&graph, numBoardsX, numBoardsY);
+
+    // Place subgraphs onto 2D mesh
+    const uint32_t placerEffort = 8;
+    boards.place(placerEffort);
+
+    // For each board
+    for (uint32_t boardY = 0; boardY < numBoardsY; boardY++) {
+      for (uint32_t boardX = 0; boardX < numBoardsX; boardX++) {
+        // Partition into subgraphs, one per mailbox
+        PartitionId b = boards.mapping[boardY][boardX];
+        Placer boxes(&boards.subgraphs[b], 
+                 TinselMailboxMeshXLen, TinselMailboxMeshYLen);
+        boxes.place(placerEffort);
+
+        // For each mailbox
+        for (uint32_t boxX = 0; boxX < TinselMailboxMeshXLen; boxX++) {
+          for (uint32_t boxY = 0; boxY < TinselMailboxMeshYLen; boxY++) {
+            // Partition into subgraphs, one per thread
+            uint32_t numThreads = 1<<TinselLogThreadsPerMailbox;
+            PartitionId t = boxes.mapping[boxY][boxX];
+            Placer threads(&boxes.subgraphs[t], numThreads, 1);
+
+            // For each thread
+            for (uint32_t threadNum = 0; threadNum < numThreads; threadNum++) {
+              // Determine tinsel thread id
+              uint32_t threadId = boardY;
+              threadId = (threadId << TinselMeshXBits) | boardX;
+              threadId = (threadId << TinselMailboxMeshYBits) | boxY;
+              threadId = (threadId << TinselMailboxMeshXBits) | boxX;
+              threadId = (threadId << (TinselLogCoresPerMailbox +
+                            TinselLogThreadsPerCore)) | threadNum;
+
+              // Get subgraph
+              Graph* g = &threads.subgraphs[threadNum];
+
+              // Populate fromDeviceAddr mapping
+              uint32_t numDevs = g->incoming->numElems;
+              numDevicesOnThread[threadId] = numDevs;
+              fromDeviceAddr[threadId] = (PDeviceId*)
+                malloc(sizeof(PDeviceId) * numDevs);
+              for (uint32_t devNum = 0; devNum < numDevs; devNum++)
+                fromDeviceAddr[threadId][devNum] = g->labels->elems[devNum];
+  
+              // Populate toDeviceAddr mapping
+              assert(numDevs < maxLocalDeviceId());
+              for (uint32_t devNum = 0; devNum < numDevs; devNum++) {
+                PDeviceAddr devAddr =
+                  makeDeviceAddr(threadId, devNum);
+                toDeviceAddr[g->labels->elems[devNum]] = devAddr;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // Stop placement timer and start routing timer
+    gettimeofday(&placementFinish, NULL);
+    gettimeofday(&routingStart, NULL);
+
+    // Compute send and receive side routing tables
+    allocateRoutingTables();
+    computeRoutingTables();
+
+    // Stop routing timer and start init timer
+    gettimeofday(&routingFinish, NULL);
+    gettimeofday(&initStart, NULL);
+
+    // Reallocate and initialise heap structures
+    allocatePartitions();
+    initialisePartitions();
+
+    // Display times, if chatty
+    gettimeofday(&initFinish, NULL);
+    if (chatty > 0) {
+      struct timeval diff;
+
+      timersub(&placementFinish, &placementStart, &diff);
+      double duration = (double) diff.tv_sec +
+        (double) diff.tv_usec / 1000000.0;
+      printf("POLite mapper profile:\n");
+      printf("  Partitioning and placement: %lfs\n", duration);
+
+      timersub(&routingFinish, &routingStart, &diff);
+      duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
+      printf("  Routing table construction: %lfs\n", duration);
+
+      timersub(&initFinish, &initStart, &diff);
+      duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
+      printf("  Thread state initialisation: %lfs\n", duration);
+    }
+  }
+
+  // Constructor
+  PGraph() {
+    char* str = getenv("HOSTLINK_BOXES_X");
+    int x = str ? atoi(str) : 1;
+    x = x * TinselMeshXLenWithinBox;
+    str = getenv("HOSTLINK_BOXES_Y");
+    int y = str ? atoi(str) : 1;
+    y = y * TinselMeshYLenWithinBox;
+    constructor(x, y);
+  }
+  PGraph(uint32_t numBoxesX, uint32_t numBoxesY) {
+    int x = numBoxesX * TinselMeshXLenWithinBox; 
+    int y = numBoxesY * TinselMeshYLenWithinBox;
+    constructor(x, y);
+  }
+
+  // Deconstructor
+  ~PGraph() {
+    releaseAll();
+    for (uint32_t i = 0; i < edgeLabels.numElems; i++)
+      delete edgeLabels.elems[i];
+  }
+
+  // Write partition to tinsel machine
+  void writeRAM(HostLink* hostLink,
+         uint8_t** heap, uint32_t* heapSize, uint32_t* heapBase) {
+    // Number of bytes written by each thread
+    uint32_t* writeCount = (uint32_t*)
+      calloc(TinselMaxThreads, sizeof(uint32_t));
+
+    // Number of threads completed by each core
+    uint32_t*** threadCount = (uint32_t***)
+      calloc(meshLenX, sizeof(uint32_t**));
+    for (uint32_t x = 0; x < meshLenX; x++) {
+      threadCount[x] = (uint32_t**)
+        calloc(meshLenY, sizeof(uint32_t*));
+      for (uint32_t y = 0; y < meshLenY; y++)
+        threadCount[x][y] = (uint32_t*)
+          calloc(TinselCoresPerBoard, sizeof(uint32_t));
+    }
+
+    // Initialise write addresses
+    for (int x = 0; x < meshLenX; x++)
+      for (int y = 0; y < meshLenY; y++)
+        for (int c = 0; c < TinselCoresPerBoard; c++)
+          hostLink->setAddr(x, y, c, heapBase[hostLink->toAddr(x, y, c, 0)]);
+
+    // Write heaps
+    uint32_t done = false;
+    while (! done) {
+      done = true;
+      for (int x = 0; x < meshLenX; x++) {
+        for (int y = 0; y < meshLenY; y++) {
+          for (int c = 0; c < TinselCoresPerBoard; c++) {
+            uint32_t t = threadCount[x][y][c];
+            if (t < TinselThreadsPerCore) {
+              done = false;
+              uint32_t threadId = hostLink->toAddr(x, y, c, t);
+              uint32_t written = writeCount[threadId];
+              if (written == heapSize[threadId]) {
+                threadCount[x][y][c] = t+1;
+                if ((t+1) < TinselThreadsPerCore)
+                  hostLink->setAddr(x, y, c,
+                    heapBase[hostLink->toAddr(x, y, c, t+1)]);
+              } else {
+                uint32_t send = min((heapSize[threadId] - written)>>2, 15);
+                hostLink->store(x, y, c, send,
+                  (uint32_t*) &heap[threadId][written]);
+                writeCount[threadId] = written + send * sizeof(uint32_t);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // Release memory
+    free(writeCount);
+    for (uint32_t x = 0; x < meshLenX; x++) {
+      for (uint32_t y = 0; y < meshLenY; y++)
+        free(threadCount[x][y]);
+      free(threadCount[x]);
+    }
+    free(threadCount);
+  }
+
+  // Write graph to tinsel machine
+  void write(HostLink* hostLink) { 
+    // Start timer
+    struct timeval start, finish;
+    gettimeofday(&start, NULL);
+
+    bool useSendBufferOld = hostLink->useSendBuffer;
+    hostLink->useSendBuffer = true;
+    writeRAM(hostLink, vertexMem, vertexMemSize, vertexMemBase);
+    writeRAM(hostLink, threadMem, threadMemSize, threadMemBase);
+    writeRAM(hostLink, inEdgeMem, inEdgeMemSize, inEdgeMemBase);
+    writeRAM(hostLink, outEdgeMem, outEdgeMemSize, outEdgeMemBase);
+    hostLink->flush();
+    hostLink->useSendBuffer = useSendBufferOld;
+
+    // Display time if chatty
+    gettimeofday(&finish, NULL);
+    if (chatty > 0) {
+      struct timeval diff;
+      timersub(&finish, &start, &diff);
+      double duration = (double) diff.tv_sec +
+        (double) diff.tv_usec / 1000000.0;
+      printf("POLite graph upload time: %lfs\n", duration);
+    }
+  }
+
+  // Determine fan-in of given device
+  uint32_t fanIn(PDeviceId id) {
+    return graph.fanIn(id);
+  }
+
+  // Determine fan-out of given device
+  uint32_t fanOut(PDeviceId id) {
+    return graph.fanOut(id);
+  }
+
+};
+
+// Read performance stats and store in file
+inline void politeSaveStats(HostLink* hostLink, const char* filename) {
+  #ifdef POLITE_DUMP_STATS
+  // Open file for performance counters
+  FILE* statsFile = fopen(filename, "wt");
+  if (statsFile == NULL) {
+    printf("Error creating stats file\n");
+    exit(EXIT_FAILURE);
+  }
+  uint32_t meshLenX = hostLink->meshXLen;
+  uint32_t meshLenY = hostLink->meshYLen;
+  // Number of caches
+  uint32_t numLines = meshLenX * meshLenY *
+                        TinselDCachesPerDRAM * TinselDRAMsPerBoard;
+  // Add on number of cores
+  numLines += meshLenX * meshLenY * TinselCoresPerBoard;
+  // Add on number of threads
+  #ifdef POLITE_COUNT_MSGS
+  numLines += meshLenX * meshLenY * TinselThreadsPerBoard;
+  #endif
+  hostLink->dumpStdOut(statsFile, numLines);
+  fclose(statsFile);
+  #endif
+}
+
+#endif

From 7b0529d03a16d7c16059c15657116a001022a84a Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mn416@jennings.cl.cam.ac.uk>
Date: Mon, 8 Jun 2020 08:30:36 +0000
Subject: [PATCH 70/78] POLite: fixes to local mapper

---
 include/POLite.h               | 8 ++++----
 include/POLite/Local/PDevice.h | 4 ++++
 include/POLite/Local/PGraph.h  | 5 ++++-
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/include/POLite.h b/include/POLite.h
index d1d5fbc6..735f8bd3 100644
--- a/include/POLite.h
+++ b/include/POLite.h
@@ -5,8 +5,8 @@
 #include <stdint.h>
 
 // Select default mapper
-#if !defined(POLITE_MAP_LOCAL) || \
-    !defined(POLITE_MAP_DIST)  || \
+#if !defined(POLITE_MAP_LOCAL) && \
+    !defined(POLITE_MAP_DIST)  && \
     !defined(POLITE_MAP_HYBRID)
   // Default mapper
   #define POLITE_MAP_HYBRID
@@ -22,13 +22,13 @@
     #include <POLite/Hybrid/PDevice.h>
   #endif
 #else
-  #if defined(POLITE_FAST_LOCAL)
+  #if defined(POLITE_MAP_LOCAL)
     #include <POLite/Local/PDevice.h>
     #include <POLite/Local/PGraph.h>
   #elif defined(POLITE_MAP_DIST)
     #include <POLite/Dist/PDevice.h>
     #include <POLite/Dist/PGraph.h>
-  #elif defined (POLITE_MAP_HYBRID)
+  #elif defined(POLITE_MAP_HYBRID)
     #include <POLite/Hybrid/PDevice.h>
     #include <POLite/Hybrid/PGraph.h>
   #endif
diff --git a/include/POLite/Local/PDevice.h b/include/POLite/Local/PDevice.h
index ca806a58..9408cfae 100644
--- a/include/POLite/Local/PDevice.h
+++ b/include/POLite/Local/PDevice.h
@@ -273,7 +273,9 @@ template <typename DeviceType,
           outEdge++;
         }
         else {
+          #ifdef POLITE_COUNT_MSGS
           blockedSends++;
+          #endif
           tinselWaitUntil(TINSEL_CAN_SEND|TINSEL_CAN_RECV);
         }
       }
@@ -298,7 +300,9 @@ template <typename DeviceType,
             ];
         }
         else {
+          #ifdef POLITE_COUNT_MSGS
           blockedSends++;
+          #endif
           tinselWaitUntil(TINSEL_CAN_SEND|TINSEL_CAN_RECV);
         }
       }
diff --git a/include/POLite/Local/PGraph.h b/include/POLite/Local/PGraph.h
index 4181c3da..5ded656a 100644
--- a/include/POLite/Local/PGraph.h
+++ b/include/POLite/Local/PGraph.h
@@ -13,7 +13,6 @@
 #include <POLite/Graph.h>
 #include <POLite/Placer.h>
 #include <type_traits>
-#include "Seq.h"
 
 // Nodes of a POETS graph are devices
 typedef NodeId PDeviceId;
@@ -91,6 +90,10 @@ template <typename DeviceType,
     outTable = NULL;
     inTable = NULL;
     chatty = 0;
+    str = getenv("POLITE_CHATTY");
+    if (str != NULL) {
+      chatty = !strcmp(str, "0") ? 0 : 1;
+    }
   }
 
  public:

From a29ec6cfb530be465e411a094c95945d8dfe7eda Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mn416@jennings.cl.cam.ac.uk>
Date: Wed, 10 Jun 2020 14:38:58 +0000
Subject: [PATCH 71/78] New BFS-based partitioner

This is faster than METIS and gives similar results for a few graphs
that I've tried.  It works by picking an unvisited vertex, and then
doing a size-bounded BFS from that vertex with the size-bound equal
to the partition size.  This process is repeated until the partition
is full, at which point we move to a new parition and repeat the
process until all vertices have been visited.  It is just a few lines
of code, and could be easily parallelised in future.
---
 include/POLite/Placer.h | 57 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 56 insertions(+), 1 deletion(-)

diff --git a/include/POLite/Placer.h b/include/POLite/Placer.h
index 4178af50..d2f2378a 100644
--- a/include/POLite/Placer.h
+++ b/include/POLite/Placer.h
@@ -5,6 +5,7 @@
 #include <stdint.h>
 #include <metis.h>
 #include <POLite/Graph.h>
+#include <queue>
 
 typedef uint32_t PartitionId;
 
@@ -15,7 +16,8 @@ struct Placer {
     Default,
     Metis,
     Random,
-    Direct
+    Direct,
+    BFS
   };
   const Method defaultMethod=Metis;
 
@@ -64,6 +66,8 @@ struct Placer {
         method=Random;
       else if (!strcmp(e, "direct"))
         method=Direct;
+      else if (!strcmp(e, "bfs"))
+        method=BFS;
       else if (!strcmp(e, "default") || *e == '\0')
         method=Default;
       else {
@@ -174,6 +178,54 @@ struct Placer {
     }
   }
 
+  // Partition the graph using repeated BFS
+  void partitionBFS() {
+    uint32_t numVertices = graph->incoming->numElems;
+    uint32_t numParts = width * height;
+    uint32_t partSize = (numVertices + numParts) / numParts;
+
+    // Visited bit for each vertex
+    bool* seen = new bool [numVertices];
+    memset(seen, 0, numVertices);
+
+    // Next vertex to visit
+    uint32_t nextUnseen = 0;
+
+    // Next partition id
+    uint32_t nextPart = 0;
+
+    while (nextUnseen < numVertices) {
+      // Frontier
+      std::queue<uint32_t> frontier;
+      uint32_t count = 0;
+
+      while (nextUnseen < numVertices && count < partSize) {
+        // Sized-bounded BFS from nextUnseen
+        frontier.push(nextUnseen);
+        while (count < partSize && !frontier.empty()) {
+          uint32_t v = frontier.front();
+          frontier.pop();
+          if (!seen[v]) {
+            seen[v] = true;
+            partitions[v] = nextPart;
+            count++;
+            // Add unvisited neighbours of v to the frontier
+            Seq<uint32_t>* dests = graph->outgoing->elems[v];
+            for (uint32_t i = 0; i < dests->numElems; i++) {
+              uint32_t w = dests->elems[i];
+              if (!seen[w]) frontier.push(w);
+            }
+          }
+        }
+        while (nextUnseen < numVertices && seen[nextUnseen]) nextUnseen++;
+      }
+
+      nextPart++;
+    }
+
+    delete [] seen;
+  }
+
   void partition()
   {
     switch(method){
@@ -187,6 +239,9 @@ struct Placer {
     case Direct:
       partitionDirect();
       break;
+    case BFS:
+      partitionBFS();
+      break;
     }
   }
 

From cb7f663a28298d740478bab8ae499797b5444a64 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Tue, 23 Jun 2020 09:12:40 +0100
Subject: [PATCH 72/78] Attempt to unify POLite mappers

Compiles but untested...
---
 include/POLite.h                     |  28 +-
 include/POLite/Bitmap.h              |  58 ++
 include/POLite/Dist/PDevice.h        | 302 ----------
 include/POLite/Dist/PGraph.h         | 708 ----------------------
 include/POLite/Hybrid/PDevice.h      | 321 ----------
 include/POLite/Hybrid/PGraph.h       | 854 ---------------------------
 include/POLite/{Local => }/PDevice.h |  91 +--
 include/POLite/{Local => }/PGraph.h  | 529 +++++++++++------
 include/tinsel-interface.h           |   9 +
 include/tinsel.h                     |   6 +-
 10 files changed, 469 insertions(+), 2437 deletions(-)
 create mode 100644 include/POLite/Bitmap.h
 delete mode 100644 include/POLite/Dist/PDevice.h
 delete mode 100644 include/POLite/Dist/PGraph.h
 delete mode 100644 include/POLite/Hybrid/PDevice.h
 delete mode 100644 include/POLite/Hybrid/PGraph.h
 rename include/POLite/{Local => }/PDevice.h (80%)
 rename include/POLite/{Local => }/PGraph.h (62%)

diff --git a/include/POLite.h b/include/POLite.h
index 735f8bd3..f053e440 100644
--- a/include/POLite.h
+++ b/include/POLite.h
@@ -4,34 +4,12 @@
 
 #include <stdint.h>
 
-// Select default mapper
-#if !defined(POLITE_MAP_LOCAL) && \
-    !defined(POLITE_MAP_DIST)  && \
-    !defined(POLITE_MAP_HYBRID)
-  // Default mapper
-  #define POLITE_MAP_HYBRID
-#endif
-
 #ifdef TINSEL
   #include <tinsel.h>
-  #if defined(POLITE_MAP_LOCAL)
-    #include <POLite/Local/PDevice.h>
-  #elif defined(POLITE_MAP_DIST)
-    #include <POLite/Dist/PDevice.h>
-  #elif defined(POLITE_MAP_HYBRID)
-    #include <POLite/Hybrid/PDevice.h>
-  #endif
+  #include <POLite/PDevice.h>
 #else
-  #if defined(POLITE_MAP_LOCAL)
-    #include <POLite/Local/PDevice.h>
-    #include <POLite/Local/PGraph.h>
-  #elif defined(POLITE_MAP_DIST)
-    #include <POLite/Dist/PDevice.h>
-    #include <POLite/Dist/PGraph.h>
-  #elif defined(POLITE_MAP_HYBRID)
-    #include <POLite/Hybrid/PDevice.h>
-    #include <POLite/Hybrid/PGraph.h>
-  #endif
+  #include <POLite/PDevice.h>
+  #include <POLite/PGraph.h>
   #include <POLite/Seq.h>
   #include <POLite/Graph.h>
   #include <POLite/Placer.h>
diff --git a/include/POLite/Bitmap.h b/include/POLite/Bitmap.h
new file mode 100644
index 00000000..262f99af
--- /dev/null
+++ b/include/POLite/Bitmap.h
@@ -0,0 +1,58 @@
+#ifndef _BITMAP_H_
+#define _BITMAP_H_
+
+#include <stdint.h>
+#include <assert.h>
+#include <POLite/Seq.h>
+
+struct Bitmap {
+  // Bitmap contents (sequence of 64-bit words)
+  Seq<uint64_t>* contents;
+
+  // Index of first non-full word in bitmap
+  uint32_t firstFree;
+
+  // Constructor
+  Bitmap() {
+    contents = new Seq<uint64_t> (16);
+    firstFree = 0;
+  }
+
+  // Destructor
+  ~Bitmap() {
+    if (contents) delete contents;
+  }
+
+  // Get value of word at given index, return 0 if out-of-bounds
+  inline uint64_t getWord(uint32_t index) {
+    return index >= contents->numElems ? 0ul : contents->elems[index];
+  }
+
+  // Find index of next free word in bitmap starting from given word index
+  inline uint32_t nextFreeWordFrom(uint32_t start) {
+    for (uint32_t i = start; i < contents->numElems; i++)
+      if (~contents->elems[i] != 0ul) return i;
+    return contents->numElems;
+  }
+
+  // Set bit at given index and bit offset in bitmap
+  inline void setBit(uint32_t wordIndex, uint32_t bitIndex) {
+    for (uint32_t i = contents->numElems; i <= wordIndex; i++)
+      contents->append(0ul);
+    contents->elems[wordIndex] |= 1ul << bitIndex;
+    if (wordIndex == firstFree) {
+      firstFree = nextFreeWordFrom(firstFree);
+    }
+  }
+
+  // Find index of next zero bit, and flip that bit
+  inline uint32_t grabNextBit() {
+    uint64_t word = getWord(firstFree);
+    assert(word != 0ul);
+    uint32_t bit = __builtin_ctzll(~word);
+    setBit(firstFree, bit);
+    return 64*firstFree + bit;
+  }
+};
+
+#endif
diff --git a/include/POLite/Dist/PDevice.h b/include/POLite/Dist/PDevice.h
deleted file mode 100644
index f095eba6..00000000
--- a/include/POLite/Dist/PDevice.h
+++ /dev/null
@@ -1,302 +0,0 @@
-// SPDX-License-Identifier: BSD-2-Clause
-#ifndef _PDEVICE_H_
-#define _PDEVICE_H_
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <type_traits>
-
-#ifdef TINSEL
-  #include <tinsel.h>
-  #define PTR(t) t*
-#else
-  #include <tinsel-interface.h>
-  #define PTR(t) uint32_t
-#endif
-
-// Use this to align on half-cache-line boundary
-#define ALIGNED __attribute__((aligned(1<<(TinselLogBytesPerLine-1))))
-
-// This is a static limit on the number of pins per device
-#ifndef POLITE_NUM_PINS
-#define POLITE_NUM_PINS 1
-#endif
-
-// Macros for performance stats
-//   POLITE_DUMP_STATS - dump performance stats on termination
-//   POLITE_COUNT_MSGS - include message counts of performance stats
-
-// Thread-local device id
-typedef uint16_t PLocalDeviceId;
-
-// Thread id
-typedef uint32_t PThreadId;
-
-// Device address
-// Bits 17->0: thread id
-// Bit 18: invalid address
-// Bits 31->19: thread-local device id
-typedef uint32_t PDeviceAddr;
-
-// Device address constructors
-inline PDeviceAddr invalidDeviceAddr() { return 0x40000; }
-inline PDeviceAddr makeDeviceAddr(PThreadId t, PLocalDeviceId d) {
-  return (d << 19) | t;
-}
-
-// Device address deconstructors
-inline bool isValidDeviceAddr(PDeviceAddr addr) { return !(addr & 0x40000); }
-inline PThreadId getThreadId(PDeviceAddr addr) { return addr & 0x3ffff; }
-inline PLocalDeviceId getLocalDeviceId(PDeviceAddr addr) { return addr >> 19; }
-
-// What's the max allowed local device address?
-inline uint32_t maxLocalDeviceId() { return 8192; }
-
-// Pins
-//   No      - means 'not ready to send'
-//   HostPin - means 'send to host'
-//   Pin(n)  - means 'send to application pin number n'
-typedef uint8_t PPin;
-#define No 0
-#define HostPin 1
-#define Pin(n) ((n)+2)
-
-// For template arguments that are not used
-struct None {};
-
-// Generic device structure
-// Type parameters:
-//   S - State
-//   E - Edge label
-//   M - Message structure
-template <typename S, typename E, typename M> struct PDevice {
-  // State
-  S* s;
-  PPin* readyToSend;
-  uint32_t numVertices;
-  uint16_t time;
-
-  // Handlers
-  void init();
-  void send(volatile M* msg);
-  void recv(M* msg, E* edge);
-  bool step();
-  bool finish(volatile M* msg);
-};
-
-// Generic device state structure
-template <typename S> struct ALIGNED PState {
-  // Board-level routing key for each outgoing pin
-  uint32_t pin[POLITE_NUM_PINS];
-  // Ready-to-send status
-  PPin readyToSend;
-  // Custom state
-  S state;
-};
-
-// Message structure
-template <typename M> struct PMessage {
-  // Destination thread-local device id
-  uint16_t devId;
-  // Id of incoming edge
-  uint16_t edgeId;
-  // Application message
-  M payload;
-};
-
-// An incoming edge to a device
-template <typename E> struct PInEdge {
-  E edge;
-};
-
-// Generic thread structure
-template <typename DeviceType,
-          typename S, typename E, typename M> struct PThread {
-
-  // Number of devices handled by thread
-  PLocalDeviceId numDevices;
-  // Number of times step handler has been called
-  uint16_t time;
-  // Number of devices in graph
-  uint32_t numVertices;
-  // Pointer to array of device states
-  PTR(PState<S>) devices;
-  // Pointer to base of edge table
-  PTR(PInEdge<E>) inTableBase;
-  // Array of local device ids are ready to send
-  PTR(PLocalDeviceId) senders;
-  // This array is accessed in a LIFO manner
-  PTR(PLocalDeviceId) sendersTop;
-
-  // Count number of messages sent
-  #ifdef POLITE_COUNT_MSGS
-  // Total messages sent
-  uint32_t msgsSent;
-  // Total messages received
-  uint32_t msgsReceived;
-  // Number of times we wanted to send but couldn't
-  uint32_t blockedSends;
-  #endif
-
-  #ifdef TINSEL
-
-  // Helper function to construct a device
-  INLINE DeviceType getDevice(uint32_t id) {
-    DeviceType dev;
-    dev.s           = &devices[id].state;
-    dev.readyToSend = &devices[id].readyToSend;
-    dev.numVertices = numVertices;
-    dev.time        = time;
-    return dev;
-  }
-
-  // Dump performance counter stats over UART
-  void dumpStats() {
-    tinselPerfCountStop();
-    uint32_t me = tinselId();
-    // Per-cache performance counters
-    uint32_t cacheMask = (1 <<
-      (TinselLogThreadsPerCore + TinselLogCoresPerDCache)) - 1;
-    if ((me & cacheMask) == 0) {
-      printf("H:%x,M:%x,W:%x\n",
-        tinselHitCount(),
-        tinselMissCount(),
-        tinselWritebackCount());
-    }
-    // Per-core performance counters
-    uint32_t coreMask = (1 << (TinselLogThreadsPerCore)) - 1;
-    if ((me & coreMask) == 0) {
-      printf("C:%x %x,I:%x %x\n",
-        tinselCycleCountU(), tinselCycleCount(),
-        tinselCPUIdleCountU(), tinselCPUIdleCount());
-    }
-    // Per-thread performance counters
-    #ifdef POLITE_COUNT_MSGS
-    uint32_t intraBoardId = me & ((1<<TinselLogThreadsPerBoard) - 1);
-    uint32_t progRouterSent =
-      intraBoardId == 0 ? tinselProgRouterSent() : 0;
-    uint32_t progRouterSentInter =
-      intraBoardId == 0 ? tinselProgRouterSentInterBoard() : 0;
-    printf("MS:%x,MR:%x,PR:%x,PRI:%x,BL:%x\n",
-      msgsSent, msgsReceived, progRouterSent,
-        progRouterSentInter, blockedSends);
-    #endif
-  }
-
-  // Invoke device handlers
-  void run() {
-    // Did last call to step handler request a new time step?
-    bool active = true;
-
-    // Reset performance counters
-    tinselPerfCountReset();
-
-    // Initialisation
-    sendersTop = senders;
-    for (uint32_t i = 0; i < numDevices; i++) {
-      DeviceType dev = getDevice(i);
-      // Invoke the initialiser for each device
-      dev.init();
-      // Device ready to send?
-      if (*dev.readyToSend != No) {
-        *(sendersTop++) = i;
-      }
-    }
-
-    // Set number of flits per message
-    tinselSetLen((sizeof(PMessage<M>)-1) >> TinselLogBytesPerFlit);
-
-    // Event loop
-    while (1) {
-      // Try to send
-      if (sendersTop != senders) {
-        if (tinselCanSend()) {
-          // Get next sender
-          PLocalDeviceId src = *(--sendersTop);
-          // Lookup device
-          DeviceType dev = getDevice(src);
-          PPin pin = *dev.readyToSend;
-          // Invoke send handler
-          PMessage<M>* m = (PMessage<M>*) tinselSendSlot();
-          dev.send(&m->payload);
-          // Reinsert sender, if it still wants to send
-          if (*dev.readyToSend != No) sendersTop++;
-          // Is it a send to the host pin or a user pin?
-          if (pin == HostPin)
-            tinselSend(tinselHostId(), m);
-          else
-            tinselKeySend(devices[src].pin[pin-2], m);
-          #ifdef POLITE_COUNT_MSGS
-            msgsSent++;
-          #endif
-        }
-        else {
-          #ifdef POLITE_COUNT_MSGS
-            blockedSends++;
-          #endif
-          tinselWaitUntil(TINSEL_CAN_SEND|TINSEL_CAN_RECV);
-        }
-      }
-      else {
-        // Idle detection
-        int idle = tinselIdle(!active);
-        if (idle > 1)
-          break;
-        else if (idle) {
-          active = false;
-          for (uint32_t i = 0; i < numDevices; i++) {
-            DeviceType dev = getDevice(i);
-            // Invoke the step handler for each device
-            active = dev.step() || active;
-            // Device ready to send?
-            if (*dev.readyToSend != No) {
-              *(sendersTop++) = i;
-            }
-          }
-          time++;
-        }
-      }
-
-      // Step 2: try to receive
-      while (tinselCanRecv()) {
-        PMessage<M>* inMsg = (PMessage<M>*) tinselRecv();
-        PInEdge<E>* inEdge = &inTableBase[inMsg->edgeId];
-        // Lookup destination device
-        PLocalDeviceId id = inMsg->devId;
-        DeviceType dev = getDevice(id);
-        // Was it ready to send?
-        PPin oldReadyToSend = *dev.readyToSend;
-        // Invoke receive handler
-        dev.recv(&inMsg->payload, &inEdge->edge);
-        // Insert device into a senders array, if not already there
-        if (*dev.readyToSend != No && oldReadyToSend == No)
-          *(sendersTop++) = id;
-        #ifdef POLITE_COUNT_MSGS
-          msgsReceived++;
-        #endif
-        tinselFree(inMsg);
-      }
-    }
-
-    // Termination
-    #ifdef POLITE_DUMP_STATS
-      dumpStats();
-    #endif
-
-    // Invoke finish handler for each device
-    for (uint32_t i = 0; i < numDevices; i++) {
-      DeviceType dev = getDevice(i);
-      tinselWaitUntil(TINSEL_CAN_SEND);
-      PMessage<M>* m = (PMessage<M>*) tinselSendSlot();
-      if (dev.finish(&m->payload)) tinselSend(tinselHostId(), m);
-    }
-
-    // Sleep
-    tinselWaitUntil(TINSEL_CAN_RECV); while (1);
-  }
-
-  #endif
-
-};
-
-#endif
diff --git a/include/POLite/Dist/PGraph.h b/include/POLite/Dist/PGraph.h
deleted file mode 100644
index 923c34d3..00000000
--- a/include/POLite/Dist/PGraph.h
+++ /dev/null
@@ -1,708 +0,0 @@
-// SPDX-License-Identifier: BSD-2-Clause
-#ifndef _PGRAPH_H_
-#define _PGRAPH_H_
-
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <math.h>
-#include <HostLink.h>
-#include <config.h>
-#include <POLite.h>
-#include <POLite/Seq.h>
-#include <POLite/Graph.h>
-#include <POLite/Placer.h>
-#include <POLite/ProgRouters.h>
-#include <type_traits>
-
-// Nodes of a POETS graph are devices
-typedef NodeId PDeviceId;
-
-// POETS graph
-template <typename DeviceType,
-          typename S, typename E, typename M> class PGraph {
- private:
-  // Align address to 2^n byte boundary
-  inline uint32_t align(uint32_t n, uint32_t addr) {
-    if ((addr & (1<<n)-1) == 0) return addr;
-    return ((addr >> n) + 1) << n;
-  }
-
-  // Align address to 32-bit word boundary
-  uint32_t wordAlign(uint32_t addr) { return align(2, addr); }
-
-  // Align address to cache-line boundary
-  uint32_t cacheAlign(uint32_t addr) {
-    return align(TinselLogBytesPerLine, addr);
-  }
-
-  // Helper function
-  inline uint32_t min(uint32_t x, uint32_t y) { return x < y ? x : y; }
-
-  // Number of FPGA boards available
-  uint32_t meshLenX;
-  uint32_t meshLenY;
-
-  // Number of FPGA boards to use
-  uint32_t numBoardsX;
-  uint32_t numBoardsY;
-
-  // Out table (sender-side edge tables)
-  // Sequence of destinations for every (device, pin) pair
-  Seq<PRoutingDest>*** outTable;
-
-  // Key table (sender-side key tables)
-  // Global routing key for every (device, pin) pair
-  uint32_t** keyTable;
-
-  // In table (receiver-side edge tables)
-  // Sequence of incoming edges for every thread
-  Seq<PInEdge<E>>** inTable;
-
-  // Mesh of per-board programmable routers
-  ProgRouterMesh* routingTables;
-
-  // Generic constructor
-  void constructor(uint32_t lenX, uint32_t lenY) {
-    meshLenX = lenX;
-    meshLenY = lenY;
-    char* str = getenv("POLITE_BOARDS_X");
-    int nx = str ? atoi(str) : meshLenX;
-    str = getenv("POLITE_BOARDS_Y");
-    int ny = str ? atoi(str) : meshLenY;
-    setNumBoards(nx, ny);
-    numDevices = 0;
-    devices = NULL;
-    toDeviceAddr = NULL;
-    numDevicesOnThread = NULL;
-    fromDeviceAddr = NULL;
-    vertexMem = NULL;
-    vertexMemSize = NULL;
-    vertexMemBase = NULL;
-    inEdgeMem = NULL;
-    inEdgeMemSize = NULL;
-    inEdgeMemBase = NULL;
-    mapVerticesToDRAM = false;
-    mapInEdgesToDRAM = true;
-    outTable = NULL;
-    keyTable = NULL;
-    inTable = NULL;
-    routingTables = NULL;
-    chatty = 0;
-    str = getenv("POLITE_CHATTY");
-    if (str != NULL) {
-      chatty = !strcmp(str, "0") ? 0 : 1;
-    }
-  }
-
- public:
-  // Number of devices
-  uint32_t numDevices;
-
-  // Graph containing device ids and connections
-  Graph graph;
-
-  // Edge labels: has same structure as graph.outgoing
-  Seq<Seq<E>*> edgeLabels;
-
-  // Mapping from device id to device state
-  // (Not valid until the mapper is called)
-  PState<S>** devices;
-
-  // Mapping from thread id to number of devices on that thread
-  // (Not valid until the mapper is called)
-  uint32_t* numDevicesOnThread;
-
-  // Mapping from device id to device address and back
-  // (Not valid until the mapper is called)
-  PDeviceAddr* toDeviceAddr;  // Device id -> device address
-  PDeviceId** fromDeviceAddr; // Device address -> device id
-
-  // Each thread's vertex mem and thread mem regions
-  // (Not valid until the mapper is called)
-  uint8_t** vertexMem;      uint8_t** threadMem;
-  uint32_t* vertexMemSize;  uint32_t* threadMemSize;
-  uint32_t* vertexMemBase;  uint32_t* threadMemBase;
-
-  // Each thread's in-edge tables
-  // (Not valid until the mapper is called)
-  uint8_t** inEdgeMem;
-  uint32_t* inEdgeMemSize;
-  uint32_t* inEdgeMemBase;
-
-  // Where to map the various regions
-  // (If false, map to SRAM instead)
-  bool mapVerticesToDRAM;
-  bool mapInEdgesToDRAM;
-
-  // Allow mapper to print useful information to stdout
-  uint32_t chatty;
-
-  // Setter for number of boards to use
-  void setNumBoards(uint32_t x, uint32_t y) {
-    if (x > meshLenX || y > meshLenY) {
-      printf("Mapper: %d x %d boards requested, %d x %d available\n",
-        numBoardsX, numBoardsY, meshLenX, meshLenY);
-      exit(EXIT_FAILURE);
-    }
-    numBoardsX = x;
-    numBoardsY = y;
-  }
-
-  // Create new device
-  inline PDeviceId newDevice() {
-    edgeLabels.append(new SmallSeq<E>);
-    numDevices++;
-    return graph.newNode();
-  }
-
-  // Add a connection between devices
-  inline void addEdge(PDeviceId from, PinId pin, PDeviceId to) {
-    if (pin >= POLITE_NUM_PINS) {
-      printf("addEdge: pin exceeds POLITE_NUM_PINS\n");
-      exit(EXIT_FAILURE);
-    }
-    graph.addEdge(from, pin, to);
-    E edge;
-    edgeLabels.elems[from]->append(edge);
-  }
-
-  // Add labelled edge using given output pin
-  void addLabelledEdge(E edge, PDeviceId x, PinId pin, PDeviceId y) {
-    graph.addEdge(x, pin, y);
-    edgeLabels.elems[x]->append(edge);
-  }
-
-  // Allocate SRAM and DRAM partitions
-  void allocatePartitions() {
-    // Decide a maximum partition size that is reasonable
-    // SRAM: Partition size minus 2048 bytes for the stack
-    uint32_t maxSRAMSize = (1<<TinselLogBytesPerSRAMPartition) - 2048;
-    // DRAM: Partition size minus 65536 bytes for the stack
-    uint32_t maxDRAMSize = (1<<TinselLogBytesPerDRAMPartition) - 65536;
-    // Allocate partition sizes and bases
-    vertexMem = (uint8_t**) calloc(TinselMaxThreads, sizeof(uint8_t*));
-    vertexMemSize = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
-    vertexMemBase = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
-    threadMem = (uint8_t**) calloc(TinselMaxThreads, sizeof(uint8_t*));
-    threadMemSize = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
-    threadMemBase = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
-    inEdgeMem = (uint8_t**) calloc(TinselMaxThreads, sizeof(uint8_t*));
-    inEdgeMemSize = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
-    inEdgeMemBase = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
-    // Compute partition sizes for each thread
-    for (uint32_t threadId = 0; threadId < TinselMaxThreads; threadId++) {
-      // This variable is used to count the size of the *initialised*
-      // partition.  The total partition size is larger as it includes
-      // uninitialised portions.
-      uint32_t sizeVMem = 0;
-      uint32_t sizeEIMem = 0;
-      uint32_t sizeTMem = 0;
-      // Add space for thread structure (always stored in SRAM)
-      sizeTMem = cacheAlign(sizeof(PThread<DeviceType, S, E, M>));
-      // Add space for devices
-      uint32_t numDevs = numDevicesOnThread[threadId];
-      for (uint32_t devNum = 0; devNum < numDevs; devNum++) {
-        // Add space for device
-        sizeVMem = sizeVMem + sizeof(PState<S>);
-      }
-      // Add space for incoming edge table
-      if (inTable[threadId]) {
-        sizeEIMem = inTable[threadId]->numElems * sizeof(PInEdge<E>);
-        sizeEIMem = wordAlign(sizeEIMem);
-      }
-      // The total partition size including uninitialised portions
-      uint32_t totalSizeVMem =
-        sizeVMem + wordAlign(sizeof(PLocalDeviceId) * numDevs);
-      // Check that total size is reasonable
-      uint32_t totalSizeSRAM = sizeTMem;
-      uint32_t totalSizeDRAM = 0;
-      if (mapVerticesToDRAM) totalSizeDRAM += totalSizeVMem;
-                        else totalSizeSRAM += totalSizeVMem;
-      if (mapInEdgesToDRAM)  totalSizeDRAM += sizeEIMem;
-                        else totalSizeSRAM += sizeEIMem;
-      if (totalSizeDRAM > maxDRAMSize) {
-        printf("Error: max DRAM partition size exceeded\n");
-        exit(EXIT_FAILURE);
-      }
-      if (totalSizeSRAM > maxSRAMSize) {
-        printf("Error: max SRAM partition size exceeded\n");
-        exit(EXIT_FAILURE);
-      }
-      // Allocate space for the initialised portion of the partition
-      assert((sizeVMem%4) == 0);
-      assert((sizeTMem%4) == 0);
-      assert((sizeEIMem%4) == 0);
-      vertexMem[threadId] = (uint8_t*) calloc(sizeVMem, 1);
-      vertexMemSize[threadId] = sizeVMem;
-      threadMem[threadId] = (uint8_t*) calloc(sizeTMem, 1);
-      threadMemSize[threadId] = sizeTMem;
-      inEdgeMem[threadId] = (uint8_t*) calloc(sizeEIMem, 1);
-      inEdgeMemSize[threadId] = sizeEIMem;
-      // Tinsel address of base of partition
-      uint32_t partId = threadId & (TinselThreadsPerDRAM-1);
-      uint32_t sramBase = (1 << TinselLogBytesPerSRAM) +
-          (partId << TinselLogBytesPerSRAMPartition);
-      uint32_t dramBase = TinselBytesPerDRAM -
-          ((partId+1) << TinselLogBytesPerDRAMPartition);
-      // Use partition-interleaved region for DRAM
-      dramBase |= 0x80000000;
-      threadMemBase[threadId] = sramBase;
-      sramBase += threadMemSize[threadId];
-      // Determine base addresses of each region
-      if (mapVerticesToDRAM) {
-        vertexMemBase[threadId] = dramBase;
-        dramBase += totalSizeVMem;
-      }
-      else {
-        vertexMemBase[threadId] = sramBase;
-        sramBase += totalSizeVMem;
-      }
-      if (mapInEdgesToDRAM) {
-        inEdgeMemBase[threadId] = dramBase;
-        dramBase += sizeEIMem;
-      }
-      else {
-        inEdgeMemBase[threadId] = sramBase;
-        sramBase += sizeEIMem;
-      }
-    }
-  }
-
-  // Initialise partitions
-  void initialisePartitions() {
-    for (uint32_t threadId = 0; threadId < TinselMaxThreads; threadId++) {
-      // Next pointers for each partition
-      uint32_t nextVMem = 0;
-      // Pointer to thread structure
-      PThread<DeviceType, S, E, M>* thread =
-        (PThread<DeviceType, S, E, M>*) &threadMem[threadId][0];
-      // Set number of devices on thread
-      thread->numDevices = numDevicesOnThread[threadId];
-      // Set number of devices in graph
-      thread->numVertices = numDevices;
-      // Set tinsel address of array of device states
-      thread->devices = vertexMemBase[threadId];
-      // Set tinsel address of base of in-edge table
-      thread->inTableBase = inEdgeMemBase[threadId];
-      // Add space for each device on thread
-      uint32_t numDevs = numDevicesOnThread[threadId];
-      for (uint32_t devNum = 0; devNum < numDevs; devNum++) {
-        PState<S>* dev = (PState<S>*) &vertexMem[threadId][nextVMem];
-        PDeviceId id = fromDeviceAddr[threadId][devNum];
-        devices[id] = dev;
-        // Add space for device
-        nextVMem = nextVMem + sizeof(PState<S>);
-      }
-      // Initialise each device and the thread's out edges
-      for (uint32_t devNum = 0; devNum < numDevs; devNum++) {
-        PDeviceId id = fromDeviceAddr[threadId][devNum];
-        PState<S>* dev = devices[id];
-        // Initialise
-        for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) {
-          dev->pin[p] = keyTable[id][p];
-        }
-      }
-      // Intialise thread's in edges
-      PInEdge<E>* inEdgeArray = (PInEdge<E>*) inEdgeMem[threadId];
-      Seq<PInEdge<E>>* edges = inTable[threadId];
-      if (edges)
-        for (uint32_t i = 0; i < edges->numElems; i++) {
-          inEdgeArray[i] = edges->elems[i];
-        }
-      // At this point, check that next pointers line up with heap sizes
-      if (nextVMem != vertexMemSize[threadId]) {
-        printf("Error: vertex mem size does not match pre-computed size\n");
-        exit(EXIT_FAILURE);
-      }
-      // Set tinsel address of senders array
-      thread->senders = vertexMemBase[threadId] + nextVMem;
-    }
-  }
-
-  // Allocate mapping structures
-  void allocateMapping() {
-    devices = (PState<S>**) calloc(numDevices, sizeof(PState<S>*));
-    toDeviceAddr = (PDeviceAddr*) calloc(numDevices, sizeof(PDeviceAddr));
-    fromDeviceAddr = (PDeviceId**) calloc(TinselMaxThreads, sizeof(PDeviceId*));
-    numDevicesOnThread = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
-  }
-
-  // Allocate thread edge input and output tables
-  // (Only valid after mapper is called)
-  void allocateInOutTables() {
-    // Receiver-side tables
-    inTable = (Seq<PInEdge<E>>**)
-      calloc(TinselMaxThreads,sizeof(Seq<PInEdge<E>>*));
-    for (uint32_t t = 0; t < TinselMaxThreads; t++) {
-      if (numDevicesOnThread[t] != 0)
-        inTable[t] = new SmallSeq<PInEdge<E>>;
-    }
-
-    // Sender-side tables
-    outTable = (Seq<PRoutingDest>***)
-      calloc(numDevices, sizeof(Seq<PRoutingDest>**));
-    for (uint32_t d = 0; d < numDevices; d++) {
-      outTable[d] = (Seq<PRoutingDest>**)
-        calloc(POLITE_NUM_PINS, sizeof(Seq<PRoutingDest>*));
-      for (uint32_t p = 0; p < POLITE_NUM_PINS; p++)
-        outTable[d][p] = new SmallSeq<PRoutingDest>;
-    }
-
-    keyTable = new uint32_t* [numDevices];
-    for (uint32_t d = 0; d < numDevices; d++)
-      keyTable[d] = new uint32_t [POLITE_NUM_PINS];
-  }
-
-  // Compute thread edge input and output tables
-  // (Only valid after mapper is called)
-  void computeInOutTables() {
-    // For each device
-    for (uint32_t d = 0; d < numDevices; d++) {
-      // For each pin
-      for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) {
-        Seq<PDeviceId>* dests = graph.outgoing->elems[d];
-        Seq<E>* edges = edgeLabels.elems[d];
-        for (uint32_t i = 0; i < dests->numElems; i++) {
-          PDeviceId destId = dests->elems[i];
-          // Destination thread id
-          uint32_t threadId = getThreadId(toDeviceAddr[destId]);
-          // Thread-local device id
-          uint32_t devId = getLocalDeviceId(toDeviceAddr[destId]);
-          // Add edge to thread's input table
-          uint32_t edgeId = inTable[threadId]->numElems;
-          PInEdge<E> edge;
-          edge.edge = edges->elems[i];
-          inTable[threadId]->append(edge);
-          // Add output table entry
-          PRoutingDest rdest;
-          rdest.kind = PRDestKindURM1;
-          rdest.mbox = threadId >> TinselLogThreadsPerMailbox;
-          rdest.urm1.key = devId | (edgeId << 16);
-          rdest.urm1.threadId = threadId &
-            ((1<<TinselLogThreadsPerMailbox) - 1);
-          outTable[d][p]->append(rdest);
-        }
-      }
-    }
-  }
-
-  // Release all structures
-  void releaseAll() {
-    if (devices != NULL) {
-      free(devices);
-      free(toDeviceAddr);
-      free(numDevicesOnThread);
-      for (uint32_t t = 0; t < TinselMaxThreads; t++)
-        if (fromDeviceAddr[t] != NULL) free(fromDeviceAddr[t]);
-      free(fromDeviceAddr);
-      for (uint32_t t = 0; t < TinselMaxThreads; t++)
-        if (vertexMem[t] != NULL) free(vertexMem[t]);
-      free(vertexMem);
-      free(vertexMemSize);
-      free(vertexMemBase);
-      for (uint32_t t = 0; t < TinselMaxThreads; t++)
-        if (threadMem[t] != NULL) free(threadMem[t]);
-      free(threadMem);
-      free(threadMemSize);
-      free(threadMemBase);
-      for (uint32_t t = 0; t < TinselMaxThreads; t++)
-        if (inEdgeMem[t] != NULL) free(inEdgeMem[t]);
-      free(inEdgeMem);
-      free(inEdgeMemSize);
-      free(inEdgeMemBase);
-    }
-    if (inTable != NULL) {
-      for (uint32_t t = 0; t < TinselMaxThreads; t++)
-        if (inTable[t] != NULL) delete inTable[t];
-      free(inTable);
-      inTable = NULL;
-    }
-    if (outTable != NULL) {
-      for (uint32_t d = 0; d < numDevices; d++) {
-        if (outTable[d] == NULL) continue;
-        for (uint32_t p = 0; p < POLITE_NUM_PINS; p++)
-          delete outTable[d][p];
-        free(outTable[d]);
-      }
-      free(outTable);
-      outTable = NULL;
-    }
-    if (keyTable != NULL) {
-      for (uint32_t d = 0; d < numDevices; d++) delete [] keyTable[d];
-      delete [] keyTable;
-      keyTable = NULL;
-    }
-    if (routingTables != NULL) delete routingTables;
-  }
-
-  // Implement mapping to tinsel threads
-  void map() {
-    // Let's measure some times
-    struct timeval placementStart, placementFinish;
-    struct timeval routingStart, routingFinish;
-    struct timeval initStart, initFinish;
-
-    // Release all mapping and heap structures
-    releaseAll();
-
-    // Reallocate mapping structures
-    allocateMapping();
-
-    // Start placement timer
-    gettimeofday(&placementStart, NULL);
-
-    // Partition into subgraphs, one per board
-    Placer boards(&graph, numBoardsX, numBoardsY);
-
-    // Place subgraphs onto 2D mesh
-    const uint32_t placerEffort = 8;
-    boards.place(placerEffort);
-
-    // For each board
-    for (uint32_t boardY = 0; boardY < numBoardsY; boardY++) {
-      for (uint32_t boardX = 0; boardX < numBoardsX; boardX++) {
-        // Partition into subgraphs, one per mailbox
-        PartitionId b = boards.mapping[boardY][boardX];
-        Placer boxes(&boards.subgraphs[b], 
-                 TinselMailboxMeshXLen, TinselMailboxMeshYLen);
-        boxes.place(placerEffort);
-
-        // For each mailbox
-        for (uint32_t boxX = 0; boxX < TinselMailboxMeshXLen; boxX++) {
-          for (uint32_t boxY = 0; boxY < TinselMailboxMeshYLen; boxY++) {
-            // Partition into subgraphs, one per thread
-            uint32_t numThreads = 1<<TinselLogThreadsPerMailbox;
-            PartitionId t = boxes.mapping[boxY][boxX];
-            Placer threads(&boxes.subgraphs[t], numThreads, 1);
-
-            // For each thread
-            for (uint32_t threadNum = 0; threadNum < numThreads; threadNum++) {
-              // Determine tinsel thread id
-              uint32_t threadId = boardY;
-              threadId = (threadId << TinselMeshXBits) | boardX;
-              threadId = (threadId << TinselMailboxMeshYBits) | boxY;
-              threadId = (threadId << TinselMailboxMeshXBits) | boxX;
-              threadId = (threadId << (TinselLogCoresPerMailbox +
-                            TinselLogThreadsPerCore)) | threadNum;
-
-              // Get subgraph
-              Graph* g = &threads.subgraphs[threadNum];
-
-              // Populate fromDeviceAddr mapping
-              uint32_t numDevs = g->incoming->numElems;
-              numDevicesOnThread[threadId] = numDevs;
-              fromDeviceAddr[threadId] = (PDeviceId*)
-                malloc(sizeof(PDeviceId) * numDevs);
-              for (uint32_t devNum = 0; devNum < numDevs; devNum++)
-                fromDeviceAddr[threadId][devNum] = g->labels->elems[devNum];
-  
-              // Populate toDeviceAddr mapping
-              assert(numDevs < maxLocalDeviceId());
-              for (uint32_t devNum = 0; devNum < numDevs; devNum++) {
-                PDeviceAddr devAddr =
-                  makeDeviceAddr(threadId, devNum);
-                toDeviceAddr[g->labels->elems[devNum]] = devAddr;
-              }
-            }
-          }
-        }
-      }
-    }
-
-    // Stop placement timer and start In/Out table timer
-    gettimeofday(&placementFinish, NULL);
-    gettimeofday(&routingStart, NULL);
-
-    // Compute send and receive side routing tables
-    allocateInOutTables();
-    computeInOutTables();
-
-    // Compute per-board programmable routing tables
-    routingTables = new ProgRouterMesh(numBoardsX, numBoardsY);
-    for (uint32_t d = 0; d < numDevices; d++) {
-      uint32_t src = getThreadId(toDeviceAddr[d]) >>
-        TinselLogThreadsPerMailbox;
-      for (uint32_t p = 0; p < POLITE_NUM_PINS; p++)
-        keyTable[d][p] = routingTables->addDestsFromBoard(src, outTable[d][p]);
-   }
-
-    // Stop routing timer and start init timer
-    gettimeofday(&routingFinish, NULL);
-    gettimeofday(&initStart, NULL);
-
-    // Reallocate and initialise heap structures
-    allocatePartitions();
-    initialisePartitions();
-
-    // Display times, if chatty
-    gettimeofday(&initFinish, NULL);
-    if (chatty > 0) {
-      struct timeval diff;
-
-      timersub(&placementFinish, &placementStart, &diff);
-      double duration = (double) diff.tv_sec +
-        (double) diff.tv_usec / 1000000.0;
-      printf("POLite mapper profile:\n");
-      printf("  Partitioning and placement: %lfs\n", duration);
-
-      timersub(&routingFinish, &routingStart, &diff);
-      duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
-      printf("  In/Out table construction: %lfs\n", duration);
-
-      timersub(&initFinish, &initStart, &diff);
-      duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
-      printf("  Thread state initialisation: %lfs\n", duration);
-    }
-  }
-
-  // Constructor
-  PGraph() {
-    char* str = getenv("HOSTLINK_BOXES_X");
-    int x = str ? atoi(str) : 1;
-    x = x * TinselMeshXLenWithinBox;
-    str = getenv("HOSTLINK_BOXES_Y");
-    int y = str ? atoi(str) : 1;
-    y = y * TinselMeshYLenWithinBox;
-    constructor(x, y);
-  }
-  PGraph(uint32_t numBoxesX, uint32_t numBoxesY) {
-    int x = numBoxesX * TinselMeshXLenWithinBox; 
-    int y = numBoxesY * TinselMeshYLenWithinBox;
-    constructor(x, y);
-  }
-
-  // Deconstructor
-  ~PGraph() {
-    releaseAll();
-    for (uint32_t i = 0; i < edgeLabels.numElems; i++)
-      delete edgeLabels.elems[i];
-  }
-
-  // Write partition to tinsel machine
-  void writeRAM(HostLink* hostLink,
-         uint8_t** heap, uint32_t* heapSize, uint32_t* heapBase) {
-    // Number of bytes written by each thread
-    uint32_t* writeCount = (uint32_t*)
-      calloc(TinselMaxThreads, sizeof(uint32_t));
-
-    // Number of threads completed by each core
-    uint32_t*** threadCount = (uint32_t***)
-      calloc(meshLenX, sizeof(uint32_t**));
-    for (uint32_t x = 0; x < meshLenX; x++) {
-      threadCount[x] = (uint32_t**)
-        calloc(meshLenY, sizeof(uint32_t*));
-      for (uint32_t y = 0; y < meshLenY; y++)
-        threadCount[x][y] = (uint32_t*)
-          calloc(TinselCoresPerBoard, sizeof(uint32_t));
-    }
-
-    // Initialise write addresses
-    for (int x = 0; x < meshLenX; x++)
-      for (int y = 0; y < meshLenY; y++)
-        for (int c = 0; c < TinselCoresPerBoard; c++)
-          hostLink->setAddr(x, y, c, heapBase[hostLink->toAddr(x, y, c, 0)]);
-
-    // Write heaps
-    uint32_t done = false;
-    while (! done) {
-      done = true;
-      for (int x = 0; x < meshLenX; x++) {
-        for (int y = 0; y < meshLenY; y++) {
-          for (int c = 0; c < TinselCoresPerBoard; c++) {
-            uint32_t t = threadCount[x][y][c];
-            if (t < TinselThreadsPerCore) {
-              done = false;
-              uint32_t threadId = hostLink->toAddr(x, y, c, t);
-              uint32_t written = writeCount[threadId];
-              if (written == heapSize[threadId]) {
-                threadCount[x][y][c] = t+1;
-                if ((t+1) < TinselThreadsPerCore)
-                  hostLink->setAddr(x, y, c,
-                    heapBase[hostLink->toAddr(x, y, c, t+1)]);
-              } else {
-                uint32_t send = min((heapSize[threadId] - written)>>2, 15);
-                hostLink->store(x, y, c, send,
-                  (uint32_t*) &heap[threadId][written]);
-                writeCount[threadId] = written + send * sizeof(uint32_t);
-              }
-            }
-          }
-        }
-      }
-    }
-
-    // Release memory
-    free(writeCount);
-    for (uint32_t x = 0; x < meshLenX; x++) {
-      for (uint32_t y = 0; y < meshLenY; y++)
-        free(threadCount[x][y]);
-      free(threadCount[x]);
-    }
-    free(threadCount);
-  }
-
-  // Write graph to tinsel machine
-  void write(HostLink* hostLink) { 
-    // Start timer
-    struct timeval start, finish;
-    gettimeofday(&start, NULL);
-
-    bool useSendBufferOld = hostLink->useSendBuffer;
-    hostLink->useSendBuffer = true;
-    writeRAM(hostLink, vertexMem, vertexMemSize, vertexMemBase);
-    writeRAM(hostLink, threadMem, threadMemSize, threadMemBase);
-    writeRAM(hostLink, inEdgeMem, inEdgeMemSize, inEdgeMemBase);
-    routingTables->write(hostLink);
-    hostLink->flush();
-    hostLink->useSendBuffer = useSendBufferOld;
-
-    // Display time if chatty
-    gettimeofday(&finish, NULL);
-    if (chatty > 0) {
-      struct timeval diff;
-      timersub(&finish, &start, &diff);
-      double duration = (double) diff.tv_sec +
-        (double) diff.tv_usec / 1000000.0;
-      printf("POLite graph upload time: %lfs\n", duration);
-    }
-  }
-
-  // Determine fan-in of given device
-  uint32_t fanIn(PDeviceId id) {
-    return graph.fanIn(id);
-  }
-
-  // Determine fan-out of given device
-  uint32_t fanOut(PDeviceId id) {
-    return graph.fanOut(id);
-  }
-
-};
-
-// Read performance stats and store in file
-inline void politeSaveStats(HostLink* hostLink, const char* filename) {
-  #ifdef POLITE_DUMP_STATS
-  // Open file for performance counters
-  FILE* statsFile = fopen(filename, "wt");
-  if (statsFile == NULL) {
-    printf("Error creating stats file\n");
-    exit(EXIT_FAILURE);
-  }
-  uint32_t meshLenX = hostLink->meshXLen;
-  uint32_t meshLenY = hostLink->meshYLen;
-  // Number of caches
-  uint32_t numLines = meshLenX * meshLenY *
-                        TinselDCachesPerDRAM * TinselDRAMsPerBoard;
-  // Add on number of cores
-  numLines += meshLenX * meshLenY * TinselCoresPerBoard;
-  // Add on number of threads
-  #ifdef POLITE_COUNT_MSGS
-  numLines += meshLenX * meshLenY * TinselThreadsPerBoard;
-  #endif
-  hostLink->dumpStdOut(statsFile, numLines);
-  fclose(statsFile);
-  #endif
-}
-
-#endif
diff --git a/include/POLite/Hybrid/PDevice.h b/include/POLite/Hybrid/PDevice.h
deleted file mode 100644
index d46bd7b4..00000000
--- a/include/POLite/Hybrid/PDevice.h
+++ /dev/null
@@ -1,321 +0,0 @@
-// SPDX-License-Identifier: BSD-2-Clause
-#ifndef _PDEVICE_H_
-#define _PDEVICE_H_
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <type_traits>
-
-#ifdef TINSEL
-  #include <tinsel.h>
-  #define PTR(t) t*
-#else
-  #include <tinsel-interface.h>
-  #define PTR(t) uint32_t
-#endif
-
-// Use this to align on half-cache-line boundary
-#define ALIGNED __attribute__((aligned(1<<(TinselLogBytesPerLine-1))))
-
-// This is a static limit on the number of pins per device
-#ifndef POLITE_NUM_PINS
-#define POLITE_NUM_PINS 1
-#endif
-
-// Macros for performance stats
-//   POLITE_DUMP_STATS - dump performance stats on termination
-//   POLITE_COUNT_MSGS - include message counts of performance stats
-
-// Thread-local device id
-typedef uint16_t PLocalDeviceId;
-#define InvalidLocalDevId 0xffff
-#define UnusedLocalDevId 0xfffe
-
-// Thread id
-typedef uint32_t PThreadId;
-
-// Device address
-// Bits 17->0: thread id
-// Bit 18: invalid address
-// Bits 31->19: thread-local device id
-typedef uint32_t PDeviceAddr;
-
-// Device address constructors
-inline PDeviceAddr invalidDeviceAddr() { return 0x40000; }
-inline PDeviceAddr makeDeviceAddr(PThreadId t, PLocalDeviceId d) {
-  return (d << 19) | t;
-}
-
-// Device address deconstructors
-inline bool isValidDeviceAddr(PDeviceAddr addr) { return !(addr & 0x40000); }
-inline PThreadId getThreadId(PDeviceAddr addr) { return addr & 0x3ffff; }
-inline PLocalDeviceId getLocalDeviceId(PDeviceAddr addr) { return addr >> 19; }
-
-// What's the max allowed local device address?
-inline uint32_t maxLocalDeviceId() { return 8192; }
-
-// Index into the per-thread in-edge table
-typedef uint16_t InTableKey;
-
-// Pins
-//   No      - means 'not ready to send'
-//   HostPin - means 'send to host'
-//   Pin(n)  - means 'send to application pin number n'
-typedef uint8_t PPin;
-#define No 0
-#define HostPin 1
-#define Pin(n) ((n)+2)
-
-// For template arguments that are not used
-struct None {};
-
-// Generic device structure
-// Type parameters:
-//   S - State
-//   E - Edge label
-//   M - Message structure
-template <typename S, typename E, typename M> struct PDevice {
-  // State
-  S* s;
-  PPin* readyToSend;
-  uint32_t numVertices;
-  uint16_t time;
-
-  // Handlers
-  void init();
-  void send(volatile M* msg);
-  void recv(M* msg, E* edge);
-  bool step();
-  bool finish(volatile M* msg);
-};
-
-// Generic device state structure
-template <typename S> struct ALIGNED PState {
-  // Board-level routing key for each outgoing pin
-  uint32_t pin[POLITE_NUM_PINS];
-  // Ready-to-send status
-  PPin readyToSend;
-  // Custom state
-  S state;
-};
-
-// Message structure
-template <typename M> struct PMessage {
-  // Source-based routing key
-  InTableKey key;
-  // Application message
-  M payload;
-};
-
-// An incoming edge to a device (labelleled)
-template <typename E> struct PInEdge {
-  // Destination device
-  PLocalDeviceId devId;
-  // Edge info
-  E edge;
-};
-
-// An incoming edge to a device (unlabelleled)
-template <> struct PInEdge<None> {
-  union {
-    // Destination device
-    PLocalDeviceId devId;
-    // Unused
-    None edge;
-  };
-};
-
-// Generic thread structure
-template <typename DeviceType,
-          typename S, typename E, typename M> struct PThread {
-
-  // Number of devices handled by thread
-  PLocalDeviceId numDevices;
-  // Number of times step handler has been called
-  uint16_t time;
-  // Number of devices in graph
-  uint32_t numVertices;
-  // Pointer to array of device states
-  PTR(PState<S>) devices;
-  // Pointer to base of in table
-  PTR(PInEdge<E>) inTableBase;
-  // Array of local device ids are ready to send
-  PTR(PLocalDeviceId) senders;
-  // This array is accessed in a LIFO manner
-  PTR(PLocalDeviceId) sendersTop;
-
-  // Count number of messages sent
-  #ifdef POLITE_COUNT_MSGS
-  // Total messages sent
-  uint32_t msgsSent;
-  // Total messages received
-  uint32_t msgsReceived;
-  // Number of times we wanted to send but couldn't
-  uint32_t blockedSends;
-  #endif
-
-  #ifdef TINSEL
-
-  // Helper function to construct a device
-  INLINE DeviceType getDevice(uint32_t id) {
-    DeviceType dev;
-    dev.s           = &devices[id].state;
-    dev.readyToSend = &devices[id].readyToSend;
-    dev.numVertices = numVertices;
-    dev.time        = time;
-    return dev;
-  }
-
-  // Dump performance counter stats over UART
-  void dumpStats() {
-    tinselPerfCountStop();
-    uint32_t me = tinselId();
-    // Per-cache performance counters
-    uint32_t cacheMask = (1 <<
-      (TinselLogThreadsPerCore + TinselLogCoresPerDCache)) - 1;
-    if ((me & cacheMask) == 0) {
-      printf("H:%x,M:%x,W:%x\n",
-        tinselHitCount(),
-        tinselMissCount(),
-        tinselWritebackCount());
-    }
-    // Per-core performance counters
-    uint32_t coreMask = (1 << (TinselLogThreadsPerCore)) - 1;
-    if ((me & coreMask) == 0) {
-      printf("C:%x %x,I:%x %x\n",
-        tinselCycleCountU(), tinselCycleCount(),
-        tinselCPUIdleCountU(), tinselCPUIdleCount());
-    }
-    // Per-thread performance counters
-    #ifdef POLITE_COUNT_MSGS
-    uint32_t intraBoardId = me & ((1<<TinselLogThreadsPerBoard) - 1);
-    uint32_t progRouterSent =
-      intraBoardId == 0 ? tinselProgRouterSent() : 0;
-    uint32_t progRouterSentInter =
-      intraBoardId == 0 ? tinselProgRouterSentInterBoard() : 0;
-    printf("MS:%x,MR:%x,PR:%x,PRI:%x,BL:%x\n",
-      msgsSent, msgsReceived, progRouterSent,
-        progRouterSentInter, blockedSends);
-    #endif
-  }
-
-  // Invoke device handlers
-  void run() {
-    // Did last call to step handler request a new time step?
-    bool active = true;
-
-    // Reset performance counters
-    tinselPerfCountReset();
-
-    // Initialisation
-    sendersTop = senders;
-    for (uint32_t i = 0; i < numDevices; i++) {
-      DeviceType dev = getDevice(i);
-      // Invoke the initialiser for each device
-      dev.init();
-      // Device ready to send?
-      if (*dev.readyToSend != No) {
-        *(sendersTop++) = i;
-      }
-    }
-
-    // Set number of flits per message
-    tinselSetLen((sizeof(PMessage<M>)-1) >> TinselLogBytesPerFlit);
-
-    // Event loop
-    while (1) {
-      // Try to send
-      if (sendersTop != senders) {
-        if (tinselCanSend()) {
-          // Get next sender
-          PLocalDeviceId src = *(--sendersTop);
-          // Lookup device
-          DeviceType dev = getDevice(src);
-          PPin pin = *dev.readyToSend;
-          // Invoke send handler
-          PMessage<M>* m = (PMessage<M>*) tinselSendSlot();
-          dev.send(&m->payload);
-          // Reinsert sender, if it still wants to send
-          if (*dev.readyToSend != No) sendersTop++;
-          // Is it a send to the host pin or a user pin?
-          if (pin == HostPin)
-            tinselSend(tinselHostId(), m);
-          else
-            tinselKeySend(devices[src].pin[pin-2], m);
-          #ifdef POLITE_COUNT_MSGS
-            msgsSent++;
-          #endif
-        }
-        else {
-          #ifdef POLITE_COUNT_MSGS
-            blockedSends++;
-          #endif
-          tinselWaitUntil(TINSEL_CAN_SEND|TINSEL_CAN_RECV);
-        }
-      }
-      else {
-        // Idle detection
-        int idle = tinselIdle(!active);
-        if (idle > 1)
-          break;
-        else if (idle) {
-          active = false;
-          for (uint32_t i = 0; i < numDevices; i++) {
-            DeviceType dev = getDevice(i);
-            // Invoke the step handler for each device
-            active = dev.step() || active;
-            // Device ready to send?
-            if (*dev.readyToSend != No) {
-              *(sendersTop++) = i;
-            }
-          }
-          time++;
-        }
-      }
-
-      // Step 2: try to receive
-      while (tinselCanRecv()) {
-        PMessage<M>* inMsg = (PMessage<M>*) tinselRecv();
-        PInEdge<E>* inEdge = &inTableBase[inMsg->key];
-        while (inEdge->devId != InvalidLocalDevId) {
-          // Lookup destination device
-          PLocalDeviceId id = inEdge->devId;
-          DeviceType dev = getDevice(id);
-          // Was it ready to send?
-          PPin oldReadyToSend = *dev.readyToSend;
-          // Invoke receive handler
-          dev.recv(&inMsg->payload, &inEdge->edge);
-          // Insert device into a senders array, if not already there
-          if (*dev.readyToSend != No && oldReadyToSend == No)
-            *(sendersTop++) = id;
-          inEdge++;
-          #ifdef POLITE_COUNT_MSGS
-            msgsReceived++;
-          #endif
-        }
-        tinselFree(inMsg);
-      }
-    }
-
-    // Termination
-    #ifdef POLITE_DUMP_STATS
-      dumpStats();
-    #endif
-
-    // Invoke finish handler for each device
-    for (uint32_t i = 0; i < numDevices; i++) {
-      DeviceType dev = getDevice(i);
-      tinselWaitUntil(TINSEL_CAN_SEND);
-      PMessage<M>* m = (PMessage<M>*) tinselSendSlot();
-      if (dev.finish(&m->payload)) tinselSend(tinselHostId(), m);
-    }
-
-    // Sleep
-    tinselWaitUntil(TINSEL_CAN_RECV); while (1);
-  }
-
-  #endif
-
-};
-
-#endif
diff --git a/include/POLite/Hybrid/PGraph.h b/include/POLite/Hybrid/PGraph.h
deleted file mode 100644
index 126eaa97..00000000
--- a/include/POLite/Hybrid/PGraph.h
+++ /dev/null
@@ -1,854 +0,0 @@
-// SPDX-License-Identifier: BSD-2-Clause
-#ifndef _PGRAPH_H_
-#define _PGRAPH_H_
-
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <math.h>
-#include <HostLink.h>
-#include <config.h>
-#include <POLite.h>
-#include <POLite/Seq.h>
-#include <POLite/Graph.h>
-#include <POLite/Placer.h>
-#include <POLite/ProgRouters.h>
-#include <type_traits>
-
-// Nodes of a POETS graph are devices
-typedef NodeId PDeviceId;
-
-// This structure holds a group of receiving edges on a thread.
-// All of the edges originate from the same output pin.
-template <typename E> struct PReceiverGroup {
-  // Thread id where all the receivers reside
-  uint32_t threadId;
-  // A sequence of receiving devices on that thread
-  Seq<PInEdge<E>>* receivers;
-};
-
-// POETS graph
-template <typename DeviceType,
-          typename S, typename E, typename M> class PGraph {
- private:
-  // Align address to 2^n byte boundary
-  inline uint32_t align(uint32_t n, uint32_t addr) {
-    if ((addr & (1<<n)-1) == 0) return addr;
-    return ((addr >> n) + 1) << n;
-  }
-
-  // Align address to 32-bit word boundary
-  uint32_t wordAlign(uint32_t addr) { return align(2, addr); }
-
-  // Align address to cache-line boundary
-  uint32_t cacheAlign(uint32_t addr) {
-    return align(TinselLogBytesPerLine, addr);
-  }
-
-  // Helper function
-  inline uint32_t min(uint32_t x, uint32_t y) { return x < y ? x : y; }
-
-  // Number of FPGA boards available
-  uint32_t meshLenX;
-  uint32_t meshLenY;
-
-  // Number of FPGA boards to use
-  uint32_t numBoardsX;
-  uint32_t numBoardsY;
-
-  // Out table (sender-side edge tables)
-  // Sequence of destinations for every (device, pin) pair
-  Seq<PRoutingDest>*** outTable;
-
-  // Key table (sender-side key tables)
-  // Global routing key for every (device, pin) pair
-  uint32_t** keyTable;
-
-  // In table (receiver-side edge tables)
-  // Sequence of incoming edges for every thread
-  Seq<PInEdge<E>>** inTable;
-
-  // Mesh of per-board programmable routers
-  ProgRouterMesh* routingTables;
-
-  // Generic constructor
-  void constructor(uint32_t lenX, uint32_t lenY) {
-    meshLenX = lenX;
-    meshLenY = lenY;
-    char* str = getenv("POLITE_BOARDS_X");
-    int nx = str ? atoi(str) : meshLenX;
-    str = getenv("POLITE_BOARDS_Y");
-    int ny = str ? atoi(str) : meshLenY;
-    setNumBoards(nx, ny);
-    numDevices = 0;
-    devices = NULL;
-    toDeviceAddr = NULL;
-    numDevicesOnThread = NULL;
-    fromDeviceAddr = NULL;
-    vertexMem = NULL;
-    vertexMemSize = NULL;
-    vertexMemBase = NULL;
-    inEdgeMem = NULL;
-    inEdgeMemSize = NULL;
-    inEdgeMemBase = NULL;
-    mapVerticesToDRAM = false;
-    mapInEdgesToDRAM = true;
-    outTable = NULL;
-    keyTable = NULL;
-    inTable = NULL;
-    routingTables = NULL;
-    chatty = 0;
-    str = getenv("POLITE_CHATTY");
-    if (str != NULL) {
-      chatty = !strcmp(str, "0") ? 0 : 1;
-    }
-  }
-
- public:
-  // Number of devices
-  uint32_t numDevices;
-
-  // Graph containing device ids and connections
-  Graph graph;
-
-  // Edge labels: has same structure as graph.outgoing
-  Seq<Seq<E>*> edgeLabels;
-
-  // Mapping from device id to device state
-  // (Not valid until the mapper is called)
-  PState<S>** devices;
-
-  // Mapping from thread id to number of devices on that thread
-  // (Not valid until the mapper is called)
-  uint32_t* numDevicesOnThread;
-
-  // Mapping from device id to device address and back
-  // (Not valid until the mapper is called)
-  PDeviceAddr* toDeviceAddr;  // Device id -> device address
-  PDeviceId** fromDeviceAddr; // Device address -> device id
-
-  // Each thread's vertex mem and thread mem regions
-  // (Not valid until the mapper is called)
-  uint8_t** vertexMem;      uint8_t** threadMem;
-  uint32_t* vertexMemSize;  uint32_t* threadMemSize;
-  uint32_t* vertexMemBase;  uint32_t* threadMemBase;
-
-  // Each thread's in-edge tables
-  // (Not valid until the mapper is called)
-  uint8_t** inEdgeMem;
-  uint32_t* inEdgeMemSize;
-  uint32_t* inEdgeMemBase;
-
-  // Where to map the various regions
-  // (If false, map to SRAM instead)
-  bool mapVerticesToDRAM;
-  bool mapInEdgesToDRAM;
-
-  // Allow mapper to print useful information to stdout
-  uint32_t chatty;
-
-  // Setter for number of boards to use
-  void setNumBoards(uint32_t x, uint32_t y) {
-    if (x > meshLenX || y > meshLenY) {
-      printf("Mapper: %d x %d boards requested, %d x %d available\n",
-        numBoardsX, numBoardsY, meshLenX, meshLenY);
-      exit(EXIT_FAILURE);
-    }
-    numBoardsX = x;
-    numBoardsY = y;
-  }
-
-  // Create new device
-  inline PDeviceId newDevice() {
-    edgeLabels.append(new SmallSeq<E>);
-    numDevices++;
-    return graph.newNode();
-  }
-
-  // Add a connection between devices
-  inline void addEdge(PDeviceId from, PinId pin, PDeviceId to) {
-    if (pin >= POLITE_NUM_PINS) {
-      printf("addEdge: pin exceeds POLITE_NUM_PINS\n");
-      exit(EXIT_FAILURE);
-    }
-    graph.addEdge(from, pin, to);
-    E edge;
-    edgeLabels.elems[from]->append(edge);
-  }
-
-  // Add labelled edge using given output pin
-  void addLabelledEdge(E edge, PDeviceId x, PinId pin, PDeviceId y) {
-    if (pin >= POLITE_NUM_PINS) {
-      printf("addEdge: pin exceeds POLITE_NUM_PINS\n");
-      exit(EXIT_FAILURE);
-    }
-    graph.addEdge(x, pin, y);
-    edgeLabels.elems[x]->append(edge);
-  }
-
-  // Allocate SRAM and DRAM partitions
-  void allocatePartitions() {
-    // Decide a maximum partition size that is reasonable
-    // SRAM: Partition size minus 2048 bytes for the stack
-    uint32_t maxSRAMSize = (1<<TinselLogBytesPerSRAMPartition) - 2048;
-    // DRAM: Partition size minus 65536 bytes for the stack
-    uint32_t maxDRAMSize = (1<<TinselLogBytesPerDRAMPartition) - 65536;
-    // Allocate partition sizes and bases
-    vertexMem = (uint8_t**) calloc(TinselMaxThreads, sizeof(uint8_t*));
-    vertexMemSize = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
-    vertexMemBase = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
-    threadMem = (uint8_t**) calloc(TinselMaxThreads, sizeof(uint8_t*));
-    threadMemSize = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
-    threadMemBase = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
-    inEdgeMem = (uint8_t**) calloc(TinselMaxThreads, sizeof(uint8_t*));
-    inEdgeMemSize = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
-    inEdgeMemBase = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
-    // Compute partition sizes for each thread
-    for (uint32_t threadId = 0; threadId < TinselMaxThreads; threadId++) {
-      // This variable is used to count the size of the *initialised*
-      // partition.  The total partition size is larger as it includes
-      // uninitialised portions.
-      uint32_t sizeVMem = 0;
-      uint32_t sizeEIMem = 0;
-      uint32_t sizeTMem = 0;
-      // Add space for thread structure (always stored in SRAM)
-      sizeTMem = cacheAlign(sizeof(PThread<DeviceType, S, E, M>));
-      // Add space for devices
-      uint32_t numDevs = numDevicesOnThread[threadId];
-      for (uint32_t devNum = 0; devNum < numDevs; devNum++) {
-        // Add space for device
-        sizeVMem = sizeVMem + sizeof(PState<S>);
-      }
-      // Add space for incoming edge table
-      if (inTable[threadId]) {
-        sizeEIMem = inTable[threadId]->numElems * sizeof(PInEdge<E>);
-        sizeEIMem = wordAlign(sizeEIMem);
-      }
-      // The total partition size including uninitialised portions
-      uint32_t totalSizeVMem =
-        sizeVMem + wordAlign(sizeof(PLocalDeviceId) * numDevs);
-      // Check that total size is reasonable
-      uint32_t totalSizeSRAM = sizeTMem;
-      uint32_t totalSizeDRAM = 0;
-      if (mapVerticesToDRAM) totalSizeDRAM += totalSizeVMem;
-                        else totalSizeSRAM += totalSizeVMem;
-      if (mapInEdgesToDRAM)  totalSizeDRAM += sizeEIMem;
-                        else totalSizeSRAM += sizeEIMem;
-      if (totalSizeDRAM > maxDRAMSize) {
-        printf("Error: max DRAM partition size exceeded\n");
-        exit(EXIT_FAILURE);
-      }
-      if (totalSizeSRAM > maxSRAMSize) {
-        printf("Error: max SRAM partition size exceeded\n");
-        exit(EXIT_FAILURE);
-      }
-      // Allocate space for the initialised portion of the partition
-      assert((sizeVMem%4) == 0);
-      assert((sizeTMem%4) == 0);
-      assert((sizeEIMem%4) == 0);
-      vertexMem[threadId] = (uint8_t*) calloc(sizeVMem, 1);
-      vertexMemSize[threadId] = sizeVMem;
-      threadMem[threadId] = (uint8_t*) calloc(sizeTMem, 1);
-      threadMemSize[threadId] = sizeTMem;
-      inEdgeMem[threadId] = (uint8_t*) calloc(sizeEIMem, 1);
-      inEdgeMemSize[threadId] = sizeEIMem;
-      // Tinsel address of base of partition
-      uint32_t partId = threadId & (TinselThreadsPerDRAM-1);
-      uint32_t sramBase = (1 << TinselLogBytesPerSRAM) +
-          (partId << TinselLogBytesPerSRAMPartition);
-      uint32_t dramBase = TinselBytesPerDRAM -
-          ((partId+1) << TinselLogBytesPerDRAMPartition);
-      // Use partition-interleaved region for DRAM
-      dramBase |= 0x80000000;
-      threadMemBase[threadId] = sramBase;
-      sramBase += threadMemSize[threadId];
-      // Determine base addresses of each region
-      if (mapVerticesToDRAM) {
-        vertexMemBase[threadId] = dramBase;
-        dramBase += totalSizeVMem;
-      }
-      else {
-        vertexMemBase[threadId] = sramBase;
-        sramBase += totalSizeVMem;
-      }
-      if (mapInEdgesToDRAM) {
-        inEdgeMemBase[threadId] = dramBase;
-        dramBase += sizeEIMem;
-      }
-      else {
-        inEdgeMemBase[threadId] = sramBase;
-        sramBase += sizeEIMem;
-      }
-    }
-  }
-
-  // Initialise partitions
-  void initialisePartitions() {
-    for (uint32_t threadId = 0; threadId < TinselMaxThreads; threadId++) {
-      // Next pointers for each partition
-      uint32_t nextVMem = 0;
-      // Pointer to thread structure
-      PThread<DeviceType, S, E, M>* thread =
-        (PThread<DeviceType, S, E, M>*) &threadMem[threadId][0];
-      // Set number of devices on thread
-      thread->numDevices = numDevicesOnThread[threadId];
-      // Set number of devices in graph
-      thread->numVertices = numDevices;
-      // Set tinsel address of array of device states
-      thread->devices = vertexMemBase[threadId];
-      // Set tinsel address of base of in-edge table
-      thread->inTableBase = inEdgeMemBase[threadId];
-      // Add space for each device on thread
-      uint32_t numDevs = numDevicesOnThread[threadId];
-      for (uint32_t devNum = 0; devNum < numDevs; devNum++) {
-        PState<S>* dev = (PState<S>*) &vertexMem[threadId][nextVMem];
-        PDeviceId id = fromDeviceAddr[threadId][devNum];
-        devices[id] = dev;
-        // Add space for device
-        nextVMem = nextVMem + sizeof(PState<S>);
-      }
-      // Initialise each device and the thread's out edges
-      for (uint32_t devNum = 0; devNum < numDevs; devNum++) {
-        PDeviceId id = fromDeviceAddr[threadId][devNum];
-        PState<S>* dev = devices[id];
-        // Initialise
-        for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) {
-          dev->pin[p] = keyTable[id][p];
-        }
-      }
-      // Intialise thread's in edges
-      PInEdge<E>* inEdgeArray = (PInEdge<E>*) inEdgeMem[threadId];
-      Seq<PInEdge<E>>* edges = inTable[threadId];
-      if (edges)
-        for (uint32_t i = 0; i < edges->numElems; i++) {
-          inEdgeArray[i] = edges->elems[i];
-        }
-      // At this point, check that next pointers line up with heap sizes
-      if (nextVMem != vertexMemSize[threadId]) {
-        printf("Error: vertex mem size does not match pre-computed size\n");
-        exit(EXIT_FAILURE);
-      }
-      // Set tinsel address of senders array
-      thread->senders = vertexMemBase[threadId] + nextVMem;
-    }
-  }
-
-  // Allocate mapping structures
-  void allocateMapping() {
-    devices = (PState<S>**) calloc(numDevices, sizeof(PState<S>*));
-    toDeviceAddr = (PDeviceAddr*) calloc(numDevices, sizeof(PDeviceAddr));
-    fromDeviceAddr = (PDeviceId**) calloc(TinselMaxThreads, sizeof(PDeviceId*));
-    numDevicesOnThread = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
-  }
-
-  // Allocate thread edge input and output tables
-  // (Only valid after mapper is called)
-  void allocateInOutTables() {
-    // Receiver-side tables
-    inTable = (Seq<PInEdge<E>>**)
-      calloc(TinselMaxThreads,sizeof(Seq<PInEdge<E>>*));
-    for (uint32_t t = 0; t < TinselMaxThreads; t++) {
-      if (numDevicesOnThread[t] != 0)
-        inTable[t] = new SmallSeq<PInEdge<E>>;
-    }
-
-    // Sender-side tables
-    outTable = (Seq<PRoutingDest>***)
-      calloc(numDevices, sizeof(Seq<PRoutingDest>**));
-    for (uint32_t d = 0; d < numDevices; d++) {
-      outTable[d] = (Seq<PRoutingDest>**)
-        calloc(POLITE_NUM_PINS, sizeof(Seq<PRoutingDest>*));
-      for (uint32_t p = 0; p < POLITE_NUM_PINS; p++)
-        outTable[d][p] = new SmallSeq<PRoutingDest>;
-    }
-
-    keyTable = new uint32_t* [numDevices];
-    for (uint32_t d = 0; d < numDevices; d++)
-      keyTable[d] = new uint32_t [POLITE_NUM_PINS];
-  }
-
-  // Pack a receivers array
-  // Input: an in-edge sequence for each thread in a mailbox.
-  // Input array may contain lots of holes (0-element sequences)
-  // Output: a sequence of receiver groups
-  // Output array contains no empty receiver groups
-  void createReceiverGroups(
-        uint32_t mbox,
-        Seq<PInEdge<E>>* receivers,
-        Seq<PReceiverGroup<E>>* groups) {
-    groups->clear();
-    for (uint32_t i = 0; i < 64; i++) {
-      if (receivers[i].numElems > 0) {
-        // Add receiver group
-        PReceiverGroup<E> g;
-        g.threadId = (mbox << TinselLogThreadsPerMailbox) | i;
-        g.receivers = &receivers[i];
-        groups->append(g);
-      }
-    }
-  }
-
-  // Determine in-table key for given set of receivers
-  // (The key must be the same for all receivers)
-  uint32_t findInTableKey(Seq<PReceiverGroup<E>>* receivers) { 
-    uint32_t key = 0;
-
-    bool found = false;
-    while (!found) {
-      found = true; 
-      for (uint32_t i = 0; i < receivers->numElems; i++) {
-        PReceiverGroup<E> g = receivers->elems[i];
-        uint32_t numReceivers = g.receivers->numElems;
-        if (numReceivers > 0) {
-          // Lookup thread id of receiver
-          uint32_t t = g.threadId;
-          // Lookup table size for this thread
-          uint32_t tableSize = inTable[t]->numElems;
-          // Move to next receiver when we find a space
-          if (key >= tableSize) continue;
-          // Is there space at the current key?
-          // (Need space for numReceivers plus null terminator)
-          bool space = true;
-          for (int j = 0; j < numReceivers+1; j++) {
-            if ((key+j) >= tableSize) break;
-            if (inTable[t]->elems[key+j].devId != UnusedLocalDevId) {
-              found = false;
-              key = key+j+1;
-              break;
-            }
-          }
-        }
-      }
-    }
-    return key;
-  }
-
-  // Add entries to the input tables for the given receivers
-  // (Only valid after mapper is called)
-  uint32_t addInTableEntries(Seq<PReceiverGroup<E>>* receivers) {
-    uint32_t key = findInTableKey(receivers);
-    if (key >= 0xfffe) {
-      printf("In-table routing key exceeds 16 bits\n");
-      exit(EXIT_FAILURE);
-    }
-    PInEdge<E> null, unused;
-    null.devId = InvalidLocalDevId;
-    unused.devId = UnusedLocalDevId;
-    // Now that a key with sufficient space has been found, populate the tables
-    for (uint32_t i = 0; i < receivers->numElems; i++) {
-      PReceiverGroup<E> g = receivers->elems[i];
-      uint32_t numReceivers = g.receivers->numElems;
-      if (numReceivers > 0) {
-        // Lookup thread id of receiver
-        uint32_t t = g.threadId;
-        // Lookup table size for this thread
-        uint32_t tableSize = inTable[t]->numElems;
-        // Make sure inTable is big enough for new entries
-        for (uint32_t j = tableSize; j < (key+numReceivers+1); j++)
-          inTable[t]->append(unused);
-        // Add receivers to thread's inTable
-        for (uint32_t j = 0; j < numReceivers; j++) {
-          inTable[t]->elems[key+j] = g.receivers->elems[j];
-        }
-        inTable[t]->elems[key+numReceivers] = null;
-      }
-    }
-    return key;
-  }
-
-  // Compute thread edge input and output tables
-  // (Only valid after mapper is called)
-  void computeInOutTables() {
-    // Routing table stats
-    uint64_t totalOutEdges = 0;
-
-    // Sequence of local device ids, for each multicast destiation
-    SmallSeq<PInEdge<E>> receivers[64];
-
-    // Sequence of receiver groups
-    // (A more compact representation of the receivers array)
-    SmallSeq<PReceiverGroup<E>> groups;
-
-    // For each device
-    for (uint32_t d = 0; d < numDevices; d++) {
-      // For each pin
-      for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) {
-        Seq<PDeviceId> dests = *(graph.outgoing->elems[d]);
-        Seq<E> edges = *(edgeLabels.elems[d]);
-        // While destinations are remaining
-        while (dests.numElems > 0) {
-          // Clear receivers
-          for (uint32_t i = 0; i < 64; i++) receivers[i].clear();
-          uint32_t threadMaskLow = 0;
-          uint32_t threadMaskHigh = 0;
-          // Current mailbox being considered
-          PDeviceAddr mbox = getThreadId(toDeviceAddr[dests.elems[0]]) >>
-                               TinselLogThreadsPerMailbox;
-          // For each destination
-          uint32_t destsRemaining = 0;
-          for (uint32_t i = 0; i < dests.numElems; i++) {
-            // Determine destination mailbox address and mailbox-local thread
-            PDeviceId destId = dests.elems[i];
-            PDeviceAddr destAddr = toDeviceAddr[destId];
-            uint32_t destMailbox = getThreadId(destAddr) >>
-                                     TinselLogThreadsPerMailbox;
-            uint32_t destThread = getThreadId(destAddr) &
-                                     ((1<<TinselLogThreadsPerMailbox)-1);
-            // Does destination match current destination?
-            if (destMailbox == mbox) {
-              PInEdge<E> edge;
-              edge.devId = getLocalDeviceId(destAddr);
-              if (! std::is_same<E, None>::value) edge.edge = edges.elems[i];
-              receivers[destThread].append(edge);
-              if (destThread < 32) threadMaskLow |= 1 << destThread;
-              if (destThread >= 32) threadMaskHigh |= 1 << (destThread-32);
-            }
-            else {
-              // Add destination back into sequence
-              dests.elems[destsRemaining] = dests.elems[i];
-              edges.elems[destsRemaining] = edges.elems[i];
-              destsRemaining++;
-            }
-          }
-          // Create receiver groups
-          createReceiverGroups(mbox, receivers, &groups);
-          // Add input table entries
-          uint32_t key = addInTableEntries(&groups);
-          // Add output table entry
-          PRoutingDest edge;
-          edge.kind = PRDestKindMRM;
-          edge.mbox = mbox;
-          edge.mrm.key = key;
-          edge.mrm.threadMaskLow = threadMaskLow;
-          edge.mrm.threadMaskHigh = threadMaskHigh;
-          outTable[d][p]->append(edge);
-          // Prepare for new output table entry
-          dests.numElems = destsRemaining;
-          edges.numElems = destsRemaining;
-          totalOutEdges++;
-        }
-      }
-    }
-    //printf("Average edges per pin: %lu\n",
-    //  totalOutEdges / (numDevices * POLITE_NUM_PINS);
-  }  
-
-  // Release all structures
-  void releaseAll() {
-    if (devices != NULL) {
-      free(devices);
-      free(toDeviceAddr);
-      free(numDevicesOnThread);
-      for (uint32_t t = 0; t < TinselMaxThreads; t++)
-        if (fromDeviceAddr[t] != NULL) free(fromDeviceAddr[t]);
-      free(fromDeviceAddr);
-      for (uint32_t t = 0; t < TinselMaxThreads; t++)
-        if (vertexMem[t] != NULL) free(vertexMem[t]);
-      free(vertexMem);
-      free(vertexMemSize);
-      free(vertexMemBase);
-      for (uint32_t t = 0; t < TinselMaxThreads; t++)
-        if (threadMem[t] != NULL) free(threadMem[t]);
-      free(threadMem);
-      free(threadMemSize);
-      free(threadMemBase);
-      for (uint32_t t = 0; t < TinselMaxThreads; t++)
-        if (inEdgeMem[t] != NULL) free(inEdgeMem[t]);
-      free(inEdgeMem);
-      free(inEdgeMemSize);
-      free(inEdgeMemBase);
-    }
-    if (inTable != NULL) {
-      for (uint32_t t = 0; t < TinselMaxThreads; t++)
-        if (inTable[t] != NULL) delete inTable[t];
-      free(inTable);
-      inTable = NULL;
-    }
-    if (outTable != NULL) {
-      for (uint32_t d = 0; d < numDevices; d++) {
-        if (outTable[d] == NULL) continue;
-        for (uint32_t p = 0; p < POLITE_NUM_PINS; p++)
-          delete outTable[d][p];
-        free(outTable[d]);
-      }
-      free(outTable);
-      outTable = NULL;
-    }
-    if (keyTable != NULL) {
-      for (uint32_t d = 0; d < numDevices; d++) delete [] keyTable[d];
-      delete [] keyTable;
-      keyTable = NULL;
-    }
-    if (routingTables != NULL) delete routingTables;
-  }
-
-  // Implement mapping to tinsel threads
-  void map() {
-    // Let's measure some times
-    struct timeval placementStart, placementFinish;
-    struct timeval routingStart, routingFinish;
-    struct timeval initStart, initFinish;
-
-    // Release all mapping and heap structures
-    releaseAll();
-
-    // Reallocate mapping structures
-    allocateMapping();
-
-    // Start placement timer
-    gettimeofday(&placementStart, NULL);
-
-    // Partition into subgraphs, one per board
-    Placer boards(&graph, numBoardsX, numBoardsY);
-
-    // Place subgraphs onto 2D mesh
-    const uint32_t placerEffort = 8;
-    boards.place(placerEffort);
-
-    // For each board
-    for (uint32_t boardY = 0; boardY < numBoardsY; boardY++) {
-      for (uint32_t boardX = 0; boardX < numBoardsX; boardX++) {
-        // Partition into subgraphs, one per mailbox
-        PartitionId b = boards.mapping[boardY][boardX];
-        Placer boxes(&boards.subgraphs[b], 
-                 TinselMailboxMeshXLen, TinselMailboxMeshYLen);
-        boxes.place(placerEffort);
-
-        // For each mailbox
-        for (uint32_t boxX = 0; boxX < TinselMailboxMeshXLen; boxX++) {
-          for (uint32_t boxY = 0; boxY < TinselMailboxMeshYLen; boxY++) {
-            // Partition into subgraphs, one per thread
-            uint32_t numThreads = 1<<TinselLogThreadsPerMailbox;
-            PartitionId t = boxes.mapping[boxY][boxX];
-            Placer threads(&boxes.subgraphs[t], numThreads, 1);
-
-            // For each thread
-            for (uint32_t threadNum = 0; threadNum < numThreads; threadNum++) {
-              // Determine tinsel thread id
-              uint32_t threadId = boardY;
-              threadId = (threadId << TinselMeshXBits) | boardX;
-              threadId = (threadId << TinselMailboxMeshYBits) | boxY;
-              threadId = (threadId << TinselMailboxMeshXBits) | boxX;
-              threadId = (threadId << (TinselLogCoresPerMailbox +
-                            TinselLogThreadsPerCore)) | threadNum;
-
-              // Get subgraph
-              Graph* g = &threads.subgraphs[threadNum];
-
-              // Populate fromDeviceAddr mapping
-              uint32_t numDevs = g->incoming->numElems;
-              numDevicesOnThread[threadId] = numDevs;
-              fromDeviceAddr[threadId] = (PDeviceId*)
-                malloc(sizeof(PDeviceId) * numDevs);
-              for (uint32_t devNum = 0; devNum < numDevs; devNum++)
-                fromDeviceAddr[threadId][devNum] = g->labels->elems[devNum];
-  
-              // Populate toDeviceAddr mapping
-              assert(numDevs < maxLocalDeviceId());
-              for (uint32_t devNum = 0; devNum < numDevs; devNum++) {
-                PDeviceAddr devAddr =
-                  makeDeviceAddr(threadId, devNum);
-                toDeviceAddr[g->labels->elems[devNum]] = devAddr;
-              }
-            }
-          }
-        }
-      }
-    }
-
-    // Stop placement timer and start In/Out table timer
-    gettimeofday(&placementFinish, NULL);
-    gettimeofday(&routingStart, NULL);
-
-    // Compute send and receive side routing tables
-    allocateInOutTables();
-    computeInOutTables();
-
-    // Compute per-board programmable routing tables
-    routingTables = new ProgRouterMesh(numBoardsX, numBoardsY);
-    for (uint32_t d = 0; d < numDevices; d++) {
-      uint32_t src = getThreadId(toDeviceAddr[d]) >>
-        TinselLogThreadsPerMailbox;
-      for (uint32_t p = 0; p < POLITE_NUM_PINS; p++)
-        keyTable[d][p] = routingTables->addDestsFromBoard(src, outTable[d][p]);
-   }
-
-    // Stop routing timer and start init timer
-    gettimeofday(&routingFinish, NULL);
-    gettimeofday(&initStart, NULL);
-
-    // Reallocate and initialise heap structures
-    allocatePartitions();
-    initialisePartitions();
-
-    // Display times, if chatty
-    gettimeofday(&initFinish, NULL);
-    if (chatty > 0) {
-      struct timeval diff;
-
-      timersub(&placementFinish, &placementStart, &diff);
-      double duration = (double) diff.tv_sec +
-        (double) diff.tv_usec / 1000000.0;
-      printf("POLite mapper profile:\n");
-      printf("  Partitioning and placement: %lfs\n", duration);
-
-      timersub(&routingFinish, &routingStart, &diff);
-      duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
-      printf("  In/Out table construction: %lfs\n", duration);
-
-      timersub(&initFinish, &initStart, &diff);
-      duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
-      printf("  Thread state initialisation: %lfs\n", duration);
-    }
-  }
-
-  // Constructor
-  PGraph() {
-    char* str = getenv("HOSTLINK_BOXES_X");
-    int x = str ? atoi(str) : 1;
-    x = x * TinselMeshXLenWithinBox;
-    str = getenv("HOSTLINK_BOXES_Y");
-    int y = str ? atoi(str) : 1;
-    y = y * TinselMeshYLenWithinBox;
-    constructor(x, y);
-  }
-  PGraph(uint32_t numBoxesX, uint32_t numBoxesY) {
-    int x = numBoxesX * TinselMeshXLenWithinBox; 
-    int y = numBoxesY * TinselMeshYLenWithinBox;
-    constructor(x, y);
-  }
-
-  // Deconstructor
-  ~PGraph() {
-    releaseAll();
-    for (uint32_t i = 0; i < edgeLabels.numElems; i++)
-      delete edgeLabels.elems[i];
-  }
-
-  // Write partition to tinsel machine
-  void writeRAM(HostLink* hostLink,
-         uint8_t** heap, uint32_t* heapSize, uint32_t* heapBase) {
-    // Number of bytes written by each thread
-    uint32_t* writeCount = (uint32_t*)
-      calloc(TinselMaxThreads, sizeof(uint32_t));
-
-    // Number of threads completed by each core
-    uint32_t*** threadCount = (uint32_t***)
-      calloc(meshLenX, sizeof(uint32_t**));
-    for (uint32_t x = 0; x < meshLenX; x++) {
-      threadCount[x] = (uint32_t**)
-        calloc(meshLenY, sizeof(uint32_t*));
-      for (uint32_t y = 0; y < meshLenY; y++)
-        threadCount[x][y] = (uint32_t*)
-          calloc(TinselCoresPerBoard, sizeof(uint32_t));
-    }
-
-    // Initialise write addresses
-    for (int x = 0; x < meshLenX; x++)
-      for (int y = 0; y < meshLenY; y++)
-        for (int c = 0; c < TinselCoresPerBoard; c++)
-          hostLink->setAddr(x, y, c, heapBase[hostLink->toAddr(x, y, c, 0)]);
-
-    // Write heaps
-    uint32_t done = false;
-    while (! done) {
-      done = true;
-      for (int x = 0; x < meshLenX; x++) {
-        for (int y = 0; y < meshLenY; y++) {
-          for (int c = 0; c < TinselCoresPerBoard; c++) {
-            uint32_t t = threadCount[x][y][c];
-            if (t < TinselThreadsPerCore) {
-              done = false;
-              uint32_t threadId = hostLink->toAddr(x, y, c, t);
-              uint32_t written = writeCount[threadId];
-              if (written == heapSize[threadId]) {
-                threadCount[x][y][c] = t+1;
-                if ((t+1) < TinselThreadsPerCore)
-                  hostLink->setAddr(x, y, c,
-                    heapBase[hostLink->toAddr(x, y, c, t+1)]);
-              } else {
-                uint32_t send = min((heapSize[threadId] - written)>>2, 15);
-                hostLink->store(x, y, c, send,
-                  (uint32_t*) &heap[threadId][written]);
-                writeCount[threadId] = written + send * sizeof(uint32_t);
-              }
-            }
-          }
-        }
-      }
-    }
-
-    // Release memory
-    free(writeCount);
-    for (uint32_t x = 0; x < meshLenX; x++) {
-      for (uint32_t y = 0; y < meshLenY; y++)
-        free(threadCount[x][y]);
-      free(threadCount[x]);
-    }
-    free(threadCount);
-  }
-
-  // Write graph to tinsel machine
-  void write(HostLink* hostLink) { 
-    // Start timer
-    struct timeval start, finish;
-    gettimeofday(&start, NULL);
-
-    bool useSendBufferOld = hostLink->useSendBuffer;
-    hostLink->useSendBuffer = true;
-    writeRAM(hostLink, vertexMem, vertexMemSize, vertexMemBase);
-    writeRAM(hostLink, threadMem, threadMemSize, threadMemBase);
-    writeRAM(hostLink, inEdgeMem, inEdgeMemSize, inEdgeMemBase);
-    routingTables->write(hostLink);
-    hostLink->flush();
-    hostLink->useSendBuffer = useSendBufferOld;
-
-    // Display time if chatty
-    gettimeofday(&finish, NULL);
-    if (chatty > 0) {
-      struct timeval diff;
-      timersub(&finish, &start, &diff);
-      double duration = (double) diff.tv_sec +
-        (double) diff.tv_usec / 1000000.0;
-      printf("POLite graph upload time: %lfs\n", duration);
-    }
-  }
-
-  // Determine fan-in of given device
-  uint32_t fanIn(PDeviceId id) {
-    return graph.fanIn(id);
-  }
-
-  // Determine fan-out of given device
-  uint32_t fanOut(PDeviceId id) {
-    return graph.fanOut(id);
-  }
-
-};
-
-// Read performance stats and store in file
-inline void politeSaveStats(HostLink* hostLink, const char* filename) {
-  #ifdef POLITE_DUMP_STATS
-  // Open file for performance counters
-  FILE* statsFile = fopen(filename, "wt");
-  if (statsFile == NULL) {
-    printf("Error creating stats file\n");
-    exit(EXIT_FAILURE);
-  }
-  uint32_t meshLenX = hostLink->meshXLen;
-  uint32_t meshLenY = hostLink->meshYLen;
-  // Number of caches
-  uint32_t numLines = meshLenX * meshLenY *
-                        TinselDCachesPerDRAM * TinselDRAMsPerBoard;
-  // Add on number of cores
-  numLines += meshLenX * meshLenY * TinselCoresPerBoard;
-  // Add on number of threads
-  #ifdef POLITE_COUNT_MSGS
-  numLines += meshLenX * meshLenY * TinselThreadsPerBoard;
-  #endif
-  hostLink->dumpStdOut(statsFile, numLines);
-  fclose(statsFile);
-  #endif
-}
-
-#endif
diff --git a/include/POLite/Local/PDevice.h b/include/POLite/PDevice.h
similarity index 80%
rename from include/POLite/Local/PDevice.h
rename to include/POLite/PDevice.h
index 9408cfae..6ba3be83 100644
--- a/include/POLite/Local/PDevice.h
+++ b/include/POLite/PDevice.h
@@ -22,14 +22,22 @@
 #define POLITE_NUM_PINS 1
 #endif
 
-// Macros for performance stats
+// The local-multicast key points to a list of incoming edges.  Some
+// of those edges are stored in a header, the rest in an array at a
+// different location.  The number stored in the header is controlled
+// by the following parameter.  If it's too low, we risk wasting
+// memory bandwidth.  If it's too high, we risk wasting memory.  
+// The minimum value is 0.  For large edge state sizes, use 0.
+#ifndef POLITE_EDGES_PER_HEADER
+#define POLITE_EDGES_PER_HEADER 6
+#endif
+
+// Macros for performance stats:
 //   POLITE_DUMP_STATS - dump performance stats on termination
-//   POLITE_COUNT_MSGS - include message counts of performance stats
+//   POLITE_COUNT_MSGS - include message counts in performance stats
 
 // Thread-local device id
 typedef uint16_t PLocalDeviceId;
-#define InvalidLocalDevId 0xffff
-#define UnusedLocalDevId 0xfffe
 
 // Thread id
 typedef uint32_t PThreadId;
@@ -54,7 +62,7 @@ inline PLocalDeviceId getLocalDeviceId(PDeviceAddr addr) { return addr >> 19; }
 // What's the max allowed local device address?
 inline uint32_t maxLocalDeviceId() { return 8192; }
 
-// Routing key
+// Local multicast key
 typedef uint16_t Key;
 #define InvalidKey 0xffff
 
@@ -102,8 +110,8 @@ template <typename S> struct ALIGNED PState {
 
 // Message structure
 template <typename M> struct PMessage {
-  // Source-based routing key
-  Key key;
+  // Destination key
+  uint16_t destKey;
   // Application message
   M payload;
 };
@@ -119,34 +127,26 @@ struct POutEdge {
   uint32_t threadMaskHigh;
 };
 
-// An incoming edge to a device (labelleled)
+// An incoming edge to a device
 template <typename E> struct PInEdge {
   // Destination device
   PLocalDeviceId devId;
-  // Edge info
+  // Edge data
   E edge;
 };
 
-// An incoming edge to a device (unlabelleled)
-template <> struct PInEdge<None> {
-  union {
-    // Destination device
-    PLocalDeviceId devId;
-    // Unused
-    None edge;
-  };
+// Header for a list of incoming edges (fixed size structure to
+// support fast construction/packing of local-multicast tables)
+template <typename E> struct PInHeader {
+  // Number of receivers
+  uint16_t numReceivers;
+  // Pointer to remaining edges in inTableRest,
+  // if they don't all fit in the header
+  uint16_t restIndex;
+  // Edges stored in the header, to make good use of cached data
+  PInEdge<E> edges[POLITE_EDGES_PER_HEADER];
 };
 
-// Helper function: Count board hops between two threads
-inline uint32_t hopsBetween(uint32_t t0, uint32_t t1) {
-  uint32_t xmask = ((1<<TinselMeshXBits)-1);
-  int32_t y0 = t0 >> (TinselLogThreadsPerBoard + TinselMeshXBits);
-  int32_t x0 = (t0 >> TinselLogThreadsPerBoard) & xmask;
-  int32_t y1 = t1 >> (TinselLogThreadsPerBoard + TinselMeshXBits);
-  int32_t x1 = (t1 >> TinselLogThreadsPerBoard) & xmask;
-  return (abs(x0-x1) + abs(y0-y1));
-}
-
 // Generic thread structure
 template <typename DeviceType,
           typename S, typename E, typename M> struct PThread {
@@ -161,7 +161,8 @@ template <typename DeviceType,
   PTR(PState<S>) devices;
   // Pointer to base of routing tables
   PTR(POutEdge) outTableBase;
-  PTR(PInEdge<E>) inTableBase;
+  PTR(PInHeader<E>) inTableHeaderBase;
+  PTR(PInEdge<E>) inTableRestBase;
   // Array of local device ids are ready to send
   PTR(PLocalDeviceId) senders;
   // This array is accessed in a LIFO manner
@@ -169,14 +170,12 @@ template <typename DeviceType,
 
   // Count number of messages sent
   #ifdef POLITE_COUNT_MSGS
-  // Total message received
+  // Total messages sent
+  uint32_t msgsSent;
+  // Total messages received
   uint32_t msgsReceived;
   // Number of times we wanted to send but couldn't
   uint32_t blockedSends;
-  // Total messages sent between threads
-  uint32_t interThreadSendCount;
-  // Messages sent between threads on different boards
-  uint32_t interBoardSendCount;
   #endif
 
   #ifdef TINSEL
@@ -213,9 +212,14 @@ template <typename DeviceType,
     }
     // Per-thread performance counters
     #ifdef POLITE_COUNT_MSGS
+    uint32_t intraBoardId = me & ((1<<TinselLogThreadsPerBoard) - 1);
+    uint32_t progRouterSent =
+      intraBoardId == 0 ? tinselProgRouterSent() : 0;
+    uint32_t progRouterSentInter =
+      intraBoardId == 0 ? tinselProgRouterSentInterBoard() : 0;
     printf("MS:%x,MR:%x,PR:%x,PRI:%x,BL:%x\n",
-      interThreadSendCount, msgsReceived, 0,
-        interBoardSendCount, blockedSends);
+      msgsSent, msgsReceived, progRouterSent,
+        progRouterSentInter, blockedSends);
     #endif
   }
 
@@ -260,14 +264,11 @@ template <typename DeviceType,
         if (tinselCanSend()) {
           PMessage<M>* m = (PMessage<M>*) tinselSendSlot();
           // Send message
-          m->key = outEdge->key;
+          m->destKey = outEdge->key;
           tinselMulticast(outEdge->mbox, outEdge->threadMaskHigh,
             outEdge->threadMaskLow, m);
           #ifdef POLITE_COUNT_MSGS
-          interThreadSendCount++;
-          interBoardSendCount +=
-            hopsBetween(outEdge->mbox << TinselLogThreadsPerMailbox,
-              tinselId());
+          msgsSent++;
           #endif
           // Move to next neighbour
           outEdge++;
@@ -329,8 +330,14 @@ template <typename DeviceType,
       // Step 2: try to receive
       while (tinselCanRecv()) {
         PMessage<M>* inMsg = (PMessage<M>*) tinselRecv();
-        PInEdge<E>* inEdge = &inTableBase[inMsg->key];
-        while (inEdge->devId != InvalidLocalDevId) {
+        PInHeader<E>* inHeader = &inTableHeaderBase[inMsg->destKey];
+        // Determine number and location of edges/receivers
+        uint32_t numReceivers = inHeader->numReceivers;
+        PInEdge<E>* inEdge = inHeader->edges;
+        // For each receiver
+        for (uint32_t i = 0; i < numReceivers; i++) {
+          if (i == POLITE_EDGES_PER_HEADER)
+            inEdge = &inTableRestBase[inHeader->restIndex];
           // Lookup destination device
           PLocalDeviceId id = inEdge->devId;
           DeviceType dev = getDevice(id);
diff --git a/include/POLite/Local/PGraph.h b/include/POLite/PGraph.h
similarity index 62%
rename from include/POLite/Local/PGraph.h
rename to include/POLite/PGraph.h
index 5ded656a..c5e5b41f 100644
--- a/include/POLite/Local/PGraph.h
+++ b/include/POLite/PGraph.h
@@ -12,7 +12,10 @@
 #include <POLite/Seq.h>
 #include <POLite/Graph.h>
 #include <POLite/Placer.h>
+#include <POLite/Bitmap.h>
+#include <POLite/ProgRouters.h>
 #include <type_traits>
+#include <tinsel-interface.h>
 
 // Nodes of a POETS graph are devices
 typedef NodeId PDeviceId;
@@ -23,9 +26,27 @@ template <typename E> struct PReceiverGroup {
   // Thread id where all the receivers reside
   uint32_t threadId;
   // A sequence of receiving devices on that thread
-  Seq<PInEdge<E>>* receivers;
+  SmallSeq<PInEdge<E>> receivers;
 };
 
+// This structure holds info about an edge destination
+struct PEdgeDest {
+  // Index of edge in outgoing edge list
+  uint32_t index;
+  // Destination device
+  PDeviceId dest;
+  // Address where destination is located
+  PDeviceAddr addr;
+};
+
+// Comparison function for PEdgeDest
+// (Useful to sort destinations by thread id of destination)
+inline int cmpEdgeDest(const void* e0, const void* e1) {
+  PEdgeDest* d0 = (PEdgeDest*) e0;
+  PEdgeDest* d1 = (PEdgeDest*) e1;
+  return getThreadId(d0->addr) < getThreadId(d1->addr);
+}
+
 // POETS graph
 template <typename DeviceType,
           typename S, typename E, typename M> class PGraph {
@@ -58,8 +79,19 @@ template <typename DeviceType,
   // Multicast routing tables:
   // Sequence of outgoing edges for every (device, pin) pair
   Seq<POutEdge>*** outTable;
-  // Sequence of incoming edges for every thread
-  Seq<PInEdge<E>>** inTable;
+  // Sequence of in-edge headers, for each thread
+  Seq<PInHeader<E>>** inTableHeaders;
+  // Remaining in-edges that don't fit in the header table, for each thread
+  Seq<PInEdge<E>>** inTableRest;
+  // Bitmap denoting used space in header table, for each thread
+  Bitmap** inTableBitmaps;
+
+  // Programmable routing tables
+  ProgRouterMesh* progRouterTables;
+
+  // Receiver groups (used internally by some methods, but declared once
+  // to avoid repeated allocation)
+  PReceiverGroup<E> groups[TinselThreadsPerMailbox];
 
   // Generic constructor
   void constructor(uint32_t lenX, uint32_t lenY) {
@@ -78,17 +110,24 @@ template <typename DeviceType,
     vertexMem = NULL;
     vertexMemSize = NULL;
     vertexMemBase = NULL;
-    inEdgeMem = NULL;
-    inEdgeMemSize = NULL;
-    inEdgeMemBase = NULL;
+    inEdgeHeaderMem = NULL;
+    inEdgeHeaderMemSize = NULL;
+    inEdgeHeaderMemBase = NULL;
+    inEdgeRestMem = NULL;
+    inEdgeRestMemSize = NULL;
+    inEdgeRestMemBase = NULL;
     outEdgeMem = NULL;
     outEdgeMemSize = NULL;
     outEdgeMemBase = NULL;
     mapVerticesToDRAM = false;
-    mapInEdgesToDRAM = true;
+    mapInEdgeHeadersToDRAM = true;
+    mapInEdgeRestToDRAM = true;
     mapOutEdgesToDRAM = true;
     outTable = NULL;
-    inTable = NULL;
+    inTableHeaders = NULL;
+    inTableRest = NULL;
+    inTableBitmaps = NULL;
+    progRouterTables = NULL;
     chatty = 0;
     str = getenv("POLITE_CHATTY");
     if (str != NULL) {
@@ -127,14 +166,18 @@ template <typename DeviceType,
 
   // Each thread's in-edge and out-edge regions
   // (Not valid until the mapper is called)
-  uint8_t** inEdgeMem;      uint8_t** outEdgeMem;
-  uint32_t* inEdgeMemSize;  uint32_t* outEdgeMemSize;
-  uint32_t* inEdgeMemBase;  uint32_t* outEdgeMemBase;
+  uint8_t** inEdgeHeaderMem;      uint8_t** inEdgeRestMem;
+  uint32_t* inEdgeHeaderMemSize;  uint32_t* inEdgeRestMemSize;
+  uint32_t* inEdgeHeaderMemBase;  uint32_t* inEdgeRestMemBase;
+  uint8_t** outEdgeMem;
+  uint32_t* outEdgeMemSize;
+  uint32_t* outEdgeMemBase;
 
   // Where to map the various regions
   // (If false, map to SRAM instead)
   bool mapVerticesToDRAM;
-  bool mapInEdgesToDRAM;
+  bool mapInEdgeHeadersToDRAM;
+  bool mapInEdgeRestToDRAM;
   bool mapOutEdgesToDRAM;
 
   // Allow mapper to print useful information to stdout
@@ -189,9 +232,14 @@ template <typename DeviceType,
     threadMem = (uint8_t**) calloc(TinselMaxThreads, sizeof(uint8_t*));
     threadMemSize = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
     threadMemBase = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
-    inEdgeMem = (uint8_t**) calloc(TinselMaxThreads, sizeof(uint8_t*));
-    inEdgeMemSize = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
-    inEdgeMemBase = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+    inEdgeHeaderMem = (uint8_t**) calloc(TinselMaxThreads, sizeof(uint8_t*));
+    inEdgeHeaderMemSize =
+      (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+    inEdgeHeaderMemBase =
+      (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+    inEdgeRestMem = (uint8_t**) calloc(TinselMaxThreads, sizeof(uint8_t*));
+    inEdgeRestMemSize = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+    inEdgeRestMemBase = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
     outEdgeMem = (uint8_t**) calloc(TinselMaxThreads, sizeof(uint8_t*));
     outEdgeMemSize = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
     outEdgeMemBase = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
@@ -201,7 +249,8 @@ template <typename DeviceType,
       // partition.  The total partition size is larger as it includes
       // uninitialised portions.
       uint32_t sizeVMem = 0;
-      uint32_t sizeEIMem = 0;
+      uint32_t sizeEIHeaderMem = 0;
+      uint32_t sizeEIRestMem = 0;
       uint32_t sizeEOMem = 0;
       uint32_t sizeTMem = 0;
       // Add space for thread structure (always stored in SRAM)
@@ -212,10 +261,15 @@ template <typename DeviceType,
         // Add space for device
         sizeVMem = sizeVMem + sizeof(PState<S>);
       }
-      // Add space for incoming edge table
-      if (inTable[threadId]) {
-        sizeEIMem = inTable[threadId]->numElems * sizeof(PInEdge<E>);
-        sizeEIMem = wordAlign(sizeEIMem);
+      // Add space for incoming edge tables
+      if (inTableHeaders[threadId]) {
+        sizeEIHeaderMem = inTableHeaders[threadId]->numElems *
+                            sizeof(PInHeader<E>);
+        sizeEIHeaderMem = wordAlign(sizeEIHeaderMem);
+      }
+      if (inTableRest[threadId]) {
+        sizeEIRestMem = inTableRest[threadId]->numElems * sizeof(PInEdge<E>);
+        sizeEIRestMem = wordAlign(sizeEIRestMem);
       }
       // Add space for outgoing edge table
       for (uint32_t devNum = 0; devNum < numDevs; devNum++) {
@@ -234,8 +288,10 @@ template <typename DeviceType,
       uint32_t totalSizeDRAM = 0;
       if (mapVerticesToDRAM) totalSizeDRAM += totalSizeVMem;
                         else totalSizeSRAM += totalSizeVMem;
-      if (mapInEdgesToDRAM)  totalSizeDRAM += sizeEIMem;
-                        else totalSizeSRAM += sizeEIMem;
+      if (mapInEdgeHeadersToDRAM) totalSizeDRAM += sizeEIHeaderMem;
+                             else totalSizeSRAM += sizeEIHeaderMem;
+      if (mapInEdgeRestToDRAM) totalSizeDRAM += sizeEIRestMem;
+                          else totalSizeSRAM += sizeEIRestMem;
       if (mapOutEdgesToDRAM) totalSizeDRAM += sizeEOMem;
                         else totalSizeSRAM += sizeEOMem;
       if (totalSizeDRAM > maxDRAMSize) {
@@ -249,14 +305,17 @@ template <typename DeviceType,
       // Allocate space for the initialised portion of the partition
       assert((sizeVMem%4) == 0);
       assert((sizeTMem%4) == 0);
-      assert((sizeEIMem%4) == 0);
+      assert((sizeEIHeaderMem%4) == 0);
+      assert((sizeEIRestMem%4) == 0);
       assert((sizeEOMem%4) == 0);
       vertexMem[threadId] = (uint8_t*) calloc(sizeVMem, 1);
       vertexMemSize[threadId] = sizeVMem;
       threadMem[threadId] = (uint8_t*) calloc(sizeTMem, 1);
       threadMemSize[threadId] = sizeTMem;
-      inEdgeMem[threadId] = (uint8_t*) calloc(sizeEIMem, 1);
-      inEdgeMemSize[threadId] = sizeEIMem;
+      inEdgeHeaderMem[threadId] = (uint8_t*) calloc(sizeEIHeaderMem, 1);
+      inEdgeHeaderMemSize[threadId] = sizeEIHeaderMem;
+      inEdgeRestMem[threadId] = (uint8_t*) calloc(sizeEIRestMem, 1);
+      inEdgeRestMemSize[threadId] = sizeEIRestMem;
       outEdgeMem[threadId] = (uint8_t*) calloc(sizeEOMem, 1);
       outEdgeMemSize[threadId] = sizeEOMem;
       // Tinsel address of base of partition
@@ -278,13 +337,21 @@ template <typename DeviceType,
         vertexMemBase[threadId] = sramBase;
         sramBase += totalSizeVMem;
       }
-      if (mapInEdgesToDRAM) {
-        inEdgeMemBase[threadId] = dramBase;
-        dramBase += sizeEIMem;
+      if (mapInEdgeHeadersToDRAM) {
+        inEdgeHeaderMemBase[threadId] = dramBase;
+        dramBase += sizeEIHeaderMem;
       }
       else {
-        inEdgeMemBase[threadId] = sramBase;
-        sramBase += sizeEIMem;
+        inEdgeHeaderMemBase[threadId] = sramBase;
+        sramBase += sizeEIHeaderMem;
+      }
+      if (mapInEdgeRestToDRAM) {
+        inEdgeRestMemBase[threadId] = dramBase;
+        dramBase += sizeEIRestMem;
+      }
+      else {
+        inEdgeRestMemBase[threadId] = sramBase;
+        sramBase += sizeEIRestMem;
       }
       if (mapOutEdgesToDRAM) {
         outEdgeMemBase[threadId] = dramBase;
@@ -314,7 +381,8 @@ template <typename DeviceType,
       thread->devices = vertexMemBase[threadId];
       // Set tinsel address of base of edge tables
       thread->outTableBase = outEdgeMemBase[threadId];
-      thread->inTableBase = inEdgeMemBase[threadId];
+      thread->inTableHeaderBase = inEdgeHeaderMemBase[threadId];
+      thread->inTableRestBase = inEdgeRestMemBase[threadId];
       // Add space for each device on thread
       uint32_t numDevs = numDevicesOnThread[threadId];
       for (uint32_t devNum = 0; devNum < numDevs; devNum++) {
@@ -340,11 +408,18 @@ template <typename DeviceType,
         }
       }
       // Intialise thread's in edges
-      PInEdge<E>* inEdgeArray = (PInEdge<E>*) inEdgeMem[threadId];
-      Seq<PInEdge<E>>* edges = inTable[threadId];
+      PInHeader<E>* inEdgeHeaderArray =
+        (PInHeader<E>*) inEdgeHeaderMem[threadId];
+      Seq<PInHeader<E>>* headers = inTableHeaders[threadId];
+      if (headers)
+        for (uint32_t i = 0; i < headers->numElems; i++) {
+          inEdgeHeaderArray[i] = headers->elems[i];
+        }
+      PInEdge<E>* inEdgeRestArray = (PInEdge<E>*) inEdgeRestMem[threadId];
+      Seq<PInEdge<E>>* edges = inTableRest[threadId];
       if (edges)
         for (uint32_t i = 0; i < edges->numElems; i++) {
-          inEdgeArray[i] = edges->elems[i];
+          inEdgeRestArray[i] = edges->elems[i];
         }
       // At this point, check that next pointers line up with heap sizes
       if (nextVMem != vertexMemSize[threadId]) {
@@ -371,12 +446,27 @@ template <typename DeviceType,
   // Allocate routing tables
   // (Only valid after mapper is called)
   void allocateRoutingTables() {
-    // Receiver-side tables
-    inTable = (Seq<PInEdge<E>>**)
+    // Receiver-side tables (headers)
+    inTableHeaders = (Seq<PInHeader<E>>**)
+      calloc(TinselMaxThreads,sizeof(Seq<PInHeader<E>>*));
+    for (uint32_t t = 0; t < TinselMaxThreads; t++) {
+      if (numDevicesOnThread[t] != 0)
+        inTableHeaders[t] = new SmallSeq<PInHeader<E>>;
+    }
+
+    // Receiver-side tables (rest)
+    inTableRest = (Seq<PInEdge<E>>**)
       calloc(TinselMaxThreads,sizeof(Seq<PInEdge<E>>*));
     for (uint32_t t = 0; t < TinselMaxThreads; t++) {
       if (numDevicesOnThread[t] != 0)
-        inTable[t] = new SmallSeq<PInEdge<E>>;
+        inTableRest[t] = new SmallSeq<PInEdge<E>>;
+    }
+
+    // Receiver-side tables (bitmaps)
+    inTableBitmaps = (Bitmap**) calloc(TinselMaxThreads,sizeof(Bitmap*));
+    for (uint32_t t = 0; t < TinselMaxThreads; t++) {
+      if (numDevicesOnThread[t] != 0)
+        inTableBitmaps[t] = new Bitmap;
     }
 
     // Sender-side tables
@@ -389,174 +479,233 @@ template <typename DeviceType,
     }
   }
 
-  // Pack a receivers array
-  // Input: an in-edge sequence for each thread in a mailbox.
-  // Input array may contain lots of holes (0-element sequences)
-  // Output: a sequence of receiver groups
-  // Output array contains no empty receiver groups
-  void createReceiverGroups(
-        uint32_t mbox,
-        Seq<PInEdge<E>>* receivers,
-        Seq<PReceiverGroup<E>>* groups) {
-    groups->clear();
-    for (uint32_t i = 0; i < 64; i++) {
-      if (receivers[i].numElems > 0) {
-        // Add receiver group
-        PReceiverGroup<E> g;
-        g.threadId = (mbox << TinselLogThreadsPerMailbox) | i;
-        g.receivers = &receivers[i];
-        groups->append(g);
-      }
+  // Determine local-multicast routing key for given set of receivers
+  // (The key must be the same for all receivers)
+  uint32_t findKey(uint32_t numGroups) { 
+    // Fast path (single receiver)
+    if (numGroups == 1) {
+      Bitmap* bm = inTableBitmaps[groups[0].threadId];
+      return bm->grabNextBit();
     }
-  }
 
-  // Determine routing key for given set of receivers
-  // (The key must be the same for all receivers)
-  uint32_t findKey(Seq<PReceiverGroup<E>>* receivers) { 
-    uint32_t key = 0;
-
-    bool found = false;
-    while (!found) {
-      found = true; 
-      for (uint32_t i = 0; i < receivers->numElems; i++) {
-        PReceiverGroup<E> g = receivers->elems[i];
-        uint32_t numReceivers = g.receivers->numElems;
-        if (numReceivers > 0) {
-          // Lookup thread id of receiver
-          uint32_t t = g.threadId;
-          // Lookup table size for this thread
-          uint32_t tableSize = inTable[t]->numElems;
-          // Move to next receiver when we find a space
-          if (key >= tableSize) continue;
-          // Is there space at the current key?
-          // (Need space for numReceivers plus null terminator)
-          bool space = true;
-          for (int j = 0; j < numReceivers+1; j++) {
-            if ((key+j) >= tableSize) break;
-            if (inTable[t]->elems[key+j].devId != UnusedLocalDevId) {
-              found = false;
-              key = key+j+1;
-              break;
-            }
-          }
-        }
+    // Determine starting index for key search
+    uint32_t index = 0;
+    for (uint32_t i = 0; i < numGroups; i++) {
+      PReceiverGroup<E>* g = &groups[i];
+      Bitmap* bm = inTableBitmaps[g->threadId];
+      if (bm->firstFree > index) index = bm->firstFree;
+    }
+
+    // Find key that is available for all receivers
+    uint64_t mask;
+    retry:
+      mask = 0ul;
+      for (uint32_t i = 0; i < numGroups; i++) {
+        PReceiverGroup<E>* g = &groups[i];
+        Bitmap* bm = inTableBitmaps[g->threadId];
+        mask |= bm->getWord(index);
+        if (~mask == 0ul) { index++; goto retry; }
       }
+
+    // Mark key as taken in each bitmap
+    uint32_t bit = __builtin_ctzll(~mask);
+    for (uint32_t i = 0; i < numGroups; i++) {
+      PReceiverGroup<E>* g = &groups[i];
+      Bitmap* bm = inTableBitmaps[g->threadId];
+      bm->setBit(index, bit);
     }
-    return key;
+    return 64*index + bit;
   }
 
   // Add entries to the input tables for the given receivers
   // (Only valid after mapper is called)
-  uint32_t addInTableEntries(Seq<PReceiverGroup<E>>* receivers) {
-    uint32_t key = findKey(receivers);
-    if (key >= 0xfffe) {
+  uint32_t addInTableEntries(uint32_t numGroups) {
+    uint32_t key = findKey(numGroups);
+    if (key >= 0xffff) {
       printf("Routing key exceeds 16 bits\n");
       exit(EXIT_FAILURE);
     }
-    PInEdge<E> null, unused;
-    null.devId = InvalidLocalDevId;
-    unused.devId = UnusedLocalDevId;
-    // Now that a key with sufficient space has been found, populate the tables
-    for (uint32_t i = 0; i < receivers->numElems; i++) {
-      PReceiverGroup<E> g = receivers->elems[i];
-      uint32_t numReceivers = g.receivers->numElems;
-      if (numReceivers > 0) {
-        // Lookup thread id of receiver
-        uint32_t t = g.threadId;
-        // Lookup table size for this thread
-        uint32_t tableSize = inTable[t]->numElems;
-        // Make sure inTable is big enough for new entries
-        for (uint32_t j = tableSize; j < (key+numReceivers+1); j++)
-          inTable[t]->append(unused);
-        // Add receivers to thread's inTable
-        for (uint32_t j = 0; j < numReceivers; j++) {
-          inTable[t]->elems[key+j] = g.receivers->elems[j];
+    // Populate inTableHeaders and inTableRest using the key
+    for (uint32_t i = 0; i < numGroups; i++) {
+      PReceiverGroup<E>* g = &groups[i];
+      uint32_t numEdges = g->receivers.numElems;
+      PInEdge<E>* edgePtr = g->receivers.elems;
+      if (numEdges > 0) {
+        // Determine thread id of receiver
+        uint32_t t = g->threadId;
+        // Extend table
+        Seq<PInHeader<E>>* headers = inTableHeaders[t];
+        if (key >= headers->numElems)
+          headers->extendBy(key + 1 - headers->numElems);
+        // Fill in header
+        PInHeader<E>* header = &inTableHeaders[t]->elems[key];
+        header->numReceivers = numEdges;
+        if (inTableRest[t]->numElems > 0xffff) {
+          printf("In-table index exceeds 16 bits\n");
+          exit(EXIT_FAILURE);
+        }
+        header->restIndex = inTableRest[t]->numElems;
+        uint32_t numHeaderEdges = numEdges < POLITE_EDGES_PER_HEADER ?
+          numEdges : POLITE_EDGES_PER_HEADER;
+        for (uint32_t j = 0; j < numHeaderEdges; j++) {
+          header->edges[j] = *edgePtr;
+          edgePtr++;
+        }
+        numEdges -= numHeaderEdges;
+        // Overflow into rest memory if header not big enough
+        for (uint32_t j = 0; j < numEdges; j++) {
+          inTableRest[t]->append(*edgePtr);
+          edgePtr++;
         }
-        inTable[t]->elems[key+numReceivers] = null;
       }
     }
     return key;
   }
 
+  // Split edge list into board-local and non-board-local destinations
+  // And sort each list by destination thread id
+  // (Only valid after mapper is called)
+  void splitDests(PDeviceId devId, PinId pinId,
+                    Seq<PEdgeDest>* local, Seq<PEdgeDest>* nonLocal) {
+    local->clear();
+    nonLocal->clear();
+    PDeviceAddr devAddr = toDeviceAddr[devId];
+    uint32_t devBoard = getThreadId(devAddr) >> TinselLogThreadsPerBoard;
+    // Split destinations into local/non-local
+    Seq<PDeviceId>* dests = graph.outgoing->elems[devId];
+    Seq<PinId>* pinIds = graph.pins->elems[devId];
+    uint32_t index = 0;
+    for (uint32_t d = 0; d < dests->numElems; d++) {
+      if (pinIds->elems[d] == pinId) {
+        PEdgeDest e;
+        e.index = index++;
+        e.dest = dests->elems[d];
+        e.addr = toDeviceAddr[e.dest];
+        uint32_t destBoard = getThreadId(e.addr) >> TinselLogThreadsPerBoard;
+        if (devBoard == destBoard)
+          local->append(e);
+        else
+          nonLocal->append(e);
+      }
+    }
+    // Sort local list
+    qsort(local->elems, local->numElems, sizeof(PEdgeDest), cmpEdgeDest);
+    // Sort non-local list
+    qsort(nonLocal->elems, nonLocal->numElems, sizeof(PEdgeDest), cmpEdgeDest);
+  }
+
+  // Compute table updates for destinations for given device
+  // (Only valid after mapper is called)
+  void computeTables(Seq<PEdgeDest>* local, uint32_t d, 
+         Seq<PRoutingDest>* out) {
+    out->clear();
+    uint32_t index = 0;
+    while (index < local->numElems) {
+      // New set of receiver groups on same mailbox
+      uint32_t threadMaskLow = 0;
+      uint32_t threadMaskHigh = 0;
+      uint32_t nextGroup = 0;
+      // Current mailbox & thread being considered
+      PDeviceAddr mbox = getThreadId(local->elems[index].addr) >>
+                           TinselLogThreadsPerMailbox;
+      uint32_t thread = getThreadId(local->elems[index].addr) &
+                          ((1<<TinselLogThreadsPerMailbox)-1);
+      // Determine edges targetting same mailbox
+      while (index < local->numElems) {
+        PEdgeDest* edge = &local->elems[index];
+        // Determine destination mailbox address and mailbox-local thread
+        uint32_t destMailbox = getThreadId(edge->addr) >>
+                                 TinselLogThreadsPerMailbox;
+        uint32_t destThread = getThreadId(edge->addr) &
+                                 ((1<<TinselLogThreadsPerMailbox)-1);
+        // Does destination match current destination?
+        if (destMailbox == mbox) {
+          if (destThread == thread) {
+            // Add to current receiver group
+            PInEdge<E> in;
+            in.devId = getLocalDeviceId(edge->addr);
+            Seq<E>* edges = edgeLabels.elems[d];
+            if (! std::is_same<E, None>::value)
+              in.edge = edges->elems[edge->index];
+            // Update current receiver group
+            groups[nextGroup].receivers.append(in);
+            groups[nextGroup].threadId = thread;
+            if (thread < 32) threadMaskLow |= 1 << thread;
+            if (thread >= 32) threadMaskHigh |= 1 << (thread-32);
+            index++;
+          }
+          else {
+            // Start new receiver group
+            thread = destThread;
+            nextGroup++;
+            assert(nextGroup < TinselThreadsPerMailbox);
+          }
+        }
+        else break;
+      }
+      // Add input table entries
+      uint32_t key = addInTableEntries(nextGroup+1);
+      // Add output entry
+      PRoutingDest dest;
+      dest.kind = PRDestKindMRM;
+      dest.mbox = mbox;
+      dest.mrm.key = key;
+      dest.mrm.threadMaskLow = threadMaskLow;
+      dest.mrm.threadMaskHigh = threadMaskHigh;
+      out->append(dest);
+      // Clear receiver groups, for a new iteration
+      for (uint32_t i = 0; i <= nextGroup; i++) groups[i].receivers.clear();
+    }
+  }
+
   // Compute routing tables
   // (Only valid after mapper is called)
   void computeRoutingTables() {
-    // Routing table stats
-    uint64_t totalOutEdges = 0;
+    // Edge destinations (local to sender board, or not)
+    Seq<PEdgeDest> local;
+    Seq<PEdgeDest> nonLocal;
 
-    // Sequence of local device ids, for each multicast destiation
-    SmallSeq<PInEdge<E>> receivers[64];
+    // Routing destinations
+    Seq<PRoutingDest> dests;
 
-    // Sequence of receiver groups
-    // (A more compact representation of the receivers array)
-    SmallSeq<PReceiverGroup<E>> groups;
+    // Allocate per-board programmable routing tables
+    progRouterTables = new ProgRouterMesh(numBoardsX, numBoardsY);
 
     // For each device
     for (uint32_t d = 0; d < numDevices; d++) {
       // For each pin
       for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) {
-        Seq<PDeviceId> dests = *(graph.outgoing->elems[d]);
-        Seq<E> edges = *(edgeLabels.elems[d]);
-        // While destinations are remaining
-        while (dests.numElems > 0) {
-          // Clear receivers
-          for (uint32_t i = 0; i < 64; i++) receivers[i].clear();
-          uint32_t threadMaskLow = 0;
-          uint32_t threadMaskHigh = 0;
-          // Current mailbox being considered
-          PDeviceAddr mbox = getThreadId(toDeviceAddr[dests.elems[0]]) >>
-                               TinselLogThreadsPerMailbox;
-          // For each destination
-          uint32_t destsRemaining = 0;
-          for (uint32_t i = 0; i < dests.numElems; i++) {
-            // Determine destination mailbox address and mailbox-local thread
-            PDeviceId destId = dests.elems[i];
-            PDeviceAddr destAddr = toDeviceAddr[destId];
-            uint32_t destMailbox = getThreadId(destAddr) >>
-                                     TinselLogThreadsPerMailbox;
-            uint32_t destThread = getThreadId(destAddr) &
-                                     ((1<<TinselLogThreadsPerMailbox)-1);
-            // Does destination match current destination?
-            if (destMailbox == mbox) {
-              PInEdge<E> edge;
-              edge.devId = getLocalDeviceId(destAddr);
-              if (! std::is_same<E, None>::value) edge.edge = edges.elems[i];
-              receivers[destThread].append(edge);
-              if (destThread < 32) threadMaskLow |= 1 << destThread;
-              if (destThread >= 32) threadMaskHigh |= 1 << (destThread-32);
-            }
-            else {
-              // Add destination back into sequence
-              dests.elems[destsRemaining] = dests.elems[i];
-              edges.elems[destsRemaining] = edges.elems[i];
-              destsRemaining++;
-            }
-          }
-          // Create receiver groups
-          createReceiverGroups(mbox, receivers, &groups);
-          // Add input table entries
-          uint32_t key = addInTableEntries(&groups);
-          // Add output table entry
+        // Split edge lists into local/non-local and sort by target thread id
+        splitDests(d, p, &local, &nonLocal);
+        // Deal with board-local connections
+        computeTables(&local, d, &dests);
+        for (uint32_t i = 0; i < dests.numElems; i++) {
+          PRoutingDest dest = dests.elems[i];
           POutEdge edge;
-          edge.mbox = mbox;
-          edge.key = key;
-          edge.threadMaskLow = threadMaskLow;
-          edge.threadMaskHigh = threadMaskHigh;
+          edge.mbox = dest.mbox;
+          edge.key = dest.mrm.key;
+          edge.threadMaskLow = dest.mrm.threadMaskLow;
+          edge.threadMaskHigh = dest.mrm.threadMaskHigh;
           outTable[d][p]->append(edge);
-          // Prepare for new output table entry
-          dests.numElems = destsRemaining;
-          edges.numElems = destsRemaining;
-          totalOutEdges++;
         }
-        // Add output edge terminator
+        // Deal with non-board-local connections
+        computeTables(&nonLocal, d, &dests);
+        uint32_t src = getThreadId(toDeviceAddr[d]) >>
+          TinselLogThreadsPerMailbox;
+        uint32_t key = progRouterTables->addDestsFromBoard(src, &dests);
+        POutEdge edge;
+        edge.mbox = tinselUseRoutingKey();
+        edge.key = 0;
+        edge.threadMaskLow = key;
+        edge.threadMaskHigh = 0; 
+        outTable[d][p]->append(edge);
+        // Add output list terminator
         POutEdge term;
         term.key = InvalidKey;
         outTable[d][p]->append(term);
       }
     }
-    //printf("Average edges per pin: %lu\n",
-    //  totalOutEdges / (numDevices * POLITE_NUM_PINS);
-  }  
+  }
 
   // Release all structures
   void releaseAll() {
@@ -578,21 +727,38 @@ template <typename DeviceType,
       free(threadMemSize);
       free(threadMemBase);
       for (uint32_t t = 0; t < TinselMaxThreads; t++)
-        if (inEdgeMem[t] != NULL) free(inEdgeMem[t]);
-      free(inEdgeMem);
-      free(inEdgeMemSize);
-      free(inEdgeMemBase);
+        if (inEdgeHeaderMem[t] != NULL) free(inEdgeHeaderMem[t]);
+      free(inEdgeHeaderMem);
+      free(inEdgeHeaderMemSize);
+      free(inEdgeHeaderMemBase);
+      for (uint32_t t = 0; t < TinselMaxThreads; t++)
+        if (inEdgeRestMem[t] != NULL) free(inEdgeRestMem[t]);
+      free(inEdgeRestMem);
+      free(inEdgeRestMemSize);
+      free(inEdgeRestMemBase);
       for (uint32_t t = 0; t < TinselMaxThreads; t++)
         if (outEdgeMem[t] != NULL) free(outEdgeMem[t]);
       free(outEdgeMem);
       free(outEdgeMemSize);
       free(outEdgeMemBase);
     }
-    if (inTable != NULL) {
+    if (inTableHeaders != NULL) {
+      for (uint32_t t = 0; t < TinselMaxThreads; t++)
+        if (inTableHeaders[t] != NULL) delete inTableHeaders[t];
+      free(inTableHeaders);
+      inTableHeaders = NULL;
+    }
+    if (inTableRest != NULL) {
       for (uint32_t t = 0; t < TinselMaxThreads; t++)
-        if (inTable[t] != NULL) delete inTable[t];
-      free(inTable);
-      inTable = NULL;
+        if (inTableRest[t] != NULL) delete inTableRest[t];
+      free(inTableRest);
+      inTableRest = NULL;
+    }
+    if (inTableBitmaps != NULL) {
+      for (uint32_t t = 0; t < TinselMaxThreads; t++)
+        if (inTableBitmaps[t] != NULL) delete inTableBitmaps[t];
+      free(inTableBitmaps);
+      inTableBitmaps = NULL;
     }
     if (outTable != NULL) {
       for (uint32_t d = 0; d < numDevices; d++) {
@@ -604,6 +770,7 @@ template <typename DeviceType,
       free(outTable);
       outTable = NULL;
     }
+    if (progRouterTables != NULL) delete progRouterTables;
   }
 
   // Implement mapping to tinsel threads
@@ -813,8 +980,11 @@ template <typename DeviceType,
     hostLink->useSendBuffer = true;
     writeRAM(hostLink, vertexMem, vertexMemSize, vertexMemBase);
     writeRAM(hostLink, threadMem, threadMemSize, threadMemBase);
-    writeRAM(hostLink, inEdgeMem, inEdgeMemSize, inEdgeMemBase);
+    writeRAM(hostLink, inEdgeHeaderMem,
+               inEdgeHeaderMemSize, inEdgeHeaderMemBase);
+    writeRAM(hostLink, inEdgeRestMem, inEdgeRestMemSize, inEdgeRestMemBase);
     writeRAM(hostLink, outEdgeMem, outEdgeMemSize, outEdgeMemBase);
+    progRouterTables->write(hostLink);
     hostLink->flush();
     hostLink->useSendBuffer = useSendBufferOld;
 
@@ -838,7 +1008,6 @@ template <typename DeviceType,
   uint32_t fanOut(PDeviceId id) {
     return graph.fanOut(id);
   }
-
 };
 
 // Read performance stats and store in file
diff --git a/include/tinsel-interface.h b/include/tinsel-interface.h
index 352b4461..21dfdfcb 100644
--- a/include/tinsel-interface.h
+++ b/include/tinsel-interface.h
@@ -175,4 +175,13 @@ INLINE uint32_t tinselAccId(
   return addr;
 }
 
+// Special address to signify use of routing key
+INLINE uint32_t tinselUseRoutingKey()
+{
+  // Special address to signify use of routing key
+  return 1 <<
+    (TinselMailboxMeshYBits + TinselMailboxMeshXBits +
+     TinselMeshXBits + TinselMeshYBits + 2);
+}
+
 #endif
diff --git a/include/tinsel.h b/include/tinsel.h
index ab29fbae..0b88844d 100644
--- a/include/tinsel.h
+++ b/include/tinsel.h
@@ -193,11 +193,7 @@ INLINE void tinselSend(int dest, volatile void* addr)
 // Send message at addr using given routing key
 INLINE void tinselKeySend(int key, volatile void* addr)
 {
-  // Special address to signify use of routing key
-  uint32_t useRoutingKey = 1 <<
-    (TinselMailboxMeshYBits + TinselMailboxMeshXBits +
-     TinselMeshXBits + TinselMeshYBits + 2);
-  tinselMulticast(useRoutingKey, 0, key, addr);
+  tinselMulticast(tinselUseRoutingKey(), 0, key, addr);
 }
 
 // Receive message

From 01de961bca3f3743f2ec038e7f89c896018ceb77 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mn416@jennings.cl.cam.ac.uk>
Date: Tue, 23 Jun 2020 09:04:34 +0000
Subject: [PATCH 73/78] Silly mistakes

---
 include/POLite/Bitmap.h | 4 ++--
 include/POLite/PGraph.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/POLite/Bitmap.h b/include/POLite/Bitmap.h
index 262f99af..0a165896 100644
--- a/include/POLite/Bitmap.h
+++ b/include/POLite/Bitmap.h
@@ -47,9 +47,9 @@ struct Bitmap {
 
   // Find index of next zero bit, and flip that bit
   inline uint32_t grabNextBit() {
-    uint64_t word = getWord(firstFree);
+    uint64_t word = ~getWord(firstFree);
     assert(word != 0ul);
-    uint32_t bit = __builtin_ctzll(~word);
+    uint32_t bit = __builtin_ctzll(word);
     setBit(firstFree, bit);
     return 64*firstFree + bit;
   }
diff --git a/include/POLite/PGraph.h b/include/POLite/PGraph.h
index c5e5b41f..5efaa15a 100644
--- a/include/POLite/PGraph.h
+++ b/include/POLite/PGraph.h
@@ -629,7 +629,7 @@ template <typename DeviceType,
               in.edge = edges->elems[edge->index];
             // Update current receiver group
             groups[nextGroup].receivers.append(in);
-            groups[nextGroup].threadId = thread;
+            groups[nextGroup].threadId = getThreadId(edge->addr);
             if (thread < 32) threadMaskLow |= 1 << thread;
             if (thread >= 32) threadMaskHigh |= 1 << (thread-32);
             index++;

From 8d3807f8fc5ea63c08a6c07f28c6c4655f9f15c5 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mn416@jennings.cl.cam.ac.uk>
Date: Tue, 23 Jun 2020 10:13:14 +0000
Subject: [PATCH 74/78] Another mistak

---
 include/POLite/PGraph.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/POLite/PGraph.h b/include/POLite/PGraph.h
index 5efaa15a..909db3ec 100644
--- a/include/POLite/PGraph.h
+++ b/include/POLite/PGraph.h
@@ -574,11 +574,10 @@ template <typename DeviceType,
     // Split destinations into local/non-local
     Seq<PDeviceId>* dests = graph.outgoing->elems[devId];
     Seq<PinId>* pinIds = graph.pins->elems[devId];
-    uint32_t index = 0;
     for (uint32_t d = 0; d < dests->numElems; d++) {
       if (pinIds->elems[d] == pinId) {
         PEdgeDest e;
-        e.index = index++;
+        e.index = d;
         e.dest = dests->elems[d];
         e.addr = toDeviceAddr[e.dest];
         uint32_t destBoard = getThreadId(e.addr) >> TinselLogThreadsPerBoard;

From d309559f37895d5ea7b0f228910b2faec5373060 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mn416@jennings.cl.cam.ac.uk>
Date: Tue, 23 Jun 2020 15:53:55 +0000
Subject: [PATCH 75/78] More fixes and tweaks

---
 include/POLite/Bitmap.h  |  9 +++++----
 include/POLite/PDevice.h | 10 ++++++++++
 include/POLite/PGraph.h  | 12 ++++++------
 3 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/include/POLite/Bitmap.h b/include/POLite/Bitmap.h
index 0a165896..9271bc07 100644
--- a/include/POLite/Bitmap.h
+++ b/include/POLite/Bitmap.h
@@ -47,11 +47,12 @@ struct Bitmap {
 
   // Find index of next zero bit, and flip that bit
   inline uint32_t grabNextBit() {
-    uint64_t word = ~getWord(firstFree);
-    assert(word != 0ul);
-    uint32_t bit = __builtin_ctzll(word);
+    uint64_t word = getWord(firstFree);
+    assert(~word != 0ul);
+    uint32_t bit = __builtin_ctzll(~word);
+    uint32_t result = 64*firstFree + bit;
     setBit(firstFree, bit);
-    return 64*firstFree + bit;
+    return result;
   }
 };
 
diff --git a/include/POLite/PDevice.h b/include/POLite/PDevice.h
index 6ba3be83..508207bd 100644
--- a/include/POLite/PDevice.h
+++ b/include/POLite/PDevice.h
@@ -135,6 +135,16 @@ template <typename E> struct PInEdge {
   E edge;
 };
 
+// An incoming edge to a device (unlabelled)
+template <> struct PInEdge<None> {
+  union {
+    // Destination device
+    PLocalDeviceId devId;
+    // Unused
+    None edge;
+  };
+};
+
 // Header for a list of incoming edges (fixed size structure to
 // support fast construction/packing of local-multicast tables)
 template <typename E> struct PInHeader {
diff --git a/include/POLite/PGraph.h b/include/POLite/PGraph.h
index 909db3ec..a1ecc739 100644
--- a/include/POLite/PGraph.h
+++ b/include/POLite/PGraph.h
@@ -595,23 +595,23 @@ template <typename DeviceType,
 
   // Compute table updates for destinations for given device
   // (Only valid after mapper is called)
-  void computeTables(Seq<PEdgeDest>* local, uint32_t d, 
+  void computeTables(Seq<PEdgeDest>* dests, uint32_t d,
          Seq<PRoutingDest>* out) {
     out->clear();
     uint32_t index = 0;
-    while (index < local->numElems) {
+    while (index < dests->numElems) {
       // New set of receiver groups on same mailbox
       uint32_t threadMaskLow = 0;
       uint32_t threadMaskHigh = 0;
       uint32_t nextGroup = 0;
       // Current mailbox & thread being considered
-      PDeviceAddr mbox = getThreadId(local->elems[index].addr) >>
+      PDeviceAddr mbox = getThreadId(dests->elems[index].addr) >>
                            TinselLogThreadsPerMailbox;
-      uint32_t thread = getThreadId(local->elems[index].addr) &
+      uint32_t thread = getThreadId(dests->elems[index].addr) &
                           ((1<<TinselLogThreadsPerMailbox)-1);
       // Determine edges targetting same mailbox
-      while (index < local->numElems) {
-        PEdgeDest* edge = &local->elems[index];
+      while (index < dests->numElems) {
+        PEdgeDest* edge = &dests->elems[index];
         // Determine destination mailbox address and mailbox-local thread
         uint32_t destMailbox = getThreadId(edge->addr) >>
                                  TinselLogThreadsPerMailbox;

From 0bb89acac23dabbfcef8fa9e4934eabf7da8c5cb Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mattfn@gmail.com>
Date: Tue, 23 Jun 2020 20:24:35 +0100
Subject: [PATCH 76/78] Update README

---
 README.md | 57 +++++++++++++++++++++++++++----------------------------
 1 file changed, 28 insertions(+), 29 deletions(-)

diff --git a/README.md b/README.md
index e4fdf72c..77883461 100644
--- a/README.md
+++ b/README.md
@@ -1299,13 +1299,21 @@ by each thread.
 After mapping, POLite writes the graph into cluster memory and
 triggers execution.  By default, vertex states are written into the
 off-chip QDRII+ SRAMs, and edge lists are written in the DDR3 DRAMs.
-This default behaviour can be modified by setting the boolean flags
-`graph.mapVerticesToDRAM`, `graph.mapInEdgesToDRAM`,
-`graph.mapOutEdgesToDRAM` accordingly (true means "map to DRAM" and
-false means "map to SRAM").  Once the application is up and running,
-the host and the graph vertices can continue to communicate: any
-vertex can send messages to the host via the `HostPin` or the `finish`
-handler, and the host can send messages to any vertex.
+This default behaviour can be modified by adjusting the following
+flags of the `PGraph` class.
+
+  Flag                     | Default
+  ------------------------ | -------
+  `mapVerticesToDRAM`      | `false`
+  `mapInEdgeHeadersToDRAM` | `true`
+  `mapInEdgeRestToDRAM`    | `true`
+  `mapOutEdgesToDRAM`      | `true`
+
+A value of `true` means "map to DRAM", while `false` means "map to
+(off-chip) SRAM".  Once the application is up and running, the host
+and the graph vertices can continue to communicate: any vertex can
+send messages to the host via the `HostPin` or the `finish` handler,
+and the host can send messages to any vertex.
 
 **Softswitch**. Central to POLite is an event loop running on each
 Tinsel thread, which we call the softswitch as it effectively
@@ -1321,20 +1329,12 @@ required, to meet the semantics of the POLite library.
 before the first instance of `#include <POLite.h>`, to control some
 aspects of POLite behaviour.
 
-  Macro               | Meaning
-  ---------           | -------
-  `POLITE_NUM_PINS`   | Max number of pins per vertex (default 1)
-  `POLITE_DUMP_STATS` | Dump stats upon completion
-  `POLITE_COUNT_MSGS` | Include message counts in stats dump
-
-POLite supports three mapping modes, also controlled via macros:
-
- 
-  Macro               | Use when graphs have...
-  ---------           | -----------------------
-  `POLITE_MAP_LOCAL`  | ...lots of local connections and few distributed connections
-  `POLITE_MAP_DIST`   | ...lots of distributed connections and few local connections (this mapper is fast)
-  `POLITE_MAP_HYBRID` | ...a mix of local and distributed connections (default)
+  Macro                     | Meaning
+  ---------                 | -------
+  `POLITE_NUM_PINS`         | Max number of pins per vertex (default 1)
+  `POLITE_DUMP_STATS`       | Dump stats upon completion
+  `POLITE_COUNT_MSGS`       | Include message counts in stats dump
+  `POLITE_EDGES_PER_HEADER` | Lower this for large edge states (default 6)
 
 **POLite dynamic parameters**.  The following environment variables can
 be set, to control some aspects of POLite behaviour.
@@ -1346,14 +1346,13 @@ be set, to control some aspects of POLite behaviour.
   `POLITE_BOARDS_X`    | Size of board mesh to use in X dimension
   `POLITE_BOARDS_Y`    | Size of board mesh to use in Y dimension
   `POLITE_CHATTY`      | Set to `1` to enable emission of mapper stats
-  `POLITE_PLACER`      | Use `metis`, `random`, or `direct` placement
-
-**Limitations**. POLite provides several important features of the
-vertex-centric paradigm, but there are lots of limitations and quirks;
-it is only intended as a prototype library for hardware evaluation
-purposes. One of the features of the Pregel framework is the ability
-for vertices to add and remove vertices and edges at runtime -- but
-currently, POLite only supports static graphs. 
+  `POLITE_PLACER`      | Use `metis`, `random`, `bfs`, or `direct` placement
+
+**Limitations**. POLite is primarily intended as a prototype library
+for hardware evaluation purposes. It occupies a single, simple point
+in a wider, richer design space.  In particular, it doesn't support
+dynamic creation of vertices and edges, and it hasn't been optimised
+to deal with highly non-uniform fanouts.
 
 ## A. DE5-Net Synthesis Report
 

From f2fda2c8598c3a7a9e02d475d3a1fd85a3f5d1ab Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mn416@jennings.cl.cam.ac.uk>
Date: Wed, 24 Jun 2020 08:25:55 +0000
Subject: [PATCH 77/78] Use OpenMP during hierarchical partitioning

---
 apps/POLite/util/polite.mk |  2 +-
 include/POLite/PGraph.h    |  1 +
 include/POLite/Placer.h    | 13 ++++++++++---
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/apps/POLite/util/polite.mk b/apps/POLite/util/polite.mk
index a1d96f83..4abe32ee 100644
--- a/apps/POLite/util/polite.mk
+++ b/apps/POLite/util/polite.mk
@@ -51,7 +51,7 @@ $(HL)/%.o:
 
 $(BUILD)/run: $(RUN_CPP) $(RUN_H) $(HL)/*.o
 	g++ -std=c++11 -O2 -I $(INC) -I $(HL) -o $(BUILD)/run $(RUN_CPP) $(HL)/*.o \
-	  -lmetis -fno-exceptions
+	  -lmetis -fno-exceptions -fopenmp
 
 $(BUILD)/sim: $(RUN_CPP) $(RUN_H) $(HL)/sim/*.o
 	g++ -O2 -I $(INC) -I $(HL) -o $(BUILD)/sim $(RUN_CPP) $(HL)/sim/*.o \
diff --git a/include/POLite/PGraph.h b/include/POLite/PGraph.h
index a1ecc739..1a67e2ef 100644
--- a/include/POLite/PGraph.h
+++ b/include/POLite/PGraph.h
@@ -796,6 +796,7 @@ template <typename DeviceType,
     boards.place(placerEffort);
 
     // For each board
+    #pragma omp parallel for collapse(2)
     for (uint32_t boardY = 0; boardY < numBoardsY; boardY++) {
       for (uint32_t boardX = 0; boardX < numBoardsX; boardX++) {
         // Partition into subgraphs, one per mailbox
diff --git a/include/POLite/Placer.h b/include/POLite/Placer.h
index d2f2378a..57c00444 100644
--- a/include/POLite/Placer.h
+++ b/include/POLite/Placer.h
@@ -6,6 +6,7 @@
 #include <metis.h>
 #include <POLite/Graph.h>
 #include <queue>
+#include <omp.h>
 
 typedef uint32_t PartitionId;
 
@@ -52,6 +53,11 @@ struct Placer {
   uint32_t* yCoordSaved;
   uint64_t savedCost;
 
+  // Random numbers
+  unsigned int seed;
+  void setRand(unsigned int s) { seed = s; };
+  int getRand() { return rand_r(&seed); }
+
   // Controls which strategy is used
   Method method = Default;
 
@@ -160,9 +166,8 @@ struct Placer {
     uint32_t numParts = width * height;
 
     // Populate result array
-    srand(0);
     for (uint32_t i = 0; i < numVertices; i++) {
-      partitions[i] = rand() % numParts;
+      partitions[i] = getRand() % numParts;
     }
   }
 
@@ -308,7 +313,7 @@ struct Placer {
     // Random mapping
     for (uint32_t y = 0; y < height; y++) {
       for (uint32_t x = 0; x < width; x++) {
-        int index = rand() % numPartitions;
+        int index = getRand() % numPartitions;
         PartitionId p = pids[index];
         mapping[y][x] = p;
         xCoord[p] = x;
@@ -424,6 +429,8 @@ struct Placer {
     graph = g;
     width = w;
     height = h;
+    // Random seed
+    setRand(1 + omp_get_thread_num());
     // Allocate the partitions array
     partitions = new PartitionId [g->incoming->numElems];
     // Allocate subgraphs

From 473d9061d9772e04d9978cf3d8aba6277cb65de9 Mon Sep 17 00:00:00 2001
From: Matthew Naylor <mn416@jennings.cl.cam.ac.uk>
Date: Wed, 24 Jun 2020 12:27:13 +0000
Subject: [PATCH 78/78] Set release date

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 77883461..a66aed56 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ Released on 2 Dec 2019 and maintained in the
 [tinsel-0.7.1 branch](https://github.com/POETSII/tinsel/tree/tinsel-0.7.1).
 (Local hardware multicast.)
 * [v0.8](https://github.com/POETSII/tinsel/releases/tag/v0.8):
-Released on 18 May 2020 and maintained in the
+Released on 24 Jun 2020 and maintained in the
 [master branch](https://github.com/POETSII/tinsel/).
 (Global hardware multicast.)
 
@@ -74,9 +74,9 @@ main features are:
 
   * **Multithreading**.  A critical aspect of the design
     is to tolerate latency as cleanly as possible.  This includes the
-    latencies arising from: floating-point on Stratix V FPGAs
-    (tens of cycles); off-chip memories; deep pipelines
-    (keeping Fmax high); and sharing of resources between cores
+    latencies arising from floating-point on Stratix V FPGAs
+    (tens of cycles), off-chip memories, deep pipelines
+    (keeping Fmax high), and sharing of resources between cores
     (such as caches, mailboxes, and FPUs).
 
   * **Message-passing**. Although there is a requirement to support a