From d0a4acfc698be54a5c574d78021977fd9e6344b1 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Tue, 14 Jan 2020 14:24:47 +0000 Subject: [PATCH 01/78] Per-board router A 2D mesh of board routers (one per board) now connects together the 2D NoCs on each board. This means that message can pass over a board without travesing the NoC. The per-board router is not yet programmable, but that is direction we are heading. --- rtl/Network.bsv | 221 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 148 insertions(+), 73 deletions(-) diff --git a/rtl/Network.bsv b/rtl/Network.bsv index 3efbb480..bd435a11 100644 --- a/rtl/Network.bsv +++ b/rtl/Network.bsv @@ -146,11 +146,8 @@ module mkMeshRouter#(MailboxId m) (MeshRouter); // Routing function function Route route(NetAddr a); - if (a.addr.board.y < b.y) return Down; - else if (a.addr.board.y > b.y) return Up; - else if (a.addr.host.valid) return a.addr.host.value == 0 ? Left : Right; - else if (a.addr.board.x < b.x) return Left; - else if (a.addr.board.x > b.x) return Right; + if (a.addr.board != b) return Down; + else if (a.addr.host.valid) return Down; else if (a.addr.mbox.y < m.y) return Down; else if (a.addr.mbox.y > m.y) return Up; else if (a.addr.mbox.x < m.x) return Left; @@ -221,6 +218,104 @@ module mkMeshRouter#(MailboxId m) (MeshRouter); endmodule +// ============================================================================= +// Board router +// ============================================================================= + +// Similar to a mesh router, but: (1) different routing function, +// which routes between boards rather than mailboxes; (2) no loopback +// in the sense that packets coming from mailbox mesh never get routed back +// onto mailbox mesh. This is a first step towards supporting +// programmable board routers. +module mkBoardRouter(MeshRouter); + + // Board id + Wire#(BoardId) b <- mkDWire(?); + + // Ports + InPort#(Flit) leftInPort <- mkInPort; + OutPort#(Flit) leftOutPort <- mkOutPort; + InPort#(Flit) rightInPort <- mkInPort; + OutPort#(Flit) rightOutPort <- mkOutPort; + InPort#(Flit) topInPort <- mkInPort; + OutPort#(Flit) topOutPort <- mkOutPort; + InPort#(Flit) bottomInPort <- mkInPort; + OutPort#(Flit) bottomOutPort <- mkOutPort; + InPort#(Flit) fromMailboxPort <- mkInPort; + OutPort#(Flit) toMailboxPort <- mkOutPort; + + // Routing function + function Route route(NetAddr a); + if (a.addr.host.valid) return a.addr.host.value == 0 ? Left : Right; + else if (a.addr.board.x < b.x) return Left; + else if (a.addr.board.x > b.x) return Right; + else if (a.addr.board.y < b.y) return Down; + else if (a.addr.board.y > b.y) return Up; + else return Mailbox; + endfunction + + // Route to the mailbox + mkRouterMux( + route, + Mailbox, + toMailboxPort, + vector(FromLeft, FromRight, FromTop, FromBottom), + vector(leftInPort, rightInPort, topInPort, bottomInPort) + ); + + // Route left + mkRouterMux( + route, + Left, + leftOutPort, + vector(FromRight, FromTop, FromBottom, FromMailbox), + vector(rightInPort, topInPort, bottomInPort, fromMailboxPort) + ); + + // Route right + mkRouterMux( + route, + Right, + rightOutPort, + vector(FromLeft, FromTop, FromBottom, FromMailbox), + vector(leftInPort, topInPort, bottomInPort, fromMailboxPort) + ); + + // Route up + mkRouterMux( + route, + Up, + topOutPort, + vector(FromLeft, FromRight, FromBottom, FromMailbox), + vector(leftInPort, rightInPort, bottomInPort, fromMailboxPort) + ); + + // Route down + mkRouterMux( + route, + Down, + bottomOutPort, + vector(FromLeft, FromRight, FromTop, FromMailbox), + vector(leftInPort, rightInPort, topInPort, fromMailboxPort) + ); + + method Action setBoardId(BoardId id); + b <= id; + endmethod + + // Interface + interface In leftIn = leftInPort.in; + interface Out leftOut = leftOutPort.out; + interface In rightIn = rightInPort.in; + interface Out rightOut = rightOutPort.out; + interface In topIn = topInPort.in; + interface Out topOut = topOutPort.out; + interface In bottomIn = bottomInPort.in; + interface Out bottomOut = bottomOutPort.out; + interface In fromMailbox = fromMailboxPort.in; + interface Out toMailbox = toMailboxPort.out; +endmodule + // ============================================================================= // Flit-sized reliable links // ============================================================================= @@ -362,79 +457,59 @@ module mkMailboxMesh#( routers[y+1][x].bottomOut, routers[y][x].topIn); end - // Connect north links - // ------------------- - - // Extract mesh top inputs and outputs - List#(In#(Flit)) topInList = Nil; - List#(Out#(Flit)) topOutList = Nil; - for (Integer x = `MailboxMeshXLen-1; x >= 0; x=x-1) begin - topOutList = Cons(routers[`MailboxMeshYLen-1][x].topOut, topOutList); - topInList = Cons(routers[`MailboxMeshYLen-1][x].topIn, topInList); - end + // Board router + // ------------ - // Connect the outgoing links - function In#(Flit) getFlitIn(BoardLink link) = link.flitIn; - reduceConnect(mkFlitMerger, - topOutList, List::map(getFlitIn, toList(northLink))); - - // Connect the incoming links - function Out#(Flit) getFlitOut(BoardLink link) = link.flitOut; - expandConnect(List::map(getFlitOut, toList(northLink)), topInList); + // For routing messages between boards + MeshRouter boardRouter <- mkBoardRouter; - // Connect south links - // ------------------- + // Set board id for board router + rule setBoardRouterId; + boardRouter.setBoardId(boardId); + endrule - // Extract mesh bottom inputs and outputs - List#(In#(Flit)) botInList = Nil; + // Connect board router to north link + connectUsing(mkUGShiftQueue1(QueueOptFmax), + boardRouter.topOut, northLink[0].flitIn); + connectUsing(mkUGShiftQueue1(QueueOptFmax), + northLink[0].flitOut, boardRouter.topIn); + + // Connect board router to south link + connectUsing(mkUGShiftQueue1(QueueOptFmax), + boardRouter.bottomOut, southLink[0].flitIn); + connectUsing(mkUGShiftQueue1(QueueOptFmax), + southLink[0].flitOut, boardRouter.bottomIn); + + // Connect board router to east link + connectUsing(mkUGShiftQueue1(QueueOptFmax), + boardRouter.rightOut, eastLink[0].flitIn); + connectUsing(mkUGShiftQueue1(QueueOptFmax), + eastLink[0].flitOut, boardRouter.rightIn); + + // Connect board router to west link + connectUsing(mkUGShiftQueue1(QueueOptFmax), + boardRouter.leftOut, westLink[0].flitIn); + connectUsing(mkUGShiftQueue1(QueueOptFmax), + westLink[0].flitOut, boardRouter.leftIn); + + // Connect mailbox mesh south rim to board router + function List#(t) single(t elem) = List::cons(elem, Nil); List#(Out#(Flit)) botOutList = Nil; - for (Integer x = `MailboxMeshXLen-1; x >= 0; x=x-1) begin + for (Integer x = `MailboxMeshXLen-1; x >= 0; x=x-1) botOutList = Cons(routers[0][x].bottomOut, botOutList); - botInList = Cons(routers[0][x].bottomIn, botInList); - end - - // Connect the outgoing links - reduceConnect(mkFlitMerger, botOutList, - List::map(getFlitIn, toList(southLink))); - - // Connect the incoming links - expandConnect(List::map(getFlitOut, toList(southLink)), botInList); - - // Connect east links - // ------------------ - - // Extract mesh right inputs and outputs - List#(In#(Flit)) rightInList = Nil; - List#(Out#(Flit)) rightOutList = Nil; - for (Integer y = `MailboxMeshYLen-1; y >= 0; y=y-1) begin - rightOutList = Cons(routers[y][`MailboxMeshXLen-1].rightOut, rightOutList); - rightInList = Cons(routers[y][`MailboxMeshXLen-1].rightIn, rightInList); - end - - // Connect the outgoing links - reduceConnect(mkFlitMerger, - rightOutList, List::map(getFlitIn, toList(eastLink))); - - // Connect the incoming links - expandConnect(List::map(getFlitOut, toList(eastLink)), rightInList); - - // Connect west links - // ------------------ - - // Extract mesh right inputs and outputs - List#(In#(Flit)) leftInList = Nil; - List#(Out#(Flit)) leftOutList = Nil; - for (Integer y = `MailboxMeshYLen-1; y >= 0; y=y-1) begin - leftOutList = Cons(routers[y][0].leftOut, leftOutList); - leftInList = Cons(routers[y][0].leftIn, leftInList); - end - - // Connect the outgoing links - reduceConnect(mkFlitMerger, - leftOutList, List::map(getFlitIn, toList(westLink))); - - // Connect the incoming links - expandConnect(List::map(getFlitOut, toList(westLink)), leftInList); + function In#(Flit) getFlitIn(BoardLink link) = link.flitIn; + reduceConnect(mkFlitMerger, botOutList, single(boardRouter.fromMailbox)); + + // Connect board router to mailbox mesh south rim + function In#(Flit) getBottomIn(MeshRouter r) = r.bottomIn; + Vector#(`MailboxMeshXLen, In#(Flit)) southRimInPorts = + map(getBottomIn, routers[0]); + function Bit#(`MailboxMeshXBits) flitGetX(Flit flit) = + flit.dest.addr.mbox.x; + let southRimDistributor <- mkResponseDistributor(flitGetX, + mkUGShiftQueue1(QueueOptFmax), southRimInPorts); + connectUsing(mkUGShiftQueue1(QueueOptFmax), boardRouter.toMailbox, + southRimDistributor); // Detect inter-board activity // --------------------------- From af7228266b44ab9b0b0308ea64b25a861a092f5c Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Wed, 15 Jan 2020 10:48:07 +0000 Subject: [PATCH 02/78] New NoC diagram --- README.md | 4 +-- doc/figures/fpga.png | Bin 17842 -> 17084 bytes doc/figures/fpga.tex | 84 ++++++++++++++++--------------------------- 3 files changed, 33 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index 00f6a84b..01e744a0 100644 --- a/README.md +++ b/README.md @@ -136,8 +136,8 @@ accelerators](doc/custom) in tiles. Each FPGA contains two *Tinsel Slices*, with each slice typically comprising eight tiles connected to one 4GB DDR3 DIMM and two 8MB QDRII+ SRAMs. All tiles are connected together via a routers to form -a 2D NoC. At the edges of the NoC are the inter-FPGA reliable -links. +a 2D NoC. The NoC is connected to the inter-FPGA links using a +per-board router. diff --git a/doc/figures/fpga.png b/doc/figures/fpga.png index f4d60fbbef0d5c072ad8fe7a4dd52857311c251e..c05a5e99609a3857048baae3a4ab2a67d6efd9b7 100644 GIT binary patch literal 17084 zcmb`vb9iN6x9_=QR-9B-tcq=`V%w^?V%s(1y)?%h<2yfNj2$K~EA|N<8y*A#eUcCtRs?~-z5x3tu;9So`|fcs;2)f=xP}7= zgwFZTFIY^EDi{bv2$B#MRCZlC({a&OQhVWNPel|J@~!IqMXLp7mldi~O3ZOpzOYc@ zRbC&hr=_M2>awahP^EMo*eF3I>ls0S%3)2}vQJ6&(l;bs|n7-$v6_{2ELs{sPgPCpT}y z`A10MPf#qGBE|q6Or403UyoU|Fj;&OolePPv-{o24=n+QgE2c>Ti?pJw>NY&G->&v z1ZuS%+xBh|d;6N&+M7$|1(cO?cQG_r$ol&F(Ik2pkmY*I-9Qvh??T+n>@2(G(iX5C z9yZ!;_edqtC1=GRPNc6^8^R!kNXg2Q#fdvPIrS=OeE->a{W<%!u&=N0_V#vaN}l#u zPnE)lDNcN?)92%Sy;TL=E*@=WeEjV3YF8?eX1Zc?JFAkYS}P5UAR;`x#eQGX$jC@n zH)Dq^bFxsexVI$3uyz?2DMXrVb$OYUmGyXjk$@#IB*fI%`03?k*Ontp*dRp+yOQx} z(K@6|WTnl`L0lXbq(AbTax{?^r-3!Cvhw2oY?TO>I8NN-{%>_{Eps;(C9tuwVuH5L z&odGegZm9a$~%HbtmnDWlBJPgVdY^N&F3gY`q|bn(DCsr9-Yy`AgPDva4DIX6gdYB ztJQmFH~)vq33?PCFi)f7Z*lGd+B2c4^pp z{Ccb6%;n>Pl9H2!@>N#CYja%$#Y*vTL)y%4eDV4m;yhtuxBUb)q}R*ShvYNp7dO2! z$?HG(yZR&ZdU~z2G&I_EX4A-Gm0yP)@0|0fn`v9U=|x^gk*&fR?DP~V;ftl;?=wQ6 z7tb{{3+wj5V6kK6Np;$oKU2;s+&?v%y8VeG`MT?z30cc=HCdlu`**IN#4BI!cn z znPs8ZSi;yl66!ofs}p|?H%ghA9pWf~!fJEu`jPF1o~CSV-y&=~H23)n?2-mLw2yH^ zhdr9@m*8(;ZIJ>Cjxc-T3(B63gy}+k@ zIN~v%3@ra#SYgFiwUCg-o|{Qs+vns(Iz(RAgMs>>n7vmq7#XEyIw|<(oSkaOQ$;km z7pdn~M~wEh+19-#AjS05P6|Z$oEPX5xQVP=n*2nNc6yUe6v>;@+Six99a-^qN>#2s z$l7l9U5wq+xUO8OsWOAQ#xr;r)7-8EYb8@SS976tMOTwvw@#VRmU-kXhZQGie|yP) z>y8SsqTw1|NO8S(hwwZ!5!Cvf8MP8_d#v2J=)6BEML;k^;7ZWW%9V0 zB;w41`wS88nke&@P);;x@N=ZSa8bW!{!u?=s!8`g%JN+>P;9yb3#%))(#Xl9W?4hR zYA%Hz5e@vAy`=GV4OfOxTWuq@o~jp-K?`izJJzjum}}&xsIEFPa1bFZ#q(6SMKZ4T zs7=)68Q#Y?R0dn$cP2187?5ux^WD?o_i!&J6wuU-{K#r@NzvA>|JQ&ut-oRuhl@!L zn#7{7xw&>k4fL4^L}o3p9E#6p1I_4ZCQ(23lU$}~Y$M28W5bp-4An1Lg91|tE+9cm zA7Vl6pX?O;NBbBK{#HjrVd5Q>g4CRtX&)q4bpBw?=HhK}(6YszU&!JR7@V5hY#$3J zV|3$g3*m78QrM(g^QyC4=edyB9?RoUjS*)XlFF?cDsH;B%hsK+`p1>B_cH@F=iL^+ z5@%fkH9>Qm&qgP*;CQE1mr`}!Kqi>4*sQ+NDY9jUf*P~!RGN&Mi88X*Gz1gJH`6My zngD}?64sm$L41m8IvexNJbrqJL*%f2(=QjPL*44?LJJe4s{uDF)|RK>CuMxJy%0)O z&Q>!|^|K&2X(G0dlgVfgoDFliMtJ7P2~JZD^q}LNMI6icID!b1jxZP~^qYNs4hZC^ zoooPaTf5j~CoV?s4-?IzVvr$=A-s1xlNJ+gKe8_n5BN)IpPPu9a-4a`JBPeJaqeBO z6Cf@$oAqbN-s&qv{{F|og!zu8)mP#~$ucjz$|>TGM&?{eY)j{+RCq^Y5Dd)iEkpE$ z6!V!Vw}XNB^icI9Est4x$SYrn6L7E~yc*Wj?ddYR*+{%Zo|eB1JF0ziJ8CC@164U! z6rM-WJ5{dk^#188(--Lno;uLdORf;LfeznK*#R5yrg-t*%UyOs+oB=@_)_9#C9+^iMk>k(PZQ%<6(jW7AI@MWGbKpH z>Z2Xu#a=CvF=|hSlH(rCaKGN9s~=;J3??vg+KV&meT+u5ys08T7(<5^I=;`)S!wUp z+QrE!dB{1%`!3Q>v%>&GJf2}`*;-eK-l;`EoG`4_Ox$~KwzP_po{Xic%){USJenN@ z3kfw9pYJF6rP;6oJqbg`SRla4ndUqTtvOCRoXKYi;K4z`&&B=64n`44wtydnve?=4 zjN$FR2QJcnP9j;~TzC&nAZ|&{mpyv{EkGhf9*BPzueCP+!)zKI5CGwMc5#sz8>{ax zRH5CB7IOND?pJ(#dU|?DVGSTO=DBC&XUbB){P%=dfoQF@zvGQCvE9e}TdT{(IgWE` zbgZ?t^-qz62cz=KAz?CJA`%@4*V1f8tze;2g7aVS~K4;FMr?+{w+V~wsm^kkK@3Nt( zsi{A$BCwg8Q-QkH*Li7aKSlH-X?cB0SO-_i)gT~D;r#R>Yopo`Uqfq87!HSwtmerb zChVVJWFC$2^=mQiZnZiMvwfadAv?PR_HQ3-ZH(_`jT|!R4Ldsq{gW>b7s#vK!D(qe z(Sn%pK~GOlWE6V=Fvu8-^Yin)sv5wtM?7fo{~C%P*i#;cdMYXjt*zW3eL@dJ53$}c z&v+U@UdmHIEIY&Z7Nr^?ARq+z`-4DabcNLl>gq>_hcI$lIK7T`D`IFb_;oOX@-8t( zfc6lNi%%h2^xVwXpb}9~QE>tu_6V+)@3GP7zA zz7bv1EUqH)9Zz|Jm%<5A#S=7#ux1>e9EyK+$@m&}d2f4r(^V11%r# zKxI{-ILF0{`~j4jcB$Jlf&=9TK=?#Za|6ES<{{--#*ks~`K*V{VOo+}El+BFz?raj ztt=z@_ctYNbYe%Yj>4R{7-eIth50oAQ#wni#$U!J(5GY~t5v^GJ!I1A%vjsJ-wl1+S z33wyg$YS9UTr=1*nBmPuG>yM4lxOFA<5&`~6>(3^`R7@#nhHuUWVmq*EK$BUZ)l_> zy3lrFf(YaNg4wdqpOB4=RKK#GjOazA3B{EKtukN2_s?Y&uZCqEBRL9TX+%3YfF%{W z=TIRbjHhnp*~ia*^Z#DZ(IwC|k;3puU3-@h@7v2jqesQ(K$3j!iwkEL?Q|m|O}IXt zxVJAX@%0c^iDcm48b<+-Q^2$6?8HcpEuzM`LanGVVR)q3vgIWWF1n*)PA0+fLaF}~i(OHwiT${rExD&QbKi}QVj^D7K zYvrEwNu|O_$6IkX1ttzDR+FUK95e5E~q13<_d3eOe;Pdg;psUDOzR$yI^be*;>0 z@)PmcxMC;JB?m)p6TidgNfYbW31b9+>g}W93;?}S)nR6oriraut!(yZ4ElN&YuhVZ_QvhWFtQ1y|m5_gejXxObe_hE(%-+LRKbXTuN9bYJ9Vy{0mCO%uGvv ztQ|B))Wb*L;a6L3h84cZ@&tY4dD3DG-)Fd%1R3JlX$tNBsOyy3tdF79p7z+pni=Bw z!|)0W(^-y>$t?eChp=7xTVU#2poL7o%jG!gtDt1m;=Ov+9tq=BF%03w|Mm9A^$&wJ(yV0%v ziE4w1Fc6LRyTqm-Aum%YD6dB&C5Ck{=&&x?E|_IGC4%rpVa;ufv&Sm z!{q-}HpY%Y^w2@&SBTpgCws=Rwe%erbmU64h7n}*!FI#Z)&PS8PxFsmVuHM;gC7Zh zU1xo!jXw8%GCO4Qfn5E@6?OUcJe6#yZc<3k7U6jvPXSqqx#YO5^AH?~v^*R3vKU?V z^Q6LZZ!M2fQwwFb}FNDz${1{4(Tgt+XbsYNK3)Z&KjZ=XqokFxk zrs|8?!Iy2w5Pn{vm8ejI9ku9N3v+!A0|HS<2?caB z?iAL4hz|y1G2-}P0L`epYPY-JRncZ~Ivj9taC9vKaT^W}4v3H=3DiG&&_DwJzayWN zM=vl?SXkKc@o{!`w&qOdEzL3rL>$-T^YOmk>hf^bnex9D9u}W5F`f2D5{8C`+S+*f zW%x4X@)bi=cn3{tIdPGar5kE%EzHel_{`XpVUdw1M@Ft69tO8<(Zc#zf>?A7rz+Qx zG$QKUZVqB&Q9xb4DU}jwwb(2z{yTc4#D$b885iYplCDT0SCo>9O^LeOM7FnMHkI}5 z_vk+5Dlz6FJHkhc#$%H1|3;`NIsb=Hse7Hv>+p^+fIY)1ES^=>enpO3czbyWIn)0x zD;vF|Nr7>2c$k_Mi;T}BudSx0hOA}$DSJKV1|WKbzJ3{*CJft@VZMy4CV%||sN!+P zT8u4ZzSQv+3dPk0kTd?(sNEQ~w4Ck#NdP*Qszur?w#2{{uf*kGRaeBNcKQ_o=dS#J zD3iy_ZMU1xxXsEgwOLlauXzA)nw!3duW-u?q@kga78h@_-S!uTe)H^jcK2QdEr$XV z51r&p+E&Z4)4A1ZwIMV#v{zz>|VLAT=~%I1cv3 z)p?GMGQQO~-e5+(Hg*h&C2qS;^cWMo-(qNRI!HBh`_QwJkeM{?+(hUZ{!0Z>;Xs8( zfbWhP?hD1uMPk$*a`Y^~<+e-Zg`JO7Wvnh_5`5jg0X_+co#eQVNYj>7Z<^z+FhIwU zBt_MSmbObd78b2**Uijr%s!xa<=GyXkN`wUM#&{;TAj3mQ|Sjllz3*LK)?bR5M7Sd z_)Ms$MpED)VD>NR$vbX3eA%F7u#az;Ydha{{l zqX6)6XF_e=uSey7*E8GfOT<|FpuLR#ls5>?$Ts%Cwt15~Py9b2q(Dwl z05Wu1h5sqC_NMuN$B!U1iTn1$ko8KW()aA|opb%c&RcIsye=RgWL_sMC;bcYe2iHk zKx==8(R-6W6R?!#O~)a;M^T;@3Qyp<{^YChYT8dSZpl;rk%nwcpe$j#`vWIJwx*se zqiqAU*{m^DaGzz>0O9uEW=LEme+!+G6OACF@fP=;s7BnQ+eKM*aCb_6l~~{~enCMO zukxUXNzSN9}F(C0~NYKCo%`zQX(Dr}Jobg6D z8v!T*)qxZdat&_?GYWekXkUOT!IXFB_retj1d!RTQ3M-PNew)`i1`M44~%aNfg;&D zvc=N6s)I_QF88Xtqb2mh(x3gTF#Pc;QDdsJQU5FmgFujAA_SUtwDAEJPvxi`?Yo_% z;Lu1@dSqSuAkd|A0pEwGcy%=*ILIAe(v4dQ6Vp?h!7$b%;%(^Z_1RulsxQ>Xk%g}B ze&2xzQbr9~OmVdd0yJa8K9Rn~Vu48A*%n~pr^T`yB;7ZG-RoaTNxJsxN)GA*9N1s0 z&ktBD+s`R<*A3flxfKrgD0rN&~D=?|7*#o$X#!2V~CGKT~@?lm`W#0@N=&6 z{_V_wHq19pa^Il1O^jyMW#_m9U2dD7QV&6-rky-v=O4VJAXUpZGdiM;J3K1!>Bba6 z?RKLvh=mF1ylsQdytBOS=(zVn3S)6ut_k=7uq%vS**)nx$i--BSL|i=Hhp@L*i=TT zv1W|Cf>q#Opb49d1;82rKSl!s(gDx1%EV?Z&q2;9M)N;y z5nzOX+T{8!XCF;2itTVs?+bf`OuzVd7oT;A!S_`M^s|LfAoqotd#o6Ph- z5J|LfB0wY=f7e$(7$?f`{0HB{LH*}h*Me5-jL%`d^B!~7?vW5R;L|??@ob)JHH`FC zy9}}}7DEWNHi6uTPY3ume1CX2e;p;bM&>_2u6c~$4-UYUE*;q>KoBXM!K^Ep-Y24O zph*zyO*N8%Rzai-De9R0T{OB=slpaY>CdKB)*Bi7SNAl? z0do0Y$8sGa!vPfK`Nd(EI1?#j$rlPfz##<<3xYP~FSs6#vwJiqrfI=ly5=kt1jnO9 zIk37kh6wZhHm%n)zZT3YuWDi-QB<=v0@a-(fE+6P@4V9gDGZPPiu2$j(~>Y#!7J;1 zS3xStb$&x@8{WU(&VthkJr8c97!wjlr-V05g1}zM7Wly;hAouhq|}?n>41yt{<#y# zospB0{>h4^R;%dSb`OZy+cPpXJ=`uXBdXK7i)Q)Fa!M8qWbi)WumKwK@#%?HyXkT! zUkXr>%?^jof0z7vw*f*WL8_>uvkoK=r^2yK!UJoY{iYUXq$N zz>(@+mA8{RH69nodpcR(luQ3yUfT9I0+&#iC^!(0i;Ej!#ER#j89QfT3Q-X{tu;_b_2NHKSR{R z;ycR+>N&ij{QWuFk<0V(vL^^3pUG_k-TEi(_)?z(u;a@zkyO5-imZ?@UTb zGN{;VK-?3?r>)mnAR(0~M%J-rgDyPk9!nV!_4|4(Me*xgjvYRG-~*2$ab)kAb7u0e zG^+15MQcQ4!8fuzK}Yn)6SA}fDdS6bgE$m{r<&!z1Y5b;-XYI6;Jg-ZR{w>o-0#tA z?eYfL(q-!mxMP1HLS_Ja7fBVWhhNjEMm508fi~BgFq-s`ts<$5} zwN3rcS^#(vMF)lD-fLL^pzD?$DAw~uVGOp=WttNg&Dl2Sht5cx{ac#Zbt8fV_L)0` zsTSYV7f#5Gata1nb=-%OwMOo2^O}}42(_{iIrEsBS9S?eS#)@DC1wR<#qG+F_x45w z2innCqYBeMF|vhbm$m&DJ$918U#aCE`ah-i?h)IB7+KA@V9^9;@s0v%YH&{v5OA?l z*t;e6$qJ<)L!`a@{Cv|{e_X1WoKlm6h5S%Q>l%eMl5r7;4&)eYdHOHHsu=Y@h1EI7 z#iRV{{BVXPmhqW(rMwMwe1gNW;{zu;u9r@6D%R7IMyK+5VAC=B73*4SkrmwNPtU@ z0*_!u3keWI!1`F~D0p#p;Nuqm&85qbIKLR}&lNo*H*S zq5oS#m35f?yOCK%p&KBa(9MlYF=>;P^oMoAk&j|xCc@HOqy-VaBt-qy)~Y%xth7)6 zUkH^Vj`~+r%{^u9mnM1o!v-V%?Y7jNuLS1Y{2LNy;_UO_rVG)5S;r>|C@Dimu{-LE z!pYYDmrnU$S=c9awjvi^TH++TTN9^eIrpm~XW^aRk6vLrTrY6!FK~+~;TKz_0X>jc!$9s7s5vm);7?DXcGs~u!c3n9lV;@$AL zC!PBf0XtLb5>(*B$o?z8MglHUB`t@o=r$5alX3zMib#?%_Y@_rOlxIEt1ZGab>0a| zuz_RY)<5MNR8XOY=G)YruF>8O9S?ZW2jT5-xzt)ohp%)lq`#r5d^!;~!CJg)UslNo z$W@Cgt-k|%m}Psx zamHhoze#7&?oe{!Rj9VvuW&ps#>bQq=)C(GI9bMfljxaV@54TN@``MG=}$WP;t4NB zHP=!K^!Yp3LqeZM1~qP0Q|l9U=oON4zSwGXS|uoJOydozk=Nr&`i(y|d_+87BN#7> zs1Un_$WD6SBf8Nn)pHtJKAZF5`srIO&Z;6~uo4D#qtF=fdcsB|N%g<&6*K2C9as8X zaA!Ip&ezmbfPs{9584x}L`4=k+S|wV9g~pFVdkwO@-c6MYM5_E^ck_BI9A{uO;9) zKDZC(eh)RfBeTu1 zB;}&&bS^pV@p=(FnHnh59jfdxijsqY{umxyQ{7vfC*1S=NBATl1Xer0Y z`zLr^I4~7A)ymANF6w;b71l@Nze~zl(Eb$xfrG5fi#(4X4?zdDL0L#bx1sSKp+v7<%G$M zwkZd{DkWm2sxR6eYBy&)ohtSg+icjfU33XuyaccT0l#~oVFQAyI1s~5TJPm=?ly6l zI~#AKs*%D!4k21K6VMFzYQPk@+3V(y7&W1?s=F(fRykbYxVRpzWE;-Wikph43IW~4 zpreICp{o2`d#`DDgbli^lqhAcm1A?j!Ek})Ffb5?2W3^av$fs^OFo-@NHiqggM9ff z-ZmPuXolH<_`+%2=t#djUc9-Nuqn~$QPf^)Y;|4X@DQU&OS6vW`D}HY$MiGHLwJ+F z{q5uc>dNM{LMOZF#eQRXscRTf>IFDb*Xs`v1{|OFS(jPoV@DXoGHUJ{Idn@W1LFFU zu0tt>gXI*pjr4NXFTOr%cSUtHFu9hGN6pPOb1mxI8cp0YPHGHD4X8+~b{A;Sl=}n5 z_-Uqnmb+NP#!D|1VYGg~WxE$RNFa+F9G1BCQR_Lz&F>ilB&%BhHH!ZI@4A?8V>L{y zcRa2Ny(;d?)$?yB4x0DJyNe6{jux(;9{aU3Cf0^kgA@TyysYLMAvX_SBo?tCC50aU z!Vy(cZDL5c(;dETpKsmVN75SYAiU6*6-OTDVnJ4ud`$4>wr9m=irH7+Knm!tDuRJ1 zO6koH*qkp{k#*O!GpE`@P)5h>o_Mt~u97COs*jl8-a{`j2hfXAuXg+0x>id3j| z{Cq5-lYJhx3tqTJ(F6jSU6@;6m{V5Lcj$Ot&5dUwE;v1}eCo*P zf`su3mg5gX#Pk8nQ_IAF&jq!iuD`_Ba+`)v&wF++!?9w-t5CurA#oLGP>Xqr%1_YX z7``APMW_|$K=xFWA~P>cO1#8fx-}j_jr>T~zIQoK9^S6?X}fg0(8umA&o0LsSCT4Qdx~a@c|3ojNrQP88aOb;3i6dT zF4*K{A#qng;jX!$F*BJMPMBrabG4QJk!IhlO}0oWiyuo8sJa-!eC*H=1`rpt9o1l?t`e?_pAN3 zS`Y|4<5qG{o37Hz@^TJAV-TLIcI|Jwn)K$}l;&GzEdgV>`e;GNWKQdQx<^Z^*p~Ul z{CY39`<+0i-l>2~@)xd~)!IzQ%T<+;~_&l5>>b}l5`)6p`p3mb+?sgB4-=?u5V z5fA(OXYK)eoysQ7tJMCLzVlpQtlGtIZF{4p>LWTBCg-C0dMLk|>-llM2|wx%cT78- z+M|j)D;0ly-G%tmT`}2B;4QI9$J;jcr%?aR2${{fbiz1M{l6M($0Ju4}_te?)cMY<3(*ph)bKKOHrf32p=nFj?2#+%%Ba*srqm1Z6erb zIC`6!`HvbK^1m=@AJ^VUQf3CBr3*A~XWDco-VCL;_I{mcKOUa#L`ag{8r!-OqFjf% zD%!A?u^@hn9}Oh-BMuyLx94Rf&c6tiv~j5_#;_KcFvWYhPp_Tia0*WX3ZlPO&`BdV zUG@FA;*%9LCuxjWCQp*p_;5KtX1G{tyXNklii0jXg}rD>j26{(fdOST72D;K<%9}T|3*6z1D|MtHTO}Z^uzT9h+Xf!hghz z_cw;qLsyev`b6EQ!t$4Lyh%M_3kbOV4id_t#E-oEy2_sTNDh-f_}D`ymq>sM0nm7L za4<1te79CzTWjr0mj2Izpc>bW=Z|!U=O{`XDr|z{IIAxL@f9k{WmECt6%HfT%bL|k zb-DnE+`Re%o2#%$Plbc30|5eG>&4WC1U0>1ro}kK+JA^K_PH<~DhoYYUfukpfIZ5y z_&Z48Q)Z}8GB)fC-!#c_R~fAHSaN5T#myXvf98oDNP0nu>c-%GO4FlI{}tg1yW|L{nV9yPV7oqmuUxbx5LGN{B~nb?T2WE@}qu z9vnA$&^T4eNHi`ZQb2c^r&C0ul7NcRPYY9YP}jtYoxu@}3k-pnR--Imh%Y&9@~dv3 z9B!O|w{raAlqzeFv60EW1l6#;>fasx7z&&cTHLfEQ9;ol!p?*7r?aSuo`q6$z*F$fLP9{M_;(Hmh_QDyd;e-q-K}@|%Iqp#Ck-kooi=aJn9{a z$$J_eXFZQTF$A3J3nTA8dvi^!0~(dKnqNZt6|d@wo^s!I&dSm$uo7dQZ*Tsws#=v6 z>)gkaF)SIWz%q(kI>8WWI#z)=6kjtVi!*9`<*ofMRptj`7Vq4PDLqFRwvM>zfj=Y~ zoPW6US$XC;Yl2YQQtP~=^>a>aV}q?QAOaJ7w8e(U{r0k)af0F;KY^-9R~_M+T4D@R zn<{`>AgV-45(c8xZX5O{qkVyjB%rnxmIIuJziYqoSf$2xTvI#Uuus9nuWw$IeW;x>3<80#^!IySwabW|Z$YSK2v4>N= zu!u&V#$)*K40$4qBdUy&8sDe6sx`ar0-qE&;rwpF$Ba(_S=H5$)-1#3Z+?he`$G%K zJjvJ$JGPu5hwNLU@{mo-HNs!bL~q8lv`+0`n>OtZHMvJEgE$_47%7cEbbjs=gw$ts zzc4)qh2&;Z{?0o-)ojjkbRu;LK?MaNWofM(V~}8Onl-zPvAN&(BJV=h_w$z5r`!{& z5WJDvo^pbG>sm{%7m<&)mB2u4;o2ZUmQ00Dhn+#pq^*yw6YIZP*${{F(&f&h*W5@R z+NZ)Z7nlqWh15OIj#+-;X${KRR^JKS71;WE&Ua*+^?t4q7`jN9w77 zKu4aQYGfx1bubbI82h8f*e~R_muh+bNUPHl4UCj**xg?Mt4&W-O{~YDO6#*%1Xd;n5nIn}4 z=^8v3=!*igdn1t`y{yS8H1USP;VFMEV`u@k1F(aBg+*V-!X-O&?;J;Y`!g3ypHZC# zdbO`yY!t(KC*C0G)ndjx;g^c&CY*-U=9h&%b{gC^LYhRiI2h2D8fGjmkEIa#E^RVW zd}6}}=kRzZEGRatg%(!~<*O`Y3rXB|U+ACUT*fA-q;EP_3*A?)Qd(^ZI%3nTaG4}U zUVFb@`GWO-?6Zr`O%r?G&pExTmY8y6NM8IbS7`0hJ`1Elgb!n5bWDsd=f)SCvhcZ; zsytmpU!+z=dsm`F3N)_`TIEQKNDwEabjG$2+wCWCd0zOgPikzm$w>T(cQ%^X4y3~4U zy|2Ad$ci>PiM@w~oDXU%@vo>6+`sOcwfw2OO+mm6w8nvs)~%yT5YPT8!sY@>fd8G4 zQ5VRXrHmh*D)M3=N|2lu`of}L%|$EB)^A=S&(E(Shx^4Ty@03JQWU$kws2C{?Esjlk@;g$U^d0~OtUYlp#k^yE7cp$M*} zAD%-cJ~0Q+nYYE|87yw$Jo@AoQzCV zRP^LUu~_-}Xgas6OMo`U%)OHGJ1N?yIuWMsEFc}^l<9S!OqanB+X%GReMhp_FYQdmnH8t&(cm4N;|x0`YH!6+P;yOa5+n?vdW!_4Vslpcl`=((*60si^1xC?5)>uB8=&&3dv_t$W>^>HYlk zYNuawMlP9YALux|yDJ%liNxc+pUxHA-`|IBV<9IeXJo7{E*|!MNMV|D_w$Z6^1l%Q zs=z;va$}yY>glwbfZJAAS9f)H2M4LsU;&k5b#=Im27N%s;dby)PfySEbevFjtT-ty z(&5QT3-GwW1_6WA6;r8EO9o^(ffof7mVrP#_#X>eHpNy1_V)G#a+wuXA^~86C^QU_ znAECiJZ_3o?$}W#riBN>d`!t>2GxijKaGs`+C3iro}GPitz7Q@tH!3Tp+T8Lm@GZ8 zd$l#tSXd}FjdHQs*?AGLzP>)0!K)Ol`Q+JA5Ud3SbXv+~^3_*WUEbZz{|*+)?h4=m zdNzS3$dMZ3QDwBTO#V)rODkhzR-27BgwJ$EpZXGV5whirh^;(4xRXjVeng9t{u(yg z2h}t*U`+^G4VxifoSvS}J2$;NV+M4~JHkAAc4*TjjqZc{$sXF>JiP`7LxX&U5{gK^ zBJ}t7Gn+QnuG1qUBY(~oMB#BSjEqq6^6Jd40#C~Mb0!aX zqwg0!AE^vKgn-Ti5096V`BG+PW_sSgr(flQj4eSeRMKuU!V7@K?1;bHH?k- zSL!WPQaWx9CkgnyUnluLv|F96BqSs@)iQx%$X;ZA&)cKv=etvzwWjR$-#}OB;n|sv zmKFj?TuSQZ>S}mshyzG%hoO_}dYsjFR6IT%W{Qc5dSCT_2KiQYzCQz2eRp^FkG*!D zFhKoz==kmH>kG^;~Y_!{Kb~v3bjA!!uELP|!C@731G{Sp7Zii%KWB@M*7>VN7 z;_B)vz)RRSr}b`c*){>qxxEv!YincrMVyEz5&eo3^izM>*x7sins*4xLwYH z8v@EEzmbxzwYtbfI4_iGXxdO`$N&u)B9ZS!TsGZsOBIiFKlbWeUD$ML_$OiBs2pI11*e{nR#|_5aylO z)$%**rM%{94PeFFF4H{##r6G*~zo7%U$5mXBn(zTJu0YjW~tEf*-GfL%Fe!ih^uOG`;n)6uE^ znFb0ufclGN%X{vNwp`l;V5S071}#iD$O@0&+jDmyDy2q696=BTE8?Gs0GJOLDEwk3 zP>$~j=+UuUsS6V(F4ybSm69ky>;}51u_E$^IyXPxLE%?s=nTxv*bheI0q<;NXvkOV z7;t?wIxV*w?H=T^XImBt#HRA0-PrnEaF&@N3m{6coM?g{O;^)W6bsJygFD zfV%vl5Z>Q#tL%DxfO8<@{W)ESbTMG4s#PU7N2z*|H^NB=RHD4BoJ z`O#5Z>mVsPLf`qSsH|)R9T%r7h9yOcCi9*5yFU;xE;?Q++S=9tFS)Yf{&2nym=a(h z3=9mkwB>nufrGpK0|P6os{~w*;-qK_VP3&@0LfDQ=X(JtF!=fTeSAKEkj!D!>G^aG z+!Yjfx0;@pilQP=6%r5-fa#Y8{+V8v$MsU*%j>PybP|Y0uWxS&AJ3!oooT!t4HXs9 zlZPH29;LUQAP`_K6%`d)4VI{3!ivSW+r1$`lt|&QBaY~|cXDdC+Z_O0&-Kj>1UUHC zQcX+CGcer1eFL$xp`jt%7kvPz!3Ii>)fE(o;>3l%e+L5@4n#`G$1o7#kBa3G5@l!w(tRJ32bb$jAWoIPw%Ul$5|MRIAhlOe!Znes+H} zNk>O#bYw(QO3LTutTCP2g^QC@2dFFtn)?AKx3RI|?d{#s(V^%4WbESd0Nh5hbiuVg z``?i8q-Y@gclkj89%b9U+0M?6|MjxB-eM8xFf%hd0=yWVZrjqv20jMH@8sm$vDH5@0@%)+l`+LAqF4vm2HZ}eEy*g#aGEXGgUJ#t8SbYD) zswC>+(RvLIn=HMsv0-gt!TpDcg^f*BRkfw5=}RxumdW_aO3UVNgX*T%R-AKeO7_Uh zOdw|7OKWOr9Xqxp@99Y-xl{A@yKr|%CH^fUJ>q|qHJOyw< zh%=JAC@uM{FRJnxXbfv{Wo2d3<;5a?22=XR#`(@S%|zTr#!MPoS~xA#aQzIo#4@PK z+oo)QNAVNzZ9xy;1_4cIR0$sF2RBa4H5`0!5sp@B}n7$?(QK#a0~9zxVvkx5Zv9}-C=g#_ndpr z{l0V8%&avty%x|bf!>eQQ}wHV)!rcra^k4S1jrx|2vt%-R1pM%;so9wBSHh;RbnAY zzz-xl33W#h2#fpg7gSU?7zzY>50VrWR(4xB)^^vzGJWuklRUnzm}m&oa%D7VWZ$>35vc>Y54Y2vN2PqDY*jw~yrKT7PA;0ew zu2lEWEFAgV@4v%RF|W$MBqg_gy8PqVdU`2v`Y6C8R|bY*D4e!{C>ANxLcycp!{cGV z3S)flS$C?ijLcIkQuwG`q>Nv~tcNeMcJ79NfWXVkJB&j#k$?;f9U}UT1rY}Rr=JW} zjT(KPb}tNwhnJTHtJe=oC3?_EB>PP4uvp=vDEZqyY!v)4_z+R@cqu55g1r1F^UKJ| ziH$(!>E@>4bb~S&ywc(-|JvNq;VUEar%bcq_wV0qcp;)WQBm?DBG54sQ;b>)3ejC@84;`S}`?;R*Mhk#t_waxHF;t37`VdJM$&#}nRosq2Q0XH*cV$4*^c zT|(kV)kRZN6Q|7*KJaxkUsG3shli)zXy-q)&8X8l_w$xSz^k#U>O9?b#ooaI8uT+h z{{G=1RwD1E-Bt7*ZJl>>FXUrh^UmL zq7Rsf@skUk&%((FDSj&XjFBG!1k} zO!ga8A;Nx(i&JuNaBy?G+Zj%cSwtu1QC3sC^L@Sb1qL<{pZv*`ii+x>WAEwyvP89F z+pm6iOC#9jlIWSIzgd*Djs7l65}?j3qk1Qw@qIoe={OWA`#nUO>IZ1eN5KSDjv`Rc#j;lm&qpFE=||TU)nr zzBn`y?T?x%(Zxulq^3r^D~pT6Lg`7N8y+4`Pfu@~I%)Ik-z2j=#TpqIAwY&G(l}Td z;K^hwOwG>rF!Qyh3yaKvfXo11?=9BqHM^<$j&o7Us@=46=6A)=$*AuVXWmivM zTYdfY=f_(pkmJrD7PYLpy6c5HOF~W?SP-B4g=n?{EF9eaPryJQhIC(YydlNKzeh($ zySnseO4SKWRY)6bS2F~>A6}jxUmsUr<h~(?G;! z*SWe%T2xF+OS=M?bV^D}Mf<}+0J^|f#+9O$mePp1)rxTCoh8q zpP)cz)2h1hiHX6%!6nlN2O<6rJaeMbfc&TelrfeWN@JDQAUEQbm5xt=VQmeOyzb$x>atc9M5>2)KF z2?Uz0(Cu*ApU8ZupOuwH}hogItrdSL5|sWy0k zi_<9OmufX#tiHZ9yu{(rt7X}$8umwuX)s54{$0$pv^3A#6Bhh%!RLK>rFZwi?wlNQkY8{xLR9}@+iWod zF| za(;B&CU!iDDSe^DOy_f-NQ$?J3&j^QGb`Nr91TB@53J~8$LsUy5L1Vfi;IBcPI#38 zEVQs`GmXEH-}{drK8R)7<|C}5=-`F*KHeN_wz}DCX%S7Z{|F8BxI0@14g;fZd#lNC zimuPSaS|s-Py24!a?OOmG|}X~6XnAB)*u$B69`pr-n?P|;`Z|7PFb4ceF{XE*c-xH z1VeT>r!TumyKllk#MU1UGU_P$Bv#(m=fM ze!RiK#Z@etLc*fTD=1iPbys?i@jW;gu-OgZumMgnJ3Cv8@fHXt$H&LOUS4bs%20h` zWBw#>Qo3WvHmAKvY~f zmH~rjVkpE94i6U>7Xc?eZ&g2X%*eo$#Ra?C1po*LE`rZjN`{8H6&1F?VufH)11lgN zMY23M_rohJF)=YJYIMCPxTd@#HV_}_aix8^etGO29eF)m(Zmg2ou9MpydOHGfD>odzhR+o@qp>e|krw|up zc|cihZQ30f6Zgoh$v}uK?f(c-lzfk&p`no^LzOH;vXZjDvlFCF1CL_Xpvcz;hk{?Z z_ZC1*W;y^6YZQSudf-qf82DV~PfJlf>+p_0pkd%OgQXy>TxGo%;Lx6Re3T$yLr3rl zE)wTOQK;u&NtyLgk?8dB6C^14`T3vI>8H`@0H-cPrL-!*qSvX1j0g*zE>|$Q57^bS z25uPVGaL}elR^wSneDi^xvwp4l8e9=zyGiT7f-S&#I0Av44E>e<7st#n}U|Q218X! zW(wyo#Ow`UEje8Nk|Oh5Mt{X3vVrAY4UOtmi9|l$8Qf(A7bC$|O4Dg|7VStx0R zop>+F)WV{8$C!qO28a|CahuAtD!d5YKrH+!VFZ900I?2#^q4*57noc1H9dG0=-TYo zyMgc$txSN849U*cH1;7FqRB63f4^RR&W^!gG>AQFDNJ-@jCS6ZgB8)@` zk}Db7*v-9*a&vQCz?#IbD?>w;Ks3T*(&2O18bCZfproS8hd``X+xZ0rN$Z{`hEq8a zU}3H1Duv_7jE#^j2d=^NxRy7T2XHnvHdO9?Yy@SAz7*u-7Z(>mye31OkefD^l3$+5 zrWh|QEG#W89TboZAUd({#L4-W&)3%Uj5JeXDCleV%l2CeQEM4w<>c}e zi-6$*kO!cKX1&#To&>4+^yN&M=4hs%u7U!yu zF96M%OMO;o%hy93rFs8&#=u12Z}lSHm^9XW#HGI2=nea1I`aL!*_xAs)^VTR9u!d! zMY@?|_r~Qt;8-MFF8oVuDjV7|5g5$a4E>5NO**Es`YVRN0dcQH*`oQjHlUz{p<923+p(yHx{of%UfPEj| zf{+J4c)pU;`iz5_vtK(!TMopjXSZPf7p$qs&akh#s`fte8p99{qUtJxkvN>>S@A-eTFx(%WCKYFP_0mLvV(d0IaF*7?mlANOO7 zSxYSdemW3-S7}p3EjNt@8&iLM>+7f{rMo;_TzA?;taqeVaZB24v69=cmTi@c`Vf}< zyHl?_FY6Ko0V~lH3F^L}N;9LqYqxr3G~MRa9B@E{?H!yuOTv*yq0(Fid|B!!L+ODs z493pp+5DiC8ia(FXE|x>7WZ-b@`GoT>K^vZbrwU$hJ8!ruq(DUd9eHnKA3kSN7xym zh+)`gLPud-eg5OaK3Hi*2?`VJex{tI>!6d?LKl~2Uq)8YxH;SZgj3U6Dymo@0V>hq zKr8Fc7hVgf6t{nxg;#^Td-IgD?ek&D@w{?SGF zai*Xqg<8NdxIi}+aowd9I^N~RkIUU#u2uBC}R*I1BScY0S4pWkQGg}5jFeV+^`kWh)x&W6xN-CIIs1TBGRp9)xed@SxdDJt#7pz$T&OwEb| zEp5R<#wvkzb_Z?$fsy7K6&q4pdc zcEp&5kN98KtIS=MwvQ!^Yy8p@*3CVKQPo!VYz2X;P`-3Oyksy0uLo+lT70-l3e^U;Ru;ucv|LHzu25(PV-m@aN-rn7 zCg??Gl@+P7K}w0t&i5oad*dq^;Nm#>g;Y z2d!z!=9i?=YU&r~Y}>CY1$1#<4Zk{nqokx;+p0Qp1UI-92g{T{tZVT1plNW#eg1$I zi#az-dY(AEHTH2Vb&}8LkA;k&Qu+gr>pTith4w)odFF2$0udf>ezQsH=0={_+<+uV z-SUEy&11dQwH6a&ixN@^8T850@{Ya<1Xo%6QZIh~roUX^^+l?!%K(LTiUE8hi;KoJ z_M^`vf%x%#clfmC1IcNuxvk{`B+Nxj{nzI74h%|(>GN}WmjN~=NzgPhmPrl<8By|& z)0oZ&-@r#+uWCjkKhc|$q@w;dKS5e6a`eRJ^ z?<|q^SHjNzt?qPSO1`rpc?j49>>A4XQpY&rgQPuZb)m#mS_dzi~JL}g{?oh*? zl2A&*ukt=TW-*)htd%f=^oFWImo~NFaMuOcL_6OhRp~`AY)adedtus2ddW9>8xRm< zm%Yl3>l_zDg)D1HE)ACbW%Yr$Wa){M&f4(Td>rB@QYy(Y~lS56gGUinv$! zz=#gsFihfK1nUVbAwJleX}w)Afx@J(>VN}Xc?tU>pWeok}`RHIGY_ZC+Zm8lscGrzcDNjCfI91~7prS( zzTOQ25>ZS{43Ky^@puC$Rv?#NP1fSzOay7$@l@O^^{0p8%y0O3ESH*CBHYq*yuDp# zyV?#U_$1s8M&A)AF|n{FCnf+vCnGa6idMi3tI7Rx$Is6Xm5>9-3S;O?$!0Yt9odtn zHpb1`$_&GQ{1_M-dV_`*^j@M!nP6+cE-+FwHz%jMx|*DVqOJ^30LS;8equ-#OfD}i z(bCZc1OzxbIDGk8IhNl*9y1V2QXxU9_)3z}5gQkGe0213UEvEUC^((32^M;9Mu(76 z^}36$0UMDr8OTd9?Dm5rvEF9c<7BA`1_6!F_r+^Cg?+(!Y0RgnKI8-tdHN%XfqW5= zG$X^pYH;+6rVnt14PDkh44VVrjwfG$48Vie?Rf%7ki)}KKtcdEdU!OKl+fYFWGNN_ z=_TXGk06j%yH`_XB}QObTuaL%AmL(&-`0!@C>1GFaCjI2n=^(#3S@$8{m#zL6yO7c zs>ZxLQAdy07jJA#b@ls};~qyxNAc2DD{0%Rb09?;i|oQ@^4DkYP%>E=i71z;E(l9U zg)|%xW4;Tgf92d<(X$N-gPE5?k`g74aS2zL%Qah*)_`NJ9p~l}CPt-G$Z0mwDDqiV zrG7jEw6G{rK>_QTZva0f7yRUrkiJRAJB$>WKdcl9Mo*zrkGc;YnL%tuXkKMZlwEFh zlJTuUxRKmywBMYVoHR7p7+JOyaBKEhvs$k%;#GyqW!zGwz;w4@xy26malri6&i$=Q7*TvJ0>k)FtyPk9{K#NaNhrJ z8|Gg+e46P%rZ8QD@^qXN%J*{Qd${tJXY(GRQ1f?FwU0vQ8ZaMVxm-;vo0y+lI73;UBBWl-HpTQ58 z40JX>U23vWpS5tD=#Wczt;KY}D%v&AzJSX$&XrWkY&=o^;Rgfy6<7%#JtsJ{KFAQ{ znBrycorT0A4P<1Z@S5GrjD-FgrxI9dZEZKXoQ#*OFAz2vQ`cEFq`RoDKj3hc7&n0Q z)ta`h4z=#X@|_I1_D}_KLxD&V-~1=3cK9<@A;l14NsgKER9iB9%ac4>_Cd4_~s~7La&}^jDp6^Oeu9R0&`tg%3hm8!ANVdq~*!?*AC>uW=AuFG9QMm+k?yM9L zJR^!oJos_Bjf^ltVAs=+BzqLMv|KH$22LmX=zdxhE$#i(0Kf^ocV+j%$wk^@v2$`T zL-c#ldN%=E;mP~5>Bu+w5f1qZ65j*k#QAMww)k;mkR7F@Z@3+bP8@Vf&is#E8jLzq~jBI=fq^sZq6%md+qy#X`phnuj6g?s#PCm%jtU()Y=F% zR2D0t3MEDLm{QZ}$9oj+IT^;TY96VMCLZzYWGl_h2GqNS_8)h1tbTP0Arf%UwdYc& zAy9UWE@gX^-N~-)n6_+^KR`7T27bBR){fjK(;0ys9*W2O**@w*Ov?dGgQ_r5-vokLqW=xODBh` z_C*Uo{we|Qz(C{gVmbm3HY`h@syZ5zXGEkmzXsmAg*h$g{{pCb^T}Uu z^RR-ZnM>PW2;DzR>@tIwyTZ;xjhQiLZg0I}bO0gFJVDAbvO;h}9>r~bNtWc4@9BF) z<+s82OlBqQl;l*E_cqCMT-=mO?&{)zzX3$gwJahzNWg_Z`iP`(P z%!zJTST&*regkfH^E#}epS+WHc2-Xjxc&g$KP8c<{^_r<8)~g9+ z8Hc^buC)5>qMhk2i&ic0<_boi%bUSf6G?grFMhnbA6M-ve+UOgn~BQVi_4vdjk>Ln zj}yUwSZz7h?K=IxuyIiLfp6d%D6o=Ha$dq~3{?+lDEUxIa4I$O6^I8)d7{;?b1qAr z>ElaoOu=vep;Q*O5ss1BE^?=4u4q)9+5DQ~ueJB76N%|w3_Pg|5Z)+(+^>iZ5UFfs z!YKsJs)_eZK`(smWUAUFZmn&~_$i$xrwSzza+-#$KR9Y}p0@0RmPv}VAk5Z*-$jZ} zSXi;k&w~*Vp`lD=jPfUGV!PpGr-HmlK|+=1O+LUjfU6Z3rtB=Qsc*}PEkBb?O}rzb z<4+#Smm@N)$~gE-q@t#c9iM$?OVP=e)UgkXPWm{X#&Gx-DhHwlE1w306`>b{MA1NY zd01seARua~Gdsd1%1TS=X{Jd4C~K_lXYDLome;J*lb#!(wpXpl%etN_yT1N{hU;ab zQ9asxua){nLMC;s+S5524sO&~Daso||9r$ozREMj&XB2(@81g4fTgX!7TXy6=Rd) z0j~U4fBh@O_Ec6X28#IGPlE6KkcR5j^min}X;K{{J7w3t4k4tz$3*oKqNZbx!h;D~ zKRM6O&ledBgXjM6p?7l9(dZ&Z3R137C*|1e^X!3*`HqAnBRblT=o3DQ^*3<>0Adg< zAb5;gb2X;uu)+Yq{@;)~y@BfLTT4hiq@l5|ul>vT$;8#^s1Fw1^%8x+ggk6B8E~ z{NQ+MBY-qNK9ex9e+AT$kC&Q&0!i!biYK7z11c|Za<@ppO3F$CTCFZn1Bi}^ zfdUnlmC4G>1LaAvvhP4*zd&on>QWpx)6}7!QKWpZKUwew8R$1jWYT4%rx(>f+&VdF zH|$6IHU4~5GpbCBlfqD}GUv251wNi!6iZ|;Xb5Axb>K<6!bS1gUicqC%0WPe8zwEK zb0X*YA4IBN*|A?nO3T~|r{8mJ01>P*w|GWeQR>;k1M7)no zWt_xJzu*vq_s*a|l}3ZH0Cm3c#0HeDAllqe8d0B@`&~vx#$nzb5{D6(YoWwy5slH@ej2|neJ65V5xgU zl_jRbxrfYZC;`#2vc{NI7mAMKvH6rX%YF@x#Eoj>Av6*hQo-jkKxRa^D2J|^>B2v?Ffx*HA;XkDkWI>48j;8I}uF>pa^Sr9oIDPT?TpS}!V7u?;gU(wDQMF>=xF$_4D0 zCf30lwgD8L;mCFr0V&ItuYlXFnpjfoVR&@wNWAYxa$8C~4U$E6ga+|3;yr}t*lQij zRGe9fYyqD?=88-pD}K`>8qF5c_BaXzN}G`C!e%;_-&5|)^2PL%t{bkuQFLar9XAa+ zR4Uv7!Ii~Zh-|E=Ie=5=;5TJ|Fi%@)hsuG9=cO%fEfgrv0Y~>^jhiO_)*kk|la)Uq z&eZ1SzB|Kx5&L2T_TDA1{9a?(mw--ep5ywQGdaVuRm>{^M*dg$bOdch$IP)Ibpx_y z5M-d~WNI#Y6_t|qe{$yH+c6>2hP;L{C;&x8&F~pZg#!-ICUn*Ss`)yCjT};GcTu>@ z5ZhYt4G?|J2L;3#r?p2cr)gSE${W=3OSeM`!d(Tt+Ur$5l5&u{=3yCrR%2tm^5D~? z4Q5)l{aVrm#Gh}i_|h)?Q#4lc0W-t9xM1gED8x(qyCb6yrHbe{={7PKo?*`i_pxpL zbWX^R9%H!j=Z)HPJG`NcF+J3_Y-rSUV!)tc<4pw3O=~`mL~do;w5bI-v{drb4s_n6 zUT!RDCvWlhTR?%N;yrrlE4&2{iUUPx*zJdmkKMsG-`Q|@o-&C{D%OF_cp7gg5_=ZvD&-F zZM*$T@L+mQB-1N_OG{lsj40d4bvi5_Ew-$Z3vtS+A5`)`Gz zXpDm?8bu&ve&h1Qj)lrhL;Nbk((zI*P?Oa~5eP{BKstC(!)!IM*t;pLg;-Lq9$pCz zS`#ANkmQ3Wya1u4o6BF^<~_PkK!M7jlfz&nSEXdvbxJDt-N4OarB~|S>s_uRjXaSD z6-Kf|%j}xY)_qdX{N^~Q9)HB;e5*zjcnSRap=@0(>Z>8?;1J zaXnj@Sel45UuSeQT<&GX2VszQBLlS~mcb8o+C+G~TaCq%PR}fv#_~^xBC9#Ya;jAK zHX;7!bU>LRAL8@>g}RxdzU~3!r$CMBg^jLVp?zKNZ2&Ua_>Z)W$g~Z-%=1>U9Bz1! z(TY{~x*tr0nN?0bAVE(X(5PiRP1F&WUwd4F05NSA!AmhUH z()SRaxmKreIPXy4-?TWU37l*+-7BmC=w(P|iT8a(oG@j&OA~v+gdN$Jj zF}!UtS2-}6w8|T;q5dh-g60^&!XLe;>{t-#_Z2N#* zl%e`)*=4e5E(R&H^bKIkf==}&YeQj8R)2YyK2=U^lK1&G=zRBtU6axf29Ro+Xc5zk zieFk_I`6>as_&eQD<_u6mC5$So0$KArd;P7Z zJwk;#KYXYJsIi+QP#G;ZVMN%EA|1fu&IDs9GlgYg0;algo@s*PVp*pNK<5PgNNNRC zB{cyZK%GMS^S1n<9T!B|F^LwP$Zj;jV)ZJD6Dtm;o$+OAwz=zW=Kd+j}RdTIWb&AzRIUHEmj3@VM@C6FU7V9})zzuY867DX*qs)@=ZgQI%9w@Z75{+b zjEt0u(G55TIj9Q09!AVZK{hA}d=D8D6wV z*tFmb`h4qDu zH#eYT51fPc!;wtv=apS?;Q|krAEz`Sit5VB*aSd#wouf82E8f_&|GZN#nkZx4YET| zJGcMRB$h$?467qC8Bg>L&>#PTaJ7>L zApp;+6*|j4dEMWNHG8m_RxkH_)DaF!uU}Hw@-Mhw6^X(xMo;OhdEcp?cPb$q zym=X$CwL>E0JYiP##nU+$hA%xEvANxeP=i)^2K7fzDF~!Kd)mieGp5P=W;9cRPs=T z8w>z`MGbjcYTF?&^dv*j`;=dyXXNux*|qqcgHE=>Iu{9 zWvctb|5ex=-cxPpr?O*dI8FF2MfJV~iS;OJ`x7@Erg-Yz8L!xJLe3<1IiQ{V1E4@j z9}r3dr_*PYmY+i9E}sCobibz{2*SE@Y8>hcO>(>4q7PJKF=zk?O>X}*E2Nl*R^4J* zNrw8rXHc=}gnbC%W&v6{A>Lz|$M6se&(J}$b+6RWp=W7v^OM{=jnXMS12a`BZh@(N z2z<#@$<59xy%`KhVw<+Tl5}de?uWI=R-Y&<^-OvmkTV~TtpAzdsZuYmADc(; zVLdtj(-IR8LhMJcX#KMm`od6&Wj#H4{Uj{-b~rjQytqI#gVq%!V>j&=*+wgrZbpN| zMFuh;^+G;85;ZBia#yGnoTW6a7$jFGP-n2N+@IYqfU~{kek{evg`?_^4~!k2TNM$t zJ9UVa!4s~Yy}2-;pTAZpsN_f0;LHRR#v=IlfBM&J;^Hf%_?4ZSSq4uTZc28gZ%8V} zXpRnrHTilsUG!5=KQ@b^WwwrUrIli%0?2J`eiL`|Ep+7fmvQ-q6ktRkP{W%=Qa+r$ z#5BC!li!r4T9yMeCwJE@z5t=FQ#$<1qLot5(rBjznzH0H^a)+MgzuExP5o`vwQsju zXXxC^(#d;E-ZFlcFlCqb+Ch`=p!^3zlkCf0_Hx8^2=_?hW$$zi}`gxeroU(`rf{yc>7{NXT3>L?ati;#mg^H{!TeSrd^dV z-N;k{TN@Japfx+88UroIQpJE&IBP5z;+R8hOaaB3!I*R}Ib1XqHyHimxmu)u%r$ zJjw14Wn|!{%N|dpi)W61gl$rUC=|<1AvkrfJ$?UI(iGPDA}QQnBVRb3l{lE$S4(RB z9>Wgc!8P4~c+`j`P;uGN$ct;Le}&xXrn>O=e<0~iIl5U7J{s{qjX&wked8WpD7UVa z*~@D&?nYq4z}`DRAq)lTLk9uIZ#%0D0=tcshL!%#o+3Rd;m8mozg$h<8Pv=st+UPi z4t#!_AH*AMcg%2I+Ynd!p+b<#VXVW&*G}1WxA^_PHFurT7IqKW)WpatvKR`v)0H|y zh8MMei}ygm{r2E}*yKAP&~2%pRnn-*I}gjv`ZZ*@__Ugo!j*iW3b-52Ki?Z2(;pbk zHa_`LowWua;_IVnUT+Pu7@%GP8faI7AjACbCcB|^q31V^>A=#5I-qTVW-~o=+BG%( z0Qo85*rJwhdS|O_S<$i0jSP5qBh`t+4tL0-Fx#|$_qn3I!}0U<~=6ER_AtIC?I^x^{ot4V+5Q22lTQ16eR zh1C%+6VOmd5ifN-Io{1j_wy>$`2muoHCEK{FX`E$0i-u{B^@vIK}dKC_7Ej?+_G0K z*oU0K5P|`#k8^`j58ort#DcQuvwxg3RMwqH)9U2V3DA|wqi(uVdUsnU@E%7g-9J}l zE^l~HK)&u%(?95B&Q#s^Klqt>f`>gJ^gLzvjP1-)R58rjrbyv8k{dLMEmw%Y8dVMk zO^UCxT#G2Q=fcp>{FpwGOj{ zN*5f+RxstRNqyI!8_l5x;q$n@lt!1+Gnn}DG?#q1<0C{iul^|Zc`N~##S_qyloTF; z)ZWoTsNTW=T%5zZP4uHhf1Y8b&2r6w<|PNlr#1%SBOmjUN~BNYI?+lF&SN)utZy?j zS3;8uSrEmJJ>{*wsYsQy8&{S&orcIJnR)i(nbu2gWB$arY$Puz+<~I{sdLs^cX5!b z1orODx{>A;gPwfM;qvqRaKz4;Z`#mUqcnB0epL1yr?)POmLSZtDNKKtna+U)^#0Rh z<%q5eWeQCC^wL*bp03Hh(Sol!l)S1%D#Y4l@`ss-9t$YAIs)F$@xg-TQA08B9X5Ui zfGz*vU50h_NKJilr51%ABVV)_3RmL20 zd@28#KEl25sC{XWP<^EB+y1N_m8Y~@F<#y-Ywau*r%8)$dWz)3bWar@Mz~;+DZr!v z10&Z?IBD4T9Zpc|Rq@f_hb!f*ceqB`2xIfqhnHn=x7=~aI(v}iz(L!?lL;;Z3(u476b<2F?-ROb*4z7=Sfs%%=A{oEAOTZbOVf2`OvmsOu#uxe~Dh~$H0?0k>ega}d?S5Tf;E>hNvhbv4{A6|v8WQ8wIXG9ZqC?e5; zK-iepa#yLc$IcJuqs%r_Q|Ff_2`@aq2g8OphnqH*#EGv=G*1jV(FUsu_RZ)y9 z?phOXzNCXt!zei9y@LeiW4PeAKe}VxyjL z|$E_$6=WE5t$yn3L>D!CT^ixU)d&MVY4$E2}-J^q^OjGeT`--dC zMCWJ`9wXuaGw}F@H8^(GRebxJ)K`E*611 zOXWwz;pAAj81{by;4(qC`Vp@vV5NIUy97&4c4oVtWcztV?>^ZBNg|(L#a)>s5|EB}P{1w;EN_=cDa| z)BSEZ&{l%6c}fM~p;Xx81#HWBfXQx;k0#H*-uDw`?sMrb;GYtH;-}tWX{s9G%9Cvx4RRSzZ>cIINev%>p>PZk;eKJ|>f*#MBO{ zvQSc9rB^bbSQltdL2O3PnJ?b4@yD!(F1x~<&Rh1f=6m!n9T6!4cQ}VWD9}bfRl>|R z#Vg6E)GVwt9ig=E;oxm_NJoeaoaHS(`$h3iJX99i4KnA=3b3l0fq0S654|0u$1C-G z=^{_sX0#k`JYnlRfGxKsLAo<;2K8&4bC3oO%XeSDT`7Ixc4w~>Bo)a(uj-fX!~#6Z z4GmiGp>g|%7$MPzI1nfgpCp(Q+ig#`6BWVEVsb`%rVcC`E;&ZB^KO^Wk!sh>SZUvO zS-XX1bLT+Z27)}eJfT|EhvVH%& z|FA*poa;wgJ7kjJeyF8W+ez;q@F={&t6DRvB=M_XHsT)OOWXcvQ0|i)ABcBi?YW=< zu^Hr7wepq#XsuoHaL^AoEp*9L`n2J3JwE!09QPv|S;DH|JlL+_uDHXraK|a!6o0~f z(k9Zb5ifIQ>P3s9Vq*aT%m-|4V)}^OZIMkGG%%(jc{x5tsqx=#6X94f3Hhy}$ zBKbLrnu*9ZZNwB!XBgl1kM^&>Xs8CK5Za*XOF4m@yalLCDqNfih^ZSI5V)xONqzhaqbU3@d z5MzFAgLOis8Hag)SibD~`+3s*e2=F4#hAjnVH{gI_n|~$%6MtIR4EXMCJrpQMefPl zf8JiTa$<<4yHtYo)gl@{29uPvXWITJaWZ&TkxcdXU3%;HXi;H#%|=}-NAb;r*%w^G zD~m)mKT*~*uBWHQ?G+nCuQ~rpsrjX^9Yz`pfEQ0EUMf!ctq~~uLPCajLaN#s%Kg>0 z4_4OE`6_FEK5^fC(M)v}Kdv9i^=R&VJ(x7bb%FYdI(Ujr(6ENS z@;^u<#2NTac0VW0d`v$@bSGgwBgmUFB9SL@V(%~MXxwrc$aLe6Dj*P8rrFA@OcK`Q zt`MCP_GIyZ*>L5FIhk13a6D;iQf{3}Re$|Ev=Vn?V37cqhsH+D%NoCV@X1i*Vir%p z;?!c?y|LgECyh@aI`4pgUo1_YLsW?rD zaT1x#T<_+B9wEN6D-lR_u6ZMiJcvb3GNSKI)4G7iW7HB(NM_dr#mDoEB>}ZjmppEL z;6`7S8jfrR;dx$3QUj7`mT;GA!tX)-dFP6ro}N7EHeC%ap|F4RAIQO&z0>fJBRf~J zKvkLpAU`I*ee-8%6#3zkjzw(z!u&k&tb{OM%2Gw(@ClSK{W>t@%nzQ|Imz1IgabhhO@D1O8SZU%)MxPp0r2AjW4 z?zAXf0xZCrJ*NMk`#%5efoBEEDy?F%FhEaI7lF_pEP3)6W*xmwKe_iiA*RfgS6cYY zS=c|^ff=>3gCY7rN4%dNYqA?3i5bUYv=l@ndn~+38u<9i*pP+!HZ_MQhq`Pj2rx9@ yPH(C>eZFo#eF-6|I7v#(h%cWtzQVquvr37sd|!g>0}o*VNs7sdmW$|p`#%7XRTMS= diff --git a/doc/figures/fpga.tex b/doc/figures/fpga.tex index 02922a0f..21901fdd 100644 --- a/doc/figures/fpga.tex +++ b/doc/figures/fpga.tex @@ -14,15 +14,6 @@ \definecolor{myorange}{RGB}{197,90,17} \definecolor{mygreen}{RGB}{84,130,53} - \node[fill=gray!20,rounded corners, - minimum width=6.3cm,minimum height=4.8cm] (border0) - at (4.5,2.0) {}; - \node[fill=white,rounded corners, - minimum width=5.8cm,minimum height=4.1cm] (border1) - at (4.5,1.8) {}; - \node[fill=none,color=black] at (4.5,6.4) - {\footnotesize{inter-FPGA reliable links}}; - \node[fill=myblue,rounded corners] (tile00) at (0,0) {\footnotesize{tile}}; \node[rectangle,sharp corners,fill=black] (router00) @@ -123,16 +114,16 @@ \draw[arrows=-,color=mygreen] (tile13) to (mem13); \node[rounded corners,fill=mygreen] - (ram0) at (1.7,-1.6) {\footnotesize{off-chip RAM}}; + (ram0) at (1.3,-1.8) {\footnotesize{off-chip RAM}}; - \draw[arrows=-,color=mygreen] (mem00) to ([xshift=-7mm]ram0.north); - \draw[arrows=-,color=mygreen] (mem01) to ([xshift=-5mm]ram0.north); - \draw[arrows=-,color=mygreen] (mem02) to ([xshift=-3mm]ram0.north); - \draw[arrows=-,color=mygreen] (mem03) to ([xshift=-1mm]ram0.north); - \draw[arrows=-,color=mygreen] (mem10) to ([xshift=7mm]ram0.north); - \draw[arrows=-,color=mygreen] (mem11) to ([xshift=5mm]ram0.north); - \draw[arrows=-,color=mygreen] (mem12) to ([xshift=3mm]ram0.north); - \draw[arrows=-,color=mygreen] (mem13) to ([xshift=1mm]ram0.north); + \draw[arrows=-,color=mygreen] (mem00) to ([xshift=-3mm]ram0.north); + \draw[arrows=-,color=mygreen] (mem01) to ([xshift=-1mm]ram0.north); + \draw[arrows=-,color=mygreen] (mem02) to ([xshift=1mm]ram0.north); + \draw[arrows=-,color=mygreen] (mem03) to ([xshift=3mm]ram0.north); + \draw[arrows=-,color=mygreen] (mem10) to ([xshift=11mm]ram0.north); + \draw[arrows=-,color=mygreen] (mem11) to ([xshift=9mm]ram0.north); + \draw[arrows=-,color=mygreen] (mem12) to ([xshift=7mm]ram0.north); + \draw[arrows=-,color=mygreen] (mem13) to ([xshift=5mm]ram0.north); \coordinate[] (south0b) at (4.3, -0.9) {}; \coordinate[] (south0a) at (-0.83, -0.9) {}; @@ -282,16 +273,16 @@ \draw[arrows=-,color=mygreen] (tile33) to (memb13); \node[rounded corners,fill=mygreen] - (ram1) at (7.57,-1.6) {\footnotesize{off-chip RAM}}; + (ram1) at (7.97,-1.8) {\footnotesize{off-chip RAM}}; - \draw[arrows=-,color=mygreen] (memb00) to ([xshift=-7mm]ram1.north); - \draw[arrows=-,color=mygreen] (memb01) to ([xshift=-5mm]ram1.north); - \draw[arrows=-,color=mygreen] (memb02) to ([xshift=-3mm]ram1.north); - \draw[arrows=-,color=mygreen] (memb03) to ([xshift=-1mm]ram1.north); - \draw[arrows=-,color=mygreen] (memb10) to ([xshift=7mm]ram1.north); - \draw[arrows=-,color=mygreen] (memb11) to ([xshift=5mm]ram1.north); - \draw[arrows=-,color=mygreen] (memb12) to ([xshift=3mm]ram1.north); - \draw[arrows=-,color=mygreen] (memb13) to ([xshift=1mm]ram1.north); + \draw[arrows=-,color=mygreen] (memb00) to ([xshift=-11mm]ram1.north); + \draw[arrows=-,color=mygreen] (memb01) to ([xshift=-9mm]ram1.north); + \draw[arrows=-,color=mygreen] (memb02) to ([xshift=-7mm]ram1.north); + \draw[arrows=-,color=mygreen] (memb03) to ([xshift=-5mm]ram1.north); + \draw[arrows=-,color=mygreen] (memb10) to ([xshift=3mm]ram1.north); + \draw[arrows=-,color=mygreen] (memb11) to ([xshift=1mm]ram1.north); + \draw[arrows=-,color=mygreen] (memb12) to ([xshift=-1mm]ram1.north); + \draw[arrows=-,color=mygreen] (memb13) to ([xshift=-3mm]ram1.north); @@ -359,33 +350,20 @@ \coordinate[] (south2c) at (4.7, -2.3) {}; \draw[arrows=-,color=black] (south2b) to (south2c); - \draw[arrows=-,color=black] (router00.west) to - ([xshift=-2.3mm]router00.west); - \draw[arrows=-,color=black] (router01.west) to - ([xshift=-2.3mm]router01.west); - \draw[arrows=-,color=black] (router02.west) to - ([xshift=-2.3mm]router02.west); - \draw[arrows=-,color=black] (router03.west) to - ([xshift=-2.3mm]router03.west); - - \draw[arrows=-,color=black] (router30.east) to - ([xshift=14.4mm]router30.east); - \draw[arrows=-,color=black] (router31.east) to - ([xshift=14.4mm]router31.east); - \draw[arrows=-,color=black] (router32.east) to - ([xshift=14.4mm]router32.east); - \draw[arrows=-,color=black] (router33.east) to - ([xshift=14.4mm]router33.east); - - \draw[arrows=-,color=black] (router03.north) to - ([yshift=2mm]router03.north); - \draw[arrows=-,color=black] (router13.north) to - ([yshift=2mm]router13.north); - \draw[arrows=-,color=black] (router23.north) to - ([yshift=2mm]router23.north); - \draw[arrows=-,color=black] (router33.north) to - ([yshift=2mm]router33.north); + \node[rounded corners,fill=myorange,minimum height=0.5cm] (boardrouter) + at (4.63cm,-1.8cm) {\footnotesize{board}\\[-1mm]\footnotesize{router}}; + + \node[rounded corners,fill=gray!20, text=black,minimum width=5.25cm] (links) + at (4.63cm, -3.2cm) {\footnotesize{inter-FPGA reliable links}}; + + \draw[arrows=-,color=black] (links.north) to (boardrouter.south); + + % Is the board router connected to off-chip RAM? + %\draw[arrows=-,color=black] (ram0.east) to (boardrouter.west); + %\draw[arrows=-,color=black] (ram1.west) to (boardrouter.east); + \end{tikzpicture} + \end{document} From c1a492c9b411a79dcd99a0866844a73a46487637 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Mon, 20 Jan 2020 16:55:30 +0000 Subject: [PATCH 03/78] Board router now has interfaces to off-chip RAM (Untested.) --- doc/figures/fpga.png | Bin 17084 -> 17166 bytes doc/figures/fpga.tex | 4 +-- rtl/Connections.bsv | 82 +++++++++++++++++++++++++++++++++++++++++++ rtl/DCache.bsv | 62 +------------------------------- rtl/DE5Top.bsv | 30 ++++++++-------- rtl/DRAM.bsv | 5 ++- rtl/Interface.bsv | 37 +++++-------------- rtl/NarrowSRAM.bsv | 2 +- rtl/Network.bsv | 44 +++++++++++++++++------ 9 files changed, 149 insertions(+), 117 deletions(-) create mode 100644 rtl/Connections.bsv diff --git a/doc/figures/fpga.png b/doc/figures/fpga.png index c05a5e99609a3857048baae3a4ab2a67d6efd9b7..2ea48fdcc2aa8e3ffc60dd4168ddba0d49ff1cb7 100644 GIT binary patch literal 17166 zcmb`v1ymhfx2C&scXtgg!JXg|+}$m>yAy&-fZ!H%6w`_F&6@9EJ! zy8Dh>BY`4e@2Xn0YR+f9@0>eQSy37Vkq{9C0-?ytNT`B95ct6U5j-UD^{!{!ANYpg zB%|XB0%7s}^A{q%R}%sRA_vJzh^l)npBi}Tt7$!d;>kpo5(}#7OJvrAaQ+$LPQ}a6 zHMgwWSlu{(X?tu_Uaf0W-sE^}yRWy=)~HKwQ@forlvo5=7d{3pBR2H%>dZCpJ(PM? zg!2CRa%caq%USEB(ByU2b=KpzT^Bx)V95SoL88$h7{u8G<)0!X2ryAvFuchd=+M@I z#%VXDPt0&?bcprn(-Kj=i+e#$GIYxL;|W9NSJOpuIG?nt4TbEGzMQSLqN1W+GfeI`EBaZ1R_IypHJ6ins{^(bFYfQA$`Fj%|V8-)PrH9K?$ygt>q7v9|5 zpb&BM^71}BJ~HdKTmbJ&L$}-Ny6OTj zB*e+7F88E`GmWtBrfh%Fjyt@MB5Bye!()7W+~@Wd4rOa=YiVie?d^@1SdlUz zt@!(ZE%H<23WaR~x8pi7CMF~(joo6T&SEso9%cXdxZVHR13pBCdOAnQ7r0pY8p+SV zrl0@=YH4o=7ZsV*E}E2A=hf-m#!1j5K|w-di^ULfeO9SO;pqjye-oJvQ&zCRDNS)V?_lMH+NHE8Dn=x zeMuQ(b&Rd{go*IHsNLuyIc%n+l;+UU(NRoHj7B*y59`QWTu5Dkx76UiL>cuHqvQCM z&ApuKphTOIS_c9F$Bm4PboTTlYdM^@4oxuBEO@ma__oL(ujMEmLVjRqJ*YLvh>Rsx zDQ&NJWaZITT2uU$A z<_B;2c;}CMN7dD9xHve>x(x?|W?Hf4Z9W3idmZxz;iKs9SK^tJ@~f%pQDO{9-R=9q zJ&o-t^Yr%Kpb%JUY+1bXG4T<%AO3G^OQe>&G6UQOuTL`Bg0*ZC5FoqPy*zbG#T_|< z=3L4%Nl;J+``KQHCN#&Y>o&r+AvB2BwV<@HY&7P^!fs}6G6B`7jAPnUll#(@%ynFb zaU-XcQ}u5yLULm-Bq!NPQb@7utE3qHb! zIUD5Ji7yqsQQwL)8SgjX67L5{|>xr1#R{El>*xtJL8+`Z6g26b2~ z8iRt=F>Z>f>G>Oan#5k#Q1QnP=Qx!vZctE*@ql9{B~4iAAzpv@a+Ttpc|D!&zV|mD zz4VH#(c<>W?9GGY&ua}sgA-55F0{WD@OWy5ME!PE{g0?q>zgy$U?z1Z&u7Z-I`LL{ zE`41Ek#gPMlGh+XPW7}($X13O8~KTrXLlvRsZ)0!K0UR?NN`Uk>D48>B3Y2B%R0S^ z3PdZ`HPYm?&BJfBsw|T3Srs%<>hsT?#u~J;6VR2N@Yk^H#%8yEwG?R^<)a$tnFwpy zX|@3cNzfPx_S@grx+HU&mZB$Y3*Ag{QNHO6o~W#SW$tDXSJs)VT_`Dz7T4EMb+@O2 zg))Uam>%m`PxKP9{M}u+ykp_Lcz?Af&gcoE(VoE76gVT~!eC zKbm4f6nx0Xhb~EWl}9f~Vhy{Mq}q{z=ITKrl_T_83h=j-!^<6Q2>2_L&IkTJol2Se zH*Q^0z$M0}ErJgd7V->rcZ4SR>CThZ5PkVR1n7IVD<_wifZE9Mqi#i0>`FOZ zFa;Chsmo`Jt1F)zv5tD4OhXNS3ez_DiZ|Sw&yij+D6zj9Xdpr4@N`dqqHNQN^_jm! zU-)ys$x#|_f?hQs^-)0}N2{aj$#Cpvb8^tn{k$K|92zP;gD@Pht<8(^S(DZEE_zf_ z9e<`;&{wo3zoBy)KoiryYR+^ZDAo_wATsb<~$dq)(nHG<09(o@=tNLb{f52sL_vT%uH?mfY=XtnN*%@=O@@=?Zd zPI<8;!(YB&oe67pIdt6@I$5d`mupcI?G~x%?yBoQwAI4-t}9)}pfwtxpBH)* zX4(XEAo-WZ$|Z0ayy8~URJ0e#DwnFWq}(}Tco9qdHuDm)vHVRN%Fe@bC0ZDN6EN+u z*8?kI>adqT6fm@OqgJ|X&dIJg<$db8S(3!6srT}%2?JBZ3`a*z9B-{VQdp3M4qZX# z#iCRZ@EsIcG+>mwROi&&E^nK%4X%5Ft*Yo@GkDddR&c!*OOVG?0f9^-^(*N>AQG+O zG|1HxE(hZ2sp!pMDuv`UFcu63Unl2&K|z)a-=AGc5NKxOCExB?3OIFQdba5auUvj0 zFR#BjYQVGptweDPD|ohd0U&!y@QXXEm|16?|o z#S>^af`V8zq4BBo_+u}YGP`8(I(*Y4SPZ9^?9<@Y^wCL8+o;-e8=gEtRpvRsiKWcS=V^}y+Mu$@6k?w zcV=};Z-!~}R_10r*e2Muo|b5Z<@E6P`9ipi#T9qeP>m4b9k+y4@i!bCv@v2Sc7s*W z_)t_7nlTjiom>8)*0eUVxf1R2OnONdI*-APz5q6zvb0z*q&xM$cVLbH{vt=5Q)%zI z!S{b84Hs`MDMDat%>Rf)5ne_I+e-Wg7Xb#bfNvUox+0VC??m?T<|fGK`T3canmX<0 zcSfzM-&;8_c*DcP6B84CYC7UynvY1V>8$4!kPi%y z>C{QI2ZVp^3j6KQ$c&*XS3mJT=dI=EC-H9=7Z)cdc7cI`#Hh3hGV*FNJ6y1@%ctKP zWyB@x9c;L=s;kfMPFE=4DHCLT?@ns#>p6OG>4A-l3kS4$c9xTp64q}TUfC5k;t*Uq zRe^@ioHgJN=3(5BUBcx<|DQuv3IX4A*G#4!Z|uk?6qYD}x3Y>A^<-ruxA!}J@54@w z#U0L~&6K~1nTU!8BOrRezxnsOD=3WDXw0K8GxNuXLjLPkq`}^-FKQVWoSvPbg7Ak( z`>YB~EW2b=Cim6oegxYId^4FiYUCxH8tHjrWu)(jRU4 zD|rc6t8V`{R^gDWLXvIQ`7~290Zv;G_RvEMhJl^t9rB|)z8F;)E^b1_07e2h+ z$?vuMlBVGm@qvCVE$N7$1n!Ue74r58v`I#L?2`Ky3nvW=x*hyk->~I=Rge0S4Igxo%)3_zw^jvOg-N`k;d|tL?S}a}6yFqe^XYNV@04yqW+k;_@Vy z9b0DbXt@J|orBE6A{ zPk&w6Ehxdjm?zVA?3B(HdUAm4e;>{_=MP|2Vj0UT@!`wb-+&qXvD$$0k~RH?=>eQW z)#h4~p^rS}U@D7Y?1fPH?6~t%yKO3sB|PCNdXU->0hUalZ$@TZc#YApBUeGti-&CZ zr94l!KAEO#seIKuIAL3xl|8pcZ)Q;|VgX4^`+tO`JRyyyU%P=UAIIAKt`bRq-e@gHvz@4e6Z6eW$gRcnpdu-GYGJ%vCjR0|dyN z{m&xHtk8RK%osy~rop3af*M_7ql+(`VO|;`r@H_K=UTAh5anpC1Iy->x{`XoN!}w? zmb!yWo21jwEFM1mq7D{pphZ)c3#Jo(*dAlQl>_w*$#*J>iCTxQ2!uj`<;0FGI{F=A z6Pw@vW}GrAxCkKKWS+q#2c;*W9Of4k0StrD2-!kqgMr57^5hMK7 zP952`a^%Wxn;*ja4AZHq0JU^Je(9$@^Qz?`XXznFYb40_lFZM0|Mc}P}<;BKrbCdwfUL#A+ z_8r3^lJksY4wM@?@v&>OG{MCHTW}M}Od;|uo8z;-}yE!69kw0!2 zPSntR^W%h`asv8kxHAPLo4m~2-_On!iO5WkZ5+1_CDO5bh_}Ru1QMtI(5(7b_^tV! z%WjPo@@iJga1YgI{LiUE-6G2ZLL3QNK`Z{xgl1pMxFg;%Ht}(%=~7pc(p|P6Apdql=s@3>z|1aNGQHM(v@XW{1m}+?50%?-di|`HGoRk_)Sa`h(=xBq zY!&)Fuja5a)tJ8{v{sh#y7$yb@!4kj{;&}CZo~))!hxxdTonAZHqG?Ua_QDs<>pI5 z4ZaMPKP4scv8bJ#t21?zC86vhzdJGSv|CJ7rVs%JpTB`M)DJZlIQ{YCWV>;5`PIc@ zf$34z?mrHNsTlTagZ*}jkEU!fEv>PNROF4`EURg;9dOa6Qc1xVgmhFCrlGL`aSeVn zL^`C7FhKGycp0;N2$uMD=C!gRlOEV52r8|pDDnDDKaR85G&>p7CSI-N!ku^+_NXoW zcz7tIjU4(jBl&o<@mg@aJeP3K>MM3`D?2W2Tb+(MG)ETcj&aPyei=2|^|Gsj;LKXu z5fn0hqgnkByvul>{Zvn^$zLy*z5z*L8RzjnhNa7<_xxd&2%J-Hk)U1whR*s$+(*i*2Ur0 zzar4g2B4>bMMd)5rHvl)>NEPL;h)Rr_nw2F_nQ(M`91gAvn|9#3 ze1-&JVqwk9%!rGNHx>r&>C{1hBq*KV-(K52&(Br@xkxD1R#z3Xcx%mvF;AaJP@f1f z+vl3*Sr{0O0I?|{A>r}yF)GtfIQMfAYPnC`xTPyEF?+h5mDR%PYK8wFyApI#(wyw< z)ARHAzAb9ZNNYsvj)9*>?Id-$*4KZ>DJUpFLG)_HOnOari}OxhU0o~9j^X4H5|pP) z^@9LMLF*L)+w815D5$;NS67!1H=3mBiIAaP<6Op^osWF8pQS{lK#HD$TaB>?S8)db z53biAqkHr#lsF4KQEzQ3H>rkE2xw?pegcsaA<5=8ao7X|rNle6+DvL5g#gPmI}tX% z%ocV9M4Q5&;(|+OG*z!cWRBN6efvCWD0q0ZoSCr^kq{Aq8}WR9&RAt@V-r?c7xu?t z${RpO^j{*AN*rkyKO}yo5pvv#kS7nLSgrvJQRL+uwLF>S;UxUK3Y~Kc?xw@-2DG4? zBAxJabuI+dLM@amLMHmSWd7lPXrZ$I1|}}MLoYDZWS#1P1vUObFNLH`BePR?h*jQ& zp-c=6pJikKdJK_(eRb*jt1J2pT8ilNtH7c-K23UulYnh$vM9Q(ClPiKRO# zxT~qN_%zGp20SgjePWng{UWye41DExEjZj!C#zqnn=og;#D4y^;(|^wQUx%FZ}Ant z0>Hw7T)Pq)24*~itNdq${e2|A#O@g54xnjpoHyIa-?CK)neL4wuQWMU<30ih)4I~D z-fadHs&ipX*Imj&2^Hgot-6zN;%gUbrW`kh4NRT3#}S%00V7=UQ71vqt-AQYfRKuR zI4C=ObSxs0pz7gedjbDn3{yq}v7P_x;mGToGBp=f{|R4kjeGCG0j_m^JA1``(`{<_ zz{5)iyTiaK*s_pyC;hBA_p^YvDS$CwQfhIX-QA)c@I2F^H8TW!o=NlLu_u7bE>kSE zKP%-VjQ-th#V|gU5M$K_3M&6cQc7f*7pF1Jwvz>cGGNZgk_>=gb(GfHvmb*ZY?A@g zMr|hW-6YZMKh>nRJE+X%1QRY+zvFkSsslEEvW%*(+)smF)QQI7BMdKTQDh>xseSi%#2 z-qolI5SiMci?;^WaWcQ#h!TNBXrN+G#jr38qqD>kDw4~k%@)TFxsdz`R@{1iX>r$( z3;{r}CjhKgks3e#qaUShux-3ThJX93V{~=qYz&m?yQ>7lM(Y9uF&J`L5Jqvf*Z20#0V|DWk)?di(8!a(|Ks>%zzn%W} zKQl8}yX}Sq+KxQYCtSlCX2c(n+C93bv(Rhc?~5qN^R49mInU*A0g3rQL`aGY6?>|rt z3c?f4#Ud%p*@W-==e;E~K8ssH&@Vuky*TP!Nmn(TTrh34Up16OKsHhNlB&j#l)7+@ zy`9~;bW?VOgfSy&O!V!i*5}`5c3+^2xiCN=QIV-|P5sf`O&d|Y(DV9BBTNV=xPxlU zO&1Ud5TtSgg$oz8VIe?5Xhj{_rBR5&ENN^*o&5p3egWCZD}-#@-{Js0yUT|q^yeX< zh}sZ&f<&Gwi@mY3EaOdZ)ZM0;oT6#kzeg@EnQ32eeGAn?KZgb_zDpN%qByIBjQ;-k zH>)=8TXI>wlvMr$C+w`E&N}m{lZmc;yhqSZ!=gxAw}8E0L}hvBH!zlkDzs}TlKqK{ zlz67oeeIR|btmmNj-+kvPRb?nYMmKM@b>cHw~XY4jO(=PfG8`!lG6|`+D0|KJMO== zwY^bMfX2FglOXkKv$4T4p8${lkCrsz+h6BTn|mVg2>q6H)u>Fp;(LWa3Wsrt-^h7P z)mbstpU{yt`!TS|MUu=|P0Q?-CBm6(`U%!pgG>VgSm{34Nx55U2KeNj` z_dVUH-<1s&Gz24JPPvKLOuDrR?h_?OjT>R!xx|N%;&P)e;K%@Ny2QvL{<-ph=}R^9 zy%%ueAIW=Zv(yIdEZsyf_sQhWO zr9KwWtwu#DMRDK~T>@BENVBxaRL4d)p4(VEgrR~42`Dsd!EUd_!#Ro)-^ug{a#_(n zR?{!Fsj7{w zpVbt#diHXmv(pvVkA4iv{1r!h+?nHBPDAE(+1UoJuJwlgucG2!US1v^P2Js}P{WgC zsEg?fFA2W}oO%rD7Ut#a@H7%mEG&3F-<_gEC&&~E`Q9xqEu|Yr{{Xg(3`CHVr>Byf zT%T!epQe$xMf0!OFZ1rZsapE~!ZY=))NYlVh1mRV&R6yFw2KR;{$}71+OoxeWdH^p zVIz(1XQ!85_VjT@0;}BTX3Q?!gAW&(71hA@C12(hP+D7A<3w=~zrn!301oKu^F7W2 z7Z+Eb=076cf#83lIN>jAGSJrbBld`#G(QQSi#b5hs_BCFY|RyMRu^Y`2zg^5&x};8 ze$IR5djpkxRct@u?zXn&wYARHg66`uV+Z!J5Rd?=1{m|12PURIW9+}_2|1xI>47k^m??tMm43x#i#ZpRXuT%FQpmt%t zF3mxJ03*?|hpSo@8xW{?9M^r6m59lcZKRuJRImX_7LZjb=n7+F6?AoVUw+Gi&o5wQ zB=whm0}2w<9|@WbbF+M@-h;2Jgsvixx=Gk@x_pP;fAHTlOlFJ;dO))Rus2%1-yZw~J%EChmOw=?>BgA( z&>G+RpC%xnR^?nT+7XwQ8i27X7g5dAGUsLz>dd2T#X=Pa=K%^t7GNu9b#b@lz;OYV zej7P|i&JdHE3>{-Qn&tREr3|UtiJl)&-_t9KgTO?Rj*W{>aVRD1!usQab_{&=ewy2 z;6lCnDmdwXgaRqw{$NcQasQOkSWJ*!a`<0ez9CIIaa2I~y5p4rgaUxh+|a32iI1W9 z9$mgWH>)e`{9CzVbhOren+pLPGkI9Tm2YkPlgA0ZN%?sc@8 zk~#vDjI%2v=6m|wYyM}@(neZG#gf7v^VL5DC>+|L`0MbU%+MRs+ zg`$PYz;{XvG1qw~Ul|apK7Fy8l5Lv*t+qh8CBTEYGB=@HM=}HmRh_krS~O6S$p&!q z-#ThJJpUlec=B2O=3QJu_k7Vr0ev6ZjRv=z3Tsv+N-^Qss4nHjS7BUA?O&@sd*E0I z4H^O}5bC8P5&U2q&ENahl>#@~Z#?+-WJZCXte?|L6zz!T zB#YA;$x|95^ZJvC{uhI7mK}blmb%fWRVT~){nGvPLnQnFG!4L62JL{)K2FjJh_m=} zZWylA61?+9DF26En$&l*wF-z-DA2E!^rx3=iv9t1c+i12-QNE~pE8&FT__H?aJ-nn ze6XC;(3nZw`2T_{jlsznd6J6#jqylajMZHGZ2Pa+lMl)(Fqeup(ru-b;QwAq9K5e#B5)#098am>5t}>h{zufMSea2bfi*E(~%&Ys+ z8VS|5cA~I%2vr>wYc>u6LP`PC^>BIYzuYc`-+vwM{)%eb*v+6^*UncRKx)L#4DgwB65tmBj*Fq7&X&>^n4F0r9`-8E#2z%QAHs*4jq$P&r1-%nG|zszX=)3+Uya$n%x~l!=I$Rmd#CVR~8S)BnO83X&{NCG7^EsAr z!M=!#Oo-8c@zDUVA8a*^EKrM7Ml+irPbh9%;4cJs zOTgJ69=x3#t=XVuhYp>`l^ZFM%#cS?hL=>Q*C5HRLdTIrm5JZ~=O!Txs=K50e&oVS zP*;Cs^U|w$PMRI#$`gmhmZ~@Fprn1##wo59}-bTWB?uS3XHDfNOu$ zov*WKtndDAVlQfKf=C1P$Pa9h<>l;1#e?(9D`zAdl50KCD=TCe`;Y*uUO z2L6}omVJ={#Rhqt79e!h^Iq?FcliNJ*U$@drh#7tWgnrD(o4{uSz+aBOd^+$jmjRK zphFrB9DSjbXM@mT#cRIbUQsX^vf$h&fhWF?6uv+EZakjsl%yoPqkz%~x!z+#xft38 z4)-L>yR!TC8Vf9z@jkuZ?(IBLGM_jTwzHk}peCh!_R`OAbX(}lJiVXjy(a;`VS#>C zGkc`9NGR(qd)#w^ehJ+gg|w;4Dm4#g8qMdd4p*~KFSmJhbt=(dxL&b*q_k>S<6Coi zg`cAu6Ao}*C|JR#T;g}?2%Tj66)pU7NX9jeQ)2nN?RR&Xn6cW>9H@5pL5TPexboTP zh6QbBQMKL<(_0Q5(ft{@ZV+hsESsS9Sjy3sxte{0+k<7PzkB;R5E8^6T@B-(L2p;O zt?_rK^7Nru@uD1w0^TLN`Ed*H+GpkHC_w4@{lRT7aKTfki}84Gr;~^Thy(F#LHIoMM642kxmv=Q5C zZIti(giz(nc9*e04E`zJ#ve`p&V!JVT)RqQA4#D#p|-@AgbQ4RE!MyG)T+)L&9yB@ zT{8K98w}w^Z2!L4o*ycV@aVWLz~}OUM4a6f=Vk_3Z}rJfehgtV!7fa#xs>lZCa>zY zDuiLx$bE2{gzMcTFRFp)HKt%f70-XF5}I{6byE9_meR#GFM7!;T+^;V&nCq}gx=_C z=37ZxpaCKzddcXFb#4AxpR{pie$0{(6aW$K5USg3;6?PnTPbLp77T>OK#>5pz|(x>CwW3{5yom>`QKR=;YAIO@> zLJJ{lqGo8x?JIz1x*iL7LASR1mA1Rs$sdEBFw2l0mB)Su1+ znjsL%$Rkd+9S=YK{cxlP;(biVWl}=EToQ)th!qOD$=odoXx;u3#{;hDGr6R#dUZOV zN}iKlDbGo1bjl17k|L-$E>*eyWLPsO)7C_;jM=*{51*drr$&7^KJwnT*0aBst~c~M z_0C9JjbKK)lk|7oi@2Uu97RcMc_5X0@Xz{so?@v0uab?w zcd>jX)FuXI(+CqNr6#NfmsehwN|c3-D3Z@Q$`PE)=7{&|ac@<7cQrQuU9@^Bg9M3; z&WXP)n=ZiU*_){fuD{e#_#1gHt)11`MgAOhKKf)|khG^-D?e+-6OfR$u*GWu6MVLt z_<|880Ry^enoA|~TG$z9JxrJ(p<(_Y5LbgrIEl}t=b1pdfOnX~%i=V0+lvGWiaSe* zJDZOROVs;-7ubDS-_xep?Rt>(UimSxO8R*)x7*Xs@4^jUgmwRy@Q5Ne1N6BFCqt1_>%>Z3yy2n zeiEBrGkjUH#mCfO!5IEP+5cd;)a+PpzM|t+RDXcDKCQVrH}7*y9ojGVpf3TfImHDE z3ZjiF0@8jcweFk(c*z~9CF;h;ojE1SM^en}fDj}CW}i7H&LHTdiy&F@#O36;pB*{O zWL^DK^--}^pC#!Sn-*;-K}@g!YPA=!yskh>{->^XPTLz+^cE%_XMgaSpqU#3C`hWQ zE|1+ZM$vy-TUR%;`s(3E1u1tFMRRknb?R>fA`oy3`{E38hks|6U-K88VtG_DdsPB)vbW{GqTfjQi4Fi z;gD+RV%}1FzyinaQ8EXFraG4aOS1i6I7X8q`U^~Lw&K2xKT}g*AC?~yW}f*~b^H_5 zX)ZlczS(mvnyH)Y3DN3#8jK8H{Fw+otK&5p-x}S((OBztdgqNaW^}k_>eA zJ`G$Kdf8v=0|JqE@7bqw(z9bp6{-R0XQda%zd0ROnf-Hoyki`xcOHlj0{1uB;W|t4 z)xFt68Go<3&vL!ylh5t=-CwN)_n&knl`f7dklI^J3Rll5N*7p_Je8N{}`+RiX8-|z2*sSf#jX<5K0u%OMO zuQ7BRmkeupRn6dycWAh%Sci5IqoP5Jkzy$MXreHph!5|w=gItpr>G+{l8cc! z6v)4-5LncQA# zIEZiy^QGeUm$8M5j@_@_E^&CCD3%UklW#cR6e+b2gk9iar8vX-jX$!!&o+sKMbIX| zoA<-xya?GKt4u1eqeBn7N0|Ed_gX6Yx7_3J^~Msjy|v%iME2Qr-3Ns11D*`TLrTmG|iImm0?o{DlOOF$B3f^1X zYw|Cbj!ZCnOkai${Yhl$&RPw8=!MN`!WTSlMF)&|r#q#!XhO$+iQu;&Ub+)_ms4ZC z>GpqNf+p|37}^VQ+Z%LD(U*_(MS?_M{Tg$Z>9ucy3oRNo)BU3w0{Vxv4jpyIb`N}R zt-o0$9v_|10G@|$-ODEa&$d?N_z&n#5_<{Fw%_*u#`;QQd=m%Nd96RaXZt=?(i3&b zz01eiF+f$|;v(Udp(Z5zdwz3FjN3ii7Nlg7+WWg;MS4_Xi-W^>Q<#6edX?!q*a?JG zfi^t|kV@B%E2+^!$#H6OYy?085NBKA^cX0AAmh@|g#p%xxxqJ>(jKX4$P~88<49Wjun7qok1^b} z*&jaNg8*5^DY`dVG&>Px@*f>=wlDsIMh?wOsKn|`1U#9XR)1wlUCiJt>~`wkt_>)$ zvcEC5hDCE~Y*iOewvV!gFm^uTkBt!Mb!0=9WsWYlWk7&D=Hsx$t=a`1c$eGUFZrMa zt%rBfDE+o0ps)ItUk2`uFj%NKToqMhQ|is|;Cv;Pl1ra9Qt;ugyJ^hUx2 z%hPL7_%l_STDxSzXw0lnBY1oQOKJApwY@Oo;+ZG6Ur4Xmu)4TjRF~&ONu; zFG{PNb|>lwET=EJl;)Coo=E%_F5>xk@FJXbjdwU`3>PlQ+FZv;T<_^t3MOs{y!>&B zy*1Q&?H`45Ifc{(n1kTgm6nBRKGrHl7X}dZgG~0w7Ib{f$oWdvq)ey{pg>H++-s0k zX9bHiRmZ^kD#~I6UVg~qN&uNfhwuCG9#AR^83&p~b9y}WNoLZoPnq2%tTu-us90Dh z+2ui%SO5jFKuCMCva=dwE}`la<1kKp`gx1jrkE(ZLWK5izjq zl^YwjG*F11&f?GBkB&bv!Wgz`MNvU&MJT|ws%&tqr`(3h9pi+`{%vlfGFt(T^^D8n z9)F9tLQNg_V@l_4blmg4L>A5b51jZe-NhO0c~3OVxX<@K`x~6OCC`i5sY8D>vJ5t(=_Eu>PJjl!gy$3 zKGOhK3h-I5-XsL5P-so#{W*eE7zgyLd<+U*;+kO2>rH@Hye_)?xP#f-wl7G2`SxLA zD0|^r7{2nwo0-FdG_-#qTajw1*8K$^195j$*n-e9OVr9^sIrO0qgFx`T7dgA1 zy@(Aha;n&pxysMI5Td%MP)43D+}x>|VB6iNJilC82WNI_iFwr-#?z5`mU*D5=8}FV zWc{$*YR1g8r)upeZ3gHF7v1@l2zC3k(2?!8>kA#GPg1rFP;;d;03_0|bePW8v{h~C;EFrc6#3`EeE zNkwp-bZ;d(Caum`_=w0_fHE%o=e$Uxi>cJ#?QkH1?Qm^MUKd}S9-XphUmDX!5cXtw8pT>2xSkUp) z=-r1uB5k7%jG)lgevHL%Z$iu!b~dOCb56zQhaOy8Y-s&e*oeol;(Mdd{8#e-tVc-s$`Fmh;<4TdLq1$c8Wa5W)yb&U)Ht zG@9WMK;*~%P1;Joy@!83pDoTrTZ&1uin*_`{-KJ7gEDU>$NCYK+(N8@awPK?nknJr zdXh~3Gj&H}_VEkzzeF02#eqU?m^cm+Z@-OC5&puz!%a8w{}@9d`cHX0|2QK~9*E*F zfO?rHoe%I323$3Hl4-->pT!Yc28#y5CHkaZ8O-C1#Cv8^K&u^SvPrLiLHrpg>i+NL zSRZlG58Ot~**R^<5(snBOx*~@F+L#C?kwzx5SLE0s&>hm!Hgu2(a$Q^Oesp*OTs-p}`#=1Odc2%9J_@aR$G7~cqZ){5jN79 z_4TeVQ~6+}+$w#bWpO0Ay8F#XN@@V;?aRo>cp6DhNMIAOh5*ga&lgC=@&~*SJg!cD zdhhVMG-W;n%cr}%KHY}F&h0=A#F0E-Z1qJ&Awn5uWM*cjr7fr~aOo6j@5;r$7**Z9o$y1i0 z#zYNwb$2&tb}#_?AA_ObHU}3fbg+ntU;bt~ZaeXUK&0`3``MneO=zvHtutUHt;PJF zAaZzI{DCMOM#XIYq7Oo`QuXy}u9Rfy@|LyQVV#`p>{oz&dUtmR+p%zBe8)&sQBi@d z@?D;0=hDk*XJuuj{7VGTOu2F2larGJ3=D&GrJt~oT3=;ITU#5gQjWH^_QTVY&1dmE zGP8zkkSOCR?1_lN;Hb*1LjA$hM4PKYqW`l9of#*Q0sWFLCCbxop zNLE%BaIb;J2S`v|eZ3A-N^I=W6amoaI5jmj;>pg=UaXMG?Qx>b$XI@Jmim+ZT5*E!I0u8t%JPxZv2~?dO9b)3*V1>*kptO^K!*X_h z9*7`S+LtkIZ-MX6M&ja7#GYGDEkKjUIZ*hx*5;0c%e2^NtEH~~!)32Bge>r-wz-*y zipp%I$(|w5#>R%<=cYuTW#5UHmzaf-5y=DS(7=TiA2$ioW=awBjvQzxDuO``1v-1< z$v)*|WRz)!wY0PV7CTTA0Nm7-$;rv3rN^&m!Vz(C0k5|U5FlVwfw`FTDX`1s;bxdN zcPxWTjso1?{=8Ug1_cEr9ZQ&;n3%5(gN2RF>9{Tk60x(hQ&pV?#=KO$j4~I9B4^8u z$3PPaFR{aHi3$P&g6H}ATz)o}?L3{|W*^!oLH8r<_`$E=UhY;Lfxz)&<$8WzlQcdx zCkItJaWI~2a%iZgw)XPwuFL%x_ee%6{7RLM30ICXp`)Y2_;?CiEbCKXPLFYxusjgM}4C=k&Sx5-E|G zo_@Hq0~LrN*px@;MOSIB0|c>u;>_El6?-}pI{brQSJ7U*+A{XnTt!7id}N>7hb z63|%EtRhW#bKYb9oZs8qySZu1gi8brxYY#l0}XLdPfFgZlJb|fbM(;`=N-3P1Q~gG zc{w>odU}<#snu53q@<)JyF1_WPOww**w`4*WrGCj2xXP^|=Zn+^+g0j-6+fYun47q2=i5EAzXsCMw?-qu%I>%sOdfH_$ zj-*_(a%5;INbeAbQIpA_?FI;Qz_YRI#H%|3aY{u2)$(38zm4;W9vgABmW7Ea6bcMH zfAopjLcW?G$Uwi2f!IH}9~4XU#`y9nSVUQeHvg)HUEjdq)ALC^ka6feGhiTr#zqbG zMX@|(I&6jxc+f}3^$s<#qP2_G-U86(ty6Dl(CQ*NC9UNKlPm|@b#`*{vK3Ay{ICgy z83~QnHM(~Nj4Xm;@t4r@@^W8aU+H+#%9oSbQZ*zbB%E70DJdzVOZ5@aN@V|PaWS!h zSfbRlGY^&STY2grv&FasW1U?)X}IygAk+S(e3FM_13+}!i)>u-PZ#3>UZ#PWc~`^V>J zASw4!_-bm-A50a%K=v#c1sdq;j_hB5{rVLrI13Vw)lksu!dP6q4@fla?d?F}ck=ra zFls>CdsA0e&g4FDLFG4oAP_KJs;a7h7>yArp;~&=^A!%r5*fVCl+pbz?(Y9|{R5TY zh=_=gkdT{;b;ZShfdhW{@BujErlzK-AgqCG0Z&v?L2Vr!pfenZOb{Tmff!jOB~}&| z2RpkCAV^i{)O!i812>pV&~ zIX@{$O;akoNK97K(pWn{j9!@O`sI)07V*q~JOdEE!LPt;UK2wzm_x|SQmjTsY z7{(h`?&Ly_tezfGw&Fq^*m(2147E#ZM`h*s`uchaD?7+bFlX2tbtnO7RjFY`#O}kz zM(iIMvH9s=*yW4_{ERrF1{o%hn4Fwk%GF4xj7$gs!K{NYk9l~7JdLfRqqOITJp3@j zLV8M$moZr3Tvl{=Wlu3SVq+5%(IQo@@vjgPin@|$jiQ%`P$A?7Pwek##3ercXMy{v Rzz1A_WF-|Ps>F;#{vV?@oK*k- literal 17084 zcmb`vb9iN6x9_=QR-9B-tcq=`V%w^?V%s(1y)?%h<2yfNj2$K~EA|N<8y*A#eUcCtRs?~-z5x3tu;9So`|fcs;2)f=xP}7= zgwFZTFIY^EDi{bv2$B#MRCZlC({a&OQhVWNPel|J@~!IqMXLp7mldi~O3ZOpzOYc@ zRbC&hr=_M2>awahP^EMo*eF3I>ls0S%3)2}vQJ6&(l;bs|n7-$v6_{2ELs{sPgPCpT}y z`A10MPf#qGBE|q6Or403UyoU|Fj;&OolePPv-{o24=n+QgE2c>Ti?pJw>NY&G->&v z1ZuS%+xBh|d;6N&+M7$|1(cO?cQG_r$ol&F(Ik2pkmY*I-9Qvh??T+n>@2(G(iX5C z9yZ!;_edqtC1=GRPNc6^8^R!kNXg2Q#fdvPIrS=OeE->a{W<%!u&=N0_V#vaN}l#u zPnE)lDNcN?)92%Sy;TL=E*@=WeEjV3YF8?eX1Zc?JFAkYS}P5UAR;`x#eQGX$jC@n zH)Dq^bFxsexVI$3uyz?2DMXrVb$OYUmGyXjk$@#IB*fI%`03?k*Ontp*dRp+yOQx} z(K@6|WTnl`L0lXbq(AbTax{?^r-3!Cvhw2oY?TO>I8NN-{%>_{Eps;(C9tuwVuH5L z&odGegZm9a$~%HbtmnDWlBJPgVdY^N&F3gY`q|bn(DCsr9-Yy`AgPDva4DIX6gdYB ztJQmFH~)vq33?PCFi)f7Z*lGd+B2c4^pp z{Ccb6%;n>Pl9H2!@>N#CYja%$#Y*vTL)y%4eDV4m;yhtuxBUb)q}R*ShvYNp7dO2! z$?HG(yZR&ZdU~z2G&I_EX4A-Gm0yP)@0|0fn`v9U=|x^gk*&fR?DP~V;ftl;?=wQ6 z7tb{{3+wj5V6kK6Np;$oKU2;s+&?v%y8VeG`MT?z30cc=HCdlu`**IN#4BI!cn z znPs8ZSi;yl66!ofs}p|?H%ghA9pWf~!fJEu`jPF1o~CSV-y&=~H23)n?2-mLw2yH^ zhdr9@m*8(;ZIJ>Cjxc-T3(B63gy}+k@ zIN~v%3@ra#SYgFiwUCg-o|{Qs+vns(Iz(RAgMs>>n7vmq7#XEyIw|<(oSkaOQ$;km z7pdn~M~wEh+19-#AjS05P6|Z$oEPX5xQVP=n*2nNc6yUe6v>;@+Six99a-^qN>#2s z$l7l9U5wq+xUO8OsWOAQ#xr;r)7-8EYb8@SS976tMOTwvw@#VRmU-kXhZQGie|yP) z>y8SsqTw1|NO8S(hwwZ!5!Cvf8MP8_d#v2J=)6BEML;k^;7ZWW%9V0 zB;w41`wS88nke&@P);;x@N=ZSa8bW!{!u?=s!8`g%JN+>P;9yb3#%))(#Xl9W?4hR zYA%Hz5e@vAy`=GV4OfOxTWuq@o~jp-K?`izJJzjum}}&xsIEFPa1bFZ#q(6SMKZ4T zs7=)68Q#Y?R0dn$cP2187?5ux^WD?o_i!&J6wuU-{K#r@NzvA>|JQ&ut-oRuhl@!L zn#7{7xw&>k4fL4^L}o3p9E#6p1I_4ZCQ(23lU$}~Y$M28W5bp-4An1Lg91|tE+9cm zA7Vl6pX?O;NBbBK{#HjrVd5Q>g4CRtX&)q4bpBw?=HhK}(6YszU&!JR7@V5hY#$3J zV|3$g3*m78QrM(g^QyC4=edyB9?RoUjS*)XlFF?cDsH;B%hsK+`p1>B_cH@F=iL^+ z5@%fkH9>Qm&qgP*;CQE1mr`}!Kqi>4*sQ+NDY9jUf*P~!RGN&Mi88X*Gz1gJH`6My zngD}?64sm$L41m8IvexNJbrqJL*%f2(=QjPL*44?LJJe4s{uDF)|RK>CuMxJy%0)O z&Q>!|^|K&2X(G0dlgVfgoDFliMtJ7P2~JZD^q}LNMI6icID!b1jxZP~^qYNs4hZC^ zoooPaTf5j~CoV?s4-?IzVvr$=A-s1xlNJ+gKe8_n5BN)IpPPu9a-4a`JBPeJaqeBO z6Cf@$oAqbN-s&qv{{F|og!zu8)mP#~$ucjz$|>TGM&?{eY)j{+RCq^Y5Dd)iEkpE$ z6!V!Vw}XNB^icI9Est4x$SYrn6L7E~yc*Wj?ddYR*+{%Zo|eB1JF0ziJ8CC@164U! z6rM-WJ5{dk^#188(--Lno;uLdORf;LfeznK*#R5yrg-t*%UyOs+oB=@_)_9#C9+^iMk>k(PZQ%<6(jW7AI@MWGbKpH z>Z2Xu#a=CvF=|hSlH(rCaKGN9s~=;J3??vg+KV&meT+u5ys08T7(<5^I=;`)S!wUp z+QrE!dB{1%`!3Q>v%>&GJf2}`*;-eK-l;`EoG`4_Ox$~KwzP_po{Xic%){USJenN@ z3kfw9pYJF6rP;6oJqbg`SRla4ndUqTtvOCRoXKYi;K4z`&&B=64n`44wtydnve?=4 zjN$FR2QJcnP9j;~TzC&nAZ|&{mpyv{EkGhf9*BPzueCP+!)zKI5CGwMc5#sz8>{ax zRH5CB7IOND?pJ(#dU|?DVGSTO=DBC&XUbB){P%=dfoQF@zvGQCvE9e}TdT{(IgWE` zbgZ?t^-qz62cz=KAz?CJA`%@4*V1f8tze;2g7aVS~K4;FMr?+{w+V~wsm^kkK@3Nt( zsi{A$BCwg8Q-QkH*Li7aKSlH-X?cB0SO-_i)gT~D;r#R>Yopo`Uqfq87!HSwtmerb zChVVJWFC$2^=mQiZnZiMvwfadAv?PR_HQ3-ZH(_`jT|!R4Ldsq{gW>b7s#vK!D(qe z(Sn%pK~GOlWE6V=Fvu8-^Yin)sv5wtM?7fo{~C%P*i#;cdMYXjt*zW3eL@dJ53$}c z&v+U@UdmHIEIY&Z7Nr^?ARq+z`-4DabcNLl>gq>_hcI$lIK7T`D`IFb_;oOX@-8t( zfc6lNi%%h2^xVwXpb}9~QE>tu_6V+)@3GP7zA zz7bv1EUqH)9Zz|Jm%<5A#S=7#ux1>e9EyK+$@m&}d2f4r(^V11%r# zKxI{-ILF0{`~j4jcB$Jlf&=9TK=?#Za|6ES<{{--#*ks~`K*V{VOo+}El+BFz?raj ztt=z@_ctYNbYe%Yj>4R{7-eIth50oAQ#wni#$U!J(5GY~t5v^GJ!I1A%vjsJ-wl1+S z33wyg$YS9UTr=1*nBmPuG>yM4lxOFA<5&`~6>(3^`R7@#nhHuUWVmq*EK$BUZ)l_> zy3lrFf(YaNg4wdqpOB4=RKK#GjOazA3B{EKtukN2_s?Y&uZCqEBRL9TX+%3YfF%{W z=TIRbjHhnp*~ia*^Z#DZ(IwC|k;3puU3-@h@7v2jqesQ(K$3j!iwkEL?Q|m|O}IXt zxVJAX@%0c^iDcm48b<+-Q^2$6?8HcpEuzM`LanGVVR)q3vgIWWF1n*)PA0+fLaF}~i(OHwiT${rExD&QbKi}QVj^D7K zYvrEwNu|O_$6IkX1ttzDR+FUK95e5E~q13<_d3eOe;Pdg;psUDOzR$yI^be*;>0 z@)PmcxMC;JB?m)p6TidgNfYbW31b9+>g}W93;?}S)nR6oriraut!(yZ4ElN&YuhVZ_QvhWFtQ1y|m5_gejXxObe_hE(%-+LRKbXTuN9bYJ9Vy{0mCO%uGvv ztQ|B))Wb*L;a6L3h84cZ@&tY4dD3DG-)Fd%1R3JlX$tNBsOyy3tdF79p7z+pni=Bw z!|)0W(^-y>$t?eChp=7xTVU#2poL7o%jG!gtDt1m;=Ov+9tq=BF%03w|Mm9A^$&wJ(yV0%v ziE4w1Fc6LRyTqm-Aum%YD6dB&C5Ck{=&&x?E|_IGC4%rpVa;ufv&Sm z!{q-}HpY%Y^w2@&SBTpgCws=Rwe%erbmU64h7n}*!FI#Z)&PS8PxFsmVuHM;gC7Zh zU1xo!jXw8%GCO4Qfn5E@6?OUcJe6#yZc<3k7U6jvPXSqqx#YO5^AH?~v^*R3vKU?V z^Q6LZZ!M2fQwwFb}FNDz${1{4(Tgt+XbsYNK3)Z&KjZ=XqokFxk zrs|8?!Iy2w5Pn{vm8ejI9ku9N3v+!A0|HS<2?caB z?iAL4hz|y1G2-}P0L`epYPY-JRncZ~Ivj9taC9vKaT^W}4v3H=3DiG&&_DwJzayWN zM=vl?SXkKc@o{!`w&qOdEzL3rL>$-T^YOmk>hf^bnex9D9u}W5F`f2D5{8C`+S+*f zW%x4X@)bi=cn3{tIdPGar5kE%EzHel_{`XpVUdw1M@Ft69tO8<(Zc#zf>?A7rz+Qx zG$QKUZVqB&Q9xb4DU}jwwb(2z{yTc4#D$b885iYplCDT0SCo>9O^LeOM7FnMHkI}5 z_vk+5Dlz6FJHkhc#$%H1|3;`NIsb=Hse7Hv>+p^+fIY)1ES^=>enpO3czbyWIn)0x zD;vF|Nr7>2c$k_Mi;T}BudSx0hOA}$DSJKV1|WKbzJ3{*CJft@VZMy4CV%||sN!+P zT8u4ZzSQv+3dPk0kTd?(sNEQ~w4Ck#NdP*Qszur?w#2{{uf*kGRaeBNcKQ_o=dS#J zD3iy_ZMU1xxXsEgwOLlauXzA)nw!3duW-u?q@kga78h@_-S!uTe)H^jcK2QdEr$XV z51r&p+E&Z4)4A1ZwIMV#v{zz>|VLAT=~%I1cv3 z)p?GMGQQO~-e5+(Hg*h&C2qS;^cWMo-(qNRI!HBh`_QwJkeM{?+(hUZ{!0Z>;Xs8( zfbWhP?hD1uMPk$*a`Y^~<+e-Zg`JO7Wvnh_5`5jg0X_+co#eQVNYj>7Z<^z+FhIwU zBt_MSmbObd78b2**Uijr%s!xa<=GyXkN`wUM#&{;TAj3mQ|Sjllz3*LK)?bR5M7Sd z_)Ms$MpED)VD>NR$vbX3eA%F7u#az;Ydha{{l zqX6)6XF_e=uSey7*E8GfOT<|FpuLR#ls5>?$Ts%Cwt15~Py9b2q(Dwl z05Wu1h5sqC_NMuN$B!U1iTn1$ko8KW()aA|opb%c&RcIsye=RgWL_sMC;bcYe2iHk zKx==8(R-6W6R?!#O~)a;M^T;@3Qyp<{^YChYT8dSZpl;rk%nwcpe$j#`vWIJwx*se zqiqAU*{m^DaGzz>0O9uEW=LEme+!+G6OACF@fP=;s7BnQ+eKM*aCb_6l~~{~enCMO zukxUXNzSN9}F(C0~NYKCo%`zQX(Dr}Jobg6D z8v!T*)qxZdat&_?GYWekXkUOT!IXFB_retj1d!RTQ3M-PNew)`i1`M44~%aNfg;&D zvc=N6s)I_QF88Xtqb2mh(x3gTF#Pc;QDdsJQU5FmgFujAA_SUtwDAEJPvxi`?Yo_% z;Lu1@dSqSuAkd|A0pEwGcy%=*ILIAe(v4dQ6Vp?h!7$b%;%(^Z_1RulsxQ>Xk%g}B ze&2xzQbr9~OmVdd0yJa8K9Rn~Vu48A*%n~pr^T`yB;7ZG-RoaTNxJsxN)GA*9N1s0 z&ktBD+s`R<*A3flxfKrgD0rN&~D=?|7*#o$X#!2V~CGKT~@?lm`W#0@N=&6 z{_V_wHq19pa^Il1O^jyMW#_m9U2dD7QV&6-rky-v=O4VJAXUpZGdiM;J3K1!>Bba6 z?RKLvh=mF1ylsQdytBOS=(zVn3S)6ut_k=7uq%vS**)nx$i--BSL|i=Hhp@L*i=TT zv1W|Cf>q#Opb49d1;82rKSl!s(gDx1%EV?Z&q2;9M)N;y z5nzOX+T{8!XCF;2itTVs?+bf`OuzVd7oT;A!S_`M^s|LfAoqotd#o6Ph- z5J|LfB0wY=f7e$(7$?f`{0HB{LH*}h*Me5-jL%`d^B!~7?vW5R;L|??@ob)JHH`FC zy9}}}7DEWNHi6uTPY3ume1CX2e;p;bM&>_2u6c~$4-UYUE*;q>KoBXM!K^Ep-Y24O zph*zyO*N8%Rzai-De9R0T{OB=slpaY>CdKB)*Bi7SNAl? z0do0Y$8sGa!vPfK`Nd(EI1?#j$rlPfz##<<3xYP~FSs6#vwJiqrfI=ly5=kt1jnO9 zIk37kh6wZhHm%n)zZT3YuWDi-QB<=v0@a-(fE+6P@4V9gDGZPPiu2$j(~>Y#!7J;1 zS3xStb$&x@8{WU(&VthkJr8c97!wjlr-V05g1}zM7Wly;hAouhq|}?n>41yt{<#y# zospB0{>h4^R;%dSb`OZy+cPpXJ=`uXBdXK7i)Q)Fa!M8qWbi)WumKwK@#%?HyXkT! zUkXr>%?^jof0z7vw*f*WL8_>uvkoK=r^2yK!UJoY{iYUXq$N zz>(@+mA8{RH69nodpcR(luQ3yUfT9I0+&#iC^!(0i;Ej!#ER#j89QfT3Q-X{tu;_b_2NHKSR{R z;ycR+>N&ij{QWuFk<0V(vL^^3pUG_k-TEi(_)?z(u;a@zkyO5-imZ?@UTb zGN{;VK-?3?r>)mnAR(0~M%J-rgDyPk9!nV!_4|4(Me*xgjvYRG-~*2$ab)kAb7u0e zG^+15MQcQ4!8fuzK}Yn)6SA}fDdS6bgE$m{r<&!z1Y5b;-XYI6;Jg-ZR{w>o-0#tA z?eYfL(q-!mxMP1HLS_Ja7fBVWhhNjEMm508fi~BgFq-s`ts<$5} zwN3rcS^#(vMF)lD-fLL^pzD?$DAw~uVGOp=WttNg&Dl2Sht5cx{ac#Zbt8fV_L)0` zsTSYV7f#5Gata1nb=-%OwMOo2^O}}42(_{iIrEsBS9S?eS#)@DC1wR<#qG+F_x45w z2innCqYBeMF|vhbm$m&DJ$918U#aCE`ah-i?h)IB7+KA@V9^9;@s0v%YH&{v5OA?l z*t;e6$qJ<)L!`a@{Cv|{e_X1WoKlm6h5S%Q>l%eMl5r7;4&)eYdHOHHsu=Y@h1EI7 z#iRV{{BVXPmhqW(rMwMwe1gNW;{zu;u9r@6D%R7IMyK+5VAC=B73*4SkrmwNPtU@ z0*_!u3keWI!1`F~D0p#p;Nuqm&85qbIKLR}&lNo*H*S zq5oS#m35f?yOCK%p&KBa(9MlYF=>;P^oMoAk&j|xCc@HOqy-VaBt-qy)~Y%xth7)6 zUkH^Vj`~+r%{^u9mnM1o!v-V%?Y7jNuLS1Y{2LNy;_UO_rVG)5S;r>|C@Dimu{-LE z!pYYDmrnU$S=c9awjvi^TH++TTN9^eIrpm~XW^aRk6vLrTrY6!FK~+~;TKz_0X>jc!$9s7s5vm);7?DXcGs~u!c3n9lV;@$AL zC!PBf0XtLb5>(*B$o?z8MglHUB`t@o=r$5alX3zMib#?%_Y@_rOlxIEt1ZGab>0a| zuz_RY)<5MNR8XOY=G)YruF>8O9S?ZW2jT5-xzt)ohp%)lq`#r5d^!;~!CJg)UslNo z$W@Cgt-k|%m}Psx zamHhoze#7&?oe{!Rj9VvuW&ps#>bQq=)C(GI9bMfljxaV@54TN@``MG=}$WP;t4NB zHP=!K^!Yp3LqeZM1~qP0Q|l9U=oON4zSwGXS|uoJOydozk=Nr&`i(y|d_+87BN#7> zs1Un_$WD6SBf8Nn)pHtJKAZF5`srIO&Z;6~uo4D#qtF=fdcsB|N%g<&6*K2C9as8X zaA!Ip&ezmbfPs{9584x}L`4=k+S|wV9g~pFVdkwO@-c6MYM5_E^ck_BI9A{uO;9) zKDZC(eh)RfBeTu1 zB;}&&bS^pV@p=(FnHnh59jfdxijsqY{umxyQ{7vfC*1S=NBATl1Xer0Y z`zLr^I4~7A)ymANF6w;b71l@Nze~zl(Eb$xfrG5fi#(4X4?zdDL0L#bx1sSKp+v7<%G$M zwkZd{DkWm2sxR6eYBy&)ohtSg+icjfU33XuyaccT0l#~oVFQAyI1s~5TJPm=?ly6l zI~#AKs*%D!4k21K6VMFzYQPk@+3V(y7&W1?s=F(fRykbYxVRpzWE;-Wikph43IW~4 zpreICp{o2`d#`DDgbli^lqhAcm1A?j!Ek})Ffb5?2W3^av$fs^OFo-@NHiqggM9ff z-ZmPuXolH<_`+%2=t#djUc9-Nuqn~$QPf^)Y;|4X@DQU&OS6vW`D}HY$MiGHLwJ+F z{q5uc>dNM{LMOZF#eQRXscRTf>IFDb*Xs`v1{|OFS(jPoV@DXoGHUJ{Idn@W1LFFU zu0tt>gXI*pjr4NXFTOr%cSUtHFu9hGN6pPOb1mxI8cp0YPHGHD4X8+~b{A;Sl=}n5 z_-Uqnmb+NP#!D|1VYGg~WxE$RNFa+F9G1BCQR_Lz&F>ilB&%BhHH!ZI@4A?8V>L{y zcRa2Ny(;d?)$?yB4x0DJyNe6{jux(;9{aU3Cf0^kgA@TyysYLMAvX_SBo?tCC50aU z!Vy(cZDL5c(;dETpKsmVN75SYAiU6*6-OTDVnJ4ud`$4>wr9m=irH7+Knm!tDuRJ1 zO6koH*qkp{k#*O!GpE`@P)5h>o_Mt~u97COs*jl8-a{`j2hfXAuXg+0x>id3j| z{Cq5-lYJhx3tqTJ(F6jSU6@;6m{V5Lcj$Ot&5dUwE;v1}eCo*P zf`su3mg5gX#Pk8nQ_IAF&jq!iuD`_Ba+`)v&wF++!?9w-t5CurA#oLGP>Xqr%1_YX z7``APMW_|$K=xFWA~P>cO1#8fx-}j_jr>T~zIQoK9^S6?X}fg0(8umA&o0LsSCT4Qdx~a@c|3ojNrQP88aOb;3i6dT zF4*K{A#qng;jX!$F*BJMPMBrabG4QJk!IhlO}0oWiyuo8sJa-!eC*H=1`rpt9o1l?t`e?_pAN3 zS`Y|4<5qG{o37Hz@^TJAV-TLIcI|Jwn)K$}l;&GzEdgV>`e;GNWKQdQx<^Z^*p~Ul z{CY39`<+0i-l>2~@)xd~)!IzQ%T<+;~_&l5>>b}l5`)6p`p3mb+?sgB4-=?u5V z5fA(OXYK)eoysQ7tJMCLzVlpQtlGtIZF{4p>LWTBCg-C0dMLk|>-llM2|wx%cT78- z+M|j)D;0ly-G%tmT`}2B;4QI9$J;jcr%?aR2${{fbiz1M{l6M($0Ju4}_te?)cMY<3(*ph)bKKOHrf32p=nFj?2#+%%Ba*srqm1Z6erb zIC`6!`HvbK^1m=@AJ^VUQf3CBr3*A~XWDco-VCL;_I{mcKOUa#L`ag{8r!-OqFjf% zD%!A?u^@hn9}Oh-BMuyLx94Rf&c6tiv~j5_#;_KcFvWYhPp_Tia0*WX3ZlPO&`BdV zUG@FA;*%9LCuxjWCQp*p_;5KtX1G{tyXNklii0jXg}rD>j26{(fdOST72D;K<%9}T|3*6z1D|MtHTO}Z^uzT9h+Xf!hghz z_cw;qLsyev`b6EQ!t$4Lyh%M_3kbOV4id_t#E-oEy2_sTNDh-f_}D`ymq>sM0nm7L za4<1te79CzTWjr0mj2Izpc>bW=Z|!U=O{`XDr|z{IIAxL@f9k{WmECt6%HfT%bL|k zb-DnE+`Re%o2#%$Plbc30|5eG>&4WC1U0>1ro}kK+JA^K_PH<~DhoYYUfukpfIZ5y z_&Z48Q)Z}8GB)fC-!#c_R~fAHSaN5T#myXvf98oDNP0nu>c-%GO4FlI{}tg1yW|L{nV9yPV7oqmuUxbx5LGN{B~nb?T2WE@}qu z9vnA$&^T4eNHi`ZQb2c^r&C0ul7NcRPYY9YP}jtYoxu@}3k-pnR--Imh%Y&9@~dv3 z9B!O|w{raAlqzeFv60EW1l6#;>fasx7z&&cTHLfEQ9;ol!p?*7r?aSuo`q6$z*F$fLP9{M_;(Hmh_QDyd;e-q-K}@|%Iqp#Ck-kooi=aJn9{a z$$J_eXFZQTF$A3J3nTA8dvi^!0~(dKnqNZt6|d@wo^s!I&dSm$uo7dQZ*Tsws#=v6 z>)gkaF)SIWz%q(kI>8WWI#z)=6kjtVi!*9`<*ofMRptj`7Vq4PDLqFRwvM>zfj=Y~ zoPW6US$XC;Yl2YQQtP~=^>a>aV}q?QAOaJ7w8e(U{r0k)af0F;KY^-9R~_M+T4D@R zn<{`>AgV-45(c8xZX5O{qkVyjB%rnxmIIuJziYqoSf$2xTvI#Uuus9nuWw$IeW;x>3<80#^!IySwabW|Z$YSK2v4>N= zu!u&V#$)*K40$4qBdUy&8sDe6sx`ar0-qE&;rwpF$Ba(_S=H5$)-1#3Z+?he`$G%K zJjvJ$JGPu5hwNLU@{mo-HNs!bL~q8lv`+0`n>OtZHMvJEgE$_47%7cEbbjs=gw$ts zzc4)qh2&;Z{?0o-)ojjkbRu;LK?MaNWofM(V~}8Onl-zPvAN&(BJV=h_w$z5r`!{& z5WJDvo^pbG>sm{%7m<&)mB2u4;o2ZUmQ00Dhn+#pq^*yw6YIZP*${{F(&f&h*W5@R z+NZ)Z7nlqWh15OIj#+-;X${KRR^JKS71;WE&Ua*+^?t4q7`jN9w77 zKu4aQYGfx1bubbI82h8f*e~R_muh+bNUPHl4UCj**xg?Mt4&W-O{~YDO6#*%1Xd;n5nIn}4 z=^8v3=!*igdn1t`y{yS8H1USP;VFMEV`u@k1F(aBg+*V-!X-O&?;J;Y`!g3ypHZC# zdbO`yY!t(KC*C0G)ndjx;g^c&CY*-U=9h&%b{gC^LYhRiI2h2D8fGjmkEIa#E^RVW zd}6}}=kRzZEGRatg%(!~<*O`Y3rXB|U+ACUT*fA-q;EP_3*A?)Qd(^ZI%3nTaG4}U zUVFb@`GWO-?6Zr`O%r?G&pExTmY8y6NM8IbS7`0hJ`1Elgb!n5bWDsd=f)SCvhcZ; zsytmpU!+z=dsm`F3N)_`TIEQKNDwEabjG$2+wCWCd0zOgPikzm$w>T(cQ%^X4y3~4U zy|2Ad$ci>PiM@w~oDXU%@vo>6+`sOcwfw2OO+mm6w8nvs)~%yT5YPT8!sY@>fd8G4 zQ5VRXrHmh*D)M3=N|2lu`of}L%|$EB)^A=S&(E(Shx^4Ty@03JQWU$kws2C{?Esjlk@;g$U^d0~OtUYlp#k^yE7cp$M*} zAD%-cJ~0Q+nYYE|87yw$Jo@AoQzCV zRP^LUu~_-}Xgas6OMo`U%)OHGJ1N?yIuWMsEFc}^l<9S!OqanB+X%GReMhp_FYQdmnH8t&(cm4N;|x0`YH!6+P;yOa5+n?vdW!_4Vslpcl`=((*60si^1xC?5)>uB8=&&3dv_t$W>^>HYlk zYNuawMlP9YALux|yDJ%liNxc+pUxHA-`|IBV<9IeXJo7{E*|!MNMV|D_w$Z6^1l%Q zs=z;va$}yY>glwbfZJAAS9f)H2M4LsU;&k5b#=Im27N%s;dby)PfySEbevFjtT-ty z(&5QT3-GwW1_6WA6;r8EO9o^(ffof7mVrP#_#X>eHpNy1_V)G#a+wuXA^~86C^QU_ znAECiJZ_3o?$}W#riBN>d`!t>2GxijKaGs`+C3iro}GPitz7Q@tH!3Tp+T8Lm@GZ8 zd$l#tSXd}FjdHQs*?AGLzP>)0!K)Ol`Q+JA5Ud3SbXv+~^3_*WUEbZz{|*+)?h4=m zdNzS3$dMZ3QDwBTO#V)rODkhzR-27BgwJ$EpZXGV5whirh^;(4xRXjVeng9t{u(yg z2h}t*U`+^G4VxifoSvS}J2$;NV+M4~JHkAAc4*TjjqZc{$sXF>JiP`7LxX&U5{gK^ zBJ}t7Gn+QnuG1qUBY(~oMB#BSjEqq6^6Jd40#C~Mb0!aX zqwg0!AE^vKgn-Ti5096V`BG+PW_sSgr(flQj4eSeRMKuU!V7@K?1;bHH?k- zSL!WPQaWx9CkgnyUnluLv|F96BqSs@)iQx%$X;ZA&)cKv=etvzwWjR$-#}OB;n|sv zmKFj?TuSQZ>S}mshyzG%hoO_}dYsjFR6IT%W{Qc5dSCT_2KiQYzCQz2eRp^FkG*!D zFhKoz==kmH>kG^;~Y_!{Kb~v3bjA!!uELP|!C@731G{Sp7Zii%KWB@M*7>VN7 z;_B)vz)RRSr}b`c*){>qxxEv!YincrMVyEz5&eo3^izM>*x7sins*4xLwYH z8v@EEzmbxzwYtbfI4_iGXxdO`$N&u)B9ZS!TsGZsOBIiFKlbWeUD$ML_$OiBs2pI11*e{nR#|_5aylO z)$%**rM%{94PeFFF4H{##r6G*~zo7%U$5mXBn(zTJu0YjW~tEf*-GfL%Fe!ih^uOG`;n)6uE^ znFb0ufclGN%X{vNwp`l;V5S071}#iD$O@0&+jDmyDy2q696=BTE8?Gs0GJOLDEwk3 zP>$~j=+UuUsS6V(F4ybSm69ky>;}51u_E$^IyXPxLE%?s=nTxv*bheI0q<;NXvkOV z7;t?wIxV*w?H=T^XImBt#HRA0-PrnEaF&@N3m{6coM?g{O;^)W6bsJygFD zfV%vl5Z>Q#tL%DxfO8<@{W)ESbTMG4s#PU7N2z*|H^NB=RHD4BoJ z`O#5Z>mVsPLf`qSsH|)R9T%r7h9yOcCi9*5yFU;xE;?Q++S=9tFS)Yf{&2nym=a(h z3=9mkwB>nufrGpK0|P6os{~w*;-qK_VP3&@0LfDQ=X(JtF!=fTeSAKEkj!D!>G^aG z+!Yjfx0;@pilQP=6%r5-fa#Y8{+V8v$MsU*%j>PybP|Y0uWxS&AJ3!oooT!t4HXs9 zlZPH29;LUQAP`_K6%`d)4VI{3!ivSW+r1$`lt|&QBaY~|cXDdC+Z_O0&-Kj>1UUHC zQcX+CGcer1eFL$xp`jt%7kvPz!3Ii>)fE(o;>3l%e+L5@4n#`G$1o7#kBa3G5@l!w(tRJ32bb$jAWoIPw%Ul$5|MRIAhlOe!Znes+H} zNk>O#bYw(QO3LTutTCP2g^QC@2dFFtn)?AKx3RI|?d{#s(V^%4WbESd0Nh5hbiuVg z``?i8q-Y@gclkj89%b9U+0M?6|MjxB-eM8xFf%hd0=yWVZrjqv20jMH@8sm$vDH5@0@%)+l`+LAqF4vm2HZ}eEy*g#aGEXGgUJ#t8SbYD) zswC>+(RvLIn=HMsv0-gt!TpDcg^f*BRkfw5=}RxumdW_aO3UVNgX*T%R-AKeO7_Uh zOdw|7OKWOr9Xqxp@99Y-xl{A@yKr|%CH^fUJ>q|qHJOyw< zh%=JAC@uM{FRJnxXbfv{Wo2d3<;5a?22=XR#`(@S%|zTr#!MPoS~xA#aQzIo#4@PK z+oo)QNAVNzZ9xy;1_4cIR0$ Core connections +// ============================================================================ + +module connectCoresToDCache#( + Vector#(`CoresPerDCache, DCacheClient) clients, + DCache dcache) (); + + // Connect requests + function getDCacheReqOut(client) = client.dcacheReqOut; + let dcacheReqs <- mkMergeTree(Fair, + mkUGShiftQueue1(QueueOptFmax), + map(getDCacheReqOut, clients)); + connectUsing(mkUGQueue, dcacheReqs, dcache.reqIn); + + // Connect responses + function Bit#(`LogCoresPerDCache) getDCacheRespKey(DCacheResp resp) = + truncateLSB(resp.id); + function getDCacheRespIn(client) = client.dcacheRespIn; + let dcacheResps <- mkResponseDistributor( + getDCacheRespKey, + mkUGShiftQueue1(QueueOptFmax), + map(getDCacheRespIn, clients)); + connectDirect(dcache.respOut, dcacheResps); + + // Connect performance-counter wires + rule connectPerfCountWires; + clients[0].incMissCount(dcache.incMissCount); + clients[0].incHitCount(dcache.incHitCount); + clients[0].incWritebackCount(dcache.incWritebackCount); + for (Integer i = 1; i < `CoresPerDCache; i=i+1) begin + clients[i].incMissCount(False); + clients[i].incHitCount(False); + clients[i].incWritebackCount(False); + end + endrule + +endmodule + +// ============================================================================ +// Off-chip RAM connections +// ============================================================================ + +module connectClientsToOffChipRAM#( + // Data caches + Vector#(`DCachesPerDRAM, DCache) caches, + // Programmable per-board router, reqs and resps + BOut#(DRAMReq) routerReqs, In#(DRAMResp) routerResps, + // Off-chip memory + OffChipRAM ram) (); + + // Connect requests + function getReqOut(cache) = cache.reqOut; + let reqs <- mkMergeTreeB(Fair, + mkUGShiftQueue1(QueueOptFmax), + append(map(getReqOut, caches), + cons(routerReqs, nil))); + connectUsing(mkUGQueue, reqs, ram.reqIn); + + // Connect load responses + function DRAMClientId getRespKey(DRAMResp resp) = resp.id; + function getRespIn(cache) = cache.respIn; + let ramResps <- mkResponseDistributor( + getRespKey, + mkUGShiftQueue2(QueueOptFmax), + append(map(getRespIn, caches), + cons(routerResps, nil))); + connectDirect(ram.respOut, ramResps); + +endmodule + +endpackage diff --git a/rtl/DCache.bsv b/rtl/DCache.bsv index 3162aade..b99d4667 100644 --- a/rtl/DCache.bsv +++ b/rtl/DCache.bsv @@ -496,7 +496,7 @@ module mkDCache#(DCacheId myId) (DCache); // Create memory request DRAMReq memReq; memReq.isStore = !isLoad; - memReq.id = myId; + memReq.id = zeroExtend(myId); memReq.addr = {isLoad ? readLineAddr : writeLineAddr, reqBeat}; memReq.data = isLoad ? {?, pack(info)} : dataMem.dataOutA; memReq.burst = isLoad ? `BeatsPerLine : 1; @@ -589,66 +589,6 @@ interface DCacheClient; method Action incWritebackCount(Bool inc); endinterface -// ============================================================================ -// Connections -// ============================================================================ - -module connectCoresToDCache#( - Vector#(`CoresPerDCache, DCacheClient) clients, - DCache dcache) (); - - // Connect requests - function getDCacheReqOut(client) = client.dcacheReqOut; - let dcacheReqs <- mkMergeTree(Fair, - mkUGShiftQueue1(QueueOptFmax), - map(getDCacheReqOut, clients)); - connectUsing(mkUGQueue, dcacheReqs, dcache.reqIn); - - // Connect responses - function Bit#(`LogCoresPerDCache) getDCacheRespKey(DCacheResp resp) = - truncateLSB(resp.id); - function getDCacheRespIn(client) = client.dcacheRespIn; - let dcacheResps <- mkResponseDistributor( - getDCacheRespKey, - mkUGShiftQueue1(QueueOptFmax), - map(getDCacheRespIn, clients)); - connectDirect(dcache.respOut, dcacheResps); - - // Connect performance-counter wires - rule connectPerfCountWires; - clients[0].incMissCount(dcache.incMissCount); - clients[0].incHitCount(dcache.incHitCount); - clients[0].incWritebackCount(dcache.incWritebackCount); - for (Integer i = 1; i < `CoresPerDCache; i=i+1) begin - clients[i].incMissCount(False); - clients[i].incHitCount(False); - clients[i].incWritebackCount(False); - end - endrule - -endmodule - -module connectDCachesToOffChipRAM#( - Vector#(`DCachesPerDRAM, DCache) caches, OffChipRAM ram) (); - - // Connect requests - function getReqOut(cache) = cache.reqOut; - let reqs <- mkMergeTreeB(Fair, - mkUGShiftQueue1(QueueOptFmax), - map(getReqOut, caches)); - connectUsing(mkUGQueue, reqs, ram.reqIn); - - // Connect load responses - function DCacheId getRespKey(DRAMResp resp) = resp.id; - function getRespIn(cache) = cache.respIn; - let ramResps <- mkResponseDistributor( - getRespKey, - mkUGShiftQueue2(QueueOptFmax), - map(getRespIn, caches)); - connectDirect(ram.respOut, ramResps); - -endmodule - // ============================================================================ // Dummy cache // ============================================================================ diff --git a/rtl/DE5Top.bsv b/rtl/DE5Top.bsv index 2173526d..0e5672fa 100644 --- a/rtl/DE5Top.bsv +++ b/rtl/DE5Top.bsv @@ -22,6 +22,7 @@ import InstrMem :: *; import NarrowSRAM :: *; import OffChipRAM :: *; import IdleDetector :: *; +import Connections :: *; // ============================================================================ // Interface @@ -114,10 +115,6 @@ module de5Top (DE5Top); for (Integer j = 0; j < `DCachesPerDRAM; j=j+1) connectCoresToDCache(map(dcacheClient, cores[i][j]), dcaches[i][j]); - // Connect data caches to DRAM - for (Integer i = 0; i < `DRAMsPerBoard; i=i+1) - connectDCachesToOffChipRAM(dcaches[i], rams[i]); - // Create FPUs Vector#(`FPUsPerBoard, FPU) fpus; for (Integer i = 0; i < `FPUsPerBoard; i=i+1) @@ -167,13 +164,18 @@ module de5Top (DE5Top); connectCoresToMailbox(map(mailboxClient, cs), mailboxes[y][x]); end - // Create mesh of mailboxes + // Create network-on-chip function MailboxNet mailboxNet(Mailbox mbox) = mbox.net; - ExtNetwork net <- mkMailboxMesh( - debugLink.getBoardId(), - debugLink.linkEnable, - map(map(mailboxNet), mailboxes), - idle); + NoC noc <- mkNoC( + debugLink.getBoardId(), + debugLink.linkEnable, + map(map(mailboxNet), mailboxes), + idle); + + // Connections to off-chip RAMs + for (Integer i = 0; i < `DRAMsPerBoard; i=i+1) + connectClientsToOffChipRAM(dcaches[i], + noc.dramReqs[i], noc.dramResps[i], rams[i]); // Set board ids rule setBoardIds; @@ -199,10 +201,10 @@ module de5Top (DE5Top); interface dramIfcs = map(getDRAMExtIfc, rams); interface sramIfcs = concat(map(getSRAMExtIfcs, rams)); interface jtagIfc = debugLink.jtagAvalon; - interface northMac = net.north; - interface southMac = net.south; - interface eastMac = net.east; - interface westMac = net.west; + interface northMac = noc.north; + interface southMac = noc.south; + interface eastMac = noc.east; + interface westMac = noc.west; method Action setBoardId(Bit#(4) id); localBoardId <= id; endmethod diff --git a/rtl/DRAM.bsv b/rtl/DRAM.bsv index b9bab54e..e5d4a33e 100644 --- a/rtl/DRAM.bsv +++ b/rtl/DRAM.bsv @@ -5,8 +5,11 @@ package DRAM; // Types // ============================================================================ +// DRAM client id +typedef Bit#(TAdd#(`LogDCachesPerDRAM, 1)) DRAMClientId; + // DRAM request id -typedef DCacheId DRAMReqId; +typedef DRAMClientId DRAMReqId; // DRAM request typedef struct { diff --git a/rtl/Interface.bsv b/rtl/Interface.bsv index c3d16860..dffd8ac2 100644 --- a/rtl/Interface.bsv +++ b/rtl/Interface.bsv @@ -248,6 +248,14 @@ function BOut#(t) enableBOut(Bool en, BOut#(t) out) = method t value = out.value; endinterface; +// Convert queue to BOut interface +function BOut#(t) queueToBOut(SizedQueue#(n, t) q) = + interface BOut + method Action get = q.deq; + method Bool valid = q.canDeq && q.canPeek; + method t value = q.dataOut; + endinterface; + // ============================================================================= // Merge unit // ============================================================================= @@ -578,7 +586,7 @@ module mkDeserialiser (Deserialiser#(typeIn, typeOut)) endmodule // ============================================================================= -// Expansion and reduction connectors +// Reduction connectors // ============================================================================= // Reduce a list of interfaces down to a given number of interfaces, @@ -651,31 +659,4 @@ module reduceConnect#( endmodule -// Connect 'from' ports to 'to' ports, -// where 'length(from)' may be less than 'length(to)'. -// Works by wiring null to any unused 'to' ports. -module expandConnect#(List#(Out#(t)) from, List#(In#(t)) to) () - provisos (Bits#(t, twidth)); - - // Count inputs and outputs - Integer numFrom = List::length(from); - Integer numTo = List::length(to); - Integer q = numTo/numFrom; - - for (Integer i = 0; i < numTo; i=i+1) begin - if (q == 0) begin - // Connect input - connectUsing(mkUGShiftQueue1(QueueOptFmax), from[i], to[i]); - end else if ((i%q) == 0) begin - // Connect input - connectUsing(mkUGShiftQueue1(QueueOptFmax), from[i/q], to[i]); - end else begin - // Connect terminator - BOut#(t) nullOut <- mkNullBOut; - connectDirect(nullOut, to[i]); - end - end - -endmodule - endpackage diff --git a/rtl/NarrowSRAM.bsv b/rtl/NarrowSRAM.bsv index d0651392..0fbd34fa 100644 --- a/rtl/NarrowSRAM.bsv +++ b/rtl/NarrowSRAM.bsv @@ -9,7 +9,7 @@ import Util :: *; // ============================================================================ // SRAM request id -typedef Bit#(`LogDCachesPerDRAM) SRAMReqId; +typedef Bit#(TAdd#(`LogDCachesPerDRAM, 1)) SRAMReqId; // SRAM load request typedef struct { diff --git a/rtl/Network.bsv b/rtl/Network.bsv index bd435a11..642acfcb 100644 --- a/rtl/Network.bsv +++ b/rtl/Network.bsv @@ -23,6 +23,8 @@ import Socket :: *; import Util :: *; import IdleDetector :: *; import FlitMerger :: *; +import OffChipRAM :: *; +import DRAM :: *; // ============================================================================= // Mesh Router @@ -366,27 +368,30 @@ module mkBoardLink#(Bool en, SocketId id) (BoardLink); endmodule // ============================================================================= -// Mailbox Mesh +// Network-on-chip // ============================================================================= -// Interface to external (off-board) network -interface ExtNetwork; -`ifndef SIMULATE - // Avalon interfaces to 10G MACs +// NoC interface +interface NoC; + `ifndef SIMULATE + // Avalon interfaces to 10G MACs (inter-FPGA links) interface Vector#(`NumNorthSouthLinks, AvalonMac) north; interface Vector#(`NumNorthSouthLinks, AvalonMac) south; interface Vector#(`NumEastWestLinks, AvalonMac) east; interface Vector#(`NumEastWestLinks, AvalonMac) west; -`endif + `endif + // Connections to off-chip memory (for the programmable router) + interface Vector#(`DRAMsPerBoard, BOut#(DRAMReq)) dramReqs; + interface Vector#(`DRAMsPerBoard, In#(DRAMResp)) dramResps; endinterface -module mkMailboxMesh#( +module mkNoC#( BoardId boardId, Vector#(4, Bool) linkEnable, Vector#(`MailboxMeshYLen, Vector#(`MailboxMeshXLen, MailboxNet)) mailboxes, IdleDetector idle) - (ExtNetwork); + (NoC); // Create off-board links Vector#(`NumNorthSouthLinks, BoardLink) northLink <- @@ -398,6 +403,14 @@ module mkMailboxMesh#( Vector#(`NumEastWestLinks, BoardLink) westLink <- mapM(mkBoardLink(linkEnable[3]), westSocket); + // Responses from off-chip memory + Vector#(`DRAMsPerBoard, InPort#(DRAMResp)) dramRespPort <- + replicateM(mkInPort); + + // Requests to off-chip memory + Vector#(`DRAMsPerBoard, Queue1#(DRAMReq)) dramReqQueues <- + replicateM(mkUGShiftQueue1(QueueOptFmax)); + // Create mailbox routers Vector#(`MailboxMeshYLen, Vector#(`MailboxMeshXLen, MeshRouter)) routers = @@ -540,13 +553,24 @@ module mkMailboxMesh#( idle.idle.interBoardActivity(activityReg); endrule -`ifndef SIMULATE + // Interfaces + // ---------- + + function In#(t) getIn(InPort#(t) p) = p.in; + + `ifndef SIMULATE function AvalonMac getMac(BoardLink link) = link.avalonMac; interface north = Vector::map(getMac, northLink); interface south = Vector::map(getMac, southLink); interface east = Vector::map(getMac, eastLink); interface west = Vector::map(getMac, westLink); -`endif + `endif + + // Requests to off-chip memory + interface dramReqs = Vector::map(queueToBOut, dramReqQueues); + + // Responses from off-chip memory + interface dramResps = Vector::map(getIn, dramRespPort); endmodule From 48bd4510cfce23dbf88fa1215f471c2d8a05288c Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Thu, 30 Jan 2020 17:15:15 +0000 Subject: [PATCH 04/78] Destinations can now be routing keys Support routing-key-based send operations, but programmable routers don't yet interpret routing keys. --- README.md | 19 ++++++++++++++++ doc/custom/ExampleAccelerator.sv | 1 + doc/custom/README.md | 1 + doc/figures/fpga.png | Bin 17166 -> 17154 bytes doc/figures/fpga.tex | 4 ++-- hostlink/HostLink.cpp | 37 +++++++++++++++++++++++++++---- hostlink/HostLink.h | 10 +++++++++ include/tinsel-interface.h | 2 +- include/tinsel.h | 10 +++++++++ rtl/DE5BridgeTop.bsv | 13 ++++++++--- rtl/Globals.bsv | 6 +++++ rtl/IdleDetector.bsv | 2 ++ 12 files changed, 95 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 01e744a0..b001abcd 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,22 @@ +TODO, document the following: + +```c++ +// Tinsel API +// ========== + +// Send message at addr using given routing key +inline void tinselKeySend(int key, volatile void* addr); + +// HostLink API +// ============ + +// Send a message using routing key (blocking by default) +bool keySend(uint32_t key, uint32_t numFlits, void* msg, bool block = true); + +// Try to send using routing key (non-blocking, returns true on success) +bool keyTrySend(uint32_t key, uint32_t numFlits, void* msg); +``` + # Tinsel 0.7.1 Tinsel is a [RISC-V](https://riscv.org/)-based manythread diff --git a/doc/custom/ExampleAccelerator.sv b/doc/custom/ExampleAccelerator.sv index 34a97fc2..acc73455 100644 --- a/doc/custom/ExampleAccelerator.sv +++ b/doc/custom/ExampleAccelerator.sv @@ -5,6 +5,7 @@ typedef struct packed { logic acc; + logic isKey; logic host; logic hostDir; logic [`TinselMeshYBits-1:0] boardY; diff --git a/doc/custom/README.md b/doc/custom/README.md index c380f9c9..fde29010 100644 --- a/doc/custom/README.md +++ b/doc/custom/README.md @@ -74,6 +74,7 @@ custom accelerator or a mailbox. ```sv typedef struct packed { logic acc; + logic isKey; logic host; logic hostDir; logic [`TinselMeshYBits-1:0] boardY; diff --git a/doc/figures/fpga.png b/doc/figures/fpga.png index 2ea48fdcc2aa8e3ffc60dd4168ddba0d49ff1cb7..71a4c97f0c8b413775eb60c379461ee91daa082e 100644 GIT binary patch literal 17154 zcmbW91yo$&lIJgOA-Dv$;E>=hArRc%9fCt}_u%gCPH@-Y?hXNhyIb(RdGF2anK?VV zGusDgINg13f49D>fBmYeL*!&ckrD9_0RTW27ZX+h00><0{vkXh`17`F%oF^DU@N9> z4*(ckf4?E3x>X)Umi;z!MPb`Bbgk5&9BRMB+$K0|; zeP#Xpg~gG1S*3%^*tqhtsi(Sy%k zPk)-5)7?+)C$4z+cul=XQ>0+!vIUtw>P?af`a(KVU{cG83i)m-L&zL(Kz~{$#-;Qu z^&08I4DF^5V2#QB=){%HfR2dy#h!vWrOn|$;eKMSOciAXGL^@_=w zPpCqx89m4yTZ@8Mb;QOXSLLs!dFmERkhLM{u}Uf{)eW`uO=o-LK+4JtDf)3 zZ{NQ4^72B$VQllbuR%a`eR;YkUOD z8F?=DMLBSj3L~T!3hwRkQmrTrDhCbe>aSlinY_(LL+B?Dgl`}4&|CY=E~<-*i|JG= zgui^*-PswkxxDwFP2es+#UEwzsd<@qGUD*~P_0MJ2JbQ{Zh7Mp$1V zP5uR*zt@S=pho^!8b?q-000z9rPNq1)#z*X+&?^6uXhmok)VgvIiKl(Zx+KcCN?%! zK~WI`I6XaWbw2AisP0kG{cPNLNEjkT2?q^57>LVe9^YnwZykz&f|BIT6C&)FU}PSN z`R-i_?pC#0`No0R=dX>?3bn!GD;4R-j-p77-Zh(SqoS$Ue3!iq5Hm?@`E}WhQOT@y$LS0?m($dmS zSyji?R9{)u`rSfRdXGoB|6*eGr#I|j;pNeDb8`m=2W2WhAOR!3O{1i5L@Ex->=(!* z9zxVCKQq{uv#e#QQj=-`Ks1UdA|%Al*_pWMVA?DoMn@&@*{VNqaadZ_M%0gF-%ETz zm5TNahR_dDE6qbwrMly4X#%5z`ttg&))iG}vLf?kDyB0xxGl z;cxw)wXn)!c?$qI_r4%E zTFm(D)bp$DNkxIm%`21o%P57k(&GO6r5@AyZ8sK!VEomwp3BNYBoSB=5iPWlKnRZrT zzp$5RHpx%0nNY!_xzZLLjxJS9<#9s`=S`?Okt=z}6)P`&=jN5>73sj&*}l$dL^eN^ z2n}Sb|9)XwGPcVti%c<1`Zg*;ck_dQ`d;Vx?uY9KCtS z*uEW(kl)ES#_%(va72-Cl`KchfRUD^^k9u{V}CB(^Xj%+}?t zIGL9m|HWJpMBdjoE;R6zw(LG8q}$K2D#J%m5=W)t?yXS=_32Zp;sV}0A$36Ik%pVY z#5%6_Tk@PHjR@c6B~(Ly+|WEUN#{N75gRB#tRTBuoAb}NZ*wJ5Vd+9KbqQrbHcBZW z!{JsfgsZu}GJ`dQgHyf^eLS89DRLBZ@tI%xer1}hDG`YIs1^9GCG!DAN1m^J=q3+m_MFyMjKL@Wu${3mo zwy$?B&Xy#6?7`LgzK!Zq-G`CMf-QLrLn9%v%;QYM3$Oqx&gdU2#opeJY|puW~9KA*vqQF_@#ns&vFch+bCfc5Qm$i}E7ibnkO=qeH0IrDPIwy17W%;2u8xn!b& zbR^%X6r&ClEiaa^73?&#K{_|^V$`A zw^S4}oy4298@VRJQxjp{63m4T3wio>dzjMqL14s3m~6zYm!@nF0*KDAXJvEYQ5-(H z*C=m@SfTsiOG=M;@=YSY=F%-wtIfVKRY%#A)Sv~v`~~}3BG@GyIpSv>B_u!sKlnI# zX;Jb`x9l1%S6@T?4}^%ekZOb@Rng0>%xNPz6UocKl-Wu2}87#7&(#>J@g ziAolG4e~E-lT%HoE2hGnjleac!=65ZbuGD?LVOue;D$Ho`Cb1sTNsdlVy zJa@>I3e}6W&50l!`uY{)luM_>y5lb2)%QDUx89{|#gJtqR%V$s*AwU8#c}IJ4S((`5fSw?-^z(`63x`b zi;HrR!mG)jYw2V?e1fD%#Pjxa8tFzjWB#FjnH747-&6xT;BaFR!#XyGB%+`VI>qY{ zV{;J*V84mE5A(N4rQKF^qz*I+fmvxUZw7Vn;6VWwG2u@9kPqbtNFe!T7e_>sqx9f# zPb?(HssC|y>czo+DwHo{zbhA)<_h#M=nyGf7P=gG87AfTS^1oa??krZx>LTeX+IOt z@Q0TS)xrfwDSD0;yFmdS)nq?$m%tb@v34y&%RvwO;L%*Yt;HW__2nV-vX&|KphDtu z(p>-@&^zIkUB%Ho)M@MsV)9k*3JqQFq=jF4$dn&p$p(2lZ>_*i+~Ra}UqN)7u~w?Bw!t0y_5t>a-n(T_hY?l5bas}zCT)5 zmAmPi(j!(FhM)aTUH$^X(x*F9hV@wjFUi# z1PKvf5QTGi&5IYcYGSa$K#I0e2+F+o&D&1ss#rJ z_h)&1U4WFd3p!A%`P=)qcl1qSGBQq1PDGi*ctWkpXJVhlYpo5$Bqdj>^!>qGR@OQw zEk5XWSo#TN3V3@BL=timFmiFF2$Rh%Ec}&o1*@reV=V~NWB*%wu^qD${vWj0T!XL} z!x(qgM3F*CPidxM?GnM;ASud~r6o2F4vVuR6s+!^p4yt4r{`xo&U7_;c4^Xx1>9?k z#?SL)$XH`7C3ChN9UUu;HbEr8!el3_tqbGhaXwkT z8L8}iH{@{h_N3;L=)+n{7)jq*qbc3`>f>Y^^%w*It|6rPA0FLZ(!O_VQw$n)Ma-~Q%P)Vddm%_3v z@`80&bBe-+T8^|*)(i9Gd=>tOjzzH)o97)g1$6|GSWsMrlNyOO?zakzmYjld4O`Xm zU{4I^l-i%v2i$V*16EV}JA~SFP`eG7 zCOa3P!B7GK%Ch$cAyRjS{70JZs4=zWkwr=A&Bj_NqM>2j(|FQYq0Pngjo}tbGjlyL ztOTL&R*^+-p1h@>KqT-yJ;B&x*BYL1a|7K{ww9B8>Gql`ZNDtY-W zH$U7ypagBp?_H-=XmW8R0SW)ClVc54nut2j1l{b)u+fed`>N+Dw5YD?VW9}W(|lz- z52goUf^&5YK${8ZSe_wTZqJ&&0EX?%82*qyH#mt(AyW<>OE zIQ$5XOpu!h0AOu=b}tKMh$Hb6@KY7kbnO`>mx%m~$xMUoeO}uhUKuGfcubptkzj7r z;kJbgpUMas89*}7qQ0dg9)vVw;8q7kP}B!I{k&?n_1RMUgaM4Vab(HlT3-!Cnemnb z+#{1J)6V#4(o--84za}HTvZgq(l0HDam-DuEu3ifo-rptDGPHUquj5K{jpAP(%}aF z#F8z1<;kf;X1scBg=*FNaOg!*_O}3y4wA^UXHn+Y4K>m9#7P(26Pvu2+v)_mhu^J2 zc<bKNOV&;%RZB1Ui%jK-bQ_RJUxx(SmcvyujP=DVrkYXcAI;kR7Mo#e z0b`=@_O)XHvYbrH8(X$Y)f%=qCOgBlQ?U<{&qA#+I$!D>iE(PVcEb8TZtk*kNBGR{ zHrpDqUlP%WzLSRSw#W_-2u5Rn;F`Wnu{3gR+etkc+kiudXx*~%CO3Y$zXbAY zYH~Nxidd5jR}=(T4&S!(4=srm;_n&~?i3Y~>ocMHnX8ohFN|NYR(hGoXCW(i?1rkP z?+=^AppDwPh5IvcXMsuRsKrF{g_N^lDauE^*~l?6@tW{ctXAKi$r2$|`|-07>8ypr$YC&3hPTSe z$jV+;8RJza#JBasj}lH@RSlokf)t`c2y*`LcNk(HT&9&(2Fk3MY|lRX@QjUwY#sB6sH<(`w4#i&F?nF$FyXJ=788@&eAW{75O{n@&$gf-Y^SG!}Rq@;ikjbb6aW`pJ8 z{QpFb{BRQ`iyMM~_~7&@0*_5Xr#eLgp}`jKE^Dd$ph${RatWKJvqSyQ!dQ3bwStt3 zOViVeI0bq+g`$1_;GkF^Zd9K^N)7pmKy7p2Oz}<;<;{&ICb+Jg(ish#n>fN6cW3Lw z4gSbnTwIqHIdaKaSvGz`{4VE$UfPO^ES3pupsM)?HUePXyi&UYiYOraXOpYpAVZW) zJ{~3hVAG$xO-B@omC0PJG97dKP+KhxY7#&=!Gy3wagKsBS%uBkyegkl<&XjgG*yW}t-Mq|rEI$Y_)17OBygTqs_9srU8PawpJzOt&0fuu1~~5iym`Ld8pc zB{R-3crZn?e~e+vg!k)G@}Z#)L?3wl8?DxL7%stsRqod7j9Db4RjheZ><)^Eg^&$Y zkQ=K=1b_3JG%Z+z|<7Ow6L?W+Jx27hp6z2`)1=`*F3UgV!Hh- ztI`4q4wnzt_W6b>{TjVEPQ|%1~JOrD(>A9Rfn8)EM%2 z6Alo-!sW`fJ%x68kW?@CQY-+lBK;DeY^9fRm}9c{pLmDwG0TeNAl~mp(nB$j85#dx zQ4532;RKGop1j9g=R9BW1D7G>fsL#^l5NpJvUkz$)r~ar7q->SLu)<)amAARW`z($ zf}`3kwO6+hVqkKWoU!~^BD`jX=`{$;q$gQPWkAtgp!W!-mKhuB*p z%yYPG{$OxuS?f(KOJn%ahL~M-?IhFd!#vSHLctxVxQHV2L-rj06Gyt{nw|NJkv^U1 z7dWx^E|PpVcr`jWZL_$y0)P@s)&?YLVHbvPh$7P}6+{vGEJX|Se@Pq+lwb#7KgHpsXZ7}ro0gVognyed8_xrMT#n%1$*if%g~0iyCl=5V zA+h+Yb1Z22xua!BH%@@HVaofM3T$ z7(FRTM0L~!Q!z*{KWL7Mgvaq+%=jzZn|2nATk@3*(ou|w6(nr8^m)Q$YU(L7JJ#UW zo7E>PZnLc#pj;1>8abSHKm=D+bM(6462oqho?v!Puvr>gY=O^&i;5vu=}H@2saKWzyB^cm z$-~nLi7aw<{{H$o|HKdy!_gZ$(ROcxSsT5qKt1yRsnNKQTglGVVi{>4AC41sVo*th zAM^8Ks0%Gm_qFTDSBeY6lF9W+exC|fSWbUrq=!Nep+7<^D$j%inqpTFb+rFm0Ld~B z^?wSG@JIfe014bGR$FfYvDQ^q8^4)jrt+synN%3-DI*+aG#>XiNF9aIwn&>9008$= zT-Qj-C}Ep|+n>%dU!ASsIzK3enbq&JOf&NT36j_>{wt7FZ){^ogilvq%xRd$C_`*E zTY39@qCgnp6C-BiiLP#wmCpU%jPP!FH`h}INgpluJR z^zMy7jaq~{-AgqvaXS*x4`1WLfq4Hbfj!DqHG5@yGU$#SuiC|+%h&RpW(*0Uv4UmD zk6l)>#>k**Awf3>QlX^>{?UPlFf4dgY89m?5D5)FJO#+@jvK%H{G~@R6IfJh2a7vd z&Qw12a=mWD`sVzN515COG#C%Nh@yAVtFD7sr!@&qYsjiywfo)g+kAwa(Ww{qFQr>2 z2fcZko%?F!r z=EquV`PMQ{^v_FJ{aqa*<ojj7S>sucD8JX$)gj}vb8>S2xxZ)7YC6B%8KI=! zzQ5XYK3??g*@%sc6Q@w{^n5;7nwMMtQ8HDYMzYEqBgSj{`)4Mfdpx9F9QyRw*va-# zoMa;XRK@y6b|q=Grj=yC>~#6n|5tf0Cs&=L9&dbM!TIs_1Qj|)EWh3TZh2`bMK3HK zyrrfh0=CZ1>+S9w*kObX4|ue#Foj=gE66+Bo?8B)&(J%cKqY65{ZepHq|p};6g1MA zAxzdK!OSKa8Xm3`exXL6D9$zg!jh)R8PW1Oo6kR&RcQDRIrsP2Y}?yyoUqE&n7ey> zY1z>zM7+_u3$wFtH2;=QbFcoz0RzndP@@G*>?u-zAUtpYwOAk$4v5p!6|B_>Cw{Mv zU=d%?n#=yU|Vj;yf-^SmylIxifz& zT&2OC+JX{91BSLqsYr;mKLshnt=o z#B5%c3T=}AAxG>R$ryD2-m7z!8d(C!pE|XQnb2n}tl#0NLoqj~IK3OYLMt!jcJ+jB z5Rv!;Vmh$WxRqjUj3H1I4|*@{37a#Hh%SPor2>;JG3gWTjIVv5O8Zs*?<$SEp(0Py zXfZ^+ba3j&So{c>`+2*krfF0l*V{4dmHt~V04Rmf?d6wxLK*W?D|E_t(f`4%S{Sp; z$%?0K8}x#^#ZT1UT%WOZ*>$}E1plSZ)aFx7jB*PH>Q~){KvjHWo8PpkPNJFfhKoWW zWqFGP^w3~boB0$iW>=1~y*=`&za5<|qUfLKs<^G7^{Qg05dz3pm|SF><@45w!bRh1 zK2$0ck;xHT9zG#-2{bUDbU>x>E999(P&eY5AN!9oOv%^RCxcD@T-D@=jtU}3Uk&|d z8N8A7|AShca-RM7Dsyl52Mv0;=GkI8iCxYJW>SrhyH8Gz)UWS3>%A5JOSR-QAA2IG z1Cpu;Z7u~G>hfdmCdTcE3@(2Y!Ilv&^8T*z?@e2c=75hHttM}u)x^cO0#=YNQ6><+ z>~o{pIPeS9+o&r1Te0o6!PmCj!oUmUSQ5?(F8&wU*Zq&;D+AD!=HgIcIa_rl8cvJ* zg50vy?}#tU3Apb)!34}_<_;}DgKGeq3H-aNgkYx$zM~gUqu`Kjb=U1`82W=v}IQInXfdpU!vo7=T0&f3DeS5?hW zy$}*L5tibnC=Bx6aI(-u&e190tGz?}uk^}43Saf)q+eUyFas!=-{o`8v&k2x zk$7mE0n|a<|4E2A$p<}E62Hokb&fOcm?VAsQSwKo!_C0n9pA{sU29JIhj~nGL$j#- zE?A=eN4YCCcx?4}%)e@O@h|H6p&OQOW=(_^=}4$rm<$Q|i|sV$-<1bb%ea(Lf2)U9 zF;u)FC6v|}($!dS?Pzf!es9CIqPpTpboI&x8<@n_hf7*2M}qHrb#Sz+Y8mIyx26#EJUI zV{KxXY@Hl+1M)7tpT!wiSl)UkgZZ4j^Y1*;4{PmWqPQw>Ls1Ng)-ZkJt-C$SWxcVD z!!hWwrJe5eJ*gF#*1*H$MCcc~PJd@CYz5`hrlEp zIn*8hKdV;uri%?l7@fMr(?dsPHYaAgxJ^YqW4Tqh!g(s^me}{bMKOgR_DFiSK_CTo zYLHmBH2Dr1$x~Y)x${Kt~O*{NRrbiSDGcQ^yaP(?S}$M`cbt zI(00BDu`tzDr;6S;ELaWn4EkI6RP^Y?z1oFL(6i2Y}R#y8F{;;EqUE4m(`-IbhfJ~ z+I}P|wlepjm`UW?{IaOKB24|WqJt-%B+cU~VUL$N9GzXW`=Qfp#^mI<)|A7+Ksp;~ zi%X1M-{2p9cSk*4Rs2%4*_crjzn0k2&%~(`mP1xFYY7x_+5Wr5Ohnd3hNEV<_|ccgq0HUmX3WW9>&}4{Nuv;^Sy9kgW{RF zCar||H_0O;8!Y4O7NQM;?KA9M^P=5R>F0hy_eZz4g=fMeN|Bl(??CV6^>Q@QSXeF# zFWYbMhPM>)b5(Sx(xNw$njf;{tj?>#SP(gC`H9;KzE2x$z(-uaz`7(9{R?JsT>;m_ zHz7)wot<~OgWSI6>hcSKPl^21x|mvCzD2r@PTk=OmzEBXMq%ln`^pdC+0<7Jm>u^w z!SP6G(C*rqbxeV^+WK_?xOM0B2U!5a_(lm5e_AK`BsT+ywu&z zsXAMHz8-Df-+GlaMa8+tpmF7$oTen=i+^?5`>=v;5ez80HR z2(p!hBB@h_T6}Zrd``>#aIhf;H{+bEli%JSij<1Ji&d62gvgzx^8y1*I=vnbYIC@2 zTDT^-HdNaWF}`4}zhVHB>JEJSGjH@Iu_~ESY^(-4J!5sXfltv#ZDUa#$qOqgs&|jp zuTc|9FV3E-2YhB9h;2?j8jR%x;SYqm{w*|2Cl1<9-)X*#-xJ=DF@PuahiC7OCg$pz z$g>+3l#*AY`A+-dbwlvhb^#SY6$STgrW<6W2&;r zqZB3Qm+)M>mmk|v;Uw?wDf`*XMw5^4seY}2doc_E(0$kTLYxpd1zvC7myUTEMl(W) zo?(|X?1Y@2kzmlob%Q91Iip?+=lyjzf9Gd!n(fiNPFU6RMxdi-7C;p?k-L*gyq zBXCA2&p<6%6XC_6Aal>KD(Pf@*yxJoJbg-Zx^JzW)^_ytyrqOB=?s_Uh0hnFguYLd z^HAh$YU#!IqP<#AcYHK7%7cT?nWAsz{qk~AhB6aK)>ULfKyZRnnaUYW$h7Qjko_px zGbv424{oY{*!J7`cw~u#&uzI>f$!1I^BRGO|*8)Bb zt(`M%I`L3*JT)h4#3~SW&~zp%CyD?7xr!!;y41Hm?L8_vTLjo%GSjPN;%?S@Lc8F* z0|y*K^3UJ;atLi?)Ps69Wp!RRUbf20;u^pUVX?su;~vhewxF1|>)obT8RKB}DVri- zA_bNC^F^W9ue6lApjQ8hR+h%-kPj!FRfWujQEq&}g!j&;>L)OPPrR{V8B{<`%e(|2 z==xb6sj(z(-EM!n!*BTU*~vDI3)%d&?satcReQNI3D-|Y2PbQWd{fdGJx5GVAlV}liUoTRD)Ue#2@6vW}{CWB1w8nkaSl!@L($=i^ z@)xQR3Qq<3D9{Odw!Iyop@VmzO4^S4NI?r^H~#2&`rQh85Amq<(~o8}OLhT{lP>+m zOD$na8oI^qn@vAyI1m{j0qzz}@QV&F)CNcQ;dyrg_K5rBbVK%~v0JQ#=!DpN!KhtB4;7iEso?S^` z$BNZ6>~{;|az`w!&5=QUkTylTppe(Ce4?x@!jI}1(c~rT5~4;&we97v7CZQ4rJN+@ z)1_yt)~0~<9@RGIm#ag_?~CN!Xbk`3^Y zF)m!?<7IJ^N99p)#AYSmyBv;h!&>K%gNBVfwSP$5<#RW>oPYeDUxbf_pzIhifQC-4P=#x$aE#t$ zM~Ox~^M&E7Wo5JapQP!nuH#|^QkK(rEhnR<>U~Bqx)jgV!}!*G>o+0Z`-!DHA*i)L z-(vCEy*@iBN!iWwY4D&lQb|RZIwtg4`?J0#@-42ioa_3~ymrzh6K|b;HdrE1v)>+e z4pc^A)2wN zrdL@pH-3=o?(~`rg!f*0^CLA&*)`s;ka9D|*Q2f>Q|iyU_`p zPFrI)8T^|TO48L6&wdzL$x!V!cGtbtSgE;+AINiwBNj=+`QRog{^AT?e=pHbbN=A2 znZVcF_(+hWP-YR+bO)XWg~Cu^`Bqg+Alb`0m#I>Fu)f*d+VR9+VF6dLU_W9r^w~gJ z`!4v;>!W*Tj052YMv}G@24H)l?-qaNQgAw1`-SH5ZU}86ky1l`UmH?e^IMN*{#3^W zven#W7FNW2ai*||b%8<}Ps^W1rF|@^jibK=`k`eQONIiy*BCqUKMLu-4T6ItjS8mX zEBlW1Jtv>JM2;dl2vxmc0fSsYNmMD*5iV`yH|)01e8|}v&t1IY2X7kSXUxW7_q7kc zECKe}7G^&Cx7K)1w}bc_aGxPuMUIs<<+ZEjD~QuIir*KO5M|7*P#r78s5s6o0*-_m zgbT|{y+;Soz-+zB@Rrd{3)n8Vfa>$530lB{@1t6Y6dwLZbw~IOla0Ows`9GbNpWbT zRnfOyKZE(s9EtF7rzl@(1O#6{(~Op^fTD` z9WeS+5Vr~O!ttGJD<{;8#{E}%Xp;N$!GtH)z5yF7U5OBPBuLa>{^7T2E_d>j(4Q`6 zIx#CDpfN?&sVP#Qy5O^_ORIas2r!AZkXrfFy=>pet!jNAK!@kd-5gxG@w>X}ZzqrH zCI!@aZJuApxvsznxnAH|_@yDqcL1#)nvxc7eL*&@ketcPX`Be{S;q0}bbV~-j+?@? zqs;w}=B>4ynPkG%AAk?_`o&TK2kYU3u`zkB1t93-{wHSCMtA>V<37HrG2L=?hvA1u z@R`E+88O46GsV^V2Y$IAjDxByuZNOI>pWJfmG{5-cUVVp+oPp&BX7f!#QHkdpZey9 z^~aM7i0IdKVzTS64xTPX_|&|Fd8-==gv|WPbAF`ykW&Uskp6}z=li0{n)s3aq%cZj zf7Hksp$QOV$-`-0cD;0W1$-r7=HB^MsjnHF|vPzfYSeJbQDmP-!NWedPblS znaVjQzYcV&^w_eUSu(jX%v2BWc4tHxBDog;MjQXwK z$)(e`{E-dT+8%EqWHp%Y{o%7X$$X!R*(^y%pSYz$G}k3sem?J)_- zA?Ws};lkhi4hp9BV?(yc9wU-K_O5izwwbN=+4=Na2!CXe*x zv2R74N1mRZxa9gu`=o8VR=LGG@6W?AwzOnGIrX*mry_ISY(+z461*ji$56QUE?ltX zsW(HZucNhvA6uXa2t}#%)nb2WCuhp05|w0N3qpgvLaH-lW{Y2#1l8^b920*Lh`lv$ zl*bic007Av=|-QjN9u-u+DzHXwu4sUd5Bbk4C@rs6o%cu! z<$Y{i{d7y8V2|pZ;qD`#rr3aN$)8iJVWW}@CJmnx&H6&G33b=x@{Mt4wE+DK)J%9? zYHDanJx;oTb!^taBhx(n1|}Ne!Z3+KT*w}EyFSz>`hdADDMPC7_@x$}ND>QO-_gcD z6;?g8*4G|Pf}7x&h&f(1>@Jw%N&E|zFk~`jr(jCc0u|uWvi}Y29 zTU9r{#V1fD5%YN*wsR1I2~c9klnY*u-J!sHA+M_hUHyfc>#&XOr=8d*{gsTF43P!C zwdIj;ahlf&3A$I#av0>bq^S?Q1kX}A6wGsa4rb9bC!oA5&k=2`GSqZ&msAF_?%S3;0crT$!Ju}PHV@} za9X~YkH2v{W=|5opMwE>Dh8Yja*3qfqw^8vJETghy@gP`rOEc|2s4cE;|wrT7Z+B) zlg2^-r~w;%Lugl=ZNpmHhO`!mhb{<+b1|@e&&(~{=-B?NE&Dumf9igYt;HW{2c?ve2Pzm_sNgLlwuE)*p5 zwgQf88JPq5t!9tZ-e8AyvD@QMG6$yp$zOc`np~$*VuY2$4Yn`i_oO~02KwU!^j*?k z0l?ECWUPOX^uAHs`x*V|=?0#n=KCZ705x&o7Cc!`h@dkCb2={Cj4dl{BZ=a#Rck_m zsK&ZJ%_T={ZHy}}zZOx(Az}k~Y!xp<;fx>MMY38rmIZF#w@R+p8Z_#WM(1P(yR)Oe zVth8`S!&s`gKobS?YwO9VzF!Yc)t-QVMn5rR$9gqgcS@O>rqtg#H#%uzu@0V3W`wW zRX$%*1rmA{vVF3}s$Sm6>w7W>f<6tiO0>1RI?)0?dN>}`$f#_TQ6+7z9HlxW>p4;o(l*y?uIqf^7>%q+~V2NQ6y1?K)qo}e! zcPtl#B*o*?6MHL9*A(>fICpdcS}l?X#EaFKMUm|2bU9;bDPU&U(?htc)lSZYc)ktCAGcY%Ak9B%#nI!DX6AULlI!C(DZpLHw*enJ*rm zb1fcG--OQIC5CSxoT6-reK|(@ycX!95>$bU6=jK8N^~h7li^owqIyp%cO#Ao%eXI&4*Q?I! z&lBD+c_Xef%XH(qUiTuj3D%2M5c8fE7G*&}{ad~=8QeF2F1F(1F$Fr4#y201Dm!ax zYcVh|$jQlXJWGVhT(9@#_4Kk6MOoVy;z`j%V3)~K^yI*jQp<$Es}_zd?w}^vGENHr zu%8#Txw#1z3QfmSv9PclUtgX_N2P$D;3VMD;i1>-^WA&N#5S-adaFO8)$Qic4+d#_ zi|OR#1RN-Hd$>LTdxKOfwC=$(C@L!YpxIDfQX;NAetmrnA%O%`H8c>hna_q`ObqUM zz1;8hh2dC;3m6Z_gFS4ltklHBADxaD!6`UpB_-VR>GbsU-rnBbT@$D^i($DVmdvZEHqqr;;kumlJ%&|xx``uq29j&%==pdJX|+~tYqtSzs>b8u)Vh2OK~ z7dA31^rxzyA63fL>>ldazr4YVFflFK73OD788xVW<8EeTv<((Ak55jJT`HHlj#b&! z)YXUbNJ^B3w=RD7H#Rn^uAvfh+6vxuWoBj;$!5`h`QgE@tJw1+prxgSN;VTTtUnJA z<`SQCiU41M|;B~Vv%pA(DZ(oQSlU96XYN~6e z)HInKJ;aDvoyn)XtjynDPLp}L=kfb@*m^>1@~=I&I$4Q}ha8qoWb!?N4-;^^{`&lI!(shv zQb*$P?(FXNwxX;I3OHVCUl|yn;^TYn4Z$dq$#C0>;84=O2>Z|x93K93Ir0GlSogRZ z14jn%@bF%4D!p&Y-YWXIo=GCW!@Gb>r5`bUaQ#EGF%mT2?ZFrw9i5z<9B=_N+ud%$ z-?5lmSm5L1tCqWb;ePFWeXh*OfrxCi<@^O!+HcR+JHfpNgM@v&Tu(+ zT_%r*?KoL*7ralJSzYx!Uaa=@_09VdAt5dvQ|b4&0g%In2q0l+XQ!e0yQ!6El`|*u zM5CNemL2!T2?^eEn9mjm1O&L>otjOJgWZDL{O21%#F_k_sRLVGLSBz2jmE~tp(m~u z7IaYql9H1C8IYu z@pLXI;Pz!_kK08>#`M$LwBF$!2XxH4pBeLDG@TY=Xn zMcRa6oaA(F$I{|rYAULMkr9izQst_jpc^YFD5!G3ySdrf-d0jl`suCkI0wF6ROkpI zJ`H+oQzN54fByW`g^seKh9g3>&;-r={&$x@XkH*WL4t=_9U`oLuy;My&d{tgP3AqO89T^YTRvw0vP< zVO!gCu(M*_>(Od92KL$AcDbWCJDY@)6F00676xW4g$1qy0Zy;VnEDoDJ5`0y;2no` z`Y8)UNJt1c^2`X1(WFiNYPOGyi(9h1bw6vTa!vrZD!6CRLWG;GGI(9Tfj&94CP5PE z6DoFC2B+QTU<^3~aCtTzFv0&QMUQQ>To)ot_CvQ*M^c;?xogXo6I}X<&UKs;$SBnb zMnf~RX2XF_nJa3mk4~)q( z2I(tE7Bfg^nIlJyiMRs}2PbpbtR2i0u{D=%9#yFwm#f!S=(N+G<&CuZg}#Tu(?qX7M0PHnBdgv2nR8SXWRaMS;OEHK@y`jx5K{N=g&Pe zMY64}tO;Ie`J(ctw(57`E%h`=b3$!SLx z*7wcPvE6Q~A6(AMt1Bo-$lr@Kg@wC|)rR1?g67`P&~WL4(ZAa2j6%q*q^=H5gMa?~ z83Hiu4;Pn|1pSS^6G~B$K$gx&8e7n3pYH#pq|r}!Ogt|=|01s`pw2K%dA zT~qTet){XPR4H($cr@(c=UIO|I&P(wmzVb)S`ztSpn|}@q9G%24Q_95zj($#{M@>b zqO=!`ga&+qJms@-8?xo7Ra8`x fKZk>nCxMnEcjLSZX(sSPEdX&58Q}^cJ-`12z7Lzp literal 17166 zcmb`v1ymhfx2C&scXtgg!JXg|+}$m>yAy&-fZ!H%6w`_F&6@9EJ! zy8Dh>BY`4e@2Xn0YR+f9@0>eQSy37Vkq{9C0-?ytNT`B95ct6U5j-UD^{!{!ANYpg zB%|XB0%7s}^A{q%R}%sRA_vJzh^l)npBi}Tt7$!d;>kpo5(}#7OJvrAaQ+$LPQ}a6 zHMgwWSlu{(X?tu_Uaf0W-sE^}yRWy=)~HKwQ@forlvo5=7d{3pBR2H%>dZCpJ(PM? zg!2CRa%caq%USEB(ByU2b=KpzT^Bx)V95SoL88$h7{u8G<)0!X2ryAvFuchd=+M@I z#%VXDPt0&?bcprn(-Kj=i+e#$GIYxL;|W9NSJOpuIG?nt4TbEGzMQSLqN1W+GfeI`EBaZ1R_IypHJ6ins{^(bFYfQA$`Fj%|V8-)PrH9K?$ygt>q7v9|5 zpb&BM^71}BJ~HdKTmbJ&L$}-Ny6OTj zB*e+7F88E`GmWtBrfh%Fjyt@MB5Bye!()7W+~@Wd4rOa=YiVie?d^@1SdlUz zt@!(ZE%H<23WaR~x8pi7CMF~(joo6T&SEso9%cXdxZVHR13pBCdOAnQ7r0pY8p+SV zrl0@=YH4o=7ZsV*E}E2A=hf-m#!1j5K|w-di^ULfeO9SO;pqjye-oJvQ&zCRDNS)V?_lMH+NHE8Dn=x zeMuQ(b&Rd{go*IHsNLuyIc%n+l;+UU(NRoHj7B*y59`QWTu5Dkx76UiL>cuHqvQCM z&ApuKphTOIS_c9F$Bm4PboTTlYdM^@4oxuBEO@ma__oL(ujMEmLVjRqJ*YLvh>Rsx zDQ&NJWaZITT2uU$A z<_B;2c;}CMN7dD9xHve>x(x?|W?Hf4Z9W3idmZxz;iKs9SK^tJ@~f%pQDO{9-R=9q zJ&o-t^Yr%Kpb%JUY+1bXG4T<%AO3G^OQe>&G6UQOuTL`Bg0*ZC5FoqPy*zbG#T_|< z=3L4%Nl;J+``KQHCN#&Y>o&r+AvB2BwV<@HY&7P^!fs}6G6B`7jAPnUll#(@%ynFb zaU-XcQ}u5yLULm-Bq!NPQb@7utE3qHb! zIUD5Ji7yqsQQwL)8SgjX67L5{|>xr1#R{El>*xtJL8+`Z6g26b2~ z8iRt=F>Z>f>G>Oan#5k#Q1QnP=Qx!vZctE*@ql9{B~4iAAzpv@a+Ttpc|D!&zV|mD zz4VH#(c<>W?9GGY&ua}sgA-55F0{WD@OWy5ME!PE{g0?q>zgy$U?z1Z&u7Z-I`LL{ zE`41Ek#gPMlGh+XPW7}($X13O8~KTrXLlvRsZ)0!K0UR?NN`Uk>D48>B3Y2B%R0S^ z3PdZ`HPYm?&BJfBsw|T3Srs%<>hsT?#u~J;6VR2N@Yk^H#%8yEwG?R^<)a$tnFwpy zX|@3cNzfPx_S@grx+HU&mZB$Y3*Ag{QNHO6o~W#SW$tDXSJs)VT_`Dz7T4EMb+@O2 zg))Uam>%m`PxKP9{M}u+ykp_Lcz?Af&gcoE(VoE76gVT~!eC zKbm4f6nx0Xhb~EWl}9f~Vhy{Mq}q{z=ITKrl_T_83h=j-!^<6Q2>2_L&IkTJol2Se zH*Q^0z$M0}ErJgd7V->rcZ4SR>CThZ5PkVR1n7IVD<_wifZE9Mqi#i0>`FOZ zFa;Chsmo`Jt1F)zv5tD4OhXNS3ez_DiZ|Sw&yij+D6zj9Xdpr4@N`dqqHNQN^_jm! zU-)ys$x#|_f?hQs^-)0}N2{aj$#Cpvb8^tn{k$K|92zP;gD@Pht<8(^S(DZEE_zf_ z9e<`;&{wo3zoBy)KoiryYR+^ZDAo_wATsb<~$dq)(nHG<09(o@=tNLb{f52sL_vT%uH?mfY=XtnN*%@=O@@=?Zd zPI<8;!(YB&oe67pIdt6@I$5d`mupcI?G~x%?yBoQwAI4-t}9)}pfwtxpBH)* zX4(XEAo-WZ$|Z0ayy8~URJ0e#DwnFWq}(}Tco9qdHuDm)vHVRN%Fe@bC0ZDN6EN+u z*8?kI>adqT6fm@OqgJ|X&dIJg<$db8S(3!6srT}%2?JBZ3`a*z9B-{VQdp3M4qZX# z#iCRZ@EsIcG+>mwROi&&E^nK%4X%5Ft*Yo@GkDddR&c!*OOVG?0f9^-^(*N>AQG+O zG|1HxE(hZ2sp!pMDuv`UFcu63Unl2&K|z)a-=AGc5NKxOCExB?3OIFQdba5auUvj0 zFR#BjYQVGptweDPD|ohd0U&!y@QXXEm|16?|o z#S>^af`V8zq4BBo_+u}YGP`8(I(*Y4SPZ9^?9<@Y^wCL8+o;-e8=gEtRpvRsiKWcS=V^}y+Mu$@6k?w zcV=};Z-!~}R_10r*e2Muo|b5Z<@E6P`9ipi#T9qeP>m4b9k+y4@i!bCv@v2Sc7s*W z_)t_7nlTjiom>8)*0eUVxf1R2OnONdI*-APz5q6zvb0z*q&xM$cVLbH{vt=5Q)%zI z!S{b84Hs`MDMDat%>Rf)5ne_I+e-Wg7Xb#bfNvUox+0VC??m?T<|fGK`T3canmX<0 zcSfzM-&;8_c*DcP6B84CYC7UynvY1V>8$4!kPi%y z>C{QI2ZVp^3j6KQ$c&*XS3mJT=dI=EC-H9=7Z)cdc7cI`#Hh3hGV*FNJ6y1@%ctKP zWyB@x9c;L=s;kfMPFE=4DHCLT?@ns#>p6OG>4A-l3kS4$c9xTp64q}TUfC5k;t*Uq zRe^@ioHgJN=3(5BUBcx<|DQuv3IX4A*G#4!Z|uk?6qYD}x3Y>A^<-ruxA!}J@54@w z#U0L~&6K~1nTU!8BOrRezxnsOD=3WDXw0K8GxNuXLjLPkq`}^-FKQVWoSvPbg7Ak( z`>YB~EW2b=Cim6oegxYId^4FiYUCxH8tHjrWu)(jRU4 zD|rc6t8V`{R^gDWLXvIQ`7~290Zv;G_RvEMhJl^t9rB|)z8F;)E^b1_07e2h+ z$?vuMlBVGm@qvCVE$N7$1n!Ue74r58v`I#L?2`Ky3nvW=x*hyk->~I=Rge0S4Igxo%)3_zw^jvOg-N`k;d|tL?S}a}6yFqe^XYNV@04yqW+k;_@Vy z9b0DbXt@J|orBE6A{ zPk&w6Ehxdjm?zVA?3B(HdUAm4e;>{_=MP|2Vj0UT@!`wb-+&qXvD$$0k~RH?=>eQW z)#h4~p^rS}U@D7Y?1fPH?6~t%yKO3sB|PCNdXU->0hUalZ$@TZc#YApBUeGti-&CZ zr94l!KAEO#seIKuIAL3xl|8pcZ)Q;|VgX4^`+tO`JRyyyU%P=UAIIAKt`bRq-e@gHvz@4e6Z6eW$gRcnpdu-GYGJ%vCjR0|dyN z{m&xHtk8RK%osy~rop3af*M_7ql+(`VO|;`r@H_K=UTAh5anpC1Iy->x{`XoN!}w? zmb!yWo21jwEFM1mq7D{pphZ)c3#Jo(*dAlQl>_w*$#*J>iCTxQ2!uj`<;0FGI{F=A z6Pw@vW}GrAxCkKKWS+q#2c;*W9Of4k0StrD2-!kqgMr57^5hMK7 zP952`a^%Wxn;*ja4AZHq0JU^Je(9$@^Qz?`XXznFYb40_lFZM0|Mc}P}<;BKrbCdwfUL#A+ z_8r3^lJksY4wM@?@v&>OG{MCHTW}M}Od;|uo8z;-}yE!69kw0!2 zPSntR^W%h`asv8kxHAPLo4m~2-_On!iO5WkZ5+1_CDO5bh_}Ru1QMtI(5(7b_^tV! z%WjPo@@iJga1YgI{LiUE-6G2ZLL3QNK`Z{xgl1pMxFg;%Ht}(%=~7pc(p|P6Apdql=s@3>z|1aNGQHM(v@XW{1m}+?50%?-di|`HGoRk_)Sa`h(=xBq zY!&)Fuja5a)tJ8{v{sh#y7$yb@!4kj{;&}CZo~))!hxxdTonAZHqG?Ua_QDs<>pI5 z4ZaMPKP4scv8bJ#t21?zC86vhzdJGSv|CJ7rVs%JpTB`M)DJZlIQ{YCWV>;5`PIc@ zf$34z?mrHNsTlTagZ*}jkEU!fEv>PNROF4`EURg;9dOa6Qc1xVgmhFCrlGL`aSeVn zL^`C7FhKGycp0;N2$uMD=C!gRlOEV52r8|pDDnDDKaR85G&>p7CSI-N!ku^+_NXoW zcz7tIjU4(jBl&o<@mg@aJeP3K>MM3`D?2W2Tb+(MG)ETcj&aPyei=2|^|Gsj;LKXu z5fn0hqgnkByvul>{Zvn^$zLy*z5z*L8RzjnhNa7<_xxd&2%J-Hk)U1whR*s$+(*i*2Ur0 zzar4g2B4>bMMd)5rHvl)>NEPL;h)Rr_nw2F_nQ(M`91gAvn|9#3 ze1-&JVqwk9%!rGNHx>r&>C{1hBq*KV-(K52&(Br@xkxD1R#z3Xcx%mvF;AaJP@f1f z+vl3*Sr{0O0I?|{A>r}yF)GtfIQMfAYPnC`xTPyEF?+h5mDR%PYK8wFyApI#(wyw< z)ARHAzAb9ZNNYsvj)9*>?Id-$*4KZ>DJUpFLG)_HOnOari}OxhU0o~9j^X4H5|pP) z^@9LMLF*L)+w815D5$;NS67!1H=3mBiIAaP<6Op^osWF8pQS{lK#HD$TaB>?S8)db z53biAqkHr#lsF4KQEzQ3H>rkE2xw?pegcsaA<5=8ao7X|rNle6+DvL5g#gPmI}tX% z%ocV9M4Q5&;(|+OG*z!cWRBN6efvCWD0q0ZoSCr^kq{Aq8}WR9&RAt@V-r?c7xu?t z${RpO^j{*AN*rkyKO}yo5pvv#kS7nLSgrvJQRL+uwLF>S;UxUK3Y~Kc?xw@-2DG4? zBAxJabuI+dLM@amLMHmSWd7lPXrZ$I1|}}MLoYDZWS#1P1vUObFNLH`BePR?h*jQ& zp-c=6pJikKdJK_(eRb*jt1J2pT8ilNtH7c-K23UulYnh$vM9Q(ClPiKRO# zxT~qN_%zGp20SgjePWng{UWye41DExEjZj!C#zqnn=og;#D4y^;(|^wQUx%FZ}Ant z0>Hw7T)Pq)24*~itNdq${e2|A#O@g54xnjpoHyIa-?CK)neL4wuQWMU<30ih)4I~D z-fadHs&ipX*Imj&2^Hgot-6zN;%gUbrW`kh4NRT3#}S%00V7=UQ71vqt-AQYfRKuR zI4C=ObSxs0pz7gedjbDn3{yq}v7P_x;mGToGBp=f{|R4kjeGCG0j_m^JA1``(`{<_ zz{5)iyTiaK*s_pyC;hBA_p^YvDS$CwQfhIX-QA)c@I2F^H8TW!o=NlLu_u7bE>kSE zKP%-VjQ-th#V|gU5M$K_3M&6cQc7f*7pF1Jwvz>cGGNZgk_>=gb(GfHvmb*ZY?A@g zMr|hW-6YZMKh>nRJE+X%1QRY+zvFkSsslEEvW%*(+)smF)QQI7BMdKTQDh>xseSi%#2 z-qolI5SiMci?;^WaWcQ#h!TNBXrN+G#jr38qqD>kDw4~k%@)TFxsdz`R@{1iX>r$( z3;{r}CjhKgks3e#qaUShux-3ThJX93V{~=qYz&m?yQ>7lM(Y9uF&J`L5Jqvf*Z20#0V|DWk)?di(8!a(|Ks>%zzn%W} zKQl8}yX}Sq+KxQYCtSlCX2c(n+C93bv(Rhc?~5qN^R49mInU*A0g3rQL`aGY6?>|rt z3c?f4#Ud%p*@W-==e;E~K8ssH&@Vuky*TP!Nmn(TTrh34Up16OKsHhNlB&j#l)7+@ zy`9~;bW?VOgfSy&O!V!i*5}`5c3+^2xiCN=QIV-|P5sf`O&d|Y(DV9BBTNV=xPxlU zO&1Ud5TtSgg$oz8VIe?5Xhj{_rBR5&ENN^*o&5p3egWCZD}-#@-{Js0yUT|q^yeX< zh}sZ&f<&Gwi@mY3EaOdZ)ZM0;oT6#kzeg@EnQ32eeGAn?KZgb_zDpN%qByIBjQ;-k zH>)=8TXI>wlvMr$C+w`E&N}m{lZmc;yhqSZ!=gxAw}8E0L}hvBH!zlkDzs}TlKqK{ zlz67oeeIR|btmmNj-+kvPRb?nYMmKM@b>cHw~XY4jO(=PfG8`!lG6|`+D0|KJMO== zwY^bMfX2FglOXkKv$4T4p8${lkCrsz+h6BTn|mVg2>q6H)u>Fp;(LWa3Wsrt-^h7P z)mbstpU{yt`!TS|MUu=|P0Q?-CBm6(`U%!pgG>VgSm{34Nx55U2KeNj` z_dVUH-<1s&Gz24JPPvKLOuDrR?h_?OjT>R!xx|N%;&P)e;K%@Ny2QvL{<-ph=}R^9 zy%%ueAIW=Zv(yIdEZsyf_sQhWO zr9KwWtwu#DMRDK~T>@BENVBxaRL4d)p4(VEgrR~42`Dsd!EUd_!#Ro)-^ug{a#_(n zR?{!Fsj7{w zpVbt#diHXmv(pvVkA4iv{1r!h+?nHBPDAE(+1UoJuJwlgucG2!US1v^P2Js}P{WgC zsEg?fFA2W}oO%rD7Ut#a@H7%mEG&3F-<_gEC&&~E`Q9xqEu|Yr{{Xg(3`CHVr>Byf zT%T!epQe$xMf0!OFZ1rZsapE~!ZY=))NYlVh1mRV&R6yFw2KR;{$}71+OoxeWdH^p zVIz(1XQ!85_VjT@0;}BTX3Q?!gAW&(71hA@C12(hP+D7A<3w=~zrn!301oKu^F7W2 z7Z+Eb=076cf#83lIN>jAGSJrbBld`#G(QQSi#b5hs_BCFY|RyMRu^Y`2zg^5&x};8 ze$IR5djpkxRct@u?zXn&wYARHg66`uV+Z!J5Rd?=1{m|12PURIW9+}_2|1xI>47k^m??tMm43x#i#ZpRXuT%FQpmt%t zF3mxJ03*?|hpSo@8xW{?9M^r6m59lcZKRuJRImX_7LZjb=n7+F6?AoVUw+Gi&o5wQ zB=whm0}2w<9|@WbbF+M@-h;2Jgsvixx=Gk@x_pP;fAHTlOlFJ;dO))Rus2%1-yZw~J%EChmOw=?>BgA( z&>G+RpC%xnR^?nT+7XwQ8i27X7g5dAGUsLz>dd2T#X=Pa=K%^t7GNu9b#b@lz;OYV zej7P|i&JdHE3>{-Qn&tREr3|UtiJl)&-_t9KgTO?Rj*W{>aVRD1!usQab_{&=ewy2 z;6lCnDmdwXgaRqw{$NcQasQOkSWJ*!a`<0ez9CIIaa2I~y5p4rgaUxh+|a32iI1W9 z9$mgWH>)e`{9CzVbhOren+pLPGkI9Tm2YkPlgA0ZN%?sc@8 zk~#vDjI%2v=6m|wYyM}@(neZG#gf7v^VL5DC>+|L`0MbU%+MRs+ zg`$PYz;{XvG1qw~Ul|apK7Fy8l5Lv*t+qh8CBTEYGB=@HM=}HmRh_krS~O6S$p&!q z-#ThJJpUlec=B2O=3QJu_k7Vr0ev6ZjRv=z3Tsv+N-^Qss4nHjS7BUA?O&@sd*E0I z4H^O}5bC8P5&U2q&ENahl>#@~Z#?+-WJZCXte?|L6zz!T zB#YA;$x|95^ZJvC{uhI7mK}blmb%fWRVT~){nGvPLnQnFG!4L62JL{)K2FjJh_m=} zZWylA61?+9DF26En$&l*wF-z-DA2E!^rx3=iv9t1c+i12-QNE~pE8&FT__H?aJ-nn ze6XC;(3nZw`2T_{jlsznd6J6#jqylajMZHGZ2Pa+lMl)(Fqeup(ru-b;QwAq9K5e#B5)#098am>5t}>h{zufMSea2bfi*E(~%&Ys+ z8VS|5cA~I%2vr>wYc>u6LP`PC^>BIYzuYc`-+vwM{)%eb*v+6^*UncRKx)L#4DgwB65tmBj*Fq7&X&>^n4F0r9`-8E#2z%QAHs*4jq$P&r1-%nG|zszX=)3+Uya$n%x~l!=I$Rmd#CVR~8S)BnO83X&{NCG7^EsAr z!M=!#Oo-8c@zDUVA8a*^EKrM7Ml+irPbh9%;4cJs zOTgJ69=x3#t=XVuhYp>`l^ZFM%#cS?hL=>Q*C5HRLdTIrm5JZ~=O!Txs=K50e&oVS zP*;Cs^U|w$PMRI#$`gmhmZ~@Fprn1##wo59}-bTWB?uS3XHDfNOu$ zov*WKtndDAVlQfKf=C1P$Pa9h<>l;1#e?(9D`zAdl50KCD=TCe`;Y*uUO z2L6}omVJ={#Rhqt79e!h^Iq?FcliNJ*U$@drh#7tWgnrD(o4{uSz+aBOd^+$jmjRK zphFrB9DSjbXM@mT#cRIbUQsX^vf$h&fhWF?6uv+EZakjsl%yoPqkz%~x!z+#xft38 z4)-L>yR!TC8Vf9z@jkuZ?(IBLGM_jTwzHk}peCh!_R`OAbX(}lJiVXjy(a;`VS#>C zGkc`9NGR(qd)#w^ehJ+gg|w;4Dm4#g8qMdd4p*~KFSmJhbt=(dxL&b*q_k>S<6Coi zg`cAu6Ao}*C|JR#T;g}?2%Tj66)pU7NX9jeQ)2nN?RR&Xn6cW>9H@5pL5TPexboTP zh6QbBQMKL<(_0Q5(ft{@ZV+hsESsS9Sjy3sxte{0+k<7PzkB;R5E8^6T@B-(L2p;O zt?_rK^7Nru@uD1w0^TLN`Ed*H+GpkHC_w4@{lRT7aKTfki}84Gr;~^Thy(F#LHIoMM642kxmv=Q5C zZIti(giz(nc9*e04E`zJ#ve`p&V!JVT)RqQA4#D#p|-@AgbQ4RE!MyG)T+)L&9yB@ zT{8K98w}w^Z2!L4o*ycV@aVWLz~}OUM4a6f=Vk_3Z}rJfehgtV!7fa#xs>lZCa>zY zDuiLx$bE2{gzMcTFRFp)HKt%f70-XF5}I{6byE9_meR#GFM7!;T+^;V&nCq}gx=_C z=37ZxpaCKzddcXFb#4AxpR{pie$0{(6aW$K5USg3;6?PnTPbLp77T>OK#>5pz|(x>CwW3{5yom>`QKR=;YAIO@> zLJJ{lqGo8x?JIz1x*iL7LASR1mA1Rs$sdEBFw2l0mB)Su1+ znjsL%$Rkd+9S=YK{cxlP;(biVWl}=EToQ)th!qOD$=odoXx;u3#{;hDGr6R#dUZOV zN}iKlDbGo1bjl17k|L-$E>*eyWLPsO)7C_;jM=*{51*drr$&7^KJwnT*0aBst~c~M z_0C9JjbKK)lk|7oi@2Uu97RcMc_5X0@Xz{so?@v0uab?w zcd>jX)FuXI(+CqNr6#NfmsehwN|c3-D3Z@Q$`PE)=7{&|ac@<7cQrQuU9@^Bg9M3; z&WXP)n=ZiU*_){fuD{e#_#1gHt)11`MgAOhKKf)|khG^-D?e+-6OfR$u*GWu6MVLt z_<|880Ry^enoA|~TG$z9JxrJ(p<(_Y5LbgrIEl}t=b1pdfOnX~%i=V0+lvGWiaSe* zJDZOROVs;-7ubDS-_xep?Rt>(UimSxO8R*)x7*Xs@4^jUgmwRy@Q5Ne1N6BFCqt1_>%>Z3yy2n zeiEBrGkjUH#mCfO!5IEP+5cd;)a+PpzM|t+RDXcDKCQVrH}7*y9ojGVpf3TfImHDE z3ZjiF0@8jcweFk(c*z~9CF;h;ojE1SM^en}fDj}CW}i7H&LHTdiy&F@#O36;pB*{O zWL^DK^--}^pC#!Sn-*;-K}@g!YPA=!yskh>{->^XPTLz+^cE%_XMgaSpqU#3C`hWQ zE|1+ZM$vy-TUR%;`s(3E1u1tFMRRknb?R>fA`oy3`{E38hks|6U-K88VtG_DdsPB)vbW{GqTfjQi4Fi z;gD+RV%}1FzyinaQ8EXFraG4aOS1i6I7X8q`U^~Lw&K2xKT}g*AC?~yW}f*~b^H_5 zX)ZlczS(mvnyH)Y3DN3#8jK8H{Fw+otK&5p-x}S((OBztdgqNaW^}k_>eA zJ`G$Kdf8v=0|JqE@7bqw(z9bp6{-R0XQda%zd0ROnf-Hoyki`xcOHlj0{1uB;W|t4 z)xFt68Go<3&vL!ylh5t=-CwN)_n&knl`f7dklI^J3Rll5N*7p_Je8N{}`+RiX8-|z2*sSf#jX<5K0u%OMO zuQ7BRmkeupRn6dycWAh%Sci5IqoP5Jkzy$MXreHph!5|w=gItpr>G+{l8cc! z6v)4-5LncQA# zIEZiy^QGeUm$8M5j@_@_E^&CCD3%UklW#cR6e+b2gk9iar8vX-jX$!!&o+sKMbIX| zoA<-xya?GKt4u1eqeBn7N0|Ed_gX6Yx7_3J^~Msjy|v%iME2Qr-3Ns11D*`TLrTmG|iImm0?o{DlOOF$B3f^1X zYw|Cbj!ZCnOkai${Yhl$&RPw8=!MN`!WTSlMF)&|r#q#!XhO$+iQu;&Ub+)_ms4ZC z>GpqNf+p|37}^VQ+Z%LD(U*_(MS?_M{Tg$Z>9ucy3oRNo)BU3w0{Vxv4jpyIb`N}R zt-o0$9v_|10G@|$-ODEa&$d?N_z&n#5_<{Fw%_*u#`;QQd=m%Nd96RaXZt=?(i3&b zz01eiF+f$|;v(Udp(Z5zdwz3FjN3ii7Nlg7+WWg;MS4_Xi-W^>Q<#6edX?!q*a?JG zfi^t|kV@B%E2+^!$#H6OYy?085NBKA^cX0AAmh@|g#p%xxxqJ>(jKX4$P~88<49Wjun7qok1^b} z*&jaNg8*5^DY`dVG&>Px@*f>=wlDsIMh?wOsKn|`1U#9XR)1wlUCiJt>~`wkt_>)$ zvcEC5hDCE~Y*iOewvV!gFm^uTkBt!Mb!0=9WsWYlWk7&D=Hsx$t=a`1c$eGUFZrMa zt%rBfDE+o0ps)ItUk2`uFj%NKToqMhQ|is|;Cv;Pl1ra9Qt;ugyJ^hUx2 z%hPL7_%l_STDxSzXw0lnBY1oQOKJApwY@Oo;+ZG6Ur4Xmu)4TjRF~&ONu; zFG{PNb|>lwET=EJl;)Coo=E%_F5>xk@FJXbjdwU`3>PlQ+FZv;T<_^t3MOs{y!>&B zy*1Q&?H`45Ifc{(n1kTgm6nBRKGrHl7X}dZgG~0w7Ib{f$oWdvq)ey{pg>H++-s0k zX9bHiRmZ^kD#~I6UVg~qN&uNfhwuCG9#AR^83&p~b9y}WNoLZoPnq2%tTu-us90Dh z+2ui%SO5jFKuCMCva=dwE}`la<1kKp`gx1jrkE(ZLWK5izjq zl^YwjG*F11&f?GBkB&bv!Wgz`MNvU&MJT|ws%&tqr`(3h9pi+`{%vlfGFt(T^^D8n z9)F9tLQNg_V@l_4blmg4L>A5b51jZe-NhO0c~3OVxX<@K`x~6OCC`i5sY8D>vJ5t(=_Eu>PJjl!gy$3 zKGOhK3h-I5-XsL5P-so#{W*eE7zgyLd<+U*;+kO2>rH@Hye_)?xP#f-wl7G2`SxLA zD0|^r7{2nwo0-FdG_-#qTajw1*8K$^195j$*n-e9OVr9^sIrO0qgFx`T7dgA1 zy@(Aha;n&pxysMI5Td%MP)43D+}x>|VB6iNJilC82WNI_iFwr-#?z5`mU*D5=8}FV zWc{$*YR1g8r)upeZ3gHF7v1@l2zC3k(2?!8>kA#GPg1rFP;;d;03_0|bePW8v{h~C;EFrc6#3`EeE zNkwp-bZ;d(Caum`_=w0_fHE%o=e$Uxi>cJ#?QkH1?Qm^MUKd}S9-XphUmDX!5cXtw8pT>2xSkUp) z=-r1uB5k7%jG)lgevHL%Z$iu!b~dOCb56zQhaOy8Y-s&e*oeol;(Mdd{8#e-tVc-s$`Fmh;<4TdLq1$c8Wa5W)yb&U)Ht zG@9WMK;*~%P1;Joy@!83pDoTrTZ&1uin*_`{-KJ7gEDU>$NCYK+(N8@awPK?nknJr zdXh~3Gj&H}_VEkzzeF02#eqU?m^cm+Z@-OC5&puz!%a8w{}@9d`cHX0|2QK~9*E*F zfO?rHoe%I323$3Hl4-->pT!Yc28#y5CHkaZ8O-C1#Cv8^K&u^SvPrLiLHrpg>i+NL zSRZlG58Ot~**R^<5(snBOx*~@F+L#C?kwzx5SLE0s&>hm!Hgu2(a$Q^Oesp*OTs-p}`#=1Odc2%9J_@aR$G7~cqZ){5jN79 z_4TeVQ~6+}+$w#bWpO0Ay8F#XN@@V;?aRo>cp6DhNMIAOh5*ga&lgC=@&~*SJg!cD zdhhVMG-W;n%cr}%KHY}F&h0=A#F0E-Z1qJ&Awn5uWM*cjr7fr~aOo6j@5;r$7**Z9o$y1i0 z#zYNwb$2&tb}#_?AA_ObHU}3fbg+ntU;bt~ZaeXUK&0`3``MneO=zvHtutUHt;PJF zAaZzI{DCMOM#XIYq7Oo`QuXy}u9Rfy@|LyQVV#`p>{oz&dUtmR+p%zBe8)&sQBi@d z@?D;0=hDk*XJuuj{7VGTOu2F2larGJ3=D&GrJt~oT3=;ITU#5gQjWH^_QTVY&1dmE zGP8zkkSOCR?1_lN;Hb*1LjA$hM4PKYqW`l9of#*Q0sWFLCCbxop zNLE%BaIb;J2S`v|eZ3A-N^I=W6amoaI5jmj;>pg=UaXMG?Qx>b$XI@Jmim+ZT5*E!I0u8t%JPxZv2~?dO9b)3*V1>*kptO^K!*X_h z9*7`S+LtkIZ-MX6M&ja7#GYGDEkKjUIZ*hx*5;0c%e2^NtEH~~!)32Bge>r-wz-*y zipp%I$(|w5#>R%<=cYuTW#5UHmzaf-5y=DS(7=TiA2$ioW=awBjvQzxDuO``1v-1< z$v)*|WRz)!wY0PV7CTTA0Nm7-$;rv3rN^&m!Vz(C0k5|U5FlVwfw`FTDX`1s;bxdN zcPxWTjso1?{=8Ug1_cEr9ZQ&;n3%5(gN2RF>9{Tk60x(hQ&pV?#=KO$j4~I9B4^8u z$3PPaFR{aHi3$P&g6H}ATz)o}?L3{|W*^!oLH8r<_`$E=UhY;Lfxz)&<$8WzlQcdx zCkItJaWI~2a%iZgw)XPwuFL%x_ee%6{7RLM30ICXp`)Y2_;?CiEbCKXPLFYxusjgM}4C=k&Sx5-E|G zo_@Hq0~LrN*px@;MOSIB0|c>u;>_El6?-}pI{brQSJ7U*+A{XnTt!7id}N>7hb z63|%EtRhW#bKYb9oZs8qySZu1gi8brxYY#l0}XLdPfFgZlJb|fbM(;`=N-3P1Q~gG zc{w>odU}<#snu53q@<)JyF1_WPOww**w`4*WrGCj2xXP^|=Zn+^+g0j-6+fYun47q2=i5EAzXsCMw?-qu%I>%sOdfH_$ zj-*_(a%5;INbeAbQIpA_?FI;Qz_YRI#H%|3aY{u2)$(38zm4;W9vgABmW7Ea6bcMH zfAopjLcW?G$Uwi2f!IH}9~4XU#`y9nSVUQeHvg)HUEjdq)ALC^ka6feGhiTr#zqbG zMX@|(I&6jxc+f}3^$s<#qP2_G-U86(ty6Dl(CQ*NC9UNKlPm|@b#`*{vK3Ay{ICgy z83~QnHM(~Nj4Xm;@t4r@@^W8aU+H+#%9oSbQZ*zbB%E70DJdzVOZ5@aN@V|PaWS!h zSfbRlGY^&STY2grv&FasW1U?)X}IygAk+S(e3FM_13+}!i)>u-PZ#3>UZ#PWc~`^V>J zASw4!_-bm-A50a%K=v#c1sdq;j_hB5{rVLrI13Vw)lksu!dP6q4@fla?d?F}ck=ra zFls>CdsA0e&g4FDLFG4oAP_KJs;a7h7>yArp;~&=^A!%r5*fVCl+pbz?(Y9|{R5TY zh=_=gkdT{;b;ZShfdhW{@BujErlzK-AgqCG0Z&v?L2Vr!pfenZOb{Tmff!jOB~}&| z2RpkCAV^i{)O!i812>pV&~ zIX@{$O;akoNK97K(pWn{j9!@O`sI)07V*q~JOdEE!LPt;UK2wzm_x|SQmjTsY z7{(h`?&Ly_tezfGw&Fq^*m(2147E#ZM`h*s`uchaD?7+bFlX2tbtnO7RjFY`#O}kz zM(iIMvH9s=*yW4_{ERrF1{o%hn4Fwk%GF4xj7$gs!K{NYk9l~7JdLfRqqOITJp3@j zLV8M$moZr3Tvl{=Wlu3SVq+5%(IQo@@vjgPin@|$jiQ%`P$A?7Pwek##3ercXMy{v Rzz1A_WF-|Ps>F;#{vV?@oK*k- diff --git a/doc/figures/fpga.tex b/doc/figures/fpga.tex index 12f0bcf5..9eafda95 100644 --- a/doc/figures/fpga.tex +++ b/doc/figures/fpga.tex @@ -359,8 +359,8 @@ \draw[arrows=-,color=black] (links.north) to (boardrouter.south); % Is the board router connected to off-chip RAM? - \draw[arrows=-,color=black] (ram0.east) to (boardrouter.west); - \draw[arrows=-,color=black] (ram1.west) to (boardrouter.east); + \draw[arrows=-,color=mygreen] (ram0.east) to (boardrouter.west); + \draw[arrows=-,color=mygreen] (ram1.west) to (boardrouter.east); \end{tikzpicture} diff --git a/hostlink/HostLink.cpp b/hostlink/HostLink.cpp index aa4d3af6..4708457e 100644 --- a/hostlink/HostLink.cpp +++ b/hostlink/HostLink.cpp @@ -218,8 +218,9 @@ void HostLink::fromAddr(uint32_t addr, uint32_t* meshX, uint32_t* meshY, *meshY = addr; } -// Inject a message via PCIe (blocking by default) -bool HostLink::send(uint32_t dest, uint32_t numFlits, void* payload, bool block) +// Internal helper for sending messages +bool HostLink::sendHelper(uint32_t dest, uint32_t numFlits, void* payload, + bool block, uint32_t key) { assert(useSendBuffer ? block : true); @@ -242,7 +243,7 @@ bool HostLink::send(uint32_t dest, uint32_t numFlits, void* payload, bool block) buffer[0] = dest; buffer[1] = 0; buffer[2] = (numFlits-1) << 24; - buffer[3] = 0; + buffer[3] = key; // Fill in message payload memcpy(&buffer[4], payload, numFlits*16); @@ -285,6 +286,13 @@ bool HostLink::send(uint32_t dest, uint32_t numFlits, void* payload, bool block) } } + +// Inject a message via PCIe (blocking by default) +bool HostLink::send(uint32_t dest, uint32_t numFlits, void* msg, bool block) +{ + return sendHelper(dest, numFlits, msg, block, 0); +} + // Flush the send buffer void HostLink::flush() { @@ -298,7 +306,28 @@ void HostLink::flush() // Try to send a message (non-blocking, returns true on success) bool HostLink::trySend(uint32_t dest, uint32_t numFlits, void* msg) { - return send(dest, numFlits, msg, false); + return sendHelper(dest, numFlits, msg, false, 0); +} + +// Send a message using routing key (blocking by default) +bool HostLink::keySend(uint32_t key, uint32_t numFlits, + void* msg, bool block) +{ + uint32_t useRoutingKey = 1 << ( + TinselLogThreadsPerCore + TinselLogCoresPerMailbox + + TinselMailboxMeshXBits + TinselMailboxMeshYBits + + TinselMeshXBits + TinselMeshYBits + 2); + return sendHelper(useRoutingKey, numFlits, msg, block, key); +} + +// Try to send using routing key (non-blocking, returns true on success) +bool HostLink::keyTrySend(uint32_t key, uint32_t numFlits, void* msg) +{ + uint32_t useRoutingKey = 1 << ( + TinselLogThreadsPerCore + TinselLogCoresPerMailbox + + TinselMailboxMeshXBits + TinselMailboxMeshYBits + + TinselMeshXBits + TinselMeshYBits + 2); + return sendHelper(useRoutingKey, numFlits, msg, false, key); } // Receive a message via PCIe (blocking) diff --git a/hostlink/HostLink.h b/hostlink/HostLink.h index 81c9b32f..f6a7a71c 100644 --- a/hostlink/HostLink.h +++ b/hostlink/HostLink.h @@ -35,6 +35,10 @@ class HostLink { // Internal constructor void constructor(uint32_t numBoxesX, uint32_t numBoxesY); + + // Internal helper for sending messages + bool sendHelper(uint32_t dest, uint32_t numFlits, void* payload, + bool block, uint32_t key); public: // Dimensions of board mesh int meshXLen; @@ -65,6 +69,12 @@ class HostLink { // Try to send a message (non-blocking, returns true on success) bool trySend(uint32_t dest, uint32_t numFlits, void* msg); + // Send a message using routing key (blocking by default) + bool keySend(uint32_t key, uint32_t numFlits, void* msg, bool block = true); + + // Try to send using routing key (non-blocking, returns true on success) + bool keyTrySend(uint32_t key, uint32_t numFlits, void* msg); + // Receive a max-sized message (blocking) void recv(void* msg); diff --git a/include/tinsel-interface.h b/include/tinsel-interface.h index 93b5ec96..352b4461 100644 --- a/include/tinsel-interface.h +++ b/include/tinsel-interface.h @@ -166,7 +166,7 @@ INLINE uint32_t tinselAccId( uint32_t tileX, uint32_t tileY) { uint32_t addr; - addr = 0x4; + addr = 0x8; addr = (addr << TinselMeshYBits) | boardY; addr = (addr << TinselMeshXBits) | boardX; addr = (addr << TinselMailboxMeshYBits) | tileY; diff --git a/include/tinsel.h b/include/tinsel.h index 9ebd8451..ec26b849 100644 --- a/include/tinsel.h +++ b/include/tinsel.h @@ -176,6 +176,16 @@ INLINE void tinselSend(int dest, volatile void* addr) tinselMulticast(dest >> 6, high, low, addr); } +// Send message at addr using given routing key +INLINE void tinselKeySend(int key, volatile void* addr) +{ + // Special address to signify use of routing key + uint32_t useRoutingKey = 1 << + (TinselMailboxMeshYBits + TinselMailboxMeshXBits + + TinselMeshXBits + TinselMeshYBits + 2); + tinselMulticast(useRoutingKey, 0, key, addr); +} + // Receive message INLINE volatile void* tinselRecv() { diff --git a/rtl/DE5BridgeTop.bsv b/rtl/DE5BridgeTop.bsv index 5dce9e25..15e2ba8f 100644 --- a/rtl/DE5BridgeTop.bsv +++ b/rtl/DE5BridgeTop.bsv @@ -12,9 +12,10 @@ // 1. DA: Destination address (4 bytes) // 2. NM: Number of messages that follow minus one (4 bytes) // 3. FM: Number of flit payloads per message minus one (1 byte) -// 4. Padding (7 bytes) -// 5. (NM+1)*(FM+1) flit payloads ((NM+1)*(FM+1)*BytesPerFlit bytes) -// 6. Goto step 1 +// 4. Padding (3 bytes) +// 5. Routing key (optional, 4 bytes) +// 6. (NM+1)*(FM+1) flit payloads ((NM+1)*(FM+1)*BytesPerFlit bytes) +// 7. Goto step 1 // // The format of the data stream in the FPGA->PC direction is simply // raw flit payloads. @@ -161,6 +162,7 @@ module de5BridgeTop (DE5BridgeTop); Reg#(Bit#(32)) fromPCIeDA <- mkConfigRegU; Reg#(Bit#(32)) fromPCIeNM <- mkConfigRegU; Reg#(Bit#(8)) fromPCIeFM <- mkConfigRegU; + Reg#(Bit#(32)) fromPCIeKey <- mkConfigRegU; Reg#(Bit#(1)) toLinkState <- mkConfigReg(0); Reg#(Bit#(32)) messageCount <- mkConfigReg(0); @@ -182,6 +184,7 @@ module de5BridgeTop (DE5BridgeTop); fromPCIeDA <= data[31:0]; fromPCIeNM <= data[63:32]; fromPCIeFM <= data[95:88]; + fromPCIeKey <= data[127:96]; toLinkState <= 1; fromPCIe.get; end @@ -203,6 +206,10 @@ module de5BridgeTop (DE5BridgeTop); Flit flit; flit.dest.addr = unpack(truncate(fromPCIeDA[31:`LogThreadsPerMailbox])); flit.dest.threads = pack(destThreads); + // If address says to use routing key, then use it + if (flit.dest.addr.isKey) begin + flit.dest.threads = zeroExtend(fromPCIeKey); + end flit.payload = fromPCIe.value; flit.notFinalFlit = True; flit.isIdleToken = False; diff --git a/rtl/Globals.bsv b/rtl/Globals.bsv index a2648a23..42ea3c70 100644 --- a/rtl/Globals.bsv +++ b/rtl/Globals.bsv @@ -20,10 +20,13 @@ typedef struct { // destination board, it is routed either left or right depending // the contents of the host bit. This is to support bridge boards // connected at the east/west rims of the FPGA mesh. +// The 'isKey' bit means that the destination is a routing key, held +// in the botom 32 bits of the 'NetAddr'. // The 'acc' bit means message is routed to a custom accelerator rather // than a mailbox. typedef struct { Bool acc; + Bool isKey; Option#(Bit#(1)) host; BoardId board; MailboxId mbox; @@ -42,6 +45,9 @@ typedef struct { function MailboxId getMailboxId(NetAddr addr) = addr.addr.mbox; +// Extract routing key from network address +function Bit#(32) getRoutingKey(NetAddr addr) = truncate(pack(addr)); + // ============================================================================ // Messages // ============================================================================ diff --git a/rtl/IdleDetector.bsv b/rtl/IdleDetector.bsv index 0307f198..4cb3ccc5 100644 --- a/rtl/IdleDetector.bsv +++ b/rtl/IdleDetector.bsv @@ -221,6 +221,7 @@ module mkIdleDetector (IdleDetector); NetAddr { addr: MailboxNetAddr { acc: False, + isKey: False, host: option(True, 0), board: BoardId { y: 0, x: 0 }, mbox: MailboxId { y: 0, x: 0 } @@ -538,6 +539,7 @@ module mkIdleDetectMaster (IdleDetectMaster); NetAddr { addr: MailboxNetAddr { acc: False, + isKey: False, host: option(False, 0), board: BoardId { y: truncate(boardY), x: truncate(boardX) }, mbox: MailboxId { y: 0, x: 0 } From ec7dd777c7cac19f993a10d981976a72d111e0b4 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Tue, 4 Feb 2020 16:07:23 +0000 Subject: [PATCH 05/78] New types for routing keys, records, etc. --- rtl/Globals.bsv | 2 +- rtl/ProgRouting.bsv | 111 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 1 deletion(-) create mode 100644 rtl/ProgRouting.bsv diff --git a/rtl/Globals.bsv b/rtl/Globals.bsv index 42ea3c70..914852e9 100644 --- a/rtl/Globals.bsv +++ b/rtl/Globals.bsv @@ -46,7 +46,7 @@ typedef struct { function MailboxId getMailboxId(NetAddr addr) = addr.addr.mbox; // Extract routing key from network address -function Bit#(32) getRoutingKey(NetAddr addr) = truncate(pack(addr)); +function Bit#(32) getRoutingKeyRaw(NetAddr addr) = truncate(pack(addr)); // ============================================================================ // Messages diff --git a/rtl/ProgRouting.bsv b/rtl/ProgRouting.bsv new file mode 100644 index 00000000..b3819180 --- /dev/null +++ b/rtl/ProgRouting.bsv @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: BSD-2-Clause +package ProgRouting; + +// Functions and data types for programmable routers + +// ============================================================================= +// Routing keys and beats +// ============================================================================= + +// A routing record is either 40 bits or 80 bits in size (aligned on a +// 40-bit or 80-bit boundary respectively). Multiple records are +// packed into a 256-bit DRAM beat (aligned on a 256-bit boundary). +// The most significant 16 bits of the beat contain a count of the +// number of records in the beat (in the range 1 to 6 inclusive). The +// remaining 240 bits contain records. The first record lies in the +// least-significant bits of the beat. The size portion of the routing +// key contains the number of contiguous DRAM beats holding all +// records for the key. + +// 256-bit routing beat +typedef struct { + // Number of 40-bit record chunks present + Bit#(16) size; + // The 40-bit record chunks + Vector#(6, Bit#(40)) chunks; +} RoutingBeat deriving (Bits); + +// 32-bit routing key +typedef struct { + // Pointer to array of routing beats containing routing records + Bit#(26) ptr; + // Number of beats in the array + Bit#(6) numBeats; +} RoutingKey deriving (Bits); + +// ============================================================================= +// Types of routing record +// ============================================================================= + +typedef enum { + URM1 = 3'd0, // 40-bit Unicast Router-to-Mailbox + URM2 = 3'd1, // 80-bit Unicast Router-to-Mailbox + RR = 3'd2, // 40-bit Router-to-Router + MRM = 3'd3, // 80-bit Multicast Router-to-Mailbox + IND = 3'd4 // 40-bit Indirection +} RoutingRecordTag; + +// 40-bit Unicast Router-to-Mailbox (URM1) record +typedef struct { + // Record type + RoutingRecordTag tag; + // Mailbox destination + Bit#(4) mbox; + // Mailbox-local thread identifier + Bit#(6) thread; + // Local key. The first word of the message + // payload is overwritten with this. + Bit#(27) localKey; +} URM1Record deriving (Bits); + +// 80-bit Unicast Router-to-Mailbox (URM2) record +typedef struct { + // Record type + RoutingRecordTag tag; + // Mailbox destination + Bit#(4) mbox; + // Mailbox-local thread identifier + Bit#(6) thread; + // Currently unused + Bit#(3) unused; + // Local key. The first two words of the message + // payload is overwritten with this. + Bit#(64) localKey; +} URM2Record deriving (Bits); + +// 40-bit Router-to-Router (RR) record +typedef struct { + // Record type + RoutingRecordTag tag; + // Direction (N, S, E, or W) + Bit#(2) dir; + // Currently unused + Bit#(3) unused; + // New 32-bit routing key that will replace the one in the + // current message for the next hop of the message's journey + Bit#(32) newKey; +} RRRecord deriving (Bits); + +// 80-bit Multicast Router-to-Mailbox (MRM) record +typedef struct { + // Record type + RoutingRecordTag tag; + // Mailbox destination + Bit#(4) mbox; + // Currently unused + Bit#(9) unused; + // Mailbox-local destination mask + Bit#(64) destMask; +} MRMRecord deriving (Bits); + +// 40-bit Indirection (IND) record: +typedef struct { + // Record type + RoutingRecordTag tag; + // Currently unused + Bit#(5) unused; + // New 32-bit routing key for new set of records on current router + Bit#(32) newKey; +} MRMRecord deriving (Bits); + +endpackage From 57346c9f760da882943419ea1fda3d27451d38f1 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Thu, 19 Mar 2020 11:29:22 +0000 Subject: [PATCH 06/78] Give prog-router multiple ports to each DRAM --- config.py | 4 ++++ rtl/Connections.bsv | 9 ++++----- rtl/Network.bsv | 25 +++++++++++++++++-------- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/config.py b/config.py index 74c7f63e..9021804c 100755 --- a/config.py +++ b/config.py @@ -360,6 +360,10 @@ def quoted(s): return "'\"" + s + "\"'" # Number of FPGA boards per box (including bridge board) p["BoardsPerBox"] = p["MeshXLenWithinBox"] * p["MeshYLenWithinBox"] + 1 +# Number of fetchers in the per-board programmable router +# (Currently assumed to be 4) +p["FetchersPerProgRouter"] = 4 + #============================================================================== # Main #============================================================================== diff --git a/rtl/Connections.bsv b/rtl/Connections.bsv index 4de1c4d8..013224c9 100644 --- a/rtl/Connections.bsv +++ b/rtl/Connections.bsv @@ -55,7 +55,8 @@ module connectClientsToOffChipRAM#( // Data caches Vector#(`DCachesPerDRAM, DCache) caches, // Programmable per-board router, reqs and resps - BOut#(DRAMReq) routerReqs, In#(DRAMResp) routerResps, + Vector#(`FetchersPerProgRouter, BOut#(DRAMReq)) routerReqs, + Vector#(`FetchersPerProgRouter, In#(DRAMResp)) routerResps, // Off-chip memory OffChipRAM ram) (); @@ -63,8 +64,7 @@ module connectClientsToOffChipRAM#( function getReqOut(cache) = cache.reqOut; let reqs <- mkMergeTreeB(Fair, mkUGShiftQueue1(QueueOptFmax), - append(map(getReqOut, caches), - cons(routerReqs, nil))); + append(map(getReqOut, caches), routerReqs)); connectUsing(mkUGQueue, reqs, ram.reqIn); // Connect load responses @@ -73,8 +73,7 @@ module connectClientsToOffChipRAM#( let ramResps <- mkResponseDistributor( getRespKey, mkUGShiftQueue2(QueueOptFmax), - append(map(getRespIn, caches), - cons(routerResps, nil))); + append(map(getRespIn, caches), routerResps)); connectDirect(ram.respOut, ramResps); endmodule diff --git a/rtl/Network.bsv b/rtl/Network.bsv index 642acfcb..99229c96 100644 --- a/rtl/Network.bsv +++ b/rtl/Network.bsv @@ -381,8 +381,10 @@ interface NoC; interface Vector#(`NumEastWestLinks, AvalonMac) west; `endif // Connections to off-chip memory (for the programmable router) - interface Vector#(`DRAMsPerBoard, BOut#(DRAMReq)) dramReqs; - interface Vector#(`DRAMsPerBoard, In#(DRAMResp)) dramResps; + interface Vector#(`DRAMsPerBoard, + Vector#(`FetchersPerProgRouter, BOut#(DRAMReq))) dramReqs; + interface Vector#(`DRAMsPerBoard, + Vector#(`FetchersPerProgRouter, In#(DRAMResp))) dramResps; endinterface module mkNoC#( @@ -404,12 +406,17 @@ module mkNoC#( mapM(mkBoardLink(linkEnable[3]), westSocket); // Responses from off-chip memory - Vector#(`DRAMsPerBoard, InPort#(DRAMResp)) dramRespPort <- - replicateM(mkInPort); + Vector#(`DRAMsPerBoard, + Vector#(`FetchersPerProgRouter, InPort#(DRAMResp))) dramRespPort <- + replicateM(replicateM(mkInPort)); // Requests to off-chip memory - Vector#(`DRAMsPerBoard, Queue1#(DRAMReq)) dramReqQueues <- - replicateM(mkUGShiftQueue1(QueueOptFmax)); + Vector#(`DRAMsPerBoard, + Vector#(`FetchersPerProgRouter, Queue1#(DRAMReq))) dramReqQueues <- + replicateM(replicateM(mkUGShiftQueue1(QueueOptFmax))); + + // Dimension-ordered routers + // ------------------------- // Create mailbox routers Vector#(`MailboxMeshYLen, @@ -567,10 +574,12 @@ module mkNoC#( `endif // Requests to off-chip memory - interface dramReqs = Vector::map(queueToBOut, dramReqQueues); + interface dramReqs = + Vector::map(Vector::map(queueToBOut), dramReqQueues); // Responses from off-chip memory - interface dramResps = Vector::map(getIn, dramRespPort); + interface dramResps = + Vector::map(Vector::map(getIn), dramRespPort); endmodule From 77c62f77a4d7353b9865aac7deb8c334d893f840 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Tue, 24 Mar 2020 17:33:19 +0000 Subject: [PATCH 07/78] Generalise DRAMResp Need to support inflight request info for programmable routers now, in addition to caches. Therefore, generalise the info field in a DRAMResp. Need to check that the optimiser does what I expect in this case, i.e. throws away unused bits in the info field. --- config.py | 6 ++- rtl/DCache.bsv | 9 ++-- rtl/DRAM.bsv | 19 ++++--- rtl/NarrowSRAM.bsv | 9 ++-- rtl/{ProgRouting.bsv => ProgRouter.bsv} | 68 ++++++++++++++++++++++--- rtl/WideSRAM.bsv | 1 + 6 files changed, 87 insertions(+), 25 deletions(-) rename rtl/{ProgRouting.bsv => ProgRouter.bsv} (58%) diff --git a/config.py b/config.py index 9021804c..7f931637 100755 --- a/config.py +++ b/config.py @@ -361,8 +361,10 @@ def quoted(s): return "'\"" + s + "\"'" p["BoardsPerBox"] = p["MeshXLenWithinBox"] * p["MeshYLenWithinBox"] + 1 # Number of fetchers in the per-board programmable router -# (Currently assumed to be 4) -p["FetchersPerProgRouter"] = 4 +# Parameters for programmable routers +# (and the routing-record fetchers they contain) +p["FetchersPerProgRouter"] = 5 +p["LogFetcherFlitBufferSize"] = 5 #============================================================================== # Main diff --git a/rtl/DCache.bsv b/rtl/DCache.bsv index b99d4667..bbed742d 100644 --- a/rtl/DCache.bsv +++ b/rtl/DCache.bsv @@ -437,9 +437,10 @@ module mkDCache#(DCacheId myId) (DCache); // This rule either consumes a flush request or a memory response let flush = flushQueue.dataOut; let resp = respPort.value; + InflightDCacheReqInfo info = unpack(truncate(resp.info)); lineWriteDataWire <= resp.data; - lineWriteIndexWire <= beatIndex(resp.info.beat, resp.info.req.id, - resp.info.req.addr, resp.info.way); + lineWriteIndexWire <= beatIndex(truncate(resp.beat), info.req.id, + info.req.addr, info.way); // Ready to consume flush queue? if (flushQueue.canDeq && flushQueue.canPeek) begin flush.req.cmd.isFlush = False; @@ -453,14 +454,14 @@ module mkDCache#(DCacheId myId) (DCache); // Remove item from fill queue and feed associated request (which // will definitely hit if it starts again from the beginning of // the pipeline) back to beginning of the pipeline - if (allHigh(resp.info.beat)) + if (allHigh(resp.beat)) feedbackTrigger <= True; // Write new line data to dataMem // (The write parameters are set outside condition for better timing) lineWriteReqWire <= True; respPort.get; // Set feedback request - feedbackReq <= resp.info.req; + feedbackReq <= info.req; end endrule diff --git a/rtl/DRAM.bsv b/rtl/DRAM.bsv index e5d4a33e..d188afbd 100644 --- a/rtl/DRAM.bsv +++ b/rtl/DRAM.bsv @@ -25,8 +25,13 @@ typedef struct { typedef struct { DRAMReqId id; Bit#(`BeatWidth) data; - InflightDCacheReqInfo info; + // Which beat is it? Bool finalBeat; + Bit#(`BeatBurstWidth) beat; + // Data from original load request + // (Can be largely ignored and optimised away, but + // can also hold useful info about the original request) + Bit#(`BeatWidth) info; } DRAMResp deriving (Bits); // DRAM identifier @@ -83,7 +88,6 @@ import Util :: *; import Interface :: *; import Queue :: *; import Assert :: *; -import DCacheTypes :: *; // Types // ----- @@ -154,8 +158,8 @@ module mkDRAM#(RAMId id) (DRAM); DRAMResp resp; resp.id = req.id; resp.data = pack(elems); - resp.info = unpack(truncate(req.data)); - resp.info.beat = truncate(burstCount); + resp.info = req.data; + resp.beat = burstCount; resp.finalBeat = finalBeat; resps.enq(resp); decOutstanding.send; @@ -222,7 +226,6 @@ import Interface :: *; import Assert :: *; import Util :: *; import Assert :: *; -import DCacheTypes :: *; // Types // ----- @@ -247,7 +250,7 @@ endinterface typedef struct { DRAMReqId id; Bit#(`BeatBurstWidth) burst; - InflightDCacheReqInfo info; + Bit#(`BeatWidth) info; } DRAMInFlightReq deriving (Bits); // Implementation @@ -312,7 +315,7 @@ module mkDRAM#(t id) (DRAM); DRAMInFlightReq inflightReq; inflightReq.id = req.id; inflightReq.burst = req.burst; - inflightReq.info = unpack(truncate(req.data)); + inflightReq.info = req.data; inFlight.enq(inflightReq); inFlightCount.incBy(zeroExtend(req.burst)); end @@ -339,7 +342,7 @@ module mkDRAM#(t id) (DRAM); DRAMResp resp; resp.id = inFlight.dataOut.id; resp.info = inFlight.dataOut.info; - resp.info.beat = truncate(burstCount-1); + resp.beat = truncate(burstCount-1); resp.data = respBuffer.dataOut; resp.finalBeat = burstCount == inFlight.dataOut.burst; return resp; diff --git a/rtl/NarrowSRAM.bsv b/rtl/NarrowSRAM.bsv index 0fbd34fa..dde0e08a 100644 --- a/rtl/NarrowSRAM.bsv +++ b/rtl/NarrowSRAM.bsv @@ -1,8 +1,7 @@ // SPDX-License-Identifier: BSD-2-Clause package NarrowSRAM; -import DCacheTypes :: *; -import Util :: *; +import Util :: *; // ============================================================================ // Types @@ -16,7 +15,7 @@ typedef struct { SRAMReqId id; Bit#(`SRAMAddrWidth) addr; Bit#(`SRAMBurstWidth) burst; - InflightDCacheReqInfo info; + Bit#(`BeatWidth) info; } SRAMLoadReq deriving (Bits); // SRAM store request @@ -31,7 +30,7 @@ typedef struct { typedef struct { SRAMReqId id; Bit#(`SRAMDataWidth) data; - InflightDCacheReqInfo info; + Bit#(`BeatWidth) info; } SRAMResp deriving (Bits); // ============================================================================ @@ -243,7 +242,7 @@ endinterface typedef struct { SRAMReqId id; Bit#(`SRAMBurstWidth) burst; - InflightDCacheReqInfo info; + Bit#(`BeatWidth) info; } SRAMInFlightReq deriving (Bits); // SRAM Implementation diff --git a/rtl/ProgRouting.bsv b/rtl/ProgRouter.bsv similarity index 58% rename from rtl/ProgRouting.bsv rename to rtl/ProgRouter.bsv index b3819180..8796182a 100644 --- a/rtl/ProgRouting.bsv +++ b/rtl/ProgRouter.bsv @@ -1,7 +1,6 @@ // SPDX-License-Identifier: BSD-2-Clause -package ProgRouting; - -// Functions and data types for programmable routers +// Functions, data types, and modules for programmable routers +package ProgRouter; // ============================================================================= // Routing keys and beats @@ -27,12 +26,18 @@ typedef struct { // 32-bit routing key typedef struct { + // Which off-chip RAM? + Bit#(`LogDRAMsPerBoard) ram // Pointer to array of routing beats containing routing records - Bit#(26) ptr; + Bit#(`LogBeatsPerDRAM) ptr; // Number of beats in the array - Bit#(6) numBeats; + Bit#(`LogRoutingEntryLen) numBeats; } RoutingKey deriving (Bits); +// Extract routing key from an address +function RoutingKey getRoutingKey(NetAddr addr) = + unpack(getRoutingKeyRaw(addr)); + // ============================================================================= // Types of routing record // ============================================================================= @@ -106,6 +111,57 @@ typedef struct { Bit#(5) unused; // New 32-bit routing key for new set of records on current router Bit#(32) newKey; -} MRMRecord deriving (Bits); +} INDRecord deriving (Bits); + +// ============================================================================= +// Design +// ============================================================================= + +// ============================================================================= +// Fetcher +// ============================================================================= + +// Address in a fetcher's flit buffer +typedef Bit#(TSub#(`LogFetcherFlitBufferSize, `LogMaxFlitsPerMsg)) + FetcherFlitBufferMsgAddr; + +// This structure contains information about an in-flight memory +// request from a fetcher. When a fetcher issues a memory load +// request, this info is packed into the unused data field of the +// request. When the memory subsystem responds, it passes back the +// same info in an extra field inside the memory response structure. +// Maintaining info about an inflight request inside the request +// itself provides an easy way to handle out-of-order responses from +// memory. +typedef struct { + // Message address in the fetcher's flit buffer + FetcherFlitBufferMsgAddr msgAddr; + // Is this the final routing beat for the key being fetched? + Bool finalBeat; +} InflightFetcherReqInfo deriving (Bits); + +// ============================================================================= +// Programmable router +// ============================================================================= + +interface ProgRouter; + // Incoming and outgoing flits + interface Vector#(`FetchersPerProgRouter, In#(Flit) flitIn); + interface Vector#(`FetchersPerProgRouter, Out#(Flit) flitOut); + + // Interface to off-chip memory + interface Vector#(`DRAMsPerBoard, + Vector#(`FetchersPerProgRouter, BOut#(DRAMReq))) ramReqs; + interface Vector#(`DRAMsPerBoard, + Vector#(`FetchersPerProgRouter, In#(DRAMResp))) ramResps; +endinterface + +module mkProgRouter (ProgRouter); + + // Flit input ports + Vector#(`FetchersPerProgRouter, InPort#(Flit)) flitInPort <- + replicateM(mkInPort); + +endmodule endpackage diff --git a/rtl/WideSRAM.bsv b/rtl/WideSRAM.bsv index a3816a38..04af1dc7 100644 --- a/rtl/WideSRAM.bsv +++ b/rtl/WideSRAM.bsv @@ -108,6 +108,7 @@ module mkWideSRAM#(RAMId id) (WideSRAM); respOut.data = pack(data); respOut.info = respIn.info; respOut.finalBeat = True; + respOut.beat = 0; respQueue.enq(respOut); respCount <= 0; end From 52c4317e1df6efdca37a7f2e6a2bcc86512db57d Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Tue, 24 Mar 2020 21:31:19 +0000 Subject: [PATCH 08/78] Fixes to previous commit --- rtl/DCache.bsv | 6 +++--- rtl/DCacheTypes.bsv | 1 - rtl/DRAM.bsv | 2 +- rtl/NarrowSRAM.bsv | 3 +-- 4 files changed, 5 insertions(+), 7 deletions(-) diff --git a/rtl/DCache.bsv b/rtl/DCache.bsv index bbed742d..e972a858 100644 --- a/rtl/DCache.bsv +++ b/rtl/DCache.bsv @@ -438,8 +438,9 @@ module mkDCache#(DCacheId myId) (DCache); let flush = flushQueue.dataOut; let resp = respPort.value; InflightDCacheReqInfo info = unpack(truncate(resp.info)); + Bit#(`LogBeatsPerLine) beat = truncate(resp.beat); lineWriteDataWire <= resp.data; - lineWriteIndexWire <= beatIndex(truncate(resp.beat), info.req.id, + lineWriteIndexWire <= beatIndex(beat, info.req.id, info.req.addr, info.way); // Ready to consume flush queue? if (flushQueue.canDeq && flushQueue.canPeek) begin @@ -454,7 +455,7 @@ module mkDCache#(DCacheId myId) (DCache); // Remove item from fill queue and feed associated request (which // will definitely hit if it starts again from the beginning of // the pipeline) back to beginning of the pipeline - if (allHigh(resp.beat)) + if (allHigh(beat)) feedbackTrigger <= True; // Write new line data to dataMem // (The write parameters are set outside condition for better timing) @@ -493,7 +494,6 @@ module mkDCache#(DCacheId myId) (DCache); InflightDCacheReqInfo info; info.req = miss.req; info.way = miss.evictWay; - info.beat = ?; // Create memory request DRAMReq memReq; memReq.isStore = !isLoad; diff --git a/rtl/DCacheTypes.bsv b/rtl/DCacheTypes.bsv index fa6ba407..4ddd809f 100644 --- a/rtl/DCacheTypes.bsv +++ b/rtl/DCacheTypes.bsv @@ -43,7 +43,6 @@ typedef struct { typedef struct { DCacheReq req; Way way; - Bit#(`LogBeatsPerLine) beat; } InflightDCacheReqInfo deriving (Bits); endpackage diff --git a/rtl/DRAM.bsv b/rtl/DRAM.bsv index d188afbd..406cfe89 100644 --- a/rtl/DRAM.bsv +++ b/rtl/DRAM.bsv @@ -6,7 +6,7 @@ package DRAM; // ============================================================================ // DRAM client id -typedef Bit#(TAdd#(`LogDCachesPerDRAM, 1)) DRAMClientId; +typedef Bit#(TLog#(TAdd#(`DCachesPerDRAM,`FetchersPerProgRouter))) DRAMClientId; // DRAM request id typedef DRAMClientId DRAMReqId; diff --git a/rtl/NarrowSRAM.bsv b/rtl/NarrowSRAM.bsv index dde0e08a..4e51be85 100644 --- a/rtl/NarrowSRAM.bsv +++ b/rtl/NarrowSRAM.bsv @@ -8,7 +8,7 @@ import Util :: *; // ============================================================================ // SRAM request id -typedef Bit#(TAdd#(`LogDCachesPerDRAM, 1)) SRAMReqId; +typedef Bit#(TLog#(TAdd#(`DCachesPerDRAM,`FetchersPerProgRouter))) SRAMReqId; // SRAM load request typedef struct { @@ -139,7 +139,6 @@ module mkSRAM#(RAMId id) (SRAM); resp.id = req.id; resp.data = pack(elems); resp.info = req.info; - resp.info.beat = truncate(loadBurstCount); resps.enq(resp); inFlightCount.dec; end From b512bf969313032ecb5d0c6b18596bbcaa279c1c Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Wed, 25 Mar 2020 13:49:51 +0000 Subject: [PATCH 09/78] Add some design notes --- rtl/ProgRouter.bsv | 50 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv index 8796182a..4cbc75f1 100644 --- a/rtl/ProgRouter.bsv +++ b/rtl/ProgRouter.bsv @@ -117,6 +117,56 @@ typedef struct { // Design // ============================================================================= +// In the following diagram N/S/E/W are the inter-FPGA links and +// L0..L3 are links at one edge of the NoC. Depending on the NoC +// dimensions, there may be more or less than four links on a single +// NoC edge, but the diagram assumes four. + +// +// N S E W L0..L3/Loop Input flits +// | | | | | | +// +---+ +---+ +---+ +---+ +---+ | +// | F | | F | | F | | F | | F | | Fetchers +// +---+ +---+ +---+ +---+ +---+ | +// | | | | | | +// +---------------------------+ | +// | Crossbar | | Preliminary routing +// +---------------------------+ | +// | | | | | | +// N/L0 S/L1 E/L2 W/L3 Ind-----+ Output queues +// | | | | +// +---------------------------+ +// | Expander | Final expansion +// +---------------------------+ +// | | | | | | | | +// N S E W L0 L1 L2 L3 Output flits +// + +// The core functionality is implemented in the fetchers, which: +// (1) extract routing keys from incoming flits; +// (2) lookup the keys in RAM; +// (3) interpret the resulting routing records; and +// (4) emit the interpreted flits. + +// The key property of these fetchers is that they act entirely +// indepdedently of each other: each one can make progress even if +// another is blocked. Unfortunately, this leads to a duplicated +// logic resources, but is necessary to avoid deadlock. + +// Note that, as the routers are fully programmable, it is possible +// for the programmer to introduce deadlock using an ill-defined +// routing scheme, e.g. where a flit arrives in on (say) link N and +// requires a flit to be sent back along the same direction N. +// However, the hardware does guarantee deadlock-freedom if the +// routing scheme is based on dimension-ordered routing. + +// After the fetchers have interpreted the flits, they are fed to a +// fair crossbar which organises them by destination into output +// queues. To reduce logic, we allow each inter-board link to share +// an output queue with a local link, as this does not compromise +// forward progress. Finally the queues are expanded to provide an +// output stream for each possible destination. + // ============================================================================= // Fetcher // ============================================================================= From 1c7ed6f1254acd576492625b60025a3e8eb3331b Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Wed, 25 Mar 2020 18:11:06 +0000 Subject: [PATCH 10/78] First stage of fetcher --- rtl/ProgRouter.bsv | 159 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 154 insertions(+), 5 deletions(-) diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv index 4cbc75f1..8f203277 100644 --- a/rtl/ProgRouter.bsv +++ b/rtl/ProgRouter.bsv @@ -171,9 +171,11 @@ typedef struct { // Fetcher // ============================================================================= -// Address in a fetcher's flit buffer -typedef Bit#(TSub#(`LogFetcherFlitBufferSize, `LogMaxFlitsPerMsg)) - FetcherFlitBufferMsgAddr; +// Flit address in a fetcher's flit buffer +typedef Bit#(`FetcherLogFlitBufferSize) FetcherFlitBufferAddr; + +// Message address in a fetcher's flit buffer +typedef Bit#(`FetcherLogMsgsPerFlitBuffer) FetcherFlitBufferMsgAddr; // This structure contains information about an in-flight memory // request from a fetcher. When a fetcher issues a memory load @@ -186,10 +188,157 @@ typedef Bit#(TSub#(`LogFetcherFlitBufferSize, `LogMaxFlitsPerMsg)) typedef struct { // Message address in the fetcher's flit buffer FetcherFlitBufferMsgAddr msgAddr; - // Is this the final routing beat for the key being fetched? - Bool finalBeat; + // How many beats in the burst? + Bit#(`BeatBurstWidth) burst; + // Is this the final burst of routing records for the current key? + Bool finalBurst; } InflightFetcherReqInfo deriving (Bits); +// Fetcher interface +interface Fetcher; + // Incoming and outgoing flits + interface In#(Flit) flitIn; + interface Out#(Flit) flitOut; + // Off-chip RAM connections + Vector#(`DRAMsPerBoard, BOut#(DRAMReq)) ramReqs; + Vector#(`DRAMsPerBoard, In#(DRAMResp)) ramResps; +endinterface + +// Fetcher module +module mkFetcher; + + // Flit input port + InPort#(Flit)) flitInPort <- mkInPort; + + // RAM response ports + Vector#(`DRAMsPerBoard, InPort#(DRAMResp)) ramRespPort <- + replicateM(mkInPort); + + // RAM request queues + Vector#(`DRAMsPerBoard, Queue1#(DRAMReq)) ramReqQueue <- + replicateM(mkUGShiftQueue(QueueOptFmax)); + + // Flit buffer + BlockRamOpts flitBufferOpts = + BlockRamOpts { + readDuringWrite: DontCare, + style: "AUTO", + registerDataOut: False, + initFile: Invalid + }; + BlockRam#(FetcherFlitBufferAddr, Flit) flitBuffer <- mkBlockRam; + + // Beat buffer + SizedQueue#(`LogProgRouterBeatBufferSize, RoutingBeat)) beatBuffer <- + replicateM(mkUGSizedQueue); + + // Stage 1: consume input message + // ------------------------------ + + // Consumer state + // State 0: pass through flits that don't contain routing keys + // State 1: buffer flits that do contain routing keys + // State 2: fetch routing beats + Reg#(Bit#(2)) consumeState <- mkReg(0); + + // Count number of flits of message consumed so far + Reg#(Bit#(`LogFlitsPerMsg)) consumeFlitCount <- mkReg(0); + + // Flit slot allocator + Vector#(`FetcherMsgsPerFlitBuffer, SetReset) flitBufferUsedSlots <- + mkSetReset(False); + + // Chosen message slot + Reg#(FetcherFlitBufferMsgAddr) chosenReg <- mkRegU; + + // Routing key of message consumed + Reg#(RoutingKey) consumeKey <- mkRegU; + + // Maintain count of routing beats fetched so far + Reg#(Bit#(`LogRoutingEntryLen)) fetchBeatCount <- mkReg(0); + + // State 0: pass through flits that don't contain routing keys + rule consumeMessage0 (consumeState == 0); + Flit flit = flitInPort.value; + // Find unused message slot + Bool found = False; + FetcherFlitBufferMsgAddr chosen = ?; + for (Integer i = 0; i < `FetcherMsgsPerFlitBuffer; i=i+1) + if (flitBufferUsedSlots[i].value == 0) begin + found = True; + chosen = fromInteger(i); + end + chosenReg <= chosen; + // Initialise counters for subsequent states + flitCount <= 0; + fetchBeatCount<= 0; + // Consume flit + if (flitInPort.canGet) begin + if (flit.dest.addr.isKey) begin + if (found) begin + consumeState <= 1; + end + end else if (flitQueue.notFull) begin + // TODO: avoid conflict with interpreter stage + flitOutQueue.enq(flit); + flitInPort.get; + end + end + endrule + + // State 1: buffer flits that do contain routing keys + rule consumeMessage1 (consumeState == 1); + Flit flit = flitInPort.value; + if (flitInPort.canGet) begin + flitInPort.get; + consumeKey <= getRoutingKey(flit.dest.addr); + // Write to flit buffer + flitBuffer.write({chosenReg, consumeFlitCount}, flit); + consumeFlitCount <= consumeFlitCount + 1; + // On final flit, move to fetch state + if (! flit.notFinalFlit) begin + consumeState <= 2; + // Claim chosen slot + flitBufferUsedSlots[chosenReg].set; + end + end + endrule + + // State 2: fetch routing beats + rule consumeMessage2 (consumeState == 2); + // Have we finished fetching beats? + Bool finished = fetchBeatCount + `ProgRouterMaxBurst >= consumeKey.len; + // Prepare inflight RAM request info + // (to handle out of order resps from the RAMs) + InflightFetcherReqInfo info; + info.msgAddr = chosenReg; + info.burst = min(consumeKey.len - fetchBeatCount, `ProgRouterMaxBurst); + info.finalBurst = finished; + // Prepare RAM request + DRAMReq req; + req.isStore = False; + req.id = fromInteger(`DCachesPerDRAM + myId); + req.addr = {1'b0, consumeKey.ptr + fetchBeatCount}; + req.data = zeroExtend(pack(info)); + req.burst = info.burst; + // Don't overfetch (beat buffer has finite size) + if (ramReqQueue[consumeKey.ram].notFull && + beatBufferLen.available >= zeroExtend(req.burst)) begin + ramReqQueue[consumeKey.ram].enq(req); + fetchBeatCount <= fetchBeatCount + req.burst; + beatBufferLen.incBy(zeroExtend(req.burst)); + if (finished) consumeState <= 0; + end + end + endrule + + // Stage 2: consume RAM responses + // ------------------------------ + + + +endmodule + // ============================================================================= // Programmable router // ============================================================================= From 210449a886b3d7cebb227553753732fb769a9b91 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Fri, 27 Mar 2020 11:43:32 +0000 Subject: [PATCH 11/78] Fetcher complete (But doesn't compile yet) --- rtl/ProgRouter.bsv | 290 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 274 insertions(+), 16 deletions(-) diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv index 8f203277..c5245254 100644 --- a/rtl/ProgRouter.bsv +++ b/rtl/ProgRouter.bsv @@ -24,6 +24,14 @@ typedef struct { Vector#(6, Bit#(40)) chunks; } RoutingBeat deriving (Bits); +// Routing beat, tagged with the beat number in the DRAM burst +typedef struct { + // Beat + RoutingBeat beat; + // Beat number + Bit#(`BeatBurstWidth) beatNum; +} NumberedRoutingBeat deriving (Bits); + // 32-bit routing key typedef struct { // Which off-chip RAM? @@ -50,6 +58,13 @@ typedef enum { IND = 3'd4 // 40-bit Indirection } RoutingRecordTag; +typedef enum { + NORTH = 2'd0, + SOUTH = 2'd1, + EAST = 2'd2, + WEST = 2'd3, +} RoutingDir; + // 40-bit Unicast Router-to-Mailbox (URM1) record typedef struct { // Record type @@ -83,7 +98,7 @@ typedef struct { // Record type RoutingRecordTag tag; // Direction (N, S, E, or W) - Bit#(2) dir; + RoutingDir dir; // Currently unused Bit#(3) unused; // New 32-bit routing key that will replace the one in the @@ -113,6 +128,25 @@ typedef struct { Bit#(32) newKey; } INDRecord deriving (Bits); +// It is sometimes convenient (though redundant) to record a routing +// decision for a flit internally within the programmable router +typedef struct { + // Normal flit + Flit flit; + // Routing decision for flit + RoutingDecision decision; +} RoutedFlit deriving (Bits); + +// Routing decision +typedef enum { + RouteNorth, + RouteSouth, + RouteEast, + RouteWest, + RouteNoC, + RouteLoop +} RoutingDecision deriving (Bits, Eq); + // ============================================================================= // Design // ============================================================================= @@ -130,7 +164,7 @@ typedef struct { // +---+ +---+ +---+ +---+ +---+ | // | | | | | | // +---------------------------+ | -// | Crossbar | | Preliminary routing +// | Crossbar | | Routing // +---------------------------+ | // | | | | | | // N/L0 S/L1 E/L2 W/L3 Ind-----+ Output queues @@ -198,22 +232,18 @@ typedef struct { interface Fetcher; // Incoming and outgoing flits interface In#(Flit) flitIn; - interface Out#(Flit) flitOut; + interface BOut#(RoutedFlit) flitOut; // Off-chip RAM connections Vector#(`DRAMsPerBoard, BOut#(DRAMReq)) ramReqs; Vector#(`DRAMsPerBoard, In#(DRAMResp)) ramResps; endinterface // Fetcher module -module mkFetcher; +module mkFetcher#(BoardId boardId) (Fetcher); // Flit input port InPort#(Flit)) flitInPort <- mkInPort; - // RAM response ports - Vector#(`DRAMsPerBoard, InPort#(DRAMResp)) ramRespPort <- - replicateM(mkInPort); - // RAM request queues Vector#(`DRAMsPerBoard, Queue1#(DRAMReq)) ramReqQueue <- replicateM(mkUGShiftQueue(QueueOptFmax)); @@ -229,8 +259,21 @@ module mkFetcher; BlockRam#(FetcherFlitBufferAddr, Flit) flitBuffer <- mkBlockRam; // Beat buffer - SizedQueue#(`LogProgRouterBeatBufferSize, RoutingBeat)) beatBuffer <- - replicateM(mkUGSizedQueue); + SizedQueue#(`LogProgRouterBeatBufferSize, NumberedRoutingBeat)) + beatBuffer <- replicateM(mkUGSizedQueue); + + // Track length of beat buffer, so that we don't overfetch + Count#(TAdd#(`LogProgRouterBeatBufferSize, 1)) beatBufferLen <- + mkCount(2 ** `LogProgRouterBeatBufferSize); + + // For flits whose destinations are *not* routing keys + Queue1#(RoutedFlit) flitBypassQueue <- mkUGShiftQueue(QueueOptFmax); + + // For flits whose destinations are routing keys + Queue1#(RoutedFlit) flitProcessedQueue <- mkUGShiftQueue(QueueOptFmax); + + // Final output queue for flits + Queue1#(RoutedFlit) flitOutQueue <- mkUGShiftQueue(QueueOptFmax); // Stage 1: consume input message // ------------------------------ @@ -278,10 +321,19 @@ module mkFetcher; if (found) begin consumeState <= 1; end - end else if (flitQueue.notFull) begin - // TODO: avoid conflict with interpreter stage - flitOutQueue.enq(flit); + end else if (flitBypassQueue.notFull) begin flitInPort.get; + // Make routing decision + RoutingDecision decision = RouteLocal; + MailboxNetAddr = flit.dest.addr; + if (a.addr.host.valid) + decision = addr.host.value == 0 ? RouteWest : RouteEast; + else if (addr.board.x < boardId.x) decision = RouteWest; + else if (addr.board.x > boardId.x) decision = RouteEast; + else if (addr.board.y < boardId.y) decision = RouteSouth; + else if (addr.board.y > boardId.y) decision = RouteNorth; + // Insert into bypass queue + flitBypassQueue.enq(RoutedFlit { decision: decision, flit: flit}); end end endrule @@ -319,7 +371,7 @@ module mkFetcher; req.isStore = False; req.id = fromInteger(`DCachesPerDRAM + myId); req.addr = {1'b0, consumeKey.ptr + fetchBeatCount}; - req.data = zeroExtend(pack(info)); + req.data = {?, pack(info)}; req.burst = info.burst; // Don't overfetch (beat buffer has finite size) if (ramReqQueue[consumeKey.ram].notFull && @@ -332,10 +384,214 @@ module mkFetcher; end endrule - // Stage 2: consume RAM responses - // ------------------------------ + // Stage 2: interpret routing beats + // -------------------------------- + + // Merge responses from each RAM + staticAssert(`DRAMsPerBoard == 2, + "Fetcher: need to generalise number of RAMs used") + MergeUnit#(NumberedRoutingBeat) ramRespMerger <- mkMergeUnitFair; + + // Convert a RAM response to a numbered routing beat + function NumberedRoutingBeat fromDRAMResp(DRAMResp resp) = + NumberedRoutingBeat { + beat: unpack(resp.data) + , beatNum: resp.beat + }; + + // Create RAM response input interfaces for this module + In#(DRAMResp) respA <- onIn(fromDRAMResp, ramRespMerger.inA); + In#(DRAMResp) respB <- onIn(fromDRAMResp, ramRespMerger.inB); + Vector#(`DRAMsPerBoard, In#(DRAMResp)) ramResps = vector(respA, respB); + + // Connect the merger to the beat buffer + connectToQueue(ramRespMerger.out, beatBuffer); + + // Count number of flits of message emitted so far + Reg#(Bit#(`LogFlitsPerMsg)) emitFlitCount <- mkReg(0); + + // Count number of records processed so far in current beat + Reg#(Bit#(3)) recordCount <- mkReg(0); + + // (Shift) register holding current routing beat + Reg#(NumberedRoutingBeat) beatReg <- mkRegU; + + // Interpreter state + // 0: register the routing beat and fetch first flit + // 1: interpret flits + Reg#(Bit#(1)) interpreterState <- mkReg(0); + + // State 0: register the routing beat and fetch first flit + rule interpreter0 (interpreterState == 0); + let beat = beatBuffer.dataOut.beat; + InflightFetcherReqInfo info = unpack(truncate(beat.info)); + // Consume beat + if (beatBuffer.canDeq && beatBuffer.canPeek) begin + beatReg <= beatBuffer.dataOut; + beatBuffer.deq; + interpreterState <= 1; + end + // Load first flit + flitBuffer.load({info.msgAddr, 0}); + emitFlitCount <= 0; + recordCount <= 0; + endrule + + // State 1: interpret flits + rule interpreter1 (interpreterState == 1); + // Extract details of registered routing beat + let beat = beatReg.beat; + let beatNum = beatReg.beatNum; + InflightFetcherReqInfo info = unpack(truncate(beat.info)); + // Extract tag from next record + RoutingRecordTag tag = beat.chunks[5].tag; + // Is this the first flit of a message? + Bool firstFlit = emitFlitCount == 0; + // Modify flit by interpreting routing key + RoutingDecision decision = ?; + Flit flit = flitBuffer.dataOut; + case (tag) + // 40-bit Unicast Router-to-Mailbox + URM1: begin + URM1Record rec = unpack(beat.chunks[5]); + flit.dest.addr.isKey = False; + flit.dest.addr.mbox = rec.mbox; + Vector#(`ThreadsPerMailbox, Bool) threadMask = newVector; + for (Integer j = 0; j < `ThreadsPerMailbox; j=j+1) + threadMask = rec.thread == fromInteger(j); + flit.dest.threads = pack(threadMask); + // Replace first word of message with local key + if (firstFlit) + flit.payload = {truncateLSB(flit.payload), 5'b0, rec.localKey}; + decision = RouteLocal; + end + // 80-bit Unicast Router-to-Mailbox + URM2: begin + URM2Record rec = unpack({beat.chunks[5], beat.chunks[4]}); + flit.dest.addr.isKey = False; + flit.dest.addr.mbox = rec.mbox; + Vector#(`ThreadsPerMailbox, Bool) threadMask = newVector; + for (Integer j = 0; j < `ThreadsPerMailbox; j=j+1) + threadMask = rec.thread == fromInteger(j); + flit.dest.threads = pack(threadMask); + // Replace first two words of message with local key + if (firstFlit) + flit.payload = {truncateLSB(flit.payload), rec.localKey}; + decision = RouteLocal; + end + // 40-bit Router-to-Router + RR: begin + RRRecord rec = unpack(beat.chunks[5]); + case (rec.dir) + NORTH: begin + decision = RouteNorth; + flit.dest.board = BoardId {x: boardId.x, y: boardId.y+1}; + end + SOUTH: begin + decision = RouteSouth; + flit.dest.board = BoardId {x: boardId.x, y: boardId.y-1}; + end + EAST: begin + decision = RouteEast; + flit.dest.board = BoardId {x: boardId.x+1, y: boardId.y}; + end + WEST: begin + decision = RouteWest; + flit.dest.board = BoardId {x: boardId.x-1, y: boardId.y}; + end + endcase + flit.dest.threads = {?, rec.newKey}; + end + // 80-bit Multicast Router-to-Mailbox + MRM: begin + MRMRecord rec = unpack({beat.chunks[5], beat.chunks[4]}); + flit.dest.addr.isKey = False; + flit.dest.addr.mbox = rec.mbox; + flit.dest.threads = rec.destMask; + decision = RouteLocal; + end + // 40-bit Indirection + IND: begin + INDRecord rec = unpack(beat.chunks[5]); + flit.dest.threads = {?, rec.newKey}; + decision = RouteLoop; + end + end + // Is output queue ready for new flit? + Bool emit = flitProcessedQueue.notFull; + Bool newFlitCount = emitFlitCount; + // Consume routing record + if (emit) begin + flitProcessedQueue.enq(RoutedFlit { decision: decision, flit: flit }); + // Move to next record + recordCount <= recordCount + 1; + // Shift beat to point to next record + RoutingBeat newBeat = beat; + Bool doubleChunk = unpack(pack(tag)[0]); + if (doubleChunk) begin + for (Integer i = 5; i > 2; i=i-2) begin + newBeat.chunks[i] = beat.chunks[i-2]; + newBeat.chunks[i-1] = beat.chunks[i-3]; + end + end else begin + for (Integer i = 5; i > 0; i=i-1) + newBeat.chunks[i] = beat.chunks[i-1]; + end + beatReg <= NumberedRoutingBeat { beatNum: beatNum, beat: newBeat }; + // Is this the final record in the beat? + if ((recordCount+1) == beat.size) begin + interpreterState <= 0; + // Have we finished with this message yet? + if (info.finalBurst && info.burst == (beatNum+1)) begin + // Reclaim message slot in flit buffer + flitBufferUsedSlots[info.msgAddr].clear; + end + end + // Is this the final flit in the message? + if (flit.notFinalFlit) + newFlitCount = emitFlitCount + 1; + else + newFlitCount = 0; + end + // Issue flit load request + flitBuffer.load({info.msgAddr, newFlitCount}); + emitFlitCount <= newFlitCount; + endrule + + // Stage 3: merge output queues + // ---------------------------- + + // We want to merge messages, not flits + // Are we in the middle of consuming a message? + Reg#(Bool) mergeInProgress <- mkReg(False); + Reg#(Bool) prevFromBypass <- mkReg(False); + + rule merge (flitOutQueue.notFull); + // Favour the bypass queue + Bool chooseBypass = mergeInProgress ? prevFromBypass : + flitBypassQueue.canDeq; + if (chooseBypass) begin + if (flitBypassQueue.canDeq) begin + flitBypassQueue.deq; + flitOutQueue.enq(flitBypassQueue.dataOut); + mergeInProgress <= flitBypassQueue.dataOut.flit.notFinalFlit; + prevFromBypass = True; + end + end else if (flitProcessedQueue.canDeq) begin + flitProcessedQueue.deq; + flitOutQueue.enq(flitProcessedQueue.dataOut); + mergeInProgress <= flitProcessedQueue.dataOut.flit.notFinalFlit; + prevFromBypass = False; + end + endrule; + // Interfaces + // ----------- + interface flitIn = flitInPort.in; + interface flitOut = queueToBOut(flitOutQueue); + interface ramReqs = map(queueToBOut, ramReqQueue); + interface ramResps = ramResps; endmodule @@ -355,6 +611,7 @@ interface ProgRouter; Vector#(`FetchersPerProgRouter, In#(DRAMResp))) ramResps; endinterface +/* module mkProgRouter (ProgRouter); // Flit input ports @@ -362,5 +619,6 @@ module mkProgRouter (ProgRouter); replicateM(mkInPort); endmodule +*/ endpackage From c294bebfa647e6e7a8858e00badb9998cbcda7b3 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Fri, 27 Mar 2020 14:43:42 +0000 Subject: [PATCH 12/78] Fetcher compiles --- config.py | 9 +++ rtl/Network.bsv | 1 + rtl/ProgRouter.bsv | 153 ++++++++++++++++++++++++--------------------- 3 files changed, 93 insertions(+), 70 deletions(-) diff --git a/config.py b/config.py index 7f931637..919ab4a8 100755 --- a/config.py +++ b/config.py @@ -161,6 +161,15 @@ def quoted(s): return "'\"" + s + "\"'" p["SRAMLogMaxInFlight"] = 5 p["SRAMStoreLatency"] = 2 +# Programmable router parameters: +p["LogRoutingEntryLen"] = 5 # Number of beats in a routing table entry +p["ProgRouterMaxBurst"] = 4 +p["FetcherLogBeatBufferSize"] = 5 +p["FetcherLogFlitBufferSize"] = 5 +p["FetcherLogMsgsPerFlitBuffer"] = ( + p["FetcherLogFlitBufferSize"] - p["LogMaxFlitsPerMsg"]) +p["FetcherMsgsPerFlitBuffer"] = 2 ** p["FetcherLogMsgsPerFlitBuffer"] + # Enable performance counters p["EnablePerfCount"] = True diff --git a/rtl/Network.bsv b/rtl/Network.bsv index 99229c96..6706d00f 100644 --- a/rtl/Network.bsv +++ b/rtl/Network.bsv @@ -25,6 +25,7 @@ import IdleDetector :: *; import FlitMerger :: *; import OffChipRAM :: *; import DRAM :: *; +import ProgRouter :: *; // ============================================================================= // Mesh Router diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv index c5245254..b5de42ef 100644 --- a/rtl/ProgRouter.bsv +++ b/rtl/ProgRouter.bsv @@ -2,6 +2,15 @@ // Functions, data types, and modules for programmable routers package ProgRouter; +import Globals :: *; +import Util :: *; +import DRAM :: *; +import Vector :: *; +import Queue :: *; +import Interface :: *; +import BlockRam :: *; +import Assert :: *; + // ============================================================================= // Routing keys and beats // ============================================================================= @@ -24,18 +33,10 @@ typedef struct { Vector#(6, Bit#(40)) chunks; } RoutingBeat deriving (Bits); -// Routing beat, tagged with the beat number in the DRAM burst -typedef struct { - // Beat - RoutingBeat beat; - // Beat number - Bit#(`BeatBurstWidth) beatNum; -} NumberedRoutingBeat deriving (Bits); - // 32-bit routing key typedef struct { // Which off-chip RAM? - Bit#(`LogDRAMsPerBoard) ram + Bit#(`LogDRAMsPerBoard) ram; // Pointer to array of routing beats containing routing records Bit#(`LogBeatsPerDRAM) ptr; // Number of beats in the array @@ -56,14 +57,14 @@ typedef enum { RR = 3'd2, // 40-bit Router-to-Router MRM = 3'd3, // 80-bit Multicast Router-to-Mailbox IND = 3'd4 // 40-bit Indirection -} RoutingRecordTag; +} RoutingRecordTag deriving (Bits, Eq); typedef enum { NORTH = 2'd0, SOUTH = 2'd1, EAST = 2'd2, - WEST = 2'd3, -} RoutingDir; + WEST = 2'd3 +} RoutingDir deriving (Bits, Eq); // 40-bit Unicast Router-to-Mailbox (URM1) record typedef struct { @@ -228,21 +229,31 @@ typedef struct { Bool finalBurst; } InflightFetcherReqInfo deriving (Bits); +// Routing beat, tagged with the beat number in the DRAM burst +typedef struct { + // Beat + RoutingBeat beat; + // Beat number + Bit#(`BeatBurstWidth) beatNum; + // Inflight request info + InflightFetcherReqInfo info; +} NumberedRoutingBeat deriving (Bits); + // Fetcher interface interface Fetcher; // Incoming and outgoing flits interface In#(Flit) flitIn; interface BOut#(RoutedFlit) flitOut; // Off-chip RAM connections - Vector#(`DRAMsPerBoard, BOut#(DRAMReq)) ramReqs; - Vector#(`DRAMsPerBoard, In#(DRAMResp)) ramResps; + interface Vector#(`DRAMsPerBoard, BOut#(DRAMReq)) ramReqs; + interface Vector#(`DRAMsPerBoard, In#(DRAMResp)) ramResps; endinterface // Fetcher module -module mkFetcher#(BoardId boardId) (Fetcher); +module mkFetcher#(Integer fetcherId, BoardId boardId) (Fetcher); // Flit input port - InPort#(Flit)) flitInPort <- mkInPort; + InPort#(Flit) flitInPort <- mkInPort; // RAM request queues Vector#(`DRAMsPerBoard, Queue1#(DRAMReq)) ramReqQueue <- @@ -259,12 +270,12 @@ module mkFetcher#(BoardId boardId) (Fetcher); BlockRam#(FetcherFlitBufferAddr, Flit) flitBuffer <- mkBlockRam; // Beat buffer - SizedQueue#(`LogProgRouterBeatBufferSize, NumberedRoutingBeat)) - beatBuffer <- replicateM(mkUGSizedQueue); + SizedQueue#(`FetcherLogBeatBufferSize, NumberedRoutingBeat) + beatBuffer <- mkUGSizedQueue; // Track length of beat buffer, so that we don't overfetch - Count#(TAdd#(`LogProgRouterBeatBufferSize, 1)) beatBufferLen <- - mkCount(2 ** `LogProgRouterBeatBufferSize); + Count#(TAdd#(`FetcherLogBeatBufferSize, 1)) beatBufferLen <- + mkCount(2 ** `FetcherLogBeatBufferSize); // For flits whose destinations are *not* routing keys Queue1#(RoutedFlit) flitBypassQueue <- mkUGShiftQueue(QueueOptFmax); @@ -285,11 +296,11 @@ module mkFetcher#(BoardId boardId) (Fetcher); Reg#(Bit#(2)) consumeState <- mkReg(0); // Count number of flits of message consumed so far - Reg#(Bit#(`LogFlitsPerMsg)) consumeFlitCount <- mkReg(0); + Reg#(Bit#(`LogMaxFlitsPerMsg)) consumeFlitCount <- mkReg(0); // Flit slot allocator Vector#(`FetcherMsgsPerFlitBuffer, SetReset) flitBufferUsedSlots <- - mkSetReset(False); + replicateM(mkSetReset(False)); // Chosen message slot Reg#(FetcherFlitBufferMsgAddr) chosenReg <- mkRegU; @@ -307,14 +318,14 @@ module mkFetcher#(BoardId boardId) (Fetcher); Bool found = False; FetcherFlitBufferMsgAddr chosen = ?; for (Integer i = 0; i < `FetcherMsgsPerFlitBuffer; i=i+1) - if (flitBufferUsedSlots[i].value == 0) begin + if (! flitBufferUsedSlots[i].value) begin found = True; chosen = fromInteger(i); end chosenReg <= chosen; // Initialise counters for subsequent states - flitCount <= 0; - fetchBeatCount<= 0; + consumeFlitCount <= 0; + fetchBeatCount <= 0; // Consume flit if (flitInPort.canGet) begin if (flit.dest.addr.isKey) begin @@ -324,9 +335,9 @@ module mkFetcher#(BoardId boardId) (Fetcher); end else if (flitBypassQueue.notFull) begin flitInPort.get; // Make routing decision - RoutingDecision decision = RouteLocal; - MailboxNetAddr = flit.dest.addr; - if (a.addr.host.valid) + RoutingDecision decision = RouteNoC; + MailboxNetAddr addr = flit.dest.addr; + if (addr.host.valid) decision = addr.host.value == 0 ? RouteWest : RouteEast; else if (addr.board.x < boardId.x) decision = RouteWest; else if (addr.board.x > boardId.x) decision = RouteEast; @@ -343,7 +354,7 @@ module mkFetcher#(BoardId boardId) (Fetcher); Flit flit = flitInPort.value; if (flitInPort.canGet) begin flitInPort.get; - consumeKey <= getRoutingKey(flit.dest.addr); + consumeKey <= getRoutingKey(flit.dest); // Write to flit buffer flitBuffer.write({chosenReg, consumeFlitCount}, flit); consumeFlitCount <= consumeFlitCount + 1; @@ -359,28 +370,28 @@ module mkFetcher#(BoardId boardId) (Fetcher); // State 2: fetch routing beats rule consumeMessage2 (consumeState == 2); // Have we finished fetching beats? - Bool finished = fetchBeatCount + `ProgRouterMaxBurst >= consumeKey.len; + Bool finished = fetchBeatCount+`ProgRouterMaxBurst >= consumeKey.numBeats; // Prepare inflight RAM request info // (to handle out of order resps from the RAMs) InflightFetcherReqInfo info; info.msgAddr = chosenReg; - info.burst = min(consumeKey.len - fetchBeatCount, `ProgRouterMaxBurst); + info.burst = truncate( + min(consumeKey.numBeats - fetchBeatCount, `ProgRouterMaxBurst)); info.finalBurst = finished; // Prepare RAM request DRAMReq req; req.isStore = False; - req.id = fromInteger(`DCachesPerDRAM + myId); - req.addr = {1'b0, consumeKey.ptr + fetchBeatCount}; + req.id = fromInteger(`DCachesPerDRAM + fetcherId); + req.addr = {1'b0, consumeKey.ptr + zeroExtend(fetchBeatCount)}; req.data = {?, pack(info)}; req.burst = info.burst; // Don't overfetch (beat buffer has finite size) if (ramReqQueue[consumeKey.ram].notFull && beatBufferLen.available >= zeroExtend(req.burst)) begin - ramReqQueue[consumeKey.ram].enq(req); - fetchBeatCount <= fetchBeatCount + req.burst; - beatBufferLen.incBy(zeroExtend(req.burst)); - if (finished) consumeState <= 0; - end + ramReqQueue[consumeKey.ram].enq(req); + fetchBeatCount <= fetchBeatCount + zeroExtend(req.burst); + beatBufferLen.incBy(zeroExtend(req.burst)); + if (finished) consumeState <= 0; end endrule @@ -389,7 +400,7 @@ module mkFetcher#(BoardId boardId) (Fetcher); // Merge responses from each RAM staticAssert(`DRAMsPerBoard == 2, - "Fetcher: need to generalise number of RAMs used") + "Fetcher: need to generalise number of RAMs used"); MergeUnit#(NumberedRoutingBeat) ramRespMerger <- mkMergeUnitFair; // Convert a RAM response to a numbered routing beat @@ -397,18 +408,19 @@ module mkFetcher#(BoardId boardId) (Fetcher); NumberedRoutingBeat { beat: unpack(resp.data) , beatNum: resp.beat + , info: unpack(truncate(resp.info)) }; // Create RAM response input interfaces for this module In#(DRAMResp) respA <- onIn(fromDRAMResp, ramRespMerger.inA); In#(DRAMResp) respB <- onIn(fromDRAMResp, ramRespMerger.inB); - Vector#(`DRAMsPerBoard, In#(DRAMResp)) ramResps = vector(respA, respB); + Vector#(`DRAMsPerBoard, In#(DRAMResp)) ramRespsOut = vector(respA, respB); // Connect the merger to the beat buffer connectToQueue(ramRespMerger.out, beatBuffer); // Count number of flits of message emitted so far - Reg#(Bit#(`LogFlitsPerMsg)) emitFlitCount <- mkReg(0); + Reg#(Bit#(`LogMaxFlitsPerMsg)) emitFlitCount <- mkReg(0); // Count number of records processed so far in current beat Reg#(Bit#(3)) recordCount <- mkReg(0); @@ -423,16 +435,16 @@ module mkFetcher#(BoardId boardId) (Fetcher); // State 0: register the routing beat and fetch first flit rule interpreter0 (interpreterState == 0); - let beat = beatBuffer.dataOut.beat; - InflightFetcherReqInfo info = unpack(truncate(beat.info)); + let beat = beatBuffer.dataOut; + InflightFetcherReqInfo info = beat.info; // Consume beat if (beatBuffer.canDeq && beatBuffer.canPeek) begin - beatReg <= beatBuffer.dataOut; + beatReg <= beat; beatBuffer.deq; interpreterState <= 1; end // Load first flit - flitBuffer.load({info.msgAddr, 0}); + flitBuffer.read({info.msgAddr, 0}); emitFlitCount <= 0; recordCount <= 0; endrule @@ -442,9 +454,9 @@ module mkFetcher#(BoardId boardId) (Fetcher); // Extract details of registered routing beat let beat = beatReg.beat; let beatNum = beatReg.beatNum; - InflightFetcherReqInfo info = unpack(truncate(beat.info)); + let info = beatReg.info; // Extract tag from next record - RoutingRecordTag tag = beat.chunks[5].tag; + RoutingRecordTag tag = unpack(truncateLSB(beat.chunks[5])); // Is this the first flit of a message? Bool firstFlit = emitFlitCount == 0; // Modify flit by interpreting routing key @@ -455,29 +467,29 @@ module mkFetcher#(BoardId boardId) (Fetcher); URM1: begin URM1Record rec = unpack(beat.chunks[5]); flit.dest.addr.isKey = False; - flit.dest.addr.mbox = rec.mbox; + flit.dest.addr.mbox = unpack(rec.mbox); Vector#(`ThreadsPerMailbox, Bool) threadMask = newVector; for (Integer j = 0; j < `ThreadsPerMailbox; j=j+1) - threadMask = rec.thread == fromInteger(j); + threadMask[j] = rec.thread == fromInteger(j); flit.dest.threads = pack(threadMask); // Replace first word of message with local key if (firstFlit) flit.payload = {truncateLSB(flit.payload), 5'b0, rec.localKey}; - decision = RouteLocal; + decision = RouteNoC; end // 80-bit Unicast Router-to-Mailbox URM2: begin URM2Record rec = unpack({beat.chunks[5], beat.chunks[4]}); flit.dest.addr.isKey = False; - flit.dest.addr.mbox = rec.mbox; + flit.dest.addr.mbox = unpack(rec.mbox); Vector#(`ThreadsPerMailbox, Bool) threadMask = newVector; for (Integer j = 0; j < `ThreadsPerMailbox; j=j+1) - threadMask = rec.thread == fromInteger(j); + threadMask[j] = rec.thread == fromInteger(j); flit.dest.threads = pack(threadMask); // Replace first two words of message with local key if (firstFlit) flit.payload = {truncateLSB(flit.payload), rec.localKey}; - decision = RouteLocal; + decision = RouteNoC; end // 40-bit Router-to-Router RR: begin @@ -485,19 +497,19 @@ module mkFetcher#(BoardId boardId) (Fetcher); case (rec.dir) NORTH: begin decision = RouteNorth; - flit.dest.board = BoardId {x: boardId.x, y: boardId.y+1}; + flit.dest.addr.board = BoardId {x: boardId.x, y: boardId.y+1}; end SOUTH: begin decision = RouteSouth; - flit.dest.board = BoardId {x: boardId.x, y: boardId.y-1}; + flit.dest.addr.board = BoardId {x: boardId.x, y: boardId.y-1}; end EAST: begin decision = RouteEast; - flit.dest.board = BoardId {x: boardId.x+1, y: boardId.y}; + flit.dest.addr.board = BoardId {x: boardId.x+1, y: boardId.y}; end WEST: begin decision = RouteWest; - flit.dest.board = BoardId {x: boardId.x-1, y: boardId.y}; + flit.dest.addr.board = BoardId {x: boardId.x-1, y: boardId.y}; end endcase flit.dest.threads = {?, rec.newKey}; @@ -506,9 +518,9 @@ module mkFetcher#(BoardId boardId) (Fetcher); MRM: begin MRMRecord rec = unpack({beat.chunks[5], beat.chunks[4]}); flit.dest.addr.isKey = False; - flit.dest.addr.mbox = rec.mbox; + flit.dest.addr.mbox = unpack(rec.mbox); flit.dest.threads = rec.destMask; - decision = RouteLocal; + decision = RouteNoC; end // 40-bit Indirection IND: begin @@ -516,10 +528,10 @@ module mkFetcher#(BoardId boardId) (Fetcher); flit.dest.threads = {?, rec.newKey}; decision = RouteLoop; end - end + endcase // Is output queue ready for new flit? Bool emit = flitProcessedQueue.notFull; - Bool newFlitCount = emitFlitCount; + let newFlitCount = emitFlitCount; // Consume routing record if (emit) begin flitProcessedQueue.enq(RoutedFlit { decision: decision, flit: flit }); @@ -537,9 +549,10 @@ module mkFetcher#(BoardId boardId) (Fetcher); for (Integer i = 5; i > 0; i=i-1) newBeat.chunks[i] = beat.chunks[i-1]; end - beatReg <= NumberedRoutingBeat { beatNum: beatNum, beat: newBeat }; + beatReg <= NumberedRoutingBeat { + beat: newBeat, beatNum: beatNum, info: info }; // Is this the final record in the beat? - if ((recordCount+1) == beat.size) begin + if ((recordCount+1) == truncate(beat.size)) begin interpreterState <= 0; // Have we finished with this message yet? if (info.finalBurst && info.burst == (beatNum+1)) begin @@ -554,7 +567,7 @@ module mkFetcher#(BoardId boardId) (Fetcher); newFlitCount = 0; end // Issue flit load request - flitBuffer.load({info.msgAddr, newFlitCount}); + flitBuffer.read({info.msgAddr, newFlitCount}); emitFlitCount <= newFlitCount; endrule @@ -575,15 +588,15 @@ module mkFetcher#(BoardId boardId) (Fetcher); flitBypassQueue.deq; flitOutQueue.enq(flitBypassQueue.dataOut); mergeInProgress <= flitBypassQueue.dataOut.flit.notFinalFlit; - prevFromBypass = True; + prevFromBypass <= True; end end else if (flitProcessedQueue.canDeq) begin flitProcessedQueue.deq; flitOutQueue.enq(flitProcessedQueue.dataOut); mergeInProgress <= flitProcessedQueue.dataOut.flit.notFinalFlit; - prevFromBypass = False; + prevFromBypass <= False; end - endrule; + endrule // Interfaces // ----------- @@ -591,7 +604,7 @@ module mkFetcher#(BoardId boardId) (Fetcher); interface flitIn = flitInPort.in; interface flitOut = queueToBOut(flitOutQueue); interface ramReqs = map(queueToBOut, ramReqQueue); - interface ramResps = ramResps; + interface ramResps = ramRespsOut; endmodule @@ -601,8 +614,8 @@ endmodule interface ProgRouter; // Incoming and outgoing flits - interface Vector#(`FetchersPerProgRouter, In#(Flit) flitIn); - interface Vector#(`FetchersPerProgRouter, Out#(Flit) flitOut); + interface Vector#(`FetchersPerProgRouter, In#(Flit)) flitIn; + interface Vector#(`FetchersPerProgRouter, Out#(Flit)) flitOut; // Interface to off-chip memory interface Vector#(`DRAMsPerBoard, From 129c22cedab4693ee2f4e9053142af1a11d59f80 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Sat, 28 Mar 2020 09:49:32 +0000 Subject: [PATCH 13/78] Fix to previous commit --- rtl/ProgRouter.bsv | 1 + 1 file changed, 1 insertion(+) diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv index b5de42ef..3bbfa613 100644 --- a/rtl/ProgRouter.bsv +++ b/rtl/ProgRouter.bsv @@ -441,6 +441,7 @@ module mkFetcher#(Integer fetcherId, BoardId boardId) (Fetcher); if (beatBuffer.canDeq && beatBuffer.canPeek) begin beatReg <= beat; beatBuffer.deq; + beatBufferLen.dec; interpreterState <= 1; end // Load first flit From 6551259d9bd161eb2c0f10c2206edd7810fdab7c Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Tue, 31 Mar 2020 11:29:31 +0100 Subject: [PATCH 14/78] ProgRouter module complete (Compiles but untested) --- config.py | 1 - rtl/ProgRouter.bsv | 198 ++++++++++++++++++++++++++++++++++++++++++--- rtl/Util.bsv | 20 +++++ 3 files changed, 208 insertions(+), 11 deletions(-) diff --git a/config.py b/config.py index 919ab4a8..50bc3480 100755 --- a/config.py +++ b/config.py @@ -369,7 +369,6 @@ def quoted(s): return "'\"" + s + "\"'" # Number of FPGA boards per box (including bridge board) p["BoardsPerBox"] = p["MeshXLenWithinBox"] * p["MeshYLenWithinBox"] + 1 -# Number of fetchers in the per-board programmable router # Parameters for programmable routers # (and the routing-record fetchers they contain) p["FetchersPerProgRouter"] = 5 diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv index 3bbfa613..6a038e9d 100644 --- a/rtl/ProgRouter.bsv +++ b/rtl/ProgRouter.bsv @@ -10,6 +10,7 @@ import Queue :: *; import Interface :: *; import BlockRam :: *; import Assert :: *; +import Util :: *; // ============================================================================= // Routing keys and beats @@ -171,7 +172,7 @@ typedef enum { // N/L0 S/L1 E/L2 W/L3 Ind-----+ Output queues // | | | | // +---------------------------+ -// | Expander | Final expansion +// | Splitter | Final splitting // +---------------------------+ // | | | | | | | | // N S E W L0 L1 L2 L3 Output flits @@ -199,7 +200,7 @@ typedef enum { // fair crossbar which organises them by destination into output // queues. To reduce logic, we allow each inter-board link to share // an output queue with a local link, as this does not compromise -// forward progress. Finally the queues are expanded to provide an +// forward progress. Finally the queues are split to provide an // output stream for each possible destination. // ============================================================================= @@ -250,7 +251,7 @@ interface Fetcher; endinterface // Fetcher module -module mkFetcher#(Integer fetcherId, BoardId boardId) (Fetcher); +module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); // Flit input port InPort#(Flit) flitInPort <- mkInPort; @@ -609,6 +610,125 @@ module mkFetcher#(Integer fetcherId, BoardId boardId) (Fetcher); endmodule +// ============================================================================= +// Crossbar +// ============================================================================= + +// Selector function for a mux in the programmable router crossbar +typedef function Bool selector(RoutedFlit flit) SelectorFunc; + +module mkProgRouterCrossbar#( + Vector#(n, SelectorFunc) f, + Vector#(n, BOut#(RoutedFlit)) out) + (Vector#(n, BOut#(RoutedFlit))) + provisos (Add#(a_, 1, n)); + + // Input ports + Vector#(n, InPort#(RoutedFlit)) inPort <- replicateM(mkInPort); + + // Connect up input ports + for (Integer i = 0; i < valueOf(n); i=i+1) + connectDirect(out[i], inPort[i].in); + + // Cosume wires, for each input port + Vector#(n, PulseWire) consumeWire<- replicateM(mkPulseWireOR); + + // Keep track of service history for flit sources (for fair selection) + Vector#(n, Reg#(Bit#(n))) hist <- replicateM(mkReg(0)); + + // Current choice of flit source + Vector#(n, Reg#(Bit#(n))) choiceReg <- replicateM(mkReg(0)); + + // Output queue + Vector#(n, Queue1#(RoutedFlit)) outQueue <- + replicateM(mkUGShiftQueue(QueueOptFmax)); + + // Selector mux for each out queue + for (Integer i = 0; i < valueOf(n); i=i+1) begin + + rule select; + // Vector of input flits and available flits + Vector#(n, RoutedFlit) flits = newVector; + Vector#(n, Bool) avails = newVector; + for (Integer i = 0; i < valueOf(n); i=i+1) begin + flits[i] = inPort[i].value; + avails[i] = f[i](inPort[i].value) && inPort[i].canGet; + end + Bit#(n) avail = pack(avails); + // Choose a new source using fair scheduler + match {.newHist, .choice} = sched(hist[i], avail); + // Select a flit + RoutedFlit flit = + oneHotSelect(unpack(choiceReg[i]), flits); + // Consume a flit + if (choiceReg[i] != 0) begin + if (outQueue[i].notFull) begin + // Pass chosen flit to out queue + outQueue[i].enq(flit); + // On final flit of message + if (!flit.flit.notFinalFlit) begin + if (choice != choiceReg[i]) begin + choiceReg[i] <= choice; + hist[i] <= newHist; + end else + choiceReg[i] <= 0; + end + end + end else begin + choiceReg[i] <= choice; + hist[i] <= newHist; + end + // Consume from chosen source + for (Integer j = 0; j < valueOf(n); j=j+1) + if (outQueue[i].notFull && choiceReg[i][j] == 1) + consumeWire[j].send; + endrule + + end + + // Consume from flit sources + rule consumeFlitSources; + for (Integer j = 0; j < valueOf(n); j=j+1) + if (consumeWire[j]) inPort[j].get; + endrule + + return map(queueToBOut, outQueue); +endmodule + + +// ============================================================================= +// Splitter +// ============================================================================= + +// Split a single stream in two based on a predicate +module splitFlits#(SelectorFunc f, BOut#(RoutedFlit) out) + (Tuple2#(BOut#(Flit), BOut#(Flit))); + + // Consume wire + PulseWire consumeWire <- mkPulseWireOR; + + // Output streams + BOut#(Flit) outYes = + interface BOut + method Action get = consumeWire.send; + method Bool valid = out.valid && f(out.value); + method Flit value = out.value.flit; + endinterface; + BOut#(Flit) outNo = + interface BOut + method Action get = consumeWire.send; + method Bool valid = out.valid && !f(out.value); + method Flit value = out.value.flit; + endinterface; + + // Consume + rule consume; + if (consumeWire) out.get; + endrule + + return tuple2(outYes, outNo); +endmodule + // ============================================================================= // Programmable router // ============================================================================= @@ -616,7 +736,8 @@ endmodule interface ProgRouter; // Incoming and outgoing flits interface Vector#(`FetchersPerProgRouter, In#(Flit)) flitIn; - interface Vector#(`FetchersPerProgRouter, Out#(Flit)) flitOut; + interface Vector#(`FetchersPerProgRouter, BOut#(Flit)) flitOut; + interface Vector#(`MailboxMeshXLen, BOut#(Flit)) nocFlitOut; // Interface to off-chip memory interface Vector#(`DRAMsPerBoard, @@ -625,14 +746,71 @@ interface ProgRouter; Vector#(`FetchersPerProgRouter, In#(DRAMResp))) ramResps; endinterface -/* -module mkProgRouter (ProgRouter); +module mkProgRouter#(BoardId boardId) (ProgRouter); + + // Fetchers + Vector#(`FetchersPerProgRouter, Fetcher) fetchers = newVector; + for (Integer i = 0; i < `FetchersPerProgRouter; i=i+1) + fetchers[i] <- mkFetcher(boardId, i); + + // Crossbar routing functions + function Bit#(2) xcoord(RoutedFlit rf) = + zeroExtend(rf.flit.dest.addr.mbox.x); + function Bool routeN(RoutedFlit rf) = + rf.decision == RouteNorth || (rf.decision == RouteNoC && xcoord(rf) == 0); + function Bool routeS(RoutedFlit rf) = + rf.decision == RouteSouth || (rf.decision == RouteNoC && xcoord(rf) == 1); + function Bool routeE(RoutedFlit rf) = + rf.decision == RouteEast || (rf.decision == RouteNoC && xcoord(rf) == 2); + function Bool routeW(RoutedFlit rf) = + rf.decision == RouteWest || (rf.decision == RouteNoC && xcoord(rf) == 3); + function Bool routeLoop(RoutedFlit rf) = rf.decision == RouteLoop; + Vector#(`FetchersPerProgRouter, SelectorFunc) funcs = + vector(routeN, routeS, routeE, routeW, routeLoop); + + // Crossbar + function BOut#(RoutedFlit) getFetcherFlitOut(Fetcher f) = f.flitOut; + Vector#(`FetchersPerProgRouter, BOut#(RoutedFlit)) fetcherOuts = + map(getFetcherFlitOut, fetchers); + Vector#(`FetchersPerProgRouter, BOut#(RoutedFlit)) + crossbarOuts <- mkProgRouterCrossbar(funcs, fetcherOuts); + + // Flit input interfaces + Vector#(`FetchersPerProgRouter, In#(Flit)) flitInIfc = newVector; + for (Integer i = 0; i < `FetchersPerProgRouter; i=i+1) + flitInIfc[i] = fetchers[i].flitIn; + + // Flit output interfaces + Vector#(`FetchersPerProgRouter, BOut#(Flit)) flitOutIfc = newVector; + Vector#(`MailboxMeshXLen, BOut#(Flit)) nocFlitOutIfc = newVector; + + // Strands + function Bool forNoC(RoutedFlit rf) = rf.decision == RouteNoC; + for (Integer i = 0; i < 4; i=i+1) begin + match {.noc, .other} <- splitFlits(forNoC, crossbarOuts[i]); + flitOutIfc[i] = other; + if (i < `MailboxMeshXLen) nocFlitOutIfc[i] = noc; + end + function Flit toFlit (RoutedFlit rf) = rf.flit; + flitOutIfc[4] <- onBOut(toFlit, crossbarOuts[4]); + + // RAM interfaces + Vector#(`DRAMsPerBoard, Vector#(`FetchersPerProgRouter, In#(DRAMResp))) + ramRespIfc = replicate(newVector); + Vector#(`DRAMsPerBoard, Vector#(`FetchersPerProgRouter, BOut#(DRAMReq))) + ramReqIfc = replicate(newVector); + for (Integer i = 0; i < `DRAMsPerBoard; i=i+1) + for (Integer j = 0; j < `FetchersPerProgRouter; j=j+1) begin + ramReqIfc[i][j] = fetchers[j].ramReqs[i]; + ramRespIfc[i][j] = fetchers[j].ramResps[i]; + end - // Flit input ports - Vector#(`FetchersPerProgRouter, InPort#(Flit)) flitInPort <- - replicateM(mkInPort); + interface flitIn = flitInIfc; + interface flitOut = flitOutIfc; + interface nocFlitOut = nocFlitOutIfc; + interface ramReqs = ramReqIfc; + interface ramResps = ramRespIfc; endmodule -*/ endpackage diff --git a/rtl/Util.bsv b/rtl/Util.bsv index 7ac885c3..507d1ef2 100644 --- a/rtl/Util.bsv +++ b/rtl/Util.bsv @@ -254,4 +254,24 @@ module mkBuffer#(Integer n, dataT init, dataT inp) (dataT) return regs[n-1]; endmodule +// Isolate first hot bit +function Bit#(n) firstHot(Bit#(n) x) = x & (~x + 1); + +// Function for fair scheduling of n tasks +function Tuple2#(Bit#(n), Bit#(n)) sched(Bit#(n) hist, Bit#(n) avail); + // First choice: an available bit that's not in the history + Bit#(n) first = firstHot(avail & ~hist); + // Second choice: any available bit + Bit#(n) second = firstHot(avail); + + // Return new history, and chosen bit + if (first != 0) begin + // Return first choice, and update history + return tuple2(hist | first, first); + end else begin + // Return second choice, and reset history + return tuple2(second, second); + end +endfunction + endpackage From 2ed587effd9dfe8d7d0af7923e432c055cf7670c Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Tue, 31 Mar 2020 15:34:06 +0100 Subject: [PATCH 15/78] Use ProgRouter instead of BoardRouter --- rtl/Interface.bsv | 8 +++ rtl/Network.bsv | 143 ++++++---------------------------------------- 2 files changed, 25 insertions(+), 126 deletions(-) diff --git a/rtl/Interface.bsv b/rtl/Interface.bsv index dffd8ac2..0484cb41 100644 --- a/rtl/Interface.bsv +++ b/rtl/Interface.bsv @@ -212,6 +212,14 @@ module onBOut#(function u f(t x), BOut#(t) out) (BOut#(u)); method u value = f(out.value); endmodule +// Convert BOut to Out +function Out#(t) fromBOut(BOut#(t) out) = + interface Out + method Action tryGet = out.get; + method Bool valid = out.valid; + method t value = out.value; + endinterface; + // A null In port accepts and discards all inputs module mkNullIn (In#(t)); method Action tryPut(u val); endmethod diff --git a/rtl/Network.bsv b/rtl/Network.bsv index 6706d00f..00066cce 100644 --- a/rtl/Network.bsv +++ b/rtl/Network.bsv @@ -221,104 +221,6 @@ module mkMeshRouter#(MailboxId m) (MeshRouter); endmodule -// ============================================================================= -// Board router -// ============================================================================= - -// Similar to a mesh router, but: (1) different routing function, -// which routes between boards rather than mailboxes; (2) no loopback -// in the sense that packets coming from mailbox mesh never get routed back -// onto mailbox mesh. This is a first step towards supporting -// programmable board routers. -module mkBoardRouter(MeshRouter); - - // Board id - Wire#(BoardId) b <- mkDWire(?); - - // Ports - InPort#(Flit) leftInPort <- mkInPort; - OutPort#(Flit) leftOutPort <- mkOutPort; - InPort#(Flit) rightInPort <- mkInPort; - OutPort#(Flit) rightOutPort <- mkOutPort; - InPort#(Flit) topInPort <- mkInPort; - OutPort#(Flit) topOutPort <- mkOutPort; - InPort#(Flit) bottomInPort <- mkInPort; - OutPort#(Flit) bottomOutPort <- mkOutPort; - InPort#(Flit) fromMailboxPort <- mkInPort; - OutPort#(Flit) toMailboxPort <- mkOutPort; - - // Routing function - function Route route(NetAddr a); - if (a.addr.host.valid) return a.addr.host.value == 0 ? Left : Right; - else if (a.addr.board.x < b.x) return Left; - else if (a.addr.board.x > b.x) return Right; - else if (a.addr.board.y < b.y) return Down; - else if (a.addr.board.y > b.y) return Up; - else return Mailbox; - endfunction - - // Route to the mailbox - mkRouterMux( - route, - Mailbox, - toMailboxPort, - vector(FromLeft, FromRight, FromTop, FromBottom), - vector(leftInPort, rightInPort, topInPort, bottomInPort) - ); - - // Route left - mkRouterMux( - route, - Left, - leftOutPort, - vector(FromRight, FromTop, FromBottom, FromMailbox), - vector(rightInPort, topInPort, bottomInPort, fromMailboxPort) - ); - - // Route right - mkRouterMux( - route, - Right, - rightOutPort, - vector(FromLeft, FromTop, FromBottom, FromMailbox), - vector(leftInPort, topInPort, bottomInPort, fromMailboxPort) - ); - - // Route up - mkRouterMux( - route, - Up, - topOutPort, - vector(FromLeft, FromRight, FromBottom, FromMailbox), - vector(leftInPort, rightInPort, bottomInPort, fromMailboxPort) - ); - - // Route down - mkRouterMux( - route, - Down, - bottomOutPort, - vector(FromLeft, FromRight, FromTop, FromMailbox), - vector(leftInPort, rightInPort, topInPort, fromMailboxPort) - ); - - method Action setBoardId(BoardId id); - b <= id; - endmethod - - // Interface - interface In leftIn = leftInPort.in; - interface Out leftOut = leftOutPort.out; - interface In rightIn = rightInPort.in; - interface Out rightOut = rightOutPort.out; - interface In topIn = topInPort.in; - interface Out topOut = topOutPort.out; - interface In bottomIn = bottomInPort.in; - interface Out bottomOut = bottomOutPort.out; - interface In fromMailbox = fromMailboxPort.in; - interface Out toMailbox = toMailboxPort.out; -endmodule - // ============================================================================= // Flit-sized reliable links // ============================================================================= @@ -478,59 +380,48 @@ module mkNoC#( routers[y+1][x].bottomOut, routers[y][x].topIn); end - // Board router - // ------------ + // Programmable board router + // ------------------------- - // For routing messages between boards - MeshRouter boardRouter <- mkBoardRouter; - - // Set board id for board router - rule setBoardRouterId; - boardRouter.setBoardId(boardId); - endrule + // Programmable router + ProgRouter boardRouter <- mkProgRouter(boardId); // Connect board router to north link + connectDirect(boardRouter.flitOut[0], northLink[0].flitIn); connectUsing(mkUGShiftQueue1(QueueOptFmax), - boardRouter.topOut, northLink[0].flitIn); - connectUsing(mkUGShiftQueue1(QueueOptFmax), - northLink[0].flitOut, boardRouter.topIn); + northLink[0].flitOut, boardRouter.flitIn[0]); // Connect board router to south link + connectDirect(boardRouter.flitOut[1], southLink[0].flitIn); connectUsing(mkUGShiftQueue1(QueueOptFmax), - boardRouter.bottomOut, southLink[0].flitIn); - connectUsing(mkUGShiftQueue1(QueueOptFmax), - southLink[0].flitOut, boardRouter.bottomIn); + southLink[0].flitOut, boardRouter.flitIn[1]); // Connect board router to east link + connectDirect(boardRouter.flitOut[2], eastLink[0].flitIn); connectUsing(mkUGShiftQueue1(QueueOptFmax), - boardRouter.rightOut, eastLink[0].flitIn); - connectUsing(mkUGShiftQueue1(QueueOptFmax), - eastLink[0].flitOut, boardRouter.rightIn); + eastLink[0].flitOut, boardRouter.flitIn[2]); // Connect board router to west link + connectDirect(boardRouter.flitOut[3], westLink[0].flitIn); connectUsing(mkUGShiftQueue1(QueueOptFmax), - boardRouter.leftOut, westLink[0].flitIn); - connectUsing(mkUGShiftQueue1(QueueOptFmax), - westLink[0].flitOut, boardRouter.leftIn); + westLink[0].flitOut, boardRouter.flitIn[3]); // Connect mailbox mesh south rim to board router function List#(t) single(t elem) = List::cons(elem, Nil); List#(Out#(Flit)) botOutList = Nil; for (Integer x = `MailboxMeshXLen-1; x >= 0; x=x-1) botOutList = Cons(routers[0][x].bottomOut, botOutList); + // Also include loopback connection to board router to implement IND records + botOutList = Cons(fromBOut(boardRouter.flitOut[4]), botOutList); function In#(Flit) getFlitIn(BoardLink link) = link.flitIn; - reduceConnect(mkFlitMerger, botOutList, single(boardRouter.fromMailbox)); + reduceConnect(mkFlitMerger, botOutList, single(boardRouter.flitIn[4])); // Connect board router to mailbox mesh south rim function In#(Flit) getBottomIn(MeshRouter r) = r.bottomIn; Vector#(`MailboxMeshXLen, In#(Flit)) southRimInPorts = map(getBottomIn, routers[0]); - function Bit#(`MailboxMeshXBits) flitGetX(Flit flit) = - flit.dest.addr.mbox.x; - let southRimDistributor <- mkResponseDistributor(flitGetX, - mkUGShiftQueue1(QueueOptFmax), southRimInPorts); - connectUsing(mkUGShiftQueue1(QueueOptFmax), boardRouter.toMailbox, - southRimDistributor); + for (Integer i = 0; i < `MailboxMeshXLen; i=i+1) + connectDirect(boardRouter.nocFlitOut[i], southRimInPorts[i]); // Detect inter-board activity // --------------------------- From 04846495dfae6a76aa7887ed619a5574693318d6 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Tue, 31 Mar 2020 15:38:08 +0100 Subject: [PATCH 16/78] Wire up ProgRouter's off-chip RAM interfaces --- rtl/Network.bsv | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/rtl/Network.bsv b/rtl/Network.bsv index 00066cce..5820209a 100644 --- a/rtl/Network.bsv +++ b/rtl/Network.bsv @@ -308,16 +308,6 @@ module mkNoC#( Vector#(`NumEastWestLinks, BoardLink) westLink <- mapM(mkBoardLink(linkEnable[3]), westSocket); - // Responses from off-chip memory - Vector#(`DRAMsPerBoard, - Vector#(`FetchersPerProgRouter, InPort#(DRAMResp))) dramRespPort <- - replicateM(replicateM(mkInPort)); - - // Requests to off-chip memory - Vector#(`DRAMsPerBoard, - Vector#(`FetchersPerProgRouter, Queue1#(DRAMReq))) dramReqQueues <- - replicateM(replicateM(mkUGShiftQueue1(QueueOptFmax))); - // Dimension-ordered routers // ------------------------- @@ -466,12 +456,10 @@ module mkNoC#( `endif // Requests to off-chip memory - interface dramReqs = - Vector::map(Vector::map(queueToBOut), dramReqQueues); + interface dramReqs = boardRouter.ramReqs; // Responses from off-chip memory - interface dramResps = - Vector::map(Vector::map(getIn), dramRespPort); + interface dramResps = boardRouter.ramResps; endmodule From a8e3c8f3226d96a064aefaedbffe8492f284bfc1 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Wed, 1 Apr 2020 11:39:49 +0100 Subject: [PATCH 17/78] Fix ProgRouter's crossbar --- rtl/ProgRouter.bsv | 44 ++++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv index 6a038e9d..17fd574f 100644 --- a/rtl/ProgRouter.bsv +++ b/rtl/ProgRouter.bsv @@ -334,6 +334,7 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); consumeState <= 1; end end else if (flitBypassQueue.notFull) begin +$display("ProgRouter: bypass"); flitInPort.get; // Make routing decision RoutingDecision decision = RouteNoC; @@ -469,7 +470,8 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); URM1: begin URM1Record rec = unpack(beat.chunks[5]); flit.dest.addr.isKey = False; - flit.dest.addr.mbox = unpack(rec.mbox); + flit.dest.addr.mbox.x = unpack(truncate(rec.mbox[1:0])); + flit.dest.addr.mbox.y = unpack(truncate(rec.mbox[3:2])); Vector#(`ThreadsPerMailbox, Bool) threadMask = newVector; for (Integer j = 0; j < `ThreadsPerMailbox; j=j+1) threadMask[j] = rec.thread == fromInteger(j); @@ -483,7 +485,8 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); URM2: begin URM2Record rec = unpack({beat.chunks[5], beat.chunks[4]}); flit.dest.addr.isKey = False; - flit.dest.addr.mbox = unpack(rec.mbox); + flit.dest.addr.mbox.x = unpack(truncate(rec.mbox[1:0])); + flit.dest.addr.mbox.y = unpack(truncate(rec.mbox[3:2])); Vector#(`ThreadsPerMailbox, Bool) threadMask = newVector; for (Integer j = 0; j < `ThreadsPerMailbox; j=j+1) threadMask[j] = rec.thread == fromInteger(j); @@ -520,7 +523,8 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); MRM: begin MRMRecord rec = unpack({beat.chunks[5], beat.chunks[4]}); flit.dest.addr.isKey = False; - flit.dest.addr.mbox = unpack(rec.mbox); + flit.dest.addr.mbox.x = unpack(truncate(rec.mbox[1:0])); + flit.dest.addr.mbox.y = unpack(truncate(rec.mbox[3:2])); flit.dest.threads = rec.destMask; decision = RouteNoC; end @@ -587,6 +591,7 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); flitBypassQueue.canDeq; if (chooseBypass) begin if (flitBypassQueue.canDeq) begin +$display("ProgRouter: consuming from bypass queue"); flitBypassQueue.deq; flitOutQueue.enq(flitBypassQueue.dataOut); mergeInProgress <= flitBypassQueue.dataOut.flit.notFinalFlit; @@ -649,38 +654,37 @@ module mkProgRouterCrossbar#( rule select; // Vector of input flits and available flits Vector#(n, RoutedFlit) flits = newVector; - Vector#(n, Bool) avails = newVector; - for (Integer i = 0; i < valueOf(n); i=i+1) begin - flits[i] = inPort[i].value; - avails[i] = f[i](inPort[i].value) && inPort[i].canGet; + Vector#(n, Bool) nextAvails = newVector; + Bool avail = False; + for (Integer j = 0; j < valueOf(n); j=j+1) begin + flits[j] = inPort[j].value; + nextAvails[j] = inPort[j].canGet && f[i](inPort[j].value) + && choiceReg[i][j] == 0; + avail = avail || (choiceReg[i][j] == 1 && inPort[j].canGet); end - Bit#(n) avail = pack(avails); + Bit#(n) nextAvail = pack(nextAvails); // Choose a new source using fair scheduler - match {.newHist, .choice} = sched(hist[i], avail); + match {.newHist, .nextChoice} = sched(hist[i], nextAvail); // Select a flit - RoutedFlit flit = - oneHotSelect(unpack(choiceReg[i]), flits); + RoutedFlit flit = oneHotSelect(unpack(choiceReg[i]), flits); // Consume a flit - if (choiceReg[i] != 0) begin + if (avail) begin if (outQueue[i].notFull) begin // Pass chosen flit to out queue outQueue[i].enq(flit); // On final flit of message if (!flit.flit.notFinalFlit) begin - if (choice != choiceReg[i]) begin - choiceReg[i] <= choice; - hist[i] <= newHist; - end else - choiceReg[i] <= 0; + choiceReg[i] <= nextChoice; + hist[i] <= newHist; end end - end else begin - choiceReg[i] <= choice; + end else if (choiceReg[i] == 0) begin + choiceReg[i] <= nextChoice; hist[i] <= newHist; end // Consume from chosen source for (Integer j = 0; j < valueOf(n); j=j+1) - if (outQueue[i].notFull && choiceReg[i][j] == 1) + if (inPort[j].canGet && choiceReg[i][j] == 1 && outQueue[i].notFull) consumeWire[j].send; endrule From 53f1acf60c5a8ac640ca01f7e3b700c1f07908f9 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Wed, 1 Apr 2020 11:42:29 +0100 Subject: [PATCH 18/78] Drop debug statements --- rtl/ProgRouter.bsv | 2 -- 1 file changed, 2 deletions(-) diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv index 17fd574f..230c1170 100644 --- a/rtl/ProgRouter.bsv +++ b/rtl/ProgRouter.bsv @@ -334,7 +334,6 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); consumeState <= 1; end end else if (flitBypassQueue.notFull) begin -$display("ProgRouter: bypass"); flitInPort.get; // Make routing decision RoutingDecision decision = RouteNoC; @@ -591,7 +590,6 @@ $display("ProgRouter: bypass"); flitBypassQueue.canDeq; if (chooseBypass) begin if (flitBypassQueue.canDeq) begin -$display("ProgRouter: consuming from bypass queue"); flitBypassQueue.deq; flitOutQueue.enq(flitBypassQueue.dataOut); mergeInProgress <= flitBypassQueue.dataOut.flit.notFinalFlit; From 7cecf664286d4985fa6fbb59530bc5890cca5316 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Thu, 2 Apr 2020 14:22:38 +0100 Subject: [PATCH 19/78] Simple bare-metal test for the ProgRouter (Works, but only tests one type of routing record so far) --- Makefile | 1 + apps/progrouter/Makefile | 53 ++++++++++++ apps/progrouter/entry.S | 3 + apps/progrouter/genld.sh | 32 ++++++++ apps/progrouter/progrouter.cpp | 142 +++++++++++++++++++++++++++++++++ apps/progrouter/run.cpp | 12 +++ rtl/Globals.bsv | 2 +- rtl/Network.bsv | 1 + rtl/ProgRouter.bsv | 19 ++--- 9 files changed, 255 insertions(+), 10 deletions(-) create mode 100644 apps/progrouter/Makefile create mode 100644 apps/progrouter/entry.S create mode 100755 apps/progrouter/genld.sh create mode 100644 apps/progrouter/progrouter.cpp create mode 100644 apps/progrouter/run.cpp diff --git a/Makefile b/Makefile index 5b2608a3..d52882f7 100644 --- a/Makefile +++ b/Makefile @@ -24,6 +24,7 @@ clean: make -C apps/multiprog clean make -C apps/sync clean make -C apps/temps clean + make -C apps/progrouter clean make -C apps/POLite/heat-gals clean make -C apps/POLite/heat-sync clean make -C apps/POLite/asp-gals clean diff --git a/apps/progrouter/Makefile b/apps/progrouter/Makefile new file mode 100644 index 00000000..b58478f2 --- /dev/null +++ b/apps/progrouter/Makefile @@ -0,0 +1,53 @@ +# Tinsel root +TINSEL_ROOT=../.. + +ifndef QUARTUS_ROOTDIR + $(error Please set QUARTUS_ROOTDIR) +endif + +include $(TINSEL_ROOT)/globals.mk + +# RISC-V compiler flags +CFLAGS = $(RV_CFLAGS) -O2 -I $(INC) +LDFLAGS = -melf32lriscv -G 0 + +.PHONY: all +all: code.v data.v run + +code.v: progrouter.elf + checkelf.sh progrouter.elf + $(RV_OBJCOPY) -O verilog --only-section=.text progrouter.elf code.v + +data.v: progrouter.elf + $(RV_OBJCOPY) -O verilog --remove-section=.text \ + --set-section-flags .bss=alloc,load,contents \ + progrouter.elf data.v + +progrouter.elf: progrouter.cpp link.ld $(INC)/config.h $(INC)/tinsel.h entry.o + $(RV_CPPC) $(CFLAGS) -Wall -c -o progrouter.o progrouter.cpp + $(RV_LD) $(LDFLAGS) -T link.ld -o progrouter.elf entry.o progrouter.o $(LIB)/lib.o + +entry.o: + $(RV_CPPC) $(CFLAGS) -Wall -c -o entry.o entry.S + +link.ld: genld.sh + ./genld.sh > link.ld + +$(LIB)/lib.o: + make -C $(LIB) + +$(INC)/config.h: $(TINSEL_ROOT)/config.py + make -C $(INC) + +$(HL)/%.o: + make -C $(HL) + +run: run.cpp $(HL)/*.o + g++ -O2 -I $(INC) -I $(HL) -o run run.cpp $(HL)/*.o + +sim: run.cpp $(HL)/sim/*.o + g++ -O2 -I $(INC) -I $(HL) -o sim run.cpp $(HL)/sim/*.o + +.PHONY: clean +clean: + rm -f *.o *.elf link.ld *.v run sim diff --git a/apps/progrouter/entry.S b/apps/progrouter/entry.S new file mode 100644 index 00000000..18cd8d27 --- /dev/null +++ b/apps/progrouter/entry.S @@ -0,0 +1,3 @@ +# We assume the boot loader has already setup the stack. +# All we need to do is jump to main. +j main diff --git a/apps/progrouter/genld.sh b/apps/progrouter/genld.sh new file mode 100755 index 00000000..cfe144c4 --- /dev/null +++ b/apps/progrouter/genld.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# Load config parameters +while read -r EXPORT; do + eval $EXPORT +done <<< `python ../../config.py envs` + +# Compute space available for instructions +MaxInstrBytes=$((4 * 2**$LogInstrsPerCore - $MaxBootImageBytes)) + +cat - << EOF +/* THIS FILE HAS BEEN GENERATED AUTOMATICALLY. */ +/* DO NOT MODIFY. INSTEAD, MODIFY THE genld.sh SCRIPT. */ + +OUTPUT_ARCH( "riscv" ) + +MEMORY +{ + instrs : ORIGIN = $MaxBootImageBytes, LENGTH = $MaxInstrBytes + globals : ORIGIN = $DRAMBase, LENGTH = $DRAMGlobalsLength +} + +SECTIONS +{ + .text : { *.o(.text*) } > instrs + .bss : { *.o(.bss*) } > globals = 0 + .rodata : { *.o(.rodata*) } > globals + .sdata : { *.o(.sdata*) } > globals + .data : { *.o(.data*) } > globals + __heapBase = ALIGN(.); +} +EOF diff --git a/apps/progrouter/progrouter.cpp b/apps/progrouter/progrouter.cpp new file mode 100644 index 00000000..b1740082 --- /dev/null +++ b/apps/progrouter/progrouter.cpp @@ -0,0 +1,142 @@ +#include + +// Simplest possible example involving programmable routers + +/* +Byte ordering in a routing beat: + + 31: Upper byte of length (i.e. number of records in beat) + 30: Lower byte of length + 29: Upper byte of first chunk + 28: + 27: + 26: + 25: Lower byte of first chunk + 24: Upper byte of second chunk + 23: + 22: + 21: + 20: Lower byte of second chunk + 19: Upper byte of third chunk + 18: + 17: + 16: + 15: Lower byte of third chunk + 14: Upper byte of fourth chunk + 13: + 12: + 11: + 10: Lower byte of fourth chunk + 9: Upper byte of fifth chunk + 8: + 7: + 6: + 5: Lower byte of fifth chunk + 4: Upper byte of sixth chunk + 3: + 2: + 1: + 0: Lower byte of sixth chunk + +Need to fold this into the docs eventually. +*/ + +// Use this to align on beat boundary +#define ALIGNED __attribute__((aligned(32))) + +// A single RAM beat +struct ALIGNED Beat { + uint8_t bytes[32]; +}; + +// Routing table, with methods to aid construction +template struct RoutingTable { + // Raw beats comprising the table + Beat beats[NumBeats]; + + // Number of chunks used so far in current beat + uint32_t numChunks; + + // Index of beat currently being filled + uint32_t currentBeat; + + // Constructor + RoutingTable() { + currentBeat = 0; + } + + // Pointer to current beat being filled + uint8_t* currentPointer() { + return beats[currentBeat].bytes; + } + + // Move on to next the beat + void next() { + beats[currentBeat].bytes[31] = 0; + beats[currentBeat].bytes[30] = numChunks; + numChunks = 0; + currentBeat++; + } + + // Add a URM1 record to the table + void addURM1(uint32_t mboxX, uint32_t mboxY, + uint32_t mboxThread, uint32_t localKey) { + uint8_t* ptr = beats[currentBeat].bytes + 5*(5-numChunks); + ptr[0] = localKey; + ptr[1] = localKey >> 8; + ptr[2] = localKey >> 16; + ptr[3] = ((mboxThread&0x1f) << 3) | ((localKey >> 24) & 0x7); + ptr[4] = (mboxY << 3) | (mboxX << 1) | (mboxThread >> 5); + numChunks++; + if (numChunks == 6) next(); + } +}; + +// Create global routing table of 16 beats +RoutingTable<16> table; + +int main() +{ + // Get thread id + int me = tinselId(); + + // Sample outgoing message + volatile uint32_t* msgOut = (uint32_t*) tinselSendSlot(); + msgOut[0] = 0x10; + msgOut[1] = 0x20; + msgOut[2] = 0x30; + msgOut[3] = 0x40; + + // On thread 0 + if (me == 0) { + // Add an URM1 record + uint8_t* entry = table.currentPointer(); + table.addURM1(0, 0, 10, 0xff); + table.next(); + + // Cache flush, to write table into RAM + tinselCacheFlush(); + // Wait until flush done, by issuing a load + volatile uint32_t* dummyPtr = (uint32_t*) entry; dummyPtr[0]; + + // Construct key + uint32_t key = (uint32_t) entry; + key = key | 1; // Entry is 1 beat long + + // Send message to key + tinselWaitUntil(TINSEL_CAN_SEND); + tinselKeySend(key, msgOut); + + while (1); + } + + // On other threads, print anything received + while (me != 0) { + tinselWaitUntil(TINSEL_CAN_RECV); + volatile uint32_t* msgIn = (uint32_t*) tinselRecv(); + printf("%x %x %x %x\n", msgIn[0], msgIn[1], msgIn[2], msgIn[3]); + tinselFree(msgIn); + } + + return 0; +} diff --git a/apps/progrouter/run.cpp b/apps/progrouter/run.cpp new file mode 100644 index 00000000..a198a064 --- /dev/null +++ b/apps/progrouter/run.cpp @@ -0,0 +1,12 @@ +#include + +int main() +{ + HostLink hostLink; + + hostLink.boot("code.v", "data.v"); + hostLink.go(); + hostLink.dumpStdOut(); + + return 0; +} diff --git a/rtl/Globals.bsv b/rtl/Globals.bsv index 914852e9..d240aa2c 100644 --- a/rtl/Globals.bsv +++ b/rtl/Globals.bsv @@ -69,7 +69,7 @@ typedef struct { Bool notFinalFlit; // Is this a special packet for idle-detection? Bool isIdleToken; -} Flit deriving (Bits); +} Flit deriving (Bits, FShow); // A padded flit is a multiple of 64 bits // (i.e. the data width of the 10G MAC interface) diff --git a/rtl/Network.bsv b/rtl/Network.bsv index 5820209a..4ee2e69b 100644 --- a/rtl/Network.bsv +++ b/rtl/Network.bsv @@ -150,6 +150,7 @@ module mkMeshRouter#(MailboxId m) (MeshRouter); // Routing function function Route route(NetAddr a); if (a.addr.board != b) return Down; + else if (a.addr.isKey) return Down; else if (a.addr.host.valid) return Down; else if (a.addr.mbox.y < m.y) return Down; else if (a.addr.mbox.y > m.y) return Up; diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv index 230c1170..48c29f15 100644 --- a/rtl/ProgRouter.bsv +++ b/rtl/ProgRouter.bsv @@ -32,7 +32,7 @@ typedef struct { Bit#(16) size; // The 40-bit record chunks Vector#(6, Bit#(40)) chunks; -} RoutingBeat deriving (Bits); +} RoutingBeat deriving (Bits, FShow); // 32-bit routing key typedef struct { @@ -42,7 +42,7 @@ typedef struct { Bit#(`LogBeatsPerDRAM) ptr; // Number of beats in the array Bit#(`LogRoutingEntryLen) numBeats; -} RoutingKey deriving (Bits); +} RoutingKey deriving (Bits, FShow); // Extract routing key from an address function RoutingKey getRoutingKey(NetAddr addr) = @@ -58,7 +58,7 @@ typedef enum { RR = 3'd2, // 40-bit Router-to-Router MRM = 3'd3, // 80-bit Multicast Router-to-Mailbox IND = 3'd4 // 40-bit Indirection -} RoutingRecordTag deriving (Bits, Eq); +} RoutingRecordTag deriving (Bits, Eq, FShow); typedef enum { NORTH = 2'd0, @@ -78,7 +78,7 @@ typedef struct { // Local key. The first word of the message // payload is overwritten with this. Bit#(27) localKey; -} URM1Record deriving (Bits); +} URM1Record deriving (Bits, FShow); // 80-bit Unicast Router-to-Mailbox (URM2) record typedef struct { @@ -137,7 +137,7 @@ typedef struct { Flit flit; // Routing decision for flit RoutingDecision decision; -} RoutedFlit deriving (Bits); +} RoutedFlit deriving (Bits, FShow); // Routing decision typedef enum { @@ -147,7 +147,7 @@ typedef enum { RouteWest, RouteNoC, RouteLoop -} RoutingDecision deriving (Bits, Eq); +} RoutingDecision deriving (Bits, Eq, FShow); // ============================================================================= // Design @@ -228,7 +228,7 @@ typedef struct { Bit#(`BeatBurstWidth) burst; // Is this the final burst of routing records for the current key? Bool finalBurst; -} InflightFetcherReqInfo deriving (Bits); +} InflightFetcherReqInfo deriving (Bits, FShow); // Routing beat, tagged with the beat number in the DRAM burst typedef struct { @@ -238,7 +238,7 @@ typedef struct { Bit#(`BeatBurstWidth) beatNum; // Inflight request info InflightFetcherReqInfo info; -} NumberedRoutingBeat deriving (Bits); +} NumberedRoutingBeat deriving (Bits, FShow); // Fetcher interface interface Fetcher; @@ -268,7 +268,8 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); registerDataOut: False, initFile: Invalid }; - BlockRam#(FetcherFlitBufferAddr, Flit) flitBuffer <- mkBlockRam; + BlockRam#(FetcherFlitBufferAddr, Flit) flitBuffer <- + mkBlockRamOpts(flitBufferOpts); // Beat buffer SizedQueue#(`FetcherLogBeatBufferSize, NumberedRoutingBeat) From e13246bf76f1e3bf13c81b5d58b6a949738a9312 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Fri, 3 Apr 2020 16:45:54 +0100 Subject: [PATCH 20/78] Testing URM2, MRM, and IND records These work. Still need to test RR records, and multi-beat records. --- apps/progrouter/Makefile | 2 +- apps/progrouter/progrouter.cpp | 72 ++++++++++++++++++++++++++++++++-- 2 files changed, 69 insertions(+), 5 deletions(-) diff --git a/apps/progrouter/Makefile b/apps/progrouter/Makefile index b58478f2..76c728f5 100644 --- a/apps/progrouter/Makefile +++ b/apps/progrouter/Makefile @@ -23,7 +23,7 @@ data.v: progrouter.elf --set-section-flags .bss=alloc,load,contents \ progrouter.elf data.v -progrouter.elf: progrouter.cpp link.ld $(INC)/config.h $(INC)/tinsel.h entry.o +progrouter.elf: progrouter.cpp link.ld $(INC)/config.h $(INC)/tinsel.h entry.o $(LIB)/lib.o $(RV_CPPC) $(CFLAGS) -Wall -c -o progrouter.o progrouter.cpp $(RV_LD) $(LDFLAGS) -T link.ld -o progrouter.elf entry.o progrouter.o $(LIB)/lib.o diff --git a/apps/progrouter/progrouter.cpp b/apps/progrouter/progrouter.cpp index b1740082..9c78f2f8 100644 --- a/apps/progrouter/progrouter.cpp +++ b/apps/progrouter/progrouter.cpp @@ -81,6 +81,7 @@ template struct RoutingTable { // Add a URM1 record to the table void addURM1(uint32_t mboxX, uint32_t mboxY, uint32_t mboxThread, uint32_t localKey) { + if (numChunks == 6) next(); uint8_t* ptr = beats[currentBeat].bytes + 5*(5-numChunks); ptr[0] = localKey; ptr[1] = localKey >> 8; @@ -88,7 +89,63 @@ template struct RoutingTable { ptr[3] = ((mboxThread&0x1f) << 3) | ((localKey >> 24) & 0x7); ptr[4] = (mboxY << 3) | (mboxX << 1) | (mboxThread >> 5); numChunks++; + } + + // Add a URM2 record to the table + void addURM2(uint32_t mboxX, uint32_t mboxY, uint32_t mboxThread, + uint32_t localKeyHigh, uint32_t localKeyLow) { + if (numChunks >= 5) next(); + uint8_t* ptr = beats[currentBeat].bytes + 5*(4-numChunks); + ptr[0] = localKeyLow; + ptr[1] = localKeyLow >> 8; + ptr[2] = localKeyLow >> 16; + ptr[3] = localKeyLow >> 24; + ptr[4] = localKeyHigh; + ptr[5] = localKeyHigh >> 8; + ptr[6] = localKeyHigh >> 16; + ptr[7] = localKeyHigh >> 24; + ptr[8] = (mboxThread&0x1f) << 3; + ptr[9] = (1 << 5) | (mboxY << 3) | (mboxX << 1) | (mboxThread >> 5); + numChunks += 2; + } + + // Add an MRM record to the table + void addMRM(uint32_t mboxX, uint32_t mboxY, + uint32_t threadsHigh, uint32_t threadsLow) { + if (numChunks >= 5) next(); + uint8_t* ptr = beats[currentBeat].bytes + 5*(4-numChunks); + ptr[0] = threadsLow; + ptr[1] = threadsLow >> 8; + ptr[2] = threadsLow >> 16; + ptr[3] = threadsLow >> 24; + ptr[4] = threadsHigh; + ptr[5] = threadsHigh >> 8; + ptr[6] = threadsHigh >> 16; + ptr[7] = threadsHigh >> 24; + ptr[9] = (3 << 5) | (mboxY << 3) | (mboxX << 1); + numChunks += 2; + } + + // Add an IND record to the table + // Return a pointer to the indirection key, + // so it can be set later by the caller + uint8_t* addIND() { if (numChunks == 6) next(); + uint8_t* ptr = beats[currentBeat].bytes + 5*(5-numChunks); + ptr[4] = 4 << 5; + numChunks++; + return ptr; + } + + // Set indirection key + void setIND(uint8_t* ind, bool upperRam, + uint8_t* beatPtr, uint32_t numBeats) { + uint32_t key = (uint32_t) beatPtr | numBeats; + if (upperRam) key |= 0x80000000; + ind[0] = key; + ind[1] = key >> 8; + ind[2] = key >> 16; + ind[3] = key >> 24; } }; @@ -110,17 +167,24 @@ int main() // On thread 0 if (me == 0) { // Add an URM1 record - uint8_t* entry = table.currentPointer(); - table.addURM1(0, 0, 10, 0xff); + uint8_t* entry1 = table.currentPointer(); + table.addURM1(0, 0, 10, 0xfff); + table.addURM2(0, 0, 60, 0xff1, 0xff0); + //table.addMRM(1, 0, 0x22222222, 0x11111111); + uint8_t* ind = table.addIND(); + table.next(); + uint8_t* entry2 = table.currentPointer(); + table.addURM1(0, 0, 20, 0x111); table.next(); + table.setIND(ind, 0, entry2, 1); // Cache flush, to write table into RAM tinselCacheFlush(); // Wait until flush done, by issuing a load - volatile uint32_t* dummyPtr = (uint32_t*) entry; dummyPtr[0]; + volatile uint32_t* dummyPtr = (uint32_t*) entry1; dummyPtr[0]; // Construct key - uint32_t key = (uint32_t) entry; + uint32_t key = (uint32_t) entry1; key = key | 1; // Entry is 1 beat long // Send message to key From 82a2e7f02d4de4214c73788325927b54ef193701 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Mon, 6 Apr 2020 09:56:05 +0100 Subject: [PATCH 21/78] Testing multi-beat records Working fine but needed to clarify the docs: the size field in a beat is the number of records, not the number of chunks. --- apps/progrouter/progrouter.cpp | 28 ++++++++++++++++++++++++++-- rtl/ProgRouter.bsv | 2 +- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/apps/progrouter/progrouter.cpp b/apps/progrouter/progrouter.cpp index 9c78f2f8..2cf55711 100644 --- a/apps/progrouter/progrouter.cpp +++ b/apps/progrouter/progrouter.cpp @@ -57,12 +57,16 @@ template struct RoutingTable { // Number of chunks used so far in current beat uint32_t numChunks; + // Number of records used so far in current beat + uint32_t numRecords; + // Index of beat currently being filled uint32_t currentBeat; // Constructor RoutingTable() { currentBeat = 0; + numChunks = numRecords = 0; } // Pointer to current beat being filled @@ -73,8 +77,9 @@ template struct RoutingTable { // Move on to next the beat void next() { beats[currentBeat].bytes[31] = 0; - beats[currentBeat].bytes[30] = numChunks; + beats[currentBeat].bytes[30] = numRecords; numChunks = 0; + numRecords = 0; currentBeat++; } @@ -89,6 +94,7 @@ template struct RoutingTable { ptr[3] = ((mboxThread&0x1f) << 3) | ((localKey >> 24) & 0x7); ptr[4] = (mboxY << 3) | (mboxX << 1) | (mboxThread >> 5); numChunks++; + numRecords++; } // Add a URM2 record to the table @@ -107,6 +113,7 @@ template struct RoutingTable { ptr[8] = (mboxThread&0x1f) << 3; ptr[9] = (1 << 5) | (mboxY << 3) | (mboxX << 1) | (mboxThread >> 5); numChunks += 2; + numRecords++; } // Add an MRM record to the table @@ -124,6 +131,7 @@ template struct RoutingTable { ptr[7] = threadsHigh >> 24; ptr[9] = (3 << 5) | (mboxY << 3) | (mboxX << 1); numChunks += 2; + numRecords++; } // Add an IND record to the table @@ -134,6 +142,7 @@ template struct RoutingTable { uint8_t* ptr = beats[currentBeat].bytes + 5*(5-numChunks); ptr[4] = 4 << 5; numChunks++; + numRecords++; return ptr; } @@ -147,6 +156,19 @@ template struct RoutingTable { ind[2] = key >> 16; ind[3] = key >> 24; } + + // Add an RR record to the table + void addRR(uint32_t dir, uint32_t key) { + if (numChunks == 6) next(); + uint8_t* ptr = beats[currentBeat].bytes + 5*(5-numChunks); + ptr[0] = key; + ptr[1] = key >> 8; + ptr[2] = key >> 16; + ptr[3] = key >> 24; + ptr[4] = (2 << 5) | (dir << 3); + numChunks++; + numRecords++; + } }; // Create global routing table of 16 beats @@ -170,6 +192,8 @@ int main() uint8_t* entry1 = table.currentPointer(); table.addURM1(0, 0, 10, 0xfff); table.addURM2(0, 0, 60, 0xff1, 0xff0); + table.addURM2(0, 0, 60, 0xff3, 0xff2); + table.addURM2(0, 0, 60, 0xff5, 0xff4); //table.addMRM(1, 0, 0x22222222, 0x11111111); uint8_t* ind = table.addIND(); table.next(); @@ -185,7 +209,7 @@ int main() // Construct key uint32_t key = (uint32_t) entry1; - key = key | 1; // Entry is 1 beat long + key = key | 2; // Entry is 2 beats long // Send message to key tinselWaitUntil(TINSEL_CAN_SEND); diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv index 48c29f15..f63b997c 100644 --- a/rtl/ProgRouter.bsv +++ b/rtl/ProgRouter.bsv @@ -28,7 +28,7 @@ import Util :: *; // 256-bit routing beat typedef struct { - // Number of 40-bit record chunks present + // Number of records present Bit#(16) size; // The 40-bit record chunks Vector#(6, Bit#(40)) chunks; From 3177e016dcb2a28259f8171dcbf77387e5cff6d1 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Tue, 7 Apr 2020 08:06:39 +0100 Subject: [PATCH 22/78] Move to 48-bit record chunks --- apps/progrouter/progrouter.cpp | 78 ++++++++++++++++++---------------- rtl/ProgRouter.bsv | 74 ++++++++++++++++++-------------- 2 files changed, 82 insertions(+), 70 deletions(-) diff --git a/apps/progrouter/progrouter.cpp b/apps/progrouter/progrouter.cpp index 2cf55711..6b764ac5 100644 --- a/apps/progrouter/progrouter.cpp +++ b/apps/progrouter/progrouter.cpp @@ -7,36 +7,36 @@ Byte ordering in a routing beat: 31: Upper byte of length (i.e. number of records in beat) 30: Lower byte of length - 29: Upper byte of first chunk + 29: Upper byte of first chunk 28: 27: 26: - 25: Lower byte of first chunk - 24: Upper byte of second chunk - 23: + 25: + 24: Lower byte of first chunk + 23: Upper byte of second chunk 22: 21: - 20: Lower byte of second chunk - 19: Upper byte of third chunk - 18: - 17: + 20: + 19: + 18: Lower byte of second chunk + 17: Upper byte of third chunk 16: - 15: Lower byte of third chunk - 14: Upper byte of fourth chunk + 15: + 14: 13: - 12: - 11: - 10: Lower byte of fourth chunk - 9: Upper byte of fifth chunk + 12: Lower byte of third chunk + 11: Upper byte of fourth chunk + 10: + 9: 8: 7: - 6: - 5: Lower byte of fifth chunk - 4: Upper byte of sixth chunk + 6: Lower byte of fourth chunk + 5: Upper byte of fifth chunk + 4: 3: 2: 1: - 0: Lower byte of sixth chunk + 0: Lower byte of fifth chunk Need to fold this into the docs eventually. */ @@ -86,13 +86,14 @@ template struct RoutingTable { // Add a URM1 record to the table void addURM1(uint32_t mboxX, uint32_t mboxY, uint32_t mboxThread, uint32_t localKey) { - if (numChunks == 6) next(); - uint8_t* ptr = beats[currentBeat].bytes + 5*(5-numChunks); + if (numChunks == 5) next(); + uint8_t* ptr = beats[currentBeat].bytes + 6*(4-numChunks); ptr[0] = localKey; ptr[1] = localKey >> 8; ptr[2] = localKey >> 16; - ptr[3] = ((mboxThread&0x1f) << 3) | ((localKey >> 24) & 0x7); - ptr[4] = (mboxY << 3) | (mboxX << 1) | (mboxThread >> 5); + ptr[3] = localKey >> 24; + ptr[4] = (mboxThread&0x1f) << 3; + ptr[5] = (mboxY << 3) | (mboxX << 1) | (mboxThread >> 5); numChunks++; numRecords++; } @@ -100,8 +101,8 @@ template struct RoutingTable { // Add a URM2 record to the table void addURM2(uint32_t mboxX, uint32_t mboxY, uint32_t mboxThread, uint32_t localKeyHigh, uint32_t localKeyLow) { - if (numChunks >= 5) next(); - uint8_t* ptr = beats[currentBeat].bytes + 5*(4-numChunks); + if (numChunks >= 4) next(); + uint8_t* ptr = beats[currentBeat].bytes + 6*(3-numChunks); ptr[0] = localKeyLow; ptr[1] = localKeyLow >> 8; ptr[2] = localKeyLow >> 16; @@ -110,17 +111,18 @@ template struct RoutingTable { ptr[5] = localKeyHigh >> 8; ptr[6] = localKeyHigh >> 16; ptr[7] = localKeyHigh >> 24; - ptr[8] = (mboxThread&0x1f) << 3; - ptr[9] = (1 << 5) | (mboxY << 3) | (mboxX << 1) | (mboxThread >> 5); + ptr[10] = (mboxThread&0x1f) << 3; + ptr[11] = (1 << 5) | (mboxY << 3) | (mboxX << 1) | (mboxThread >> 5); numChunks += 2; numRecords++; } // Add an MRM record to the table void addMRM(uint32_t mboxX, uint32_t mboxY, - uint32_t threadsHigh, uint32_t threadsLow) { - if (numChunks >= 5) next(); - uint8_t* ptr = beats[currentBeat].bytes + 5*(4-numChunks); + uint32_t threadsHigh, uint32_t threadsLow, + uint16_t localKey) { + if (numChunks >= 4) next(); + uint8_t* ptr = beats[currentBeat].bytes + 6*(3-numChunks); ptr[0] = threadsLow; ptr[1] = threadsLow >> 8; ptr[2] = threadsLow >> 16; @@ -129,7 +131,9 @@ template struct RoutingTable { ptr[5] = threadsHigh >> 8; ptr[6] = threadsHigh >> 16; ptr[7] = threadsHigh >> 24; - ptr[9] = (3 << 5) | (mboxY << 3) | (mboxX << 1); + ptr[8] = localKey; + ptr[9] = localKey >> 8; + ptr[11] = (3 << 5) | (mboxY << 3) | (mboxX << 1); numChunks += 2; numRecords++; } @@ -138,9 +142,9 @@ template struct RoutingTable { // Return a pointer to the indirection key, // so it can be set later by the caller uint8_t* addIND() { - if (numChunks == 6) next(); - uint8_t* ptr = beats[currentBeat].bytes + 5*(5-numChunks); - ptr[4] = 4 << 5; + if (numChunks == 5) next(); + uint8_t* ptr = beats[currentBeat].bytes + 6*(4-numChunks); + ptr[5] = 4 << 5; numChunks++; numRecords++; return ptr; @@ -159,13 +163,13 @@ template struct RoutingTable { // Add an RR record to the table void addRR(uint32_t dir, uint32_t key) { - if (numChunks == 6) next(); - uint8_t* ptr = beats[currentBeat].bytes + 5*(5-numChunks); + if (numChunks == 5) next(); + uint8_t* ptr = beats[currentBeat].bytes + 6*(4-numChunks); ptr[0] = key; ptr[1] = key >> 8; ptr[2] = key >> 16; ptr[3] = key >> 24; - ptr[4] = (2 << 5) | (dir << 3); + ptr[5] = (2 << 5) | (dir << 3); numChunks++; numRecords++; } @@ -194,7 +198,7 @@ int main() table.addURM2(0, 0, 60, 0xff1, 0xff0); table.addURM2(0, 0, 60, 0xff3, 0xff2); table.addURM2(0, 0, 60, 0xff5, 0xff4); - //table.addMRM(1, 0, 0x22222222, 0x11111111); + //table.addMRM(1, 0, 0x22222222, 0x11111111, 0x2222); uint8_t* ind = table.addIND(); table.next(); uint8_t* entry2 = table.currentPointer(); diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv index f63b997c..f6712ba1 100644 --- a/rtl/ProgRouter.bsv +++ b/rtl/ProgRouter.bsv @@ -16,11 +16,11 @@ import Util :: *; // Routing keys and beats // ============================================================================= -// A routing record is either 40 bits or 80 bits in size (aligned on a -// 40-bit or 80-bit boundary respectively). Multiple records are +// A routing record is either 48 bits or 96 bits in size (aligned on a +// 48-bit or 96-bit boundary respectively). Multiple records are // packed into a 256-bit DRAM beat (aligned on a 256-bit boundary). // The most significant 16 bits of the beat contain a count of the -// number of records in the beat (in the range 1 to 6 inclusive). The +// number of records in the beat (in the range 1 to 5 inclusive). The // remaining 240 bits contain records. The first record lies in the // least-significant bits of the beat. The size portion of the routing // key contains the number of contiguous DRAM beats holding all @@ -30,8 +30,8 @@ import Util :: *; typedef struct { // Number of records present Bit#(16) size; - // The 40-bit record chunks - Vector#(6, Bit#(40)) chunks; + // The 48-bit record chunks + Vector#(5, Bit#(48)) chunks; } RoutingBeat deriving (Bits, FShow); // 32-bit routing key @@ -53,11 +53,11 @@ function RoutingKey getRoutingKey(NetAddr addr) = // ============================================================================= typedef enum { - URM1 = 3'd0, // 40-bit Unicast Router-to-Mailbox - URM2 = 3'd1, // 80-bit Unicast Router-to-Mailbox - RR = 3'd2, // 40-bit Router-to-Router - MRM = 3'd3, // 80-bit Multicast Router-to-Mailbox - IND = 3'd4 // 40-bit Indirection + URM1 = 3'd0, // 48-bit Unicast Router-to-Mailbox + URM2 = 3'd1, // 96-bit Unicast Router-to-Mailbox + RR = 3'd2, // 48-bit Router-to-Router + MRM = 3'd3, // 96-bit Multicast Router-to-Mailbox + IND = 3'd4 // 48-bit Indirection } RoutingRecordTag deriving (Bits, Eq, FShow); typedef enum { @@ -67,7 +67,7 @@ typedef enum { WEST = 2'd3 } RoutingDir deriving (Bits, Eq); -// 40-bit Unicast Router-to-Mailbox (URM1) record +// 48-bit Unicast Router-to-Mailbox (URM1) record typedef struct { // Record type RoutingRecordTag tag; @@ -75,12 +75,14 @@ typedef struct { Bit#(4) mbox; // Mailbox-local thread identifier Bit#(6) thread; + // Unused + Bit#(3) unused; // Local key. The first word of the message // payload is overwritten with this. - Bit#(27) localKey; + Bit#(32) localKey; } URM1Record deriving (Bits, FShow); -// 80-bit Unicast Router-to-Mailbox (URM2) record +// 96-bit Unicast Router-to-Mailbox (URM2) record typedef struct { // Record type RoutingRecordTag tag; @@ -89,26 +91,26 @@ typedef struct { // Mailbox-local thread identifier Bit#(6) thread; // Currently unused - Bit#(3) unused; + Bit#(19) unused; // Local key. The first two words of the message // payload is overwritten with this. Bit#(64) localKey; } URM2Record deriving (Bits); -// 40-bit Router-to-Router (RR) record +// 48-bit Router-to-Router (RR) record typedef struct { // Record type RoutingRecordTag tag; // Direction (N, S, E, or W) RoutingDir dir; // Currently unused - Bit#(3) unused; + Bit#(11) unused; // New 32-bit routing key that will replace the one in the // current message for the next hop of the message's journey Bit#(32) newKey; } RRRecord deriving (Bits); -// 80-bit Multicast Router-to-Mailbox (MRM) record +// 96-bit Multicast Router-to-Mailbox (MRM) record typedef struct { // Record type RoutingRecordTag tag; @@ -116,16 +118,19 @@ typedef struct { Bit#(4) mbox; // Currently unused Bit#(9) unused; + // Local key. The least-significant half-word + // of the message is replaced with this + Bit#(16) localKey; // Mailbox-local destination mask Bit#(64) destMask; } MRMRecord deriving (Bits); -// 40-bit Indirection (IND) record: +// 48-bit Indirection (IND) record: typedef struct { // Record type RoutingRecordTag tag; // Currently unused - Bit#(5) unused; + Bit#(13) unused; // New 32-bit routing key for new set of records on current router Bit#(32) newKey; } INDRecord deriving (Bits); @@ -459,16 +464,16 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); let beatNum = beatReg.beatNum; let info = beatReg.info; // Extract tag from next record - RoutingRecordTag tag = unpack(truncateLSB(beat.chunks[5])); + RoutingRecordTag tag = unpack(truncateLSB(beat.chunks[4])); // Is this the first flit of a message? Bool firstFlit = emitFlitCount == 0; // Modify flit by interpreting routing key RoutingDecision decision = ?; Flit flit = flitBuffer.dataOut; case (tag) - // 40-bit Unicast Router-to-Mailbox + // 48-bit Unicast Router-to-Mailbox URM1: begin - URM1Record rec = unpack(beat.chunks[5]); + URM1Record rec = unpack(beat.chunks[4]); flit.dest.addr.isKey = False; flit.dest.addr.mbox.x = unpack(truncate(rec.mbox[1:0])); flit.dest.addr.mbox.y = unpack(truncate(rec.mbox[3:2])); @@ -478,12 +483,12 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); flit.dest.threads = pack(threadMask); // Replace first word of message with local key if (firstFlit) - flit.payload = {truncateLSB(flit.payload), 5'b0, rec.localKey}; + flit.payload = {truncateLSB(flit.payload), rec.localKey}; decision = RouteNoC; end - // 80-bit Unicast Router-to-Mailbox + // 96-bit Unicast Router-to-Mailbox URM2: begin - URM2Record rec = unpack({beat.chunks[5], beat.chunks[4]}); + URM2Record rec = unpack({beat.chunks[4], beat.chunks[3]}); flit.dest.addr.isKey = False; flit.dest.addr.mbox.x = unpack(truncate(rec.mbox[1:0])); flit.dest.addr.mbox.y = unpack(truncate(rec.mbox[3:2])); @@ -496,9 +501,9 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); flit.payload = {truncateLSB(flit.payload), rec.localKey}; decision = RouteNoC; end - // 40-bit Router-to-Router + // 48-bit Router-to-Router RR: begin - RRRecord rec = unpack(beat.chunks[5]); + RRRecord rec = unpack(beat.chunks[4]); case (rec.dir) NORTH: begin decision = RouteNorth; @@ -519,18 +524,21 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); endcase flit.dest.threads = {?, rec.newKey}; end - // 80-bit Multicast Router-to-Mailbox + // 96-bit Multicast Router-to-Mailbox MRM: begin - MRMRecord rec = unpack({beat.chunks[5], beat.chunks[4]}); + MRMRecord rec = unpack({beat.chunks[4], beat.chunks[3]}); flit.dest.addr.isKey = False; flit.dest.addr.mbox.x = unpack(truncate(rec.mbox[1:0])); flit.dest.addr.mbox.y = unpack(truncate(rec.mbox[3:2])); flit.dest.threads = rec.destMask; + // Replace first half-word of message with local key + if (firstFlit) + flit.payload = {truncateLSB(flit.payload), rec.localKey}; decision = RouteNoC; end - // 40-bit Indirection + // 48-bit Indirection IND: begin - INDRecord rec = unpack(beat.chunks[5]); + INDRecord rec = unpack(beat.chunks[4]); flit.dest.threads = {?, rec.newKey}; decision = RouteLoop; end @@ -547,12 +555,12 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); RoutingBeat newBeat = beat; Bool doubleChunk = unpack(pack(tag)[0]); if (doubleChunk) begin - for (Integer i = 5; i > 2; i=i-2) begin + for (Integer i = 4; i > 2; i=i-2) begin newBeat.chunks[i] = beat.chunks[i-2]; newBeat.chunks[i-1] = beat.chunks[i-3]; end end else begin - for (Integer i = 5; i > 0; i=i-1) + for (Integer i = 4; i > 0; i=i-1) newBeat.chunks[i] = beat.chunks[i-1]; end beatReg <= NumberedRoutingBeat { From 4928d5ab05865b9f89a8cd9b1cff51235c612f0c Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Tue, 7 Apr 2020 08:37:40 +0100 Subject: [PATCH 23/78] Forward port recent changes to POLite app suite (From 0.6.3) --- apps/POLite/asp-pc/Makefile | 9 +- apps/POLite/asp-pc/asp-push.cpp | 180 ++++++++++++++++++ apps/POLite/asp-sync/Run.cpp | 3 +- apps/POLite/heat-gals/Heat.h | 10 +- apps/POLite/heat-gals/Makefile | 4 +- apps/POLite/heat-gals/Run.cpp | 109 +++++------ .../{heat-gals => heat-grid-sync}/Colours.cpp | 0 .../{heat-gals => heat-grid-sync}/Colours.h | 0 apps/POLite/heat-grid-sync/Heat.cpp | 23 +++ apps/POLite/heat-grid-sync/Heat.h | 71 +++++++ apps/POLite/heat-grid-sync/Makefile | 7 + apps/POLite/heat-grid-sync/Run.cpp | 119 ++++++++++++ apps/POLite/heat-pc/.asp.cpp.swp | Bin 0 -> 16384 bytes apps/POLite/heat-pc/Makefile | 11 ++ apps/POLite/heat-pc/heat.cpp | 63 ++++++ apps/POLite/heat-sync/Colours.cpp | 71 ------- apps/POLite/heat-sync/Colours.h | 10 - apps/POLite/heat-sync/Heat.h | 21 +- apps/POLite/heat-sync/Makefile | 4 +- apps/POLite/heat-sync/Run.cpp | 111 +++++------ apps/POLite/izhikevich-gals/Izhikevich.cpp | 23 +++ apps/POLite/izhikevich-gals/Izhikevich.h | 115 +++++++++++ apps/POLite/izhikevich-gals/Makefile | 6 + apps/POLite/izhikevich-gals/RNG.h | 23 +++ apps/POLite/izhikevich-gals/Run.cpp | 130 +++++++++++++ apps/POLite/izhikevich-pc/Izhikevich.cpp | 139 ++++++++++++++ apps/POLite/izhikevich-pc/Makefile | 6 + apps/POLite/izhikevich-pc/RNG.h | 27 +++ apps/POLite/izhikevich-sync/Izhikevich.cpp | 23 +++ apps/POLite/izhikevich-sync/Izhikevich.h | 72 +++++++ apps/POLite/izhikevich-sync/Makefile | 6 + apps/POLite/izhikevich-sync/RNG.h | 23 +++ apps/POLite/izhikevich-sync/Run.cpp | 117 ++++++++++++ apps/POLite/pagerank-sync/Run.cpp | 1 + apps/POLite/sssp-async/Run.cpp | 1 + apps/POLite/sssp-pc/.asp.cpp.swp | Bin 0 -> 16384 bytes apps/POLite/sssp-pc/Makefile | 11 ++ apps/POLite/sssp-pc/sssp.cpp | 92 +++++++++ apps/POLite/sssp-sync/Run.cpp | 1 + apps/POLite/util/sumstats.awk | 2 +- 40 files changed, 1412 insertions(+), 232 deletions(-) create mode 100644 apps/POLite/asp-pc/asp-push.cpp rename apps/POLite/{heat-gals => heat-grid-sync}/Colours.cpp (100%) rename apps/POLite/{heat-gals => heat-grid-sync}/Colours.h (100%) create mode 100644 apps/POLite/heat-grid-sync/Heat.cpp create mode 100644 apps/POLite/heat-grid-sync/Heat.h create mode 100644 apps/POLite/heat-grid-sync/Makefile create mode 100644 apps/POLite/heat-grid-sync/Run.cpp create mode 100644 apps/POLite/heat-pc/.asp.cpp.swp create mode 100644 apps/POLite/heat-pc/Makefile create mode 100644 apps/POLite/heat-pc/heat.cpp delete mode 100644 apps/POLite/heat-sync/Colours.cpp delete mode 100644 apps/POLite/heat-sync/Colours.h create mode 100644 apps/POLite/izhikevich-gals/Izhikevich.cpp create mode 100644 apps/POLite/izhikevich-gals/Izhikevich.h create mode 100644 apps/POLite/izhikevich-gals/Makefile create mode 100644 apps/POLite/izhikevich-gals/RNG.h create mode 100644 apps/POLite/izhikevich-gals/Run.cpp create mode 100644 apps/POLite/izhikevich-pc/Izhikevich.cpp create mode 100644 apps/POLite/izhikevich-pc/Makefile create mode 100644 apps/POLite/izhikevich-pc/RNG.h create mode 100644 apps/POLite/izhikevich-sync/Izhikevich.cpp create mode 100644 apps/POLite/izhikevich-sync/Izhikevich.h create mode 100644 apps/POLite/izhikevich-sync/Makefile create mode 100644 apps/POLite/izhikevich-sync/RNG.h create mode 100644 apps/POLite/izhikevich-sync/Run.cpp create mode 100644 apps/POLite/sssp-pc/.asp.cpp.swp create mode 100644 apps/POLite/sssp-pc/Makefile create mode 100644 apps/POLite/sssp-pc/sssp.cpp diff --git a/apps/POLite/asp-pc/Makefile b/apps/POLite/asp-pc/Makefile index 0cf7448f..bf9439f3 100644 --- a/apps/POLite/asp-pc/Makefile +++ b/apps/POLite/asp-pc/Makefile @@ -1,10 +1,10 @@ # SPDX-License-Identifier: BSD-2-Clause -all: asp GenHypercube GenTree GenGeoGraph +all: asp GenHypercube GenTree INC=../../../../include asp: asp.cpp - g++ -fopenmp -D_DEFAULT_SOURCE -I$(INC) -O3 asp.cpp -o asp + g++ -I$(INC) -O3 asp.cpp -o asp GenHypercube: GenHypercube.hs ghc -O2 --make GenHypercube.hs @@ -12,8 +12,5 @@ GenHypercube: GenHypercube.hs GenTree: GenTree.hs ghc -O2 --make GenTree.hs -GenGeoGraph: GenGeoGraph.cpp - g++ -O2 -lstdc++ GenGeoGraph.cpp -o GenGeoGraph - clean: - rm -f asp GenHypercube GenTree GenGeoGraph *.hi *.o + rm -f asp GenHypercube GenTree *.hi *.o diff --git a/apps/POLite/asp-pc/asp-push.cpp b/apps/POLite/asp-pc/asp-push.cpp new file mode 100644 index 00000000..a75f6628 --- /dev/null +++ b/apps/POLite/asp-pc/asp-push.cpp @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: BSD-2-Clause +#include "RandomSet.h" + +#include +#include +#include +#include +#include +#include + +// Number of nodes and edges +uint32_t numNodes; +uint32_t numEdges; + +// Mapping from node id to array of neighbouring node ids +// First element of each array holds the number of neighbours +uint32_t** neighbours; + +// Mapping from node id to bit vector of reaching nodes +uint64_t** reaching; +uint64_t** reachingNext; + +// Number of 64-bit words in reaching vector +const uint64_t vectorSize = 1; + +void readGraph(const char* filename, bool undirected) +{ + // Read edges + FILE* fp = fopen(filename, "rt"); + if (fp == NULL) { + fprintf(stderr, "Can't open '%s'\n", filename); + exit(EXIT_FAILURE); + } + + // Note: we use a "pull" algorithm (rather than "push") to + // avoid parallel writes to the same address, hence we reverse + // the direction of the edges here. + + // Count number of nodes and edges + numEdges = 0; + numNodes = 0; + int ret; + while (1) { + uint32_t src, dst; + ret = fscanf(fp, "%d %d", &dst, &src); + if (ret == EOF) break; + numEdges++; + numNodes = src >= numNodes ? src+1 : numNodes; + numNodes = dst >= numNodes ? dst+1 : numNodes; + } + rewind(fp); + + // Create mapping from node id to number of neighbours + uint32_t* count = (uint32_t*) calloc(numNodes, sizeof(uint32_t)); + for (int i = 0; i < numEdges; i++) { + uint32_t src, dst; + ret = fscanf(fp, "%d %d", &dst, &src); + count[src]++; + if (undirected) count[dst]++; + } + + // Create mapping from node id to neighbours + neighbours = (uint32_t**) calloc(numNodes, sizeof(uint32_t*)); + rewind(fp); + for (int i = 0; i < numNodes; i++) { + neighbours[i] = (uint32_t*) calloc(count[i]+1, sizeof(uint32_t)); + neighbours[i][0] = count[i]; + } + for (int i = 0; i < numEdges; i++) { + uint32_t src, dst; + ret = fscanf(fp, "%d %d", &dst, &src); + neighbours[src][count[src]--] = dst; + if (undirected) neighbours[dst][count[dst]--] = src; + } + + // Create mapping from node id to bit vector of reaching nodes + reaching = (uint64_t**) calloc(numNodes, sizeof(uint64_t*)); + reachingNext = (uint64_t**) calloc(numNodes, sizeof(uint64_t*)); + for (int i = 0; i < numNodes; i++) { + reaching[i] = (uint64_t*) calloc(vectorSize, sizeof(uint64_t)); + reachingNext[i] = (uint64_t*) calloc(vectorSize, sizeof(uint64_t)); + } + + // Release + free(count); + fclose(fp); +} + +// Compute sum of all shortest paths from given sources +uint64_t ssp(uint32_t numSources, uint32_t* sources) +{ + // Sum of distances + uint64_t sum = 0; + + // Initialise reaching vector for each node + for (int i = 0; i < numNodes; i++) { + for (int j = 0; j < vectorSize; j++) { + reaching[i][j] = 0; + reachingNext[i][j] = 0; + } + } + for (int i = 0; i < numSources; i++) { + uint32_t src = sources[i]; + reaching[src][i/64] |= 1ul << (i%64); + } + + int* queue = new int [numNodes]; + int queueSize = 0; + for (int i = 0; i < numNodes; i++) queue[queueSize++] = i; + + // Distance increases on each iteration + uint32_t dist = 1; + + while (queueSize > 0) { + // For each node + for (int i = 0; i < queueSize; i++) { + int me = queue[i]; + // For each neighbour + uint32_t numNeighbours = neighbours[me][0]; + for (int j = 1; j <= numNeighbours; j++) { + uint32_t n = neighbours[me][j]; + // For each chunk + for (int k = 0; k < vectorSize; k++) { + if (reaching[me][k] & ~reachingNext[n][k]) + reachingNext[n][k] |= reaching[me][k]; + } + } + } + + // For each node, update reaching vector + queueSize = 0; + for (int i = 0; i < numNodes; i++) { + for (int k = 0; k < vectorSize; k++) { + uint64_t diff = reachingNext[i][k] & ~reaching[i][k]; + if (diff) { + queue[queueSize++] = i; + uint32_t n = __builtin_popcountll(diff); + sum += n * dist; + reaching[i][k] |= reachingNext[i][k]; + } + } + } + dist++; + } + + return sum; +} + +int main(int argc, char**argv) +{ + if (argc != 2) { + printf("Specify edges file\n"); + exit(EXIT_FAILURE); + } + bool undirected = false; + readGraph(argv[1], undirected); + printf("Nodes: %u. Edges: %u\n", numNodes, numEdges); + + uint32_t numSources = 64*vectorSize; + assert(numSources < numNodes); + uint32_t sources[numSources]; + for (int i = 0; i < numSources; i++) sources[i] = i; + //randomSet(numSources, sources, numNodes); + + struct timeval start, finish, diff; + + uint64_t sum = 0; + const int nodesPerVector = 64 * vectorSize; + gettimeofday(&start, NULL); + sum = ssp(numSources, sources); + gettimeofday(&finish, NULL); + + printf("Sum of subset of shortest paths = %lu\n", sum); + + timersub(&finish, &start, &diff); + double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; + printf("Time = %lf\n", duration); + + return 0; +} diff --git a/apps/POLite/asp-sync/Run.cpp b/apps/POLite/asp-sync/Run.cpp index 25082646..3264d114 100644 --- a/apps/POLite/asp-sync/Run.cpp +++ b/apps/POLite/asp-sync/Run.cpp @@ -19,9 +19,10 @@ int main(int argc, char**argv) // Read network EdgeList net; net.read(argv[1]); - + // Print max fan-out printf("Max fan-out = %d\n", net.maxFanOut()); + assert(net.minFanOut() > 0); // Check that parameters make sense assert(32*N <= net.numNodes); diff --git a/apps/POLite/heat-gals/Heat.h b/apps/POLite/heat-gals/Heat.h index 12ca9574..600b4d00 100644 --- a/apps/POLite/heat-gals/Heat.h +++ b/apps/POLite/heat-gals/Heat.h @@ -2,6 +2,8 @@ #ifndef _HEAT_H_ #define _HEAT_H_ +#define POLITE_DUMP_STATS +#define POLITE_COUNT_MSGS #include struct HeatMessage { @@ -10,7 +12,7 @@ struct HeatMessage { // Time step uint32_t time; // Temperature at sender - uint32_t val; + float val; }; struct HeatState { @@ -21,9 +23,9 @@ struct HeatState { // Current time step of device uint32_t time; // Current temperature of device - uint32_t val; + float val; // Accumulator for temperatures received at times t and t+1 - uint32_t acc, accNext; + float acc, accNext; // Count messages sent and received uint8_t sent, received, receivedNext; // Is the temperature of this device constant? @@ -45,7 +47,7 @@ struct HeatDevice : PDevice { // Proceed to next time step? if (s->sent && s->received == s->fanIn) { s->time--; - if (!s->isConstant) s->val = s->acc >> 2; + if (!s->isConstant) s->val = s->acc / (float) s->fanIn; s->acc = s->accNext; s->received = s->receivedNext; s->accNext = s->receivedNext = 0; diff --git a/apps/POLite/heat-gals/Makefile b/apps/POLite/heat-gals/Makefile index 0c343edd..86430b66 100644 --- a/apps/POLite/heat-gals/Makefile +++ b/apps/POLite/heat-gals/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: BSD-2-Clause APP_CPP = Heat.cpp APP_HDR = Heat.h -RUN_CPP = Run.cpp Colours.cpp -RUN_H = Colours.h +RUN_CPP = Run.cpp +RUN_H = include ../util/polite.mk diff --git a/apps/POLite/heat-gals/Run.cpp b/apps/POLite/heat-gals/Run.cpp index 0a08505b..eacf449f 100644 --- a/apps/POLite/heat-gals/Run.cpp +++ b/apps/POLite/heat-gals/Run.cpp @@ -1,17 +1,31 @@ // SPDX-License-Identifier: BSD-2-Clause #include "Heat.h" -#include "Colours.h" #include #include +#include #include -int main() +int main(int argc, char **argv) { // Parameters - const uint32_t width = 256; - const uint32_t height = 256; - const uint32_t time = 1000; + const uint32_t time = 1000; + + // Read in the example edge list and create data structure + if (argc != 2) { + printf("Specify edge file\n"); + exit(EXIT_FAILURE); + } + + // Load in the edge list file + printf("Loading in the graph..."); fflush(stdout); + EdgeList net; + net.read(argv[1]); + printf(" done\n"); + + // Print max fan-out + printf("Min fan-out = %d\n", net.minFanOut()); + printf("Max fan-out = %d\n", net.maxFanOut()); // Connection to tinsel machine HostLink hostLink; @@ -19,58 +33,32 @@ int main() // Create POETS graph PGraph graph; - // Create 2D mesh of devices - PDeviceId **mesh = new PDeviceId* [height]; - for (uint32_t y = 0; y < height; y++) { - mesh[y] = new PDeviceId [width]; - for (uint32_t x = 0; x < width; x++) - mesh[y][x] = graph.newDevice(); + // Create nodes in POETS graph + for (uint32_t i = 0; i < net.numNodes; i++) { + PDeviceId id = graph.newDevice(); + assert(i == id); } - // Add edges - for (uint32_t y = 0; y < height; y++) - for (uint32_t x = 0; x < width; x++) { - if (x < width-1) { - graph.addEdge(mesh[y][x], 0, mesh[y][x+1]); - graph.addEdge(mesh[y][x+1], 0, mesh[y][x]); - } - if (y < height-1) { - graph.addEdge(mesh[y][x], 0, mesh[y+1][x]); - graph.addEdge(mesh[y+1][x], 0, mesh[y][x]); - } - } + // Create connections in POETS graph + for (uint32_t i = 0; i < net.numNodes; i++) { + uint32_t numNeighbours = net.neighbours[i][0]; + for (uint32_t j = 0; j < numNeighbours; j++) + graph.addEdge(i, 0, net.neighbours[i][j+1]); + } // Prepare mapping from graph to hardware graph.map(); - // Set device ids - for (uint32_t y = 0; y < height; y++) - for (uint32_t x = 0; x < width; x++) - graph.devices[mesh[y][x]]->state.id = mesh[y][x]; - - // Initialise time and fanIn fields + // Specify number of time steps to run on each device + srand(1); for (PDeviceId i = 0; i < graph.numDevices; i++) { + int r = rand() % 255; + graph.devices[i]->state.id = i; graph.devices[i]->state.time = time; + graph.devices[i]->state.val = (float) r; + graph.devices[i]->state.isConstant = false; graph.devices[i]->state.fanIn = graph.fanIn(i); } - - // Apply constant heat at north edge - // Apply constant cool at south edge - for (uint32_t x = 0; x < width; x++) { - graph.devices[mesh[0][x]]->state.val = 255 << 16; - graph.devices[mesh[0][x]]->state.isConstant = true; - graph.devices[mesh[height-1][x]]->state.val = 40 << 16; - graph.devices[mesh[height-1][x]]->state.isConstant = true; - } - - // Apply constant heat at west edge - // Apply constant cool at east edge - for (uint32_t y = 0; y < height; y++) { - graph.devices[mesh[y][0]]->state.val = 255 << 16; - graph.devices[mesh[y][0]]->state.isConstant = true; - graph.devices[mesh[y][width-1]]->state.val = 40 << 16; - graph.devices[mesh[y][width-1]]->state.isConstant = true; - } // Write graph down to tinsel machine via HostLink graph.write(&hostLink); @@ -84,8 +72,11 @@ int main() struct timeval start, finish, diff; gettimeofday(&start, NULL); + // Consume performance stats + politeSaveStats(&hostLink, "stats.txt"); + // Allocate array to contain final value of each device - uint32_t* pixels = new uint32_t [graph.numDevices]; + float* pixels = new float [graph.numDevices]; // Receive final value of each device for (uint32_t i = 0; i < graph.numDevices; i++) { @@ -97,25 +88,17 @@ int main() pixels[msg.payload.from] = msg.payload.val; } + // Display final values of first ten devices + for (uint32_t i = 0; i < 10; i++) { + if (i < graph.numDevices) { + printf("%d: %f\n", i, pixels[i]); + } + } + // Display time timersub(&finish, &start, &diff); double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; printf("Time = %lf\n", duration); - // Emit image - FILE* fp = fopen("out.ppm", "wt"); - if (fp == NULL) { - printf("Can't open output file for writing\n"); - return -1; - } - fprintf(fp, "P3\n%d %d\n255\n", width, height); - for (uint32_t y = 0; y < height; y++) - for (uint32_t x = 0; x < width; x++) { - uint32_t val = (pixels[mesh[y][x]] >> 16) & 0xff; - fprintf(fp, "%d %d %d\n", - colours[val*3], colours[val*3+1], colours[val*3+2]); - } - fclose(fp); - return 0; } diff --git a/apps/POLite/heat-gals/Colours.cpp b/apps/POLite/heat-grid-sync/Colours.cpp similarity index 100% rename from apps/POLite/heat-gals/Colours.cpp rename to apps/POLite/heat-grid-sync/Colours.cpp diff --git a/apps/POLite/heat-gals/Colours.h b/apps/POLite/heat-grid-sync/Colours.h similarity index 100% rename from apps/POLite/heat-gals/Colours.h rename to apps/POLite/heat-grid-sync/Colours.h diff --git a/apps/POLite/heat-grid-sync/Heat.cpp b/apps/POLite/heat-grid-sync/Heat.cpp new file mode 100644 index 00000000..b2b4fc3e --- /dev/null +++ b/apps/POLite/heat-grid-sync/Heat.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: BSD-2-Clause +#include "Heat.h" + +#include +#include + +typedef PThread< + HeatDevice, + HeatState, // State + None, // Edge label + HeatMessage // Message + > HeatThread; + +int main() +{ + // Point thread structure at base of thread's heap + HeatThread* thread = (HeatThread*) tinselHeapBaseSRAM(); + + // Invoke interpreter + thread->run(); + + return 0; +} diff --git a/apps/POLite/heat-grid-sync/Heat.h b/apps/POLite/heat-grid-sync/Heat.h new file mode 100644 index 00000000..b3a63a93 --- /dev/null +++ b/apps/POLite/heat-grid-sync/Heat.h @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: BSD-2-Clause +#ifndef _HEAT_H_ +#define _HEAT_H_ + +#include + +struct HeatMessage { + // Sender id + uint32_t from; + // Time step + uint32_t time; + // Temperature at sender + uint32_t val; +}; + +struct HeatState { + // Device id + uint32_t id; + // Current time step of device + uint32_t time; + // Current temperature of device + uint32_t val, acc; + // Is the temperature of this device constant? + bool isConstant; +}; + +struct HeatDevice : PDevice { + + // Called once by POLite at start of execution + inline void init() { + *readyToSend = Pin(0); + } + + // Send handler + inline void send(volatile HeatMessage* msg) { + msg->from = s->id; + msg->time = s->time; + msg->val = s->val; + *readyToSend = No; + } + + // Receive handler + inline void recv(HeatMessage* msg, None* edge) { + s->acc += msg->val; + } + + // Called by POLite when system becomes idle + inline bool step() { + // Execution complete? + if (s->time == 0) { + *readyToSend = No; + return false; + } + else { + s->time--; + if (!s->isConstant) s->val = s->acc >> 2; + s->acc = 0; + *readyToSend = Pin(0); + return true; + } + } + + // Optionally send message to host on termination + inline bool finish(volatile HeatMessage* msg) { + msg->from = s->id; + msg->val = s->val; + return true; + } +}; + +#endif diff --git a/apps/POLite/heat-grid-sync/Makefile b/apps/POLite/heat-grid-sync/Makefile new file mode 100644 index 00000000..0c343edd --- /dev/null +++ b/apps/POLite/heat-grid-sync/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: BSD-2-Clause +APP_CPP = Heat.cpp +APP_HDR = Heat.h +RUN_CPP = Run.cpp Colours.cpp +RUN_H = Colours.h + +include ../util/polite.mk diff --git a/apps/POLite/heat-grid-sync/Run.cpp b/apps/POLite/heat-grid-sync/Run.cpp new file mode 100644 index 00000000..a938a446 --- /dev/null +++ b/apps/POLite/heat-grid-sync/Run.cpp @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: BSD-2-Clause +#include "Heat.h" +#include "Colours.h" + +#include +#include +#include + +int main() +{ + // Parameters + const uint32_t width = 256; + const uint32_t height = 256; + const uint32_t time = 1000; + + // Connection to tinsel machine + HostLink hostLink; + + // Create POETS graph + PGraph graph; + + // Create 2D mesh of devices + PDeviceId **mesh = new PDeviceId* [height]; + for (uint32_t y = 0; y < height; y++) { + mesh[y] = new PDeviceId [width]; + for (uint32_t x = 0; x < width; x++) + mesh[y][x] = graph.newDevice(); + } + + // Add edges + for (uint32_t y = 0; y < height; y++) + for (uint32_t x = 0; x < width; x++) { + if (x < width-1) { + graph.addEdge(mesh[y][x], 0, mesh[y][x+1]); + graph.addEdge(mesh[y][x+1], 0, mesh[y][x]); + } + if (y < height-1) { + graph.addEdge(mesh[y][x], 0, mesh[y+1][x]); + graph.addEdge(mesh[y+1][x], 0, mesh[y][x]); + } + } + + // Prepare mapping from graph to hardware + graph.map(); + + // Set device ids + for (uint32_t y = 0; y < height; y++) + for (uint32_t x = 0; x < width; x++) + graph.devices[mesh[y][x]]->state.id = mesh[y][x]; + + // Specify number of time steps to run on each device + for (PDeviceId i = 0; i < graph.numDevices; i++) + graph.devices[i]->state.time = time; + + // Apply constant heat at north edge + // Apply constant cool at south edge + for (uint32_t x = 0; x < width; x++) { + graph.devices[mesh[0][x]]->state.val = 255 << 16; + graph.devices[mesh[0][x]]->state.isConstant = true; + graph.devices[mesh[height-1][x]]->state.val = 40 << 16; + graph.devices[mesh[height-1][x]]->state.isConstant = true; + } + + // Apply constant heat at west edge + // Apply constant cool at east edge + for (uint32_t y = 0; y < height; y++) { + graph.devices[mesh[y][0]]->state.val = 255 << 16; + graph.devices[mesh[y][0]]->state.isConstant = true; + graph.devices[mesh[y][width-1]]->state.val = 40 << 16; + graph.devices[mesh[y][width-1]]->state.isConstant = true; + } + + // Write graph down to tinsel machine via HostLink + graph.write(&hostLink); + + // Load code and trigger execution + hostLink.boot("code.v", "data.v"); + hostLink.go(); + printf("Starting\n"); + + // Start timer + struct timeval start, finish, diff; + gettimeofday(&start, NULL); + + // Allocate array to contain final value of each device + uint32_t* pixels = new uint32_t [graph.numDevices]; + + // Receive final value of each device + for (uint32_t i = 0; i < graph.numDevices; i++) { + // Receive message + PMessage msg; + hostLink.recvMsg(&msg, sizeof(msg)); + if (i == 0) gettimeofday(&finish, NULL); + // Save final value + pixels[msg.payload.from] = msg.payload.val; + } + + // Display time + timersub(&finish, &start, &diff); + double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; + printf("Time = %lf\n", duration); + + // Emit image + FILE* fp = fopen("out.ppm", "wt"); + if (fp == NULL) { + printf("Can't open output file for writing\n"); + return -1; + } + fprintf(fp, "P3\n%d %d\n255\n", width, height); + for (uint32_t y = 0; y < height; y++) + for (uint32_t x = 0; x < width; x++) { + uint32_t val = (pixels[mesh[y][x]] >> 16) & 0xff; + fprintf(fp, "%d %d %d\n", + colours[val*3], colours[val*3+1], colours[val*3+2]); + } + fclose(fp); + + return 0; +} diff --git a/apps/POLite/heat-pc/.asp.cpp.swp b/apps/POLite/heat-pc/.asp.cpp.swp new file mode 100644 index 0000000000000000000000000000000000000000..2d2936e2dd2badda5887be7d43054d6c78666e79 GIT binary patch literal 16384 zcmeI2U2Ggz700KE3#1_}N`Z$;NH?3>*4E#jdnPkgmO3PM6c2oWmLH!4vHsCYoY8~=0feC*oZ zxTz2VF)RJNo;&yB+;h(TIQMRI{E5ZK*aM^E20kA#jL%+K{#@stsex;zVRZb-`V^(_ z^J#92z~3hiy}qg{9E)6Ea&OEGLpgTt@g+CnW2Ovip+#S#Rv5mAs){Uy1_}+_jt1h$ zsXcJSI5|;QL;ZN^7`ylAh1;p2$XjTj&_JPqLIZ^c3Jnw*C^S%LpwPhoxCWx|fbj+N z;Xu-ltI78rJH8Jl&ojw4m7!0XPd^oZ3k?(+C^S%LpwK{}fkFd?1_})n8YnbSXrRzQ zp@IKK4VboJlpxp?9RT3|KaKxy-eVX)1wR3=fNS7OAOsh|0+O>xGhhxJ0)PFmVf-2V349;;;8E}hD1oEk0Qmc1 z!*~b$8oUIa2Uox%_&7KU2Eak^i@OZtS+EWs1)l(i!8?Zx<8AN<@EVAK15SZ|e#kI> z2Yv&-2fhh5z{B7WxDyP3>mP(}z-e$F`0WP_;~U^>;5l#wbig`jfQ#UM@a~=HC%6f& zgEznnU>mG}6W|DVcK~vNx4{p=w}Au!@WFZTAQ%7#!J9a&`5CwlUIj0LuLB>T5@K4!H|zs2n}zfoaU;Nw>!F`vn@#AxN@#CpVhwB&QQ)ouo{kZW#Z z?NTGnmWgu-nd&p_5}QTlxU-z)M;b(8my>$)TDvptYGFKUp}P;66soeUZC9(pg(|4& zz~*v>xp}L{Lt(Z$CJQ$u&O(&IsmY4AYSuO6M9UIZw>a6I zUiXoERcGo_yKYtg&0AmQPe)3n=ntk_ z^xeocJy-HfSxIZE*J>2`$B`Ec;ikSgT2UaS$aCj+4h*r$VE)nQm^Ymj>|*Je?`nr@VO z56qV}M>%A!dYXr0-*yG2aBf$+=UH1ujWuPsHTt(!qiFy`QV-H>DwKLa>x?wEbw(7? z83iX5InpwH2R<>vN`p2Vv`Zsw2zl@qW%d~OBP^+*k4fh3R?jS}2YQ!OSSH3vVl9}S zJXm-CawnlxCbwK#V7T66JIbU@*GHnA^S0DIRChAd3HtPFoMVn&(VSoynT+pQ0tDau#B;J!OTMz}# z#~#u<2^ui$6r~K;*>t|bW2S>YMeqly^AR4P1Q|9P^o;5%|E_SVNHzHjepPB zNj4Jod=G(!sOm1U$CGCz6MC+m)%=esn=+>AQ+hCcn{gU?N}3*`g(IlMAd8s+r{=;?tBG zV{tTDDU3a@gf*!ZpsIEU3dD*8Ge_8f&|BFqRZxFE4*1hBXfBEAz>e zPuK5|e zc9kz%)bmvlZP#aL;a<*6HZ@tJRbVR+w#0rgUj%e1gki}pqFT+~(ULZ^*9%qmWAAIR z$wqp$YY?jX7F>Z|V?WU0a5lJ)+T@r*nA@x!co3UBq{|7jZ9Vn0pESCB_mHOrGFcnp zxv4xNnEMsT*) z&NLfF*p=qF&0T43KuLKUr4`5%bXGZvmq;J0=gvM^TXHRqU)$FfZK%|BTrQ^B!>ea& z6SXs*iQ!{V{9i?^JAqi3;{Wu1|5u3Re-7RNuY(uBv)~N44-A06AfA66d=Z%76u2Aw z4Ke%;@DlhA_!hVZo(EqAKInjT&;Stc=g}DCr z;0NF}@LeFm1#l946x<8mMBi?LR{;m7!Kc6^Q2pLN;4N|#8YnbSXrRzQp@BjJg$DkQ zH9*lZ0>@bR6dmFr37SghvF%0NG3YrePO@WtzO3RMJBXV&#ZJ?Zt*nzO2pAn_G}>I3 z++xGKld(y)*chvitIwzkNNIyDWZWzdIj--@Hk}(qrl9j9s*PIgk}Ej|MSh9XCjThoaiEvqY9kyYo zufh;{j*BPMr3utBHOZ>or^;w8jmz@WT@oP@u@y0ra?`|NtnPlQ4$^|^h)9Eiy(AsU z;u|G%76>IBS<5@84%>AQ(^XN|4M%SUc|p~qY95#NIE*?Yn`VRYC}S$}qW_g4P=R`> zBvh|d19^aKiq_^@y@BKE{J4@Vvm-=C^N#6Bb&sS^b^3+=sS~e;+;W}myo1jSlvOyN?n69s`!06qGj@hduUPYgOCqo4;)6@4sTMrVP&DZLxek~sz z*3(%K`jXz26a49j;WMGj-FFmmodq5Bc#h&b8oj)NcMF za|OC>TVHR+u7~@$^)Lw40ioxW({Yy6M9NV~*{x50Po%RX_f_g^Xh@BEIyUK&J@a#_ z2<`5coDH$7X{zpFDspxTdy9+6?A9;6m{KBU0`&AbNzJOg^-WIyyBu`F#yG`sSb1n@ zI=RBj(}t&MQ0ptwL_s*VA%!7;Uq~-Y>9_%&9@=!mpY}hQB>Fi@?h3i#yWIiTtZ +#include +#include +#include +#include +#include +#include + +int main(int argc, char**argv) +{ + if (argc != 2) { + printf("Specify edges file\n"); + exit(EXIT_FAILURE); + } + + // Read network + EdgeList net; + net.read(argv[1]); + + // Create states + float* heat = new float [net.numNodes]; + float* heatNext = new float [net.numNodes]; + srand(1); + for (int i = 0; i < net.numNodes; i++) { + int r = rand() % 255; + heat[i] = (float) r; + } + + // Start timer + printf("Started\n"); + struct timeval start, finish, diff; + gettimeofday(&start, NULL); + + for (int t = 0; t < 100; t++) { + for (int i = 0; i < net.numNodes; i++) { + uint32_t numNeighbours = net.neighbours[i][0]; + float acc = 0.0; + for (uint32_t j = 0; j < numNeighbours; j++) { + uint32_t neighbour = net.neighbours[i][j+1]; + acc += heat[neighbour]; + } + heatNext[i] = acc / (float) numNeighbours; + } + float* tmp = heat; heat = heatNext; heatNext = tmp; + } + + // Stop timer + gettimeofday(&finish, NULL); + + // Display final values of first ten devices + for (uint32_t i = 0; i < 10; i++) { + if (i < net.numNodes) + printf("%d: %f\n", i, heat[i]); + } + + // Display time + timersub(&finish, &start, &diff); + double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; + printf("Time = %lf\n", duration); + + return 0; +} diff --git a/apps/POLite/heat-sync/Colours.cpp b/apps/POLite/heat-sync/Colours.cpp deleted file mode 100644 index 93b49740..00000000 --- a/apps/POLite/heat-sync/Colours.cpp +++ /dev/null @@ -1,71 +0,0 @@ -// SPDX-License-Identifier: BSD-2-Clause -#include - -// 256 x RGB colours representing heat intensities -uint8_t colours[] = { - 0x00, 0x00, 0x76, 0x00, 0x00, 0x7a, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x83, - 0x00, 0x00, 0x88, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x91, 0x00, 0x00, 0x95, - 0x00, 0x00, 0x9a, 0x00, 0x00, 0x9e, 0x00, 0x00, 0xa3, 0x00, 0x00, 0xa3, - 0x00, 0x00, 0xa7, 0x00, 0x00, 0xac, 0x00, 0x00, 0xb0, 0x00, 0x00, 0xb5, - 0x00, 0x00, 0xb9, 0x00, 0x00, 0xbe, 0x00, 0x00, 0xc2, 0x00, 0x00, 0xc7, - 0x00, 0x00, 0xcb, 0x00, 0x00, 0xd0, 0x00, 0x00, 0xd4, 0x00, 0x00, 0xd9, - 0x00, 0x00, 0xde, 0x00, 0x00, 0xe2, 0x00, 0x00, 0xe7, 0x00, 0x00, 0xeb, - 0x00, 0x00, 0xf0, 0x00, 0x00, 0xf4, 0x00, 0x00, 0xf9, 0x00, 0x00, 0xfd, - 0x00, 0x03, 0xff, 0x00, 0x07, 0xff, 0x00, 0x0c, 0xff, 0x00, 0x10, 0xff, - 0x00, 0x15, 0xff, 0x00, 0x19, 0xff, 0x00, 0x1e, 0xff, 0x00, 0x22, 0xff, - 0x00, 0x27, 0xff, 0x00, 0x2b, 0xff, 0x00, 0x30, 0xff, 0x00, 0x34, 0xff, - 0x00, 0x39, 0xff, 0x00, 0x3d, 0xff, 0x00, 0x42, 0xff, 0x00, 0x47, 0xff, - 0x00, 0x4b, 0xff, 0x00, 0x50, 0xff, 0x00, 0x54, 0xff, 0x00, 0x59, 0xff, - 0x00, 0x5d, 0xff, 0x00, 0x62, 0xff, 0x00, 0x66, 0xff, 0x00, 0x6b, 0xff, - 0x00, 0x6f, 0xff, 0x00, 0x74, 0xff, 0x00, 0x78, 0xff, 0x00, 0x7d, 0xff, - 0x00, 0x81, 0xff, 0x00, 0x86, 0xff, 0x00, 0x8a, 0xff, 0x00, 0x8f, 0xff, - 0x00, 0x93, 0xff, 0x00, 0x98, 0xff, 0x00, 0x9c, 0xff, 0x00, 0xa1, 0xff, - 0x00, 0xa5, 0xff, 0x00, 0xaa, 0xff, 0x00, 0xaf, 0xff, 0x00, 0xb3, 0xff, - 0x00, 0xb8, 0xff, 0x00, 0xbc, 0xff, 0x00, 0xc1, 0xff, 0x00, 0xc5, 0xff, - 0x00, 0xca, 0xff, 0x00, 0xce, 0xff, 0x00, 0xd3, 0xff, 0x00, 0xd7, 0xff, - 0x00, 0xdc, 0xff, 0x00, 0xe0, 0xff, 0x00, 0xe5, 0xff, 0x00, 0xe9, 0xff, - 0x00, 0xee, 0xff, 0x00, 0xf2, 0xff, 0x00, 0xf7, 0xff, 0x00, 0xfb, 0xff, - 0x00, 0xff, 0xff, 0x00, 0xff, 0xfa, 0x00, 0xff, 0xf5, 0x00, 0xff, 0xf1, - 0x00, 0xff, 0xec, 0x00, 0xff, 0xe7, 0x00, 0xff, 0xe3, 0x00, 0xff, 0xde, - 0x00, 0xff, 0xda, 0x00, 0xff, 0xd5, 0x00, 0xff, 0xd1, 0x00, 0xff, 0xcc, - 0x00, 0xff, 0xc8, 0x00, 0xff, 0xc3, 0x00, 0xff, 0xbf, 0x00, 0xff, 0xba, - 0x00, 0xff, 0xb6, 0x00, 0xff, 0xb1, 0x00, 0xff, 0xad, 0x00, 0xff, 0xa8, - 0x00, 0xff, 0xa4, 0x00, 0xff, 0x9f, 0x00, 0xff, 0x9b, 0x00, 0xff, 0x96, - 0x00, 0xff, 0x92, 0x00, 0xff, 0x8d, 0x00, 0xff, 0x89, 0x00, 0xff, 0x84, - 0x00, 0xff, 0x80, 0x00, 0xff, 0x7b, 0x00, 0xff, 0x76, 0x00, 0xff, 0x72, - 0x00, 0xff, 0x6d, 0x00, 0xff, 0x69, 0x00, 0xff, 0x64, 0x00, 0xff, 0x60, - 0x00, 0xff, 0x5b, 0x00, 0xff, 0x57, 0x00, 0xff, 0x52, 0x00, 0xff, 0x4e, - 0x00, 0xff, 0x49, 0x00, 0xff, 0x45, 0x00, 0xff, 0x40, 0x00, 0xff, 0x3c, - 0x00, 0xff, 0x37, 0x00, 0xff, 0x33, 0x00, 0xff, 0x2e, 0x00, 0xff, 0x2a, - 0x00, 0xff, 0x25, 0x00, 0xff, 0x21, 0x00, 0xff, 0x1c, 0x00, 0xff, 0x18, - 0x00, 0xff, 0x13, 0x00, 0xff, 0x0e, 0x00, 0xff, 0x0a, 0x00, 0xff, 0x05, - 0x00, 0xff, 0x01, 0x04, 0xff, 0x00, 0x08, 0xff, 0x00, 0x0d, 0xff, 0x00, - 0x11, 0xff, 0x00, 0x16, 0xff, 0x00, 0x1a, 0xff, 0x00, 0x1f, 0xff, 0x00, - 0x23, 0xff, 0x00, 0x28, 0xff, 0x00, 0x2c, 0xff, 0x00, 0x31, 0xff, 0x00, - 0x35, 0xff, 0x00, 0x3a, 0xff, 0x00, 0x3e, 0xff, 0x00, 0x43, 0xff, 0x00, - 0x47, 0xff, 0x00, 0x4c, 0xff, 0x00, 0x50, 0xff, 0x00, 0x55, 0xff, 0x00, - 0x5a, 0xff, 0x00, 0x5e, 0xff, 0x00, 0x63, 0xff, 0x00, 0x67, 0xff, 0x00, - 0x6c, 0xff, 0x00, 0x70, 0xff, 0x00, 0x75, 0xff, 0x00, 0x79, 0xff, 0x00, - 0x7e, 0xff, 0x00, 0x82, 0xff, 0x00, 0x87, 0xff, 0x00, 0x8b, 0xff, 0x00, - 0x90, 0xff, 0x00, 0x94, 0xff, 0x00, 0x99, 0xff, 0x00, 0x9d, 0xff, 0x00, - 0xa2, 0xff, 0x00, 0xa6, 0xff, 0x00, 0xab, 0xff, 0x00, 0xaf, 0xff, 0x00, - 0xb4, 0xff, 0x00, 0xb8, 0xff, 0x00, 0xbd, 0xff, 0x00, 0xc2, 0xff, 0x00, - 0xc6, 0xff, 0x00, 0xcb, 0xff, 0x00, 0xcf, 0xff, 0x00, 0xd4, 0xff, 0x00, - 0xd8, 0xff, 0x00, 0xdd, 0xff, 0x00, 0xe1, 0xff, 0x00, 0xe6, 0xff, 0x00, - 0xea, 0xff, 0x00, 0xef, 0xff, 0x00, 0xf3, 0xff, 0x00, 0xf8, 0xff, 0x00, - 0xfc, 0xff, 0x00, 0xff, 0xfd, 0x00, 0xff, 0xf9, 0x00, 0xff, 0xf4, 0x00, - 0xff, 0xf0, 0x00, 0xff, 0xeb, 0x00, 0xff, 0xe7, 0x00, 0xff, 0xe2, 0x00, - 0xff, 0xde, 0x00, 0xff, 0xd9, 0x00, 0xff, 0xd5, 0x00, 0xff, 0xd0, 0x00, - 0xff, 0xcb, 0x00, 0xff, 0xc7, 0x00, 0xff, 0xc2, 0x00, 0xff, 0xbe, 0x00, - 0xff, 0xb9, 0x00, 0xff, 0xb5, 0x00, 0xff, 0xb0, 0x00, 0xff, 0xac, 0x00, - 0xff, 0xa7, 0x00, 0xff, 0xa3, 0x00, 0xff, 0x9e, 0x00, 0xff, 0x9a, 0x00, - 0xff, 0x95, 0x00, 0xff, 0x91, 0x00, 0xff, 0x8c, 0x00, 0xff, 0x88, 0x00, - 0xff, 0x83, 0x00, 0xff, 0x7f, 0x00, 0xff, 0x7a, 0x00, 0xff, 0x76, 0x00, - 0xff, 0x71, 0x00, 0xff, 0x6d, 0x00, 0xff, 0x68, 0x00, 0xff, 0x63, 0x00, - 0xff, 0x5f, 0x00, 0xff, 0x5a, 0x00, 0xff, 0x56, 0x00, 0xff, 0x51, 0x00, - 0xff, 0x4d, 0x00, 0xff, 0x48, 0x00, 0xff, 0x44, 0x00, 0xff, 0x3f, 0x00, - 0xff, 0x3b, 0x00, 0xff, 0x36, 0x00, 0xff, 0x32, 0x00, 0xff, 0x2d, 0x00, - 0xff, 0x29, 0x00, 0xff, 0x24, 0x00, 0xff, 0x20, 0x00, 0xff, 0x1b, 0x00, - 0xff, 0x17, 0x00, 0xff, 0x12, 0x00, 0xff, 0x0e, 0x00, 0xff, 0x09, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 -}; diff --git a/apps/POLite/heat-sync/Colours.h b/apps/POLite/heat-sync/Colours.h deleted file mode 100644 index fc34e04c..00000000 --- a/apps/POLite/heat-sync/Colours.h +++ /dev/null @@ -1,10 +0,0 @@ -// SPDX-License-Identifier: BSD-2-Clause -#ifndef _COLOURS_H_ -#define _COLOURS_H_ - -#include - -// 256 x RGB colours representing heat intensities -extern uint8_t colours[]; - -#endif diff --git a/apps/POLite/heat-sync/Heat.h b/apps/POLite/heat-sync/Heat.h index b3a63a93..8dc926b3 100644 --- a/apps/POLite/heat-sync/Heat.h +++ b/apps/POLite/heat-sync/Heat.h @@ -2,24 +2,26 @@ #ifndef _HEAT_H_ #define _HEAT_H_ +#define POLITE_DUMP_STATS +#define POLITE_COUNT_MSGS #include struct HeatMessage { // Sender id uint32_t from; - // Time step - uint32_t time; // Temperature at sender - uint32_t val; + float val; }; struct HeatState { // Device id uint32_t id; - // Current time step of device - uint32_t time; // Current temperature of device - uint32_t val, acc; + float val, acc; + // Time step + uint16_t time; + // Number of neighbours + uint16_t numNeighbours; // Is the temperature of this device constant? bool isConstant; }; @@ -34,7 +36,6 @@ struct HeatDevice : PDevice { // Send handler inline void send(volatile HeatMessage* msg) { msg->from = s->id; - msg->time = s->time; msg->val = s->val; *readyToSend = No; } @@ -42,6 +43,7 @@ struct HeatDevice : PDevice { // Receive handler inline void recv(HeatMessage* msg, None* edge) { s->acc += msg->val; + s->numNeighbours++; } // Called by POLite when system becomes idle @@ -53,8 +55,9 @@ struct HeatDevice : PDevice { } else { s->time--; - if (!s->isConstant) s->val = s->acc >> 2; - s->acc = 0; + if (!s->isConstant) s->val = s->acc / (float) s->numNeighbours; + s->acc = 0.0; + s->numNeighbours = 0; *readyToSend = Pin(0); return true; } diff --git a/apps/POLite/heat-sync/Makefile b/apps/POLite/heat-sync/Makefile index 0c343edd..f44d5b09 100644 --- a/apps/POLite/heat-sync/Makefile +++ b/apps/POLite/heat-sync/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: BSD-2-Clause APP_CPP = Heat.cpp APP_HDR = Heat.h -RUN_CPP = Run.cpp Colours.cpp -RUN_H = Colours.h +RUN_CPP = Run.cpp +RUN_H = include ../util/polite.mk diff --git a/apps/POLite/heat-sync/Run.cpp b/apps/POLite/heat-sync/Run.cpp index a938a446..c3db2fbf 100644 --- a/apps/POLite/heat-sync/Run.cpp +++ b/apps/POLite/heat-sync/Run.cpp @@ -1,17 +1,31 @@ // SPDX-License-Identifier: BSD-2-Clause #include "Heat.h" -#include "Colours.h" #include #include +#include #include -int main() +int main(int argc, char **argv) { - // Parameters - const uint32_t width = 256; - const uint32_t height = 256; - const uint32_t time = 1000; + const uint32_t time = 1000; + + // Read in the example edge list and create data structure + if (argc != 2) { + printf("Specify edge file\n"); + exit(EXIT_FAILURE); + } + + // Load in the edge list file + printf("Loading in the graph..."); fflush(stdout); + EdgeList net; + net.read(argv[1]); + printf(" done\n"); + + // Print max fan-out + printf("Min fan-out = %d\n", net.minFanOut()); + printf("Max fan-out = %d\n", net.maxFanOut()); + assert(net.minFanOut() > 0); // Connection to tinsel machine HostLink hostLink; @@ -19,55 +33,31 @@ int main() // Create POETS graph PGraph graph; - // Create 2D mesh of devices - PDeviceId **mesh = new PDeviceId* [height]; - for (uint32_t y = 0; y < height; y++) { - mesh[y] = new PDeviceId [width]; - for (uint32_t x = 0; x < width; x++) - mesh[y][x] = graph.newDevice(); + // Create nodes in POETS graph + for (uint32_t i = 0; i < net.numNodes; i++) { + PDeviceId id = graph.newDevice(); + assert(i == id); } - // Add edges - for (uint32_t y = 0; y < height; y++) - for (uint32_t x = 0; x < width; x++) { - if (x < width-1) { - graph.addEdge(mesh[y][x], 0, mesh[y][x+1]); - graph.addEdge(mesh[y][x+1], 0, mesh[y][x]); - } - if (y < height-1) { - graph.addEdge(mesh[y][x], 0, mesh[y+1][x]); - graph.addEdge(mesh[y+1][x], 0, mesh[y][x]); - } - } + // Create connections in POETS graph + for (uint32_t i = 0; i < net.numNodes; i++) { + uint32_t numNeighbours = net.neighbours[i][0]; + for (uint32_t j = 0; j < numNeighbours; j++) + graph.addEdge(i, 0, net.neighbours[i][j+1]); + } // Prepare mapping from graph to hardware graph.map(); - // Set device ids - for (uint32_t y = 0; y < height; y++) - for (uint32_t x = 0; x < width; x++) - graph.devices[mesh[y][x]]->state.id = mesh[y][x]; - // Specify number of time steps to run on each device - for (PDeviceId i = 0; i < graph.numDevices; i++) + srand(1); + for (PDeviceId i = 0; i < graph.numDevices; i++) { + int r = rand() % 255; + graph.devices[i]->state.id = i; graph.devices[i]->state.time = time; - - // Apply constant heat at north edge - // Apply constant cool at south edge - for (uint32_t x = 0; x < width; x++) { - graph.devices[mesh[0][x]]->state.val = 255 << 16; - graph.devices[mesh[0][x]]->state.isConstant = true; - graph.devices[mesh[height-1][x]]->state.val = 40 << 16; - graph.devices[mesh[height-1][x]]->state.isConstant = true; - } - - // Apply constant heat at west edge - // Apply constant cool at east edge - for (uint32_t y = 0; y < height; y++) { - graph.devices[mesh[y][0]]->state.val = 255 << 16; - graph.devices[mesh[y][0]]->state.isConstant = true; - graph.devices[mesh[y][width-1]]->state.val = 40 << 16; - graph.devices[mesh[y][width-1]]->state.isConstant = true; + graph.devices[i]->state.val = (float) r; + graph.devices[i]->state.isConstant = false; + //graph.devices[i]->state.fanOut = graph.fanOut(i); } // Write graph down to tinsel machine via HostLink @@ -82,8 +72,11 @@ int main() struct timeval start, finish, diff; gettimeofday(&start, NULL); + // Consume performance stats + politeSaveStats(&hostLink, "stats.txt"); + // Allocate array to contain final value of each device - uint32_t* pixels = new uint32_t [graph.numDevices]; + float* pixels = new float [graph.numDevices]; // Receive final value of each device for (uint32_t i = 0; i < graph.numDevices; i++) { @@ -95,25 +88,17 @@ int main() pixels[msg.payload.from] = msg.payload.val; } + // Display final values of first ten devices + for (uint32_t i = 0; i < 10; i++) { + if (i < graph.numDevices) { + printf("%d: %f\n", i, pixels[i]); + } + } + // Display time timersub(&finish, &start, &diff); double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; printf("Time = %lf\n", duration); - // Emit image - FILE* fp = fopen("out.ppm", "wt"); - if (fp == NULL) { - printf("Can't open output file for writing\n"); - return -1; - } - fprintf(fp, "P3\n%d %d\n255\n", width, height); - for (uint32_t y = 0; y < height; y++) - for (uint32_t x = 0; x < width; x++) { - uint32_t val = (pixels[mesh[y][x]] >> 16) & 0xff; - fprintf(fp, "%d %d %d\n", - colours[val*3], colours[val*3+1], colours[val*3+2]); - } - fclose(fp); - return 0; } diff --git a/apps/POLite/izhikevich-gals/Izhikevich.cpp b/apps/POLite/izhikevich-gals/Izhikevich.cpp new file mode 100644 index 00000000..8533062a --- /dev/null +++ b/apps/POLite/izhikevich-gals/Izhikevich.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: BSD-2-Clause +#include "Izhikevich.h" + +#include +#include + +typedef PThread< + IzhikevichDevice, + IzhikevichState, // State + Weight, // Edge label + IzhikevichMsg // Message + > IzhikevichThread; + +int main() +{ + // Point thread structure at base of thread's heap + IzhikevichThread* thread = (IzhikevichThread*) tinselHeapBaseSRAM(); + + // Invoke interpreter + thread->run(); + + return 0; +} diff --git a/apps/POLite/izhikevich-gals/Izhikevich.h b/apps/POLite/izhikevich-gals/Izhikevich.h new file mode 100644 index 00000000..701af341 --- /dev/null +++ b/apps/POLite/izhikevich-gals/Izhikevich.h @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: BSD-2-Clause +// (Based on code by David Thomas) +#ifndef _Izhikevich_H_ +#define _Izhikevich_H_ + +#define POLITE_DUMP_STATS +#define POLITE_COUNT_MSGS +#include +#include "RNG.h" + +// Number of time steps to run for +#define NUM_STEPS 100 + +// Vertex state +struct IzhikevichState { + // Random-number-generator state + uint32_t rng; + // Neuron state + float u, v, I, acc, accNext; + uint32_t spikeCount; + // Protocol + bool sent; + uint16_t received, receivedNext, fanIn, time; + // Neuron properties + float a, b, c, d, Ir; +}; + +// Edge weight type +typedef float Weight; + +// Message type +struct IzhikevichMsg { + // Did the sender spike or not? + bool spike; + // Time step of sender + uint16_t time; + // Number of times sender has spiked + uint32_t spikeCount; +}; + +// Vertex behaviour +struct IzhikevichDevice : PDevice { + inline void init() { + s->v = -65.0f; + s->u = s->b * s->v; + s->I = s->Ir * grng(s->rng); + *readyToSend = Pin(0); + } + + // We call this on every state change + inline void change() { + // Execution complete? + if (s->time == NUM_STEPS) return; + + // Proceed to next time step? + if (s->sent && s->received == s->fanIn) { + s->time++; + s->I += s->acc; + s->acc = s->accNext; + s->accNext = 0; + s->received = s->receivedNext; + s->receivedNext = 0; + s->sent = false; + *readyToSend = s->time == (NUM_STEPS+1) ? No : Pin(0); + } + } + + // Send handler + inline void send(volatile IzhikevichMsg* msg) { + bool spike = false; + float &v = s->v; + float &u = s->u; + float &I = s->I; + v = v+0.5*(0.04*v*v+5*v+140-u+I); // Step 0.5 ms + v = v+0.5*(0.04*v*v+5*v+140-u+I); // for numerical + u = u + s->a*(s->b*v-u); // stability + if (v >= 30.0) { + v = s->c; + u += s->d; + s->spikeCount++; + spike = true; + } + s->I = s->Ir * grng(s->rng); + msg->time = s->time; + msg->spike = spike; + msg->spikeCount = s->spikeCount; + s->sent = true; + *readyToSend = No; + change(); + } + + // Receive handler + inline void recv(IzhikevichMsg* msg, Weight* weight) { + if (msg->time == s->time) { + if (msg->spike) s->acc += *weight; + s->received++; + change(); + } + else { + if (msg->spike) s->accNext += *weight; + s->receivedNext++; + } + } + + inline bool step() { + return false; + } + + inline bool finish(IzhikevichMsg* msg) { + msg->spikeCount = s->spikeCount; + return true; + } +}; + +#endif diff --git a/apps/POLite/izhikevich-gals/Makefile b/apps/POLite/izhikevich-gals/Makefile new file mode 100644 index 00000000..5ba3d9e3 --- /dev/null +++ b/apps/POLite/izhikevich-gals/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: BSD-2-Clause +APP_CPP = Izhikevich.cpp +APP_HDR = Izhikevich.h +RUN_CPP = Run.cpp + +include ../util/polite.mk diff --git a/apps/POLite/izhikevich-gals/RNG.h b/apps/POLite/izhikevich-gals/RNG.h new file mode 100644 index 00000000..61b719b3 --- /dev/null +++ b/apps/POLite/izhikevich-gals/RNG.h @@ -0,0 +1,23 @@ +#ifndef _RNG_H_ +#define _RNG_H_ + +inline uint32_t urng(uint32_t &state) { + state = state*1664525+1013904223; + return state; +} + +// World's crappiest gaussian (courtesy of dt10!) +inline float grng(uint32_t &state) { + uint32_t u=urng(state); + int32_t acc=0; + for(unsigned i=0;i<8;i++){ + acc += u&0xf; + u=u>>4; + } + // a four-bit uniform has mean 7.5 and variance ((15-0+1)^2-1)/12 = 85/4 + // sum of four uniforms has mean 8*7.5=60 and variance of 8*85/4=170 + const float scale=0.07669649888473704; // == 1/sqrt(170) + return (acc-60.0f) * scale; +} + +#endif diff --git a/apps/POLite/izhikevich-gals/Run.cpp b/apps/POLite/izhikevich-gals/Run.cpp new file mode 100644 index 00000000..43fb3d4d --- /dev/null +++ b/apps/POLite/izhikevich-gals/Run.cpp @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: BSD-2-Clause +#include "Izhikevich.h" + +#include +#include + +#include +#include +#include +#include + +inline double urand() { return (double) rand() / RAND_MAX; } + +int main(int argc, char**argv) +{ + if (argc != 2) { + printf("Specify edges file\n"); + exit(EXIT_FAILURE); + } + + // Read network + EdgeList net; + net.read(argv[1]); + + // Connection to tinsel machine + HostLink hostLink; + + // Create POETS graph + PGraph graph; + + // Create nodes in POETS graph + for (uint32_t i = 0; i < net.numNodes; i++) { + PDeviceId id = graph.newDevice(); + assert(i == id); + } + + // Ratio of excitatory to inhibitory neurons + double excitatory = 0.8; + + // Mark each neuron as excitatory (or inhibiatory) + srand(1); + bool* excite = new bool [net.numNodes]; + for (int i = 0; i < net.numNodes; i++) + excite[i] = urand() < excitatory; + + // Create connections in POETS graph + for (uint32_t i = 0; i < net.numNodes; i++) { + uint32_t numNeighbours = net.neighbours[i][0]; + for (uint32_t j = 0; j < numNeighbours; j++) { + float weight = excite[i] ? 0.5 * urand() : -urand(); + graph.addLabelledEdge(weight, i, 0, net.neighbours[i][j+1]); + } + } + + // Add zero-weight back-edges for any directed edges + // (For GALS synchronisation) + for (uint32_t i = 0; i < net.numNodes; i++) { + for (uint32_t j = 0; j < net.neighbours[i][0]; j++) { + uint32_t n = net.neighbours[i][j+1]; + // TODO: can be more efficient here + bool needBackEdge = true; + for (uint32_t k = 0; k < net.neighbours[n][0]; k++) + if (net.neighbours[n][k+1] == i) needBackEdge = false; + if (needBackEdge) graph.addLabelledEdge(0.0, n, 0, i); + } + } + + // Prepare mapping from graph to hardware + graph.map(); + + srand(2); + // Initialise devices + for (PDeviceId i = 0; i < graph.numDevices; i++) { + IzhikevichState* n = &graph.devices[i]->state; + n->rng = (int32_t) (urand()*((double) (1<<31))); + n->fanIn = graph.fanIn(i); + if (excite[i]) { + float re = (float) urand(); + n->a = 0.02; + n->b = 0.2; + n->c = -65+15*re*re; + n->d = 8-6*re*re; + n->Ir = 5; + } + else { + float ri = (float) urand(); + n->a = 0.02+0.08*ri; + n->b = 0.25-0.05*ri; + n->c = -65; + n->d = 2; + n->Ir = 2; + } + } + + // Write graph down to tinsel machine via HostLink + graph.write(&hostLink); + + // Load code and trigger execution + hostLink.boot("code.v", "data.v"); + hostLink.go(); + + // Timer + printf("Started\n"); + struct timeval start, finish, diff; + gettimeofday(&start, NULL); + + // Consume performance stats + politeSaveStats(&hostLink, "stats.txt"); + + int64_t sum = 0; + // Receive final distance to each vertex + for (uint32_t i = 0; i < graph.numDevices; i++) { + // Receive message + PMessage msg; + hostLink.recvMsg(&msg, sizeof(msg)); + if (i == 0) gettimeofday(&finish, NULL); + // Accumulate + sum += msg.payload.spikeCount; + } + + // Emit result + printf("Total spikes = %ld\n", sum); + + // Display time + timersub(&finish, &start, &diff); + double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; + printf("Time = %lf\n", duration); + + return 0; +} diff --git a/apps/POLite/izhikevich-pc/Izhikevich.cpp b/apps/POLite/izhikevich-pc/Izhikevich.cpp new file mode 100644 index 00000000..b4f03ed5 --- /dev/null +++ b/apps/POLite/izhikevich-pc/Izhikevich.cpp @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: BSD-2-Clause +// (Based on code by David Thomas) + +#include +#include +#include +#include "RNG.h" + +#define NUM_STEPS 100 + +// Neuron +struct Neuron { + // Random-number-generator state + uint32_t rng; + // Neuron state + float u, v, I, spikeCount; + // Neuron properties + float a, b, c, d, Ir; +}; + +int main(int argc, char**argv) +{ + if (argc != 2) { + printf("Specify edges file\n"); + exit(EXIT_FAILURE); + } + + // Read network + EdgeList net; + net.read(argv[1]); + + // Ratio of excitatory to inhibitory neurons + double excitatory = 0.8; + + // Mark each neuron as excitatory (or inhibiatory) + srand(1); + bool* excite = new bool [net.numNodes]; + for (int i = 0; i < net.numNodes; i++) { + excite[i] = urand() < excitatory; + } + + // Edge weights + float** weight = new float* [net.numNodes]; + for (int i = 0; i < net.numNodes; i++) { + uint32_t numEdges = net.neighbours[i][0]; + weight[i] = new float [numEdges]; + for (int j = 0; j < numEdges; j++) { + weight[i][j] = excite[i] ? 0.5 * urand() : -urand(); + } + } + + // State for each neuron + srand(2); + Neuron* neuron = new Neuron [net.numNodes]; + for (int i = 0; i < net.numNodes; i++) { + Neuron* n = &neuron[i]; + n->rng = (int32_t) (urand()*((double) (1<<31))); + if (excite[i]) { + float re = (float) urand(); + n->a = 0.02; + n->b = 0.2; + n->c = -65+15*re*re; + n->d = 8-6*re*re; + n->Ir = 5; + } + else { + float ri = (float) urand(); + n->a = 0.02+0.08*ri; + n->b = 0.25-0.05*ri; + n->c = -65; + n->d = 2; + n->Ir = 2; + } + } + + // Spike array + bool* spike = new bool [net.numNodes]; + + // Initialisation + for (int i = 0; i < net.numNodes; i++) { + Neuron* n = &neuron[i]; + n->v = -65.0; + n->u = n->b * n->v; + n->I = n->Ir * grng(n->rng); + } + + // Timer + printf("Started\n"); + struct timeval start, finish, diff; + gettimeofday(&start, NULL); + + // Simulation + int64_t totalSpikes = 0; + for (int t = 0; t <= NUM_STEPS; t++) { + // Update state + for (int i = 0; i < net.numNodes; i++) { + spike[i] = false; + Neuron* n = &neuron[i]; + float &v = n->v; + float &u = n->u; + float &I = n->I; + v = v+0.5*(0.04*v*v+5*v+140-u+I); // Step 0.5 ms + v = v+0.5*(0.04*v*v+5*v+140-u+I); // for numerical + u = u + n->a*(n->b*v-u); // stability + if (v >= 30.0) { + n->v = n->c; + n->u += n->d; + spike[i] = true; + } + n->I = n->Ir * grng(n->rng); + } + // Update I-values + uint32_t spikes = 0; + for (int i = 0; i < net.numNodes; i++) { + Neuron* n = &neuron[i]; + if (spike[i]) { + spikes++; + n->spikeCount++; + uint32_t numEdges = net.neighbours[i][0]; + uint32_t* dst = &net.neighbours[i][1]; + for (int j = 0; j < numEdges; j++) { + neuron[dst[j]].I += weight[i][j]; + } + } + } + //printf("%d: %d\n", t, spikes); + totalSpikes += spikes; + } + gettimeofday(&finish, NULL); + + printf("Total spikes: %ld\n", totalSpikes); + + // Display time + timersub(&finish, &start, &diff); + double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; + printf("Time = %lf\n", duration); + + return 0; +} diff --git a/apps/POLite/izhikevich-pc/Makefile b/apps/POLite/izhikevich-pc/Makefile new file mode 100644 index 00000000..52c92c74 --- /dev/null +++ b/apps/POLite/izhikevich-pc/Makefile @@ -0,0 +1,6 @@ +Izhikevich: Izhikevich.cpp RNG.h + g++ -I../../../include -O2 Izhikevich.cpp -o Izhikevich + +.PHONY: clean +clean: + rm Izhikevich diff --git a/apps/POLite/izhikevich-pc/RNG.h b/apps/POLite/izhikevich-pc/RNG.h new file mode 100644 index 00000000..decc32f1 --- /dev/null +++ b/apps/POLite/izhikevich-pc/RNG.h @@ -0,0 +1,27 @@ +#ifndef _RNG_H_ +#define _RNG_H_ + +inline uint32_t urng(uint32_t &state) { + state = state*1664525+1013904223; + return state; +} + +// World's crappiest gaussian (courtesy of dt10!) +inline float grng(uint32_t &state) { + uint32_t u=urng(state); + int32_t acc=0; + for(unsigned i=0;i<8;i++){ + acc += u&0xf; + u=u>>4; + } + // a four-bit uniform has mean 7.5 and variance ((15-0+1)^2-1)/12 = 85/4 + // sum of four uniforms has mean 8*7.5=60 and variance of 8*85/4=170 + const float scale=0.07669649888473704; // == 1/sqrt(170) + return (acc-60.0f) * scale; +} + +inline double urand() { + return (double) rand() / RAND_MAX; +} + +#endif diff --git a/apps/POLite/izhikevich-sync/Izhikevich.cpp b/apps/POLite/izhikevich-sync/Izhikevich.cpp new file mode 100644 index 00000000..8533062a --- /dev/null +++ b/apps/POLite/izhikevich-sync/Izhikevich.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: BSD-2-Clause +#include "Izhikevich.h" + +#include +#include + +typedef PThread< + IzhikevichDevice, + IzhikevichState, // State + Weight, // Edge label + IzhikevichMsg // Message + > IzhikevichThread; + +int main() +{ + // Point thread structure at base of thread's heap + IzhikevichThread* thread = (IzhikevichThread*) tinselHeapBaseSRAM(); + + // Invoke interpreter + thread->run(); + + return 0; +} diff --git a/apps/POLite/izhikevich-sync/Izhikevich.h b/apps/POLite/izhikevich-sync/Izhikevich.h new file mode 100644 index 00000000..150a4afa --- /dev/null +++ b/apps/POLite/izhikevich-sync/Izhikevich.h @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: BSD-2-Clause +// (Based on code by David Thomas) +#ifndef _Izhikevich_H_ +#define _Izhikevich_H_ + +#define POLITE_DUMP_STATS +#define POLITE_COUNT_MSGS + +#include +#include "RNG.h" + +// Number of time steps to run for +#define NUM_STEPS 100 + +// Vertex state +struct IzhikevichState { + // Random-number-generator state + uint32_t rng; + // Neuron state + float u, v, I; + uint32_t spikeCount; + // Neuron properties + float a, b, c, d, Ir; +}; + +// Edge weight type +typedef float Weight; + +// Message type +struct IzhikevichMsg { + // Number of times sender has spiked + uint32_t spikeCount; +}; + +// Vertex behaviour +struct IzhikevichDevice : PDevice { + inline void init() { + s->v = -65.0f; + s->u = s->b * s->v; + s->I = s->Ir * grng(s->rng); + *readyToSend = No; + } + inline void send(IzhikevichMsg* msg) { + s->spikeCount++; + msg->spikeCount = s->spikeCount; + *readyToSend = No; + } + inline void recv(IzhikevichMsg* msg, Weight* weight) { + s->I += *weight; + } + inline bool step() { + float &v = s->v; + float &u = s->u; + float &I = s->I; + v = v+0.5*(0.04*v*v+5*v+140-u+I); // Step 0.5 ms + v = v+0.5*(0.04*v*v+5*v+140-u+I); // for numerical + u = u + s->a*(s->b*v-u); // stability + if (v >= 30.0) { + v = s->c; + u += s->d; + *readyToSend = Pin(0); + } + s->I = s->Ir * grng(s->rng); + return (time < NUM_STEPS); + } + inline bool finish(IzhikevichMsg* msg) { + msg->spikeCount = s->spikeCount; + return true; + } +}; + +#endif diff --git a/apps/POLite/izhikevich-sync/Makefile b/apps/POLite/izhikevich-sync/Makefile new file mode 100644 index 00000000..5ba3d9e3 --- /dev/null +++ b/apps/POLite/izhikevich-sync/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: BSD-2-Clause +APP_CPP = Izhikevich.cpp +APP_HDR = Izhikevich.h +RUN_CPP = Run.cpp + +include ../util/polite.mk diff --git a/apps/POLite/izhikevich-sync/RNG.h b/apps/POLite/izhikevich-sync/RNG.h new file mode 100644 index 00000000..61b719b3 --- /dev/null +++ b/apps/POLite/izhikevich-sync/RNG.h @@ -0,0 +1,23 @@ +#ifndef _RNG_H_ +#define _RNG_H_ + +inline uint32_t urng(uint32_t &state) { + state = state*1664525+1013904223; + return state; +} + +// World's crappiest gaussian (courtesy of dt10!) +inline float grng(uint32_t &state) { + uint32_t u=urng(state); + int32_t acc=0; + for(unsigned i=0;i<8;i++){ + acc += u&0xf; + u=u>>4; + } + // a four-bit uniform has mean 7.5 and variance ((15-0+1)^2-1)/12 = 85/4 + // sum of four uniforms has mean 8*7.5=60 and variance of 8*85/4=170 + const float scale=0.07669649888473704; // == 1/sqrt(170) + return (acc-60.0f) * scale; +} + +#endif diff --git a/apps/POLite/izhikevich-sync/Run.cpp b/apps/POLite/izhikevich-sync/Run.cpp new file mode 100644 index 00000000..dd1ac79e --- /dev/null +++ b/apps/POLite/izhikevich-sync/Run.cpp @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: BSD-2-Clause +#include "Izhikevich.h" + +#include +#include + +#include +#include +#include +#include + +inline double urand() { return (double) rand() / RAND_MAX; } + +int main(int argc, char**argv) +{ + if (argc != 2) { + printf("Specify edges file\n"); + exit(EXIT_FAILURE); + } + + // Read network + EdgeList net; + net.read(argv[1]); + assert(net.minFanOut() > 0); + + // Connection to tinsel machine + HostLink hostLink; + + // Create POETS graph + PGraph graph; + + // Create nodes in POETS graph + for (uint32_t i = 0; i < net.numNodes; i++) { + PDeviceId id = graph.newDevice(); + assert(i == id); + } + + // Ratio of excitatory to inhibitory neurons + double excitatory = 0.8; + + // Mark each neuron as excitatory (or inhibiatory) + srand(1); + bool* excite = new bool [net.numNodes]; + for (int i = 0; i < net.numNodes; i++) + excite[i] = urand() < excitatory; + + // Create connections in POETS graph + for (uint32_t i = 0; i < net.numNodes; i++) { + uint32_t numNeighbours = net.neighbours[i][0]; + for (uint32_t j = 0; j < numNeighbours; j++) { + float weight = excite[i] ? 0.5 * urand() : -urand(); + graph.addLabelledEdge(weight, i, 0, net.neighbours[i][j+1]); + } + } + + // Prepare mapping from graph to hardware + graph.map(); + + srand(2); + // Initialise devices + for (PDeviceId i = 0; i < graph.numDevices; i++) { + IzhikevichState* n = &graph.devices[i]->state; + n->rng = (int32_t) (urand()*((double) (1<<31))); + if (excite[i]) { + float re = (float) urand(); + n->a = 0.02; + n->b = 0.2; + n->c = -65+15*re*re; + n->d = 8-6*re*re; + n->Ir = 5; + } + else { + float ri = (float) urand(); + n->a = 0.02+0.08*ri; + n->b = 0.25-0.05*ri; + n->c = -65; + n->d = 2; + n->Ir = 2; + } + } + + // Write graph down to tinsel machine via HostLink + graph.write(&hostLink); + + // Load code and trigger execution + hostLink.boot("code.v", "data.v"); + hostLink.go(); + + // Timer + printf("Started\n"); + struct timeval start, finish, diff; + gettimeofday(&start, NULL); + + // Consume performance stats + politeSaveStats(&hostLink, "stats.txt"); + + int64_t sum = 0; + // Receive final distance to each vertex + for (uint32_t i = 0; i < graph.numDevices; i++) { + // Receive message + PMessage msg; + hostLink.recvMsg(&msg, sizeof(msg)); + if (i == 0) gettimeofday(&finish, NULL); + // Accumulate + sum += msg.payload.spikeCount; + } + + // Emit result + printf("Total spikes = %ld\n", sum); + + // Display time + timersub(&finish, &start, &diff); + double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; + printf("Time = %lf\n", duration); + + return 0; +} diff --git a/apps/POLite/pagerank-sync/Run.cpp b/apps/POLite/pagerank-sync/Run.cpp index 435a0750..3ce786b5 100644 --- a/apps/POLite/pagerank-sync/Run.cpp +++ b/apps/POLite/pagerank-sync/Run.cpp @@ -27,6 +27,7 @@ int main(int argc, char **argv) EdgeList net; net.read(argv[1]); printf(" done\n"); + assert(net.minFanOut() > 0); // Print max fan-out printf("Max fan-out = %d\n", net.maxFanOut()); diff --git a/apps/POLite/sssp-async/Run.cpp b/apps/POLite/sssp-async/Run.cpp index c7953795..b9c174a3 100644 --- a/apps/POLite/sssp-async/Run.cpp +++ b/apps/POLite/sssp-async/Run.cpp @@ -22,6 +22,7 @@ int main(int argc, char**argv) // Print max fan-out printf("Max fan-out = %d\n", net.maxFanOut()); + assert(net.minFanOut() > 0); // Connection to tinsel machine HostLink hostLink; diff --git a/apps/POLite/sssp-pc/.asp.cpp.swp b/apps/POLite/sssp-pc/.asp.cpp.swp new file mode 100644 index 0000000000000000000000000000000000000000..2d2936e2dd2badda5887be7d43054d6c78666e79 GIT binary patch literal 16384 zcmeI2U2Ggz700KE3#1_}N`Z$;NH?3>*4E#jdnPkgmO3PM6c2oWmLH!4vHsCYoY8~=0feC*oZ zxTz2VF)RJNo;&yB+;h(TIQMRI{E5ZK*aM^E20kA#jL%+K{#@stsex;zVRZb-`V^(_ z^J#92z~3hiy}qg{9E)6Ea&OEGLpgTt@g+CnW2Ovip+#S#Rv5mAs){Uy1_}+_jt1h$ zsXcJSI5|;QL;ZN^7`ylAh1;p2$XjTj&_JPqLIZ^c3Jnw*C^S%LpwPhoxCWx|fbj+N z;Xu-ltI78rJH8Jl&ojw4m7!0XPd^oZ3k?(+C^S%LpwK{}fkFd?1_})n8YnbSXrRzQ zp@IKK4VboJlpxp?9RT3|KaKxy-eVX)1wR3=fNS7OAOsh|0+O>xGhhxJ0)PFmVf-2V349;;;8E}hD1oEk0Qmc1 z!*~b$8oUIa2Uox%_&7KU2Eak^i@OZtS+EWs1)l(i!8?Zx<8AN<@EVAK15SZ|e#kI> z2Yv&-2fhh5z{B7WxDyP3>mP(}z-e$F`0WP_;~U^>;5l#wbig`jfQ#UM@a~=HC%6f& zgEznnU>mG}6W|DVcK~vNx4{p=w}Au!@WFZTAQ%7#!J9a&`5CwlUIj0LuLB>T5@K4!H|zs2n}zfoaU;Nw>!F`vn@#AxN@#CpVhwB&QQ)ouo{kZW#Z z?NTGnmWgu-nd&p_5}QTlxU-z)M;b(8my>$)TDvptYGFKUp}P;66soeUZC9(pg(|4& zz~*v>xp}L{Lt(Z$CJQ$u&O(&IsmY4AYSuO6M9UIZw>a6I zUiXoERcGo_yKYtg&0AmQPe)3n=ntk_ z^xeocJy-HfSxIZE*J>2`$B`Ec;ikSgT2UaS$aCj+4h*r$VE)nQm^Ymj>|*Je?`nr@VO z56qV}M>%A!dYXr0-*yG2aBf$+=UH1ujWuPsHTt(!qiFy`QV-H>DwKLa>x?wEbw(7? z83iX5InpwH2R<>vN`p2Vv`Zsw2zl@qW%d~OBP^+*k4fh3R?jS}2YQ!OSSH3vVl9}S zJXm-CawnlxCbwK#V7T66JIbU@*GHnA^S0DIRChAd3HtPFoMVn&(VSoynT+pQ0tDau#B;J!OTMz}# z#~#u<2^ui$6r~K;*>t|bW2S>YMeqly^AR4P1Q|9P^o;5%|E_SVNHzHjepPB zNj4Jod=G(!sOm1U$CGCz6MC+m)%=esn=+>AQ+hCcn{gU?N}3*`g(IlMAd8s+r{=;?tBG zV{tTDDU3a@gf*!ZpsIEU3dD*8Ge_8f&|BFqRZxFE4*1hBXfBEAz>e zPuK5|e zc9kz%)bmvlZP#aL;a<*6HZ@tJRbVR+w#0rgUj%e1gki}pqFT+~(ULZ^*9%qmWAAIR z$wqp$YY?jX7F>Z|V?WU0a5lJ)+T@r*nA@x!co3UBq{|7jZ9Vn0pESCB_mHOrGFcnp zxv4xNnEMsT*) z&NLfF*p=qF&0T43KuLKUr4`5%bXGZvmq;J0=gvM^TXHRqU)$FfZK%|BTrQ^B!>ea& z6SXs*iQ!{V{9i?^JAqi3;{Wu1|5u3Re-7RNuY(uBv)~N44-A06AfA66d=Z%76u2Aw z4Ke%;@DlhA_!hVZo(EqAKInjT&;Stc=g}DCr z;0NF}@LeFm1#l946x<8mMBi?LR{;m7!Kc6^Q2pLN;4N|#8YnbSXrRzQp@BjJg$DkQ zH9*lZ0>@bR6dmFr37SghvF%0NG3YrePO@WtzO3RMJBXV&#ZJ?Zt*nzO2pAn_G}>I3 z++xGKld(y)*chvitIwzkNNIyDWZWzdIj--@Hk}(qrl9j9s*PIgk}Ej|MSh9XCjThoaiEvqY9kyYo zufh;{j*BPMr3utBHOZ>or^;w8jmz@WT@oP@u@y0ra?`|NtnPlQ4$^|^h)9Eiy(AsU z;u|G%76>IBS<5@84%>AQ(^XN|4M%SUc|p~qY95#NIE*?Yn`VRYC}S$}qW_g4P=R`> zBvh|d19^aKiq_^@y@BKE{J4@Vvm-=C^N#6Bb&sS^b^3+=sS~e;+;W}myo1jSlvOyN?n69s`!06qGj@hduUPYgOCqo4;)6@4sTMrVP&DZLxek~sz z*3(%K`jXz26a49j;WMGj-FFmmodq5Bc#h&b8oj)NcMF za|OC>TVHR+u7~@$^)Lw40ioxW({Yy6M9NV~*{x50Po%RX_f_g^Xh@BEIyUK&J@a#_ z2<`5coDH$7X{zpFDspxTdy9+6?A9;6m{KBU0`&AbNzJOg^-WIyyBu`F#yG`sSb1n@ zI=RBj(}t&MQ0ptwL_s*VA%!7;Uq~-Y>9_%&9@=!mpY}hQB>Fi@?h3i#yWIiTtZ +#include +#include +#include +#include +#include +#include + +int main(int argc, char**argv) +{ + if (argc != 2) { + printf("Specify edges file\n"); + exit(EXIT_FAILURE); + } + + // Read network + EdgeList net; + net.read(argv[1]); + + // Create weights + srand(1); + uint32_t** weights = new uint32_t* [net.numNodes]; + for (uint32_t i = 0; i < net.numNodes; i++) { + uint32_t numNeighbours = net.neighbours[i][0]; + weights[i] = new uint32_t [numNeighbours]; + for (uint32_t j = 0; j < numNeighbours; j++) { + weights[i][j] = rand() % 100; + } + } + + // Create states + uint32_t* dist = new uint32_t [net.numNodes]; + int* queue = new int [net.numNodes]; + int queueSize = 0; + int* queueNext = new int [net.numNodes]; + int queueSizeNext = 0; + bool* inQueue = new bool [net.numNodes]; + for (int i = 0; i < net.numNodes; i++) { + inQueue[i] = false; + dist[i] = 0x7fffffff; + } + + // Set source vertex + dist[2] = 0; + queue[queueSize++] = 2; + + // Start timer + printf("Started\n"); + struct timeval start, finish, diff; + gettimeofday(&start, NULL); + + int iters = 0; + while (queueSize > 0) { + for (int i = 0; i < queueSize; i++) { + uint32_t me = queue[i]; + uint32_t numNeighbours = net.neighbours[me][0]; + for (uint32_t j = 0; j < numNeighbours; j++) { + uint32_t neighbour = net.neighbours[me][j+1]; + uint32_t newDist = dist[me] + weights[me][j]; + if (newDist < dist[neighbour]) { + dist[neighbour] = newDist; + if (!inQueue[neighbour]) { + queueNext[queueSizeNext++] = neighbour; + inQueue[neighbour] = true; + } + } + } + } + queueSize = queueSizeNext; + queueSizeNext = 0; + int32_t* tmp = queue; queue = queueNext; queueNext = tmp; + for (int i = 0; i < queueSize; i++) inQueue[queue[i]] = false; + iters++; + } + + // Stop timer + gettimeofday(&finish, NULL); + + uint64_t sum = 0; + for (int i = 0; i < net.numNodes; i++) + sum += dist[i]; + printf("Sum of distances = %ld\n", sum); + printf("Iterations = %d\n", iters); + + // Display time + timersub(&finish, &start, &diff); + double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; + printf("Time = %lf\n", duration); + + return 0; +} diff --git a/apps/POLite/sssp-sync/Run.cpp b/apps/POLite/sssp-sync/Run.cpp index c7953795..b9c174a3 100644 --- a/apps/POLite/sssp-sync/Run.cpp +++ b/apps/POLite/sssp-sync/Run.cpp @@ -22,6 +22,7 @@ int main(int argc, char**argv) // Print max fan-out printf("Max fan-out = %d\n", net.maxFanOut()); + assert(net.minFanOut() > 0); // Connection to tinsel machine HostLink hostLink; diff --git a/apps/POLite/util/sumstats.awk b/apps/POLite/util/sumstats.awk index 4d037cca..f1f70329 100755 --- a/apps/POLite/util/sumstats.awk +++ b/apps/POLite/util/sumstats.awk @@ -13,7 +13,7 @@ BEGIN { intraThreadSendCount = 0; interThreadSendCount = 0; interBoardSendCount = 0; - fmax = 225000000; + fmax = 250000000; if (boardsX == "" || boardsY == "") { boardsX = 3; boardsY = 2; From 8d76d9bbd26e7e7df78b8c51ba37381bee2119cc Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Tue, 7 Apr 2020 08:40:18 +0100 Subject: [PATCH 24/78] Drop some accidently committed files --- apps/POLite/heat-pc/.asp.cpp.swp | Bin 16384 -> 0 bytes apps/POLite/sssp-pc/.asp.cpp.swp | Bin 16384 -> 0 bytes 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 apps/POLite/heat-pc/.asp.cpp.swp delete mode 100644 apps/POLite/sssp-pc/.asp.cpp.swp diff --git a/apps/POLite/heat-pc/.asp.cpp.swp b/apps/POLite/heat-pc/.asp.cpp.swp deleted file mode 100644 index 2d2936e2dd2badda5887be7d43054d6c78666e79..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16384 zcmeI2U2Ggz700KE3#1_}N`Z$;NH?3>*4E#jdnPkgmO3PM6c2oWmLH!4vHsCYoY8~=0feC*oZ zxTz2VF)RJNo;&yB+;h(TIQMRI{E5ZK*aM^E20kA#jL%+K{#@stsex;zVRZb-`V^(_ z^J#92z~3hiy}qg{9E)6Ea&OEGLpgTt@g+CnW2Ovip+#S#Rv5mAs){Uy1_}+_jt1h$ zsXcJSI5|;QL;ZN^7`ylAh1;p2$XjTj&_JPqLIZ^c3Jnw*C^S%LpwPhoxCWx|fbj+N z;Xu-ltI78rJH8Jl&ojw4m7!0XPd^oZ3k?(+C^S%LpwK{}fkFd?1_})n8YnbSXrRzQ zp@IKK4VboJlpxp?9RT3|KaKxy-eVX)1wR3=fNS7OAOsh|0+O>xGhhxJ0)PFmVf-2V349;;;8E}hD1oEk0Qmc1 z!*~b$8oUIa2Uox%_&7KU2Eak^i@OZtS+EWs1)l(i!8?Zx<8AN<@EVAK15SZ|e#kI> z2Yv&-2fhh5z{B7WxDyP3>mP(}z-e$F`0WP_;~U^>;5l#wbig`jfQ#UM@a~=HC%6f& zgEznnU>mG}6W|DVcK~vNx4{p=w}Au!@WFZTAQ%7#!J9a&`5CwlUIj0LuLB>T5@K4!H|zs2n}zfoaU;Nw>!F`vn@#AxN@#CpVhwB&QQ)ouo{kZW#Z z?NTGnmWgu-nd&p_5}QTlxU-z)M;b(8my>$)TDvptYGFKUp}P;66soeUZC9(pg(|4& zz~*v>xp}L{Lt(Z$CJQ$u&O(&IsmY4AYSuO6M9UIZw>a6I zUiXoERcGo_yKYtg&0AmQPe)3n=ntk_ z^xeocJy-HfSxIZE*J>2`$B`Ec;ikSgT2UaS$aCj+4h*r$VE)nQm^Ymj>|*Je?`nr@VO z56qV}M>%A!dYXr0-*yG2aBf$+=UH1ujWuPsHTt(!qiFy`QV-H>DwKLa>x?wEbw(7? z83iX5InpwH2R<>vN`p2Vv`Zsw2zl@qW%d~OBP^+*k4fh3R?jS}2YQ!OSSH3vVl9}S zJXm-CawnlxCbwK#V7T66JIbU@*GHnA^S0DIRChAd3HtPFoMVn&(VSoynT+pQ0tDau#B;J!OTMz}# z#~#u<2^ui$6r~K;*>t|bW2S>YMeqly^AR4P1Q|9P^o;5%|E_SVNHzHjepPB zNj4Jod=G(!sOm1U$CGCz6MC+m)%=esn=+>AQ+hCcn{gU?N}3*`g(IlMAd8s+r{=;?tBG zV{tTDDU3a@gf*!ZpsIEU3dD*8Ge_8f&|BFqRZxFE4*1hBXfBEAz>e zPuK5|e zc9kz%)bmvlZP#aL;a<*6HZ@tJRbVR+w#0rgUj%e1gki}pqFT+~(ULZ^*9%qmWAAIR z$wqp$YY?jX7F>Z|V?WU0a5lJ)+T@r*nA@x!co3UBq{|7jZ9Vn0pESCB_mHOrGFcnp zxv4xNnEMsT*) z&NLfF*p=qF&0T43KuLKUr4`5%bXGZvmq;J0=gvM^TXHRqU)$FfZK%|BTrQ^B!>ea& z6SXs*iQ!{V{9i?^JAqi3;{Wu1|5u3Re-7RNuY(uBv)~N44-A06AfA66d=Z%76u2Aw z4Ke%;@DlhA_!hVZo(EqAKInjT&;Stc=g}DCr z;0NF}@LeFm1#l946x<8mMBi?LR{;m7!Kc6^Q2pLN;4N|#8YnbSXrRzQp@BjJg$DkQ zH9*lZ0>@bR6dmFr37SghvF%0NG3YrePO@WtzO3RMJBXV&#ZJ?Zt*nzO2pAn_G}>I3 z++xGKld(y)*chvitIwzkNNIyDWZWzdIj--@Hk}(qrl9j9s*PIgk}Ej|MSh9XCjThoaiEvqY9kyYo zufh;{j*BPMr3utBHOZ>or^;w8jmz@WT@oP@u@y0ra?`|NtnPlQ4$^|^h)9Eiy(AsU z;u|G%76>IBS<5@84%>AQ(^XN|4M%SUc|p~qY95#NIE*?Yn`VRYC}S$}qW_g4P=R`> zBvh|d19^aKiq_^@y@BKE{J4@Vvm-=C^N#6Bb&sS^b^3+=sS~e;+;W}myo1jSlvOyN?n69s`!06qGj@hduUPYgOCqo4;)6@4sTMrVP&DZLxek~sz z*3(%K`jXz26a49j;WMGj-FFmmodq5Bc#h&b8oj)NcMF za|OC>TVHR+u7~@$^)Lw40ioxW({Yy6M9NV~*{x50Po%RX_f_g^Xh@BEIyUK&J@a#_ z2<`5coDH$7X{zpFDspxTdy9+6?A9;6m{KBU0`&AbNzJOg^-WIyyBu`F#yG`sSb1n@ zI=RBj(}t&MQ0ptwL_s*VA%!7;Uq~-Y>9_%&9@=!mpY}hQB>Fi@?h3i#yWIiTtZ*4E#jdnPkgmO3PM6c2oWmLH!4vHsCYoY8~=0feC*oZ zxTz2VF)RJNo;&yB+;h(TIQMRI{E5ZK*aM^E20kA#jL%+K{#@stsex;zVRZb-`V^(_ z^J#92z~3hiy}qg{9E)6Ea&OEGLpgTt@g+CnW2Ovip+#S#Rv5mAs){Uy1_}+_jt1h$ zsXcJSI5|;QL;ZN^7`ylAh1;p2$XjTj&_JPqLIZ^c3Jnw*C^S%LpwPhoxCWx|fbj+N z;Xu-ltI78rJH8Jl&ojw4m7!0XPd^oZ3k?(+C^S%LpwK{}fkFd?1_})n8YnbSXrRzQ zp@IKK4VboJlpxp?9RT3|KaKxy-eVX)1wR3=fNS7OAOsh|0+O>xGhhxJ0)PFmVf-2V349;;;8E}hD1oEk0Qmc1 z!*~b$8oUIa2Uox%_&7KU2Eak^i@OZtS+EWs1)l(i!8?Zx<8AN<@EVAK15SZ|e#kI> z2Yv&-2fhh5z{B7WxDyP3>mP(}z-e$F`0WP_;~U^>;5l#wbig`jfQ#UM@a~=HC%6f& zgEznnU>mG}6W|DVcK~vNx4{p=w}Au!@WFZTAQ%7#!J9a&`5CwlUIj0LuLB>T5@K4!H|zs2n}zfoaU;Nw>!F`vn@#AxN@#CpVhwB&QQ)ouo{kZW#Z z?NTGnmWgu-nd&p_5}QTlxU-z)M;b(8my>$)TDvptYGFKUp}P;66soeUZC9(pg(|4& zz~*v>xp}L{Lt(Z$CJQ$u&O(&IsmY4AYSuO6M9UIZw>a6I zUiXoERcGo_yKYtg&0AmQPe)3n=ntk_ z^xeocJy-HfSxIZE*J>2`$B`Ec;ikSgT2UaS$aCj+4h*r$VE)nQm^Ymj>|*Je?`nr@VO z56qV}M>%A!dYXr0-*yG2aBf$+=UH1ujWuPsHTt(!qiFy`QV-H>DwKLa>x?wEbw(7? z83iX5InpwH2R<>vN`p2Vv`Zsw2zl@qW%d~OBP^+*k4fh3R?jS}2YQ!OSSH3vVl9}S zJXm-CawnlxCbwK#V7T66JIbU@*GHnA^S0DIRChAd3HtPFoMVn&(VSoynT+pQ0tDau#B;J!OTMz}# z#~#u<2^ui$6r~K;*>t|bW2S>YMeqly^AR4P1Q|9P^o;5%|E_SVNHzHjepPB zNj4Jod=G(!sOm1U$CGCz6MC+m)%=esn=+>AQ+hCcn{gU?N}3*`g(IlMAd8s+r{=;?tBG zV{tTDDU3a@gf*!ZpsIEU3dD*8Ge_8f&|BFqRZxFE4*1hBXfBEAz>e zPuK5|e zc9kz%)bmvlZP#aL;a<*6HZ@tJRbVR+w#0rgUj%e1gki}pqFT+~(ULZ^*9%qmWAAIR z$wqp$YY?jX7F>Z|V?WU0a5lJ)+T@r*nA@x!co3UBq{|7jZ9Vn0pESCB_mHOrGFcnp zxv4xNnEMsT*) z&NLfF*p=qF&0T43KuLKUr4`5%bXGZvmq;J0=gvM^TXHRqU)$FfZK%|BTrQ^B!>ea& z6SXs*iQ!{V{9i?^JAqi3;{Wu1|5u3Re-7RNuY(uBv)~N44-A06AfA66d=Z%76u2Aw z4Ke%;@DlhA_!hVZo(EqAKInjT&;Stc=g}DCr z;0NF}@LeFm1#l946x<8mMBi?LR{;m7!Kc6^Q2pLN;4N|#8YnbSXrRzQp@BjJg$DkQ zH9*lZ0>@bR6dmFr37SghvF%0NG3YrePO@WtzO3RMJBXV&#ZJ?Zt*nzO2pAn_G}>I3 z++xGKld(y)*chvitIwzkNNIyDWZWzdIj--@Hk}(qrl9j9s*PIgk}Ej|MSh9XCjThoaiEvqY9kyYo zufh;{j*BPMr3utBHOZ>or^;w8jmz@WT@oP@u@y0ra?`|NtnPlQ4$^|^h)9Eiy(AsU z;u|G%76>IBS<5@84%>AQ(^XN|4M%SUc|p~qY95#NIE*?Yn`VRYC}S$}qW_g4P=R`> zBvh|d19^aKiq_^@y@BKE{J4@Vvm-=C^N#6Bb&sS^b^3+=sS~e;+;W}myo1jSlvOyN?n69s`!06qGj@hduUPYgOCqo4;)6@4sTMrVP&DZLxek~sz z*3(%K`jXz26a49j;WMGj-FFmmodq5Bc#h&b8oj)NcMF za|OC>TVHR+u7~@$^)Lw40ioxW({Yy6M9NV~*{x50Po%RX_f_g^Xh@BEIyUK&J@a#_ z2<`5coDH$7X{zpFDspxTdy9+6?A9;6m{KBU0`&AbNzJOg^-WIyyBu`F#yG`sSb1n@ zI=RBj(}t&MQ0ptwL_s*VA%!7;Uq~-Y>9_%&9@=!mpY}hQB>Fi@?h3i#yWIiTtZ Date: Tue, 7 Apr 2020 09:38:02 +0100 Subject: [PATCH 25/78] Forward port EdgeList.h (From 0.6.3) --- include/EdgeList.h | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/include/EdgeList.h b/include/EdgeList.h index 7d03bb8f..efb65c73 100644 --- a/include/EdgeList.h +++ b/include/EdgeList.h @@ -16,7 +16,7 @@ struct EdgeList { uint32_t** neighbours; // Read network from file - void read(const char* filename) + void read(const char* filename, bool warn = true) { // Read edges FILE* fp = fopen(filename, "rt"); @@ -62,6 +62,11 @@ struct EdgeList { // Release free(count); fclose(fp); + + if (warn && minFanOut() == 0) { + printf("Warning: some vertices have no outgoing edges and\n"); + printf(" some POLite apps do not handle this case.\n"); + } } // Determine max fan-out @@ -73,6 +78,17 @@ struct EdgeList { } return max; } + + // Determine min fan-out + uint32_t minFanOut() { + uint32_t min = ~0; + for (uint32_t i = 0; i < numNodes; i++) { + uint32_t numNeighbours = neighbours[i][0]; + if (numNeighbours < min) min = numNeighbours; + } + return min; + } + }; #endif From 8f3a15a46a0afad0c85659ee391c5b65b7046d00 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Tue, 7 Apr 2020 09:38:38 +0100 Subject: [PATCH 26/78] Renaming routing tables to thread routing tables This is to distinguish (existing) thread-level routing tables from board-level routing tables (to come). --- include/POLite/PGraph.h | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/include/POLite/PGraph.h b/include/POLite/PGraph.h index 4181c3da..3ac93eae 100644 --- a/include/POLite/PGraph.h +++ b/include/POLite/PGraph.h @@ -56,7 +56,7 @@ template *** outTable; // Sequence of incoming edges for every thread @@ -365,9 +365,9 @@ template >**) calloc(TinselMaxThreads,sizeof(Seq>*)); @@ -407,9 +407,9 @@ template >* receivers) { + uint32_t findThreadKey(Seq>* receivers) { uint32_t key = 0; bool found = false; @@ -445,9 +445,9 @@ template >* receivers) { - uint32_t key = findKey(receivers); + uint32_t key = findThreadKey(receivers); if (key >= 0xfffe) { - printf("Routing key exceeds 16 bits\n"); + printf("Thread routing key exceeds 16 bits\n"); exit(EXIT_FAILURE); } PInEdge null, unused; @@ -475,9 +475,9 @@ template Date: Fri, 10 Apr 2020 19:04:00 +0100 Subject: [PATCH 27/78] Update POLite to use ProgRouters (Builds but untested) --- Makefile | 9 +- apps/POLite/ping-test/Makefile | 6 - apps/POLite/ping-test/Run.cpp | 57 ----- apps/POLite/ping-test/ping.cpp | 23 --- apps/POLite/ping-test/ping.h | 54 ----- apps/POLite/util/genld.sh | 2 +- config.py | 6 + include/POLite/PDevice.h | 82 ++------ include/POLite/PGraph.h | 149 ++++++------- include/POLite/ProgRouters.h | 367 +++++++++++++++++++++++++++++++++ include/POLite/Seq.h | 20 +- rtl/ProgRouter.bsv | 14 +- 12 files changed, 487 insertions(+), 302 deletions(-) delete mode 100644 apps/POLite/ping-test/Makefile delete mode 100644 apps/POLite/ping-test/Run.cpp delete mode 100644 apps/POLite/ping-test/ping.cpp delete mode 100644 apps/POLite/ping-test/ping.h create mode 100644 include/POLite/ProgRouters.h diff --git a/Makefile b/Makefile index d52882f7..d95602f9 100644 --- a/Makefile +++ b/Makefile @@ -27,13 +27,18 @@ clean: make -C apps/progrouter clean make -C apps/POLite/heat-gals clean make -C apps/POLite/heat-sync clean + make -C apps/POLite/heat-cube-sync clean + make -C apps/POLite/heat-grid-sync clean make -C apps/POLite/asp-gals clean make -C apps/POLite/asp-sync clean - make -C apps/POLite/asp-pc clean make -C apps/POLite/pagerank-sync clean make -C apps/POLite/pagerank-gals clean + make -C apps/POLite/sssp-sync clean make -C apps/POLite/sssp-async clean - make -C apps/POLite/ping-test clean make -C apps/POLite/clocktree-async clean + make -C apps/POLite/izhikevich-gals clean + make -C apps/POLite/izhikevich-sync clean + make -C apps/POLite/pressure-sync clean + make -C apps/POLite/hashmin-sync clean make -C bin clean make -C tests clean diff --git a/apps/POLite/ping-test/Makefile b/apps/POLite/ping-test/Makefile deleted file mode 100644 index 7e85d2c6..00000000 --- a/apps/POLite/ping-test/Makefile +++ /dev/null @@ -1,6 +0,0 @@ -# SPDX-License-Identifier: BSD-2-Clause -APP_CPP = ping.cpp -APP_HDR = ping.h -RUN_CPP = Run.cpp - -include ../util/polite.mk diff --git a/apps/POLite/ping-test/Run.cpp b/apps/POLite/ping-test/Run.cpp deleted file mode 100644 index 57ac5441..00000000 --- a/apps/POLite/ping-test/Run.cpp +++ /dev/null @@ -1,57 +0,0 @@ -// SPDX-License-Identifier: BSD-2-Clause -#include "ping.h" - -#include -#include -#include -#include -#include -#include - -int main(int argc, char**argv) -{ - // Connection to tinsel machine - HostLink hostLink; - - // Create POETS graph - PGraph graph; - - // Create single ping device - PDeviceId id = graph.newDevice(); - - // Prepare mapping from graph to hardware - graph.map(); - - // Write graph down to tinsel machine via HostLink - graph.write(&hostLink); - - // Load code and trigger execution - hostLink.boot("code.v", "data.v"); - hostLink.go(); - - printf("Ping started\n"); - - // Consume performance stats - //politeSaveStats(&hostLink, "stats.txt"); - - int test = 0; - int deviceAddr = graph.toDeviceAddr[id]; - printf("deviceAddr = %d\n", deviceAddr); - while (test < 100) { - // Send ping - PMessage sendMsg; - sendMsg.devId = getLocalDeviceId(deviceAddr); - sendMsg.payload.test = test; - hostLink.send(getThreadId(deviceAddr), 1, &sendMsg); - printf("Sent %d to device\n", sendMsg.payload.test); - - // Receive pong - PMessage recvMsg; - hostLink.recvMsg(&recvMsg, sizeof(recvMsg)); - printf("Received %d from device\n", recvMsg.payload.test); - - test++; - } - - return 0; -} diff --git a/apps/POLite/ping-test/ping.cpp b/apps/POLite/ping-test/ping.cpp deleted file mode 100644 index 74960d36..00000000 --- a/apps/POLite/ping-test/ping.cpp +++ /dev/null @@ -1,23 +0,0 @@ -// SPDX-License-Identifier: BSD-2-Clause -#include "ping.h" - -#include -#include - -typedef PThread< - PingDevice, - PingState, // State - None, // Edge label - PingMessage // Message - > PingThread; - -int main() -{ - // Point thread structure at base of thread's heap - PingThread* thread = (PingThread*) tinselHeapBaseSRAM(); - - // Invoke interpreter - thread->run(); - - return 0; -} diff --git a/apps/POLite/ping-test/ping.h b/apps/POLite/ping-test/ping.h deleted file mode 100644 index 3d4c17de..00000000 --- a/apps/POLite/ping-test/ping.h +++ /dev/null @@ -1,54 +0,0 @@ -// SPDX-License-Identifier: BSD-2-Clause -// Test messaging between host and threads. - -#ifndef _ping_H_ -#define _ping_H_ - -//#define POLITE_DUMP_STATS -//#define POLITE_COUNT_MSGS - -// Lightweight POETS frontend -#include - -struct PingMessage { - uint32_t test; -}; - -struct PingState { - // Number received to be sent back to host - uint32_t test; -}; - -struct PingDevice : PDevice { - // Called once by POLite at start of execution - void init() { - // Do nothing until a message is received from the host - *readyToSend = No; - } - - // Receive handler - inline void recv(PingMessage* msg, None* edge) { - // Store number from host to send back to host - s->test = msg->test; - *readyToSend = HostPin; - } - - // Send handler - inline void send(volatile PingMessage* msg) { - // Put received value back in message for host to check - msg->test = s->test; - *readyToSend = No; - } - - // Called by POLite when system becomes idle - inline bool step() { - return true; // Never terminate - } - - // Optionally send message to host on termination - inline bool finish(volatile PingMessage* msg) { - return false; - } -}; - -#endif diff --git a/apps/POLite/util/genld.sh b/apps/POLite/util/genld.sh index 0350108e..474e5694 100755 --- a/apps/POLite/util/genld.sh +++ b/apps/POLite/util/genld.sh @@ -18,7 +18,7 @@ OUTPUT_ARCH( "riscv" ) MEMORY { instrs : ORIGIN = $MaxBootImageBytes, LENGTH = $MaxInstrBytes - globals : ORIGIN = $DRAMBase, LENGTH = $DRAMGlobalsLength + globals : ORIGIN = $DRAMBase, LENGTH = $POLiteDRAMGlobalsLength } SECTIONS diff --git a/config.py b/config.py index 50bc3480..a4099e7e 100755 --- a/config.py +++ b/config.py @@ -365,6 +365,12 @@ def quoted(s): return "'\"" + s + "\"'" # DRAM base and length p["DRAMBase"] = 3 * (2 ** p["LogBytesPerSRAM"]) p["DRAMGlobalsLength"] = 2 ** (p["LogBytesPerDRAM"] - 1) - p["DRAMBase"] +p["POLiteDRAMGlobalsLength"] = 2 ** 14 +p["POLiteProgRouterBase"] = p["DRAMBase"] + p["POLiteDRAMGlobalsLength"] +p["POLiteProgRouterLength"] = (p["DRAMGlobalsLength"] - + p["POLiteDRAMGlobalsLength"]) + +# POLite globals # Number of FPGA boards per box (including bridge board) p["BoardsPerBox"] = p["MeshXLenWithinBox"] * p["MeshYLenWithinBox"] + 1 diff --git a/include/POLite/PDevice.h b/include/POLite/PDevice.h index 9eefda3a..b5f99340 100644 --- a/include/POLite/PDevice.h +++ b/include/POLite/PDevice.h @@ -54,9 +54,8 @@ inline PLocalDeviceId getLocalDeviceId(PDeviceAddr addr) { return addr >> 19; } // What's the max allowed local device address? inline uint32_t maxLocalDeviceId() { return 8192; } -// Routing key -typedef uint16_t Key; -#define InvalidKey 0xffff +// Index into the per-thread in-edge table +typedef uint16_t InTableKey; // Pins // No - means 'not ready to send' @@ -92,8 +91,8 @@ template struct PDevice { // Generic device state structure template struct ALIGNED PState { - // Pointer to base of neighbours arrays - uint16_t pinBase[POLITE_NUM_PINS]; + // Board-level routing key for each outgoing pin + uint32_t pin[POLITE_NUM_PINS]; // Ready-to-send status PPin readyToSend; // Custom state @@ -103,22 +102,11 @@ template struct ALIGNED PState { // Message structure template struct PMessage { // Source-based routing key - Key key; + InTableKey key; // Application message M payload; }; -// An outgoing edge from a device -struct POutEdge { - // Destination mailbox - uint16_t mbox; - // Routing key - uint16_t key; - // Destination threads - uint32_t threadMaskLow; - uint32_t threadMaskHigh; -}; - // An incoming edge to a device (labelleled) template struct PInEdge { // Destination device @@ -137,16 +125,6 @@ template <> struct PInEdge { }; }; -// Helper function: Count board hops between two threads -inline uint32_t hopsBetween(uint32_t t0, uint32_t t1) { - uint32_t xmask = ((1<> (TinselLogThreadsPerBoard + TinselMeshXBits); - int32_t x0 = (t0 >> TinselLogThreadsPerBoard) & xmask; - int32_t y1 = t1 >> (TinselLogThreadsPerBoard + TinselMeshXBits); - int32_t x1 = (t1 >> TinselLogThreadsPerBoard) & xmask; - return (abs(x0-x1) + abs(y0-y1)); -} - // Generic thread structure template struct PThread { @@ -159,8 +137,7 @@ template ) devices; - // Pointer to base of routing tables - PTR(POutEdge) outTableBase; + // Pointer to base of in table PTR(PInEdge) inTableBase; // Array of local device ids are ready to send PTR(PLocalDeviceId) senders; @@ -218,17 +195,6 @@ template > TinselLogThreadsPerMailbox; - outHost[0].key = 0; - outHost[1].key = InvalidKey; - // Initialise outEdge to null terminator - outEdge = &outHost[1]; - // Did last call to step handler request a new time step? bool active = true; @@ -252,29 +218,10 @@ template key != InvalidKey) { - if (tinselCanSend()) { - PMessage* m = (PMessage*) tinselSendSlot(); - // Send message - m->key = outEdge->key; - tinselMulticast(outEdge->mbox, outEdge->threadMaskHigh, - outEdge->threadMaskLow, m); - #ifdef POLITE_COUNT_MSGS - interThreadSendCount++; - interBoardSendCount += - hopsBetween(outEdge->mbox << TinselLogThreadsPerMailbox, - tinselId()); - #endif - // Move to next neighbour - outEdge++; - } - else - tinselWaitUntil(TINSEL_CAN_SEND|TINSEL_CAN_RECV); - } - else if (sendersTop != senders) { + // Try to send + if (sendersTop != senders) { if (tinselCanSend()) { - // Start new multicast + // Get next sender PLocalDeviceId src = *(--sendersTop); // Lookup device DeviceType dev = getDevice(src); @@ -284,13 +231,14 @@ template payload); // Reinsert sender, if it still wants to send if (*dev.readyToSend != No) sendersTop++; - // Determine out-edge array for sender + // Is it a send to the host pin or a user pin? if (pin == HostPin) - outEdge = outHost; + tinselSend(tinselHostId(), m); else - outEdge = (POutEdge*) &outTableBase[ - devices[src].pinBase[pin-2] - ]; + tinselKeySend(devices[src].pin[pin-2], m); + #ifdef POLITE_COUNT_MSGS + interThreadSendCount++; + #endif } else tinselWaitUntil(TINSEL_CAN_SEND|TINSEL_CAN_RECV); diff --git a/include/POLite/PGraph.h b/include/POLite/PGraph.h index 3ac93eae..b6a1245a 100644 --- a/include/POLite/PGraph.h +++ b/include/POLite/PGraph.h @@ -12,8 +12,8 @@ #include #include #include +#include #include -#include "Seq.h" // Nodes of a POETS graph are devices typedef NodeId PDeviceId; @@ -56,12 +56,21 @@ template *** outTable; + // Out table (sender-side edge tables) + // Sequence of destinations for every (device, pin) pair + Seq*** outTable; + + // Key table (sender-side key tables) + // Global routing key for every (device, pin) pair + uint32_t** keyTable; + + // In table (receiver-side edge tables) // Sequence of incoming edges for every thread Seq>** inTable; + // Mesh of per-board programmable routers + ProgRouterMesh* routingTables; + // Generic constructor void constructor(uint32_t lenX, uint32_t lenY) { meshLenX = lenX; @@ -82,14 +91,12 @@ template )); @@ -214,15 +216,6 @@ template numElems * sizeof(PInEdge); sizeEIMem = wordAlign(sizeEIMem); } - // Add space for outgoing edge table - for (uint32_t devNum = 0; devNum < numDevs; devNum++) { - PDeviceId id = fromDeviceAddr[threadId][devNum]; - for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) { - Seq* edges = outTable[id][p]; - sizeEOMem += sizeof(POutEdge) * edges->numElems; - } - } - sizeEOMem = wordAlign(sizeEOMem); // The total partition size including uninitialised portions uint32_t totalSizeVMem = sizeVMem + wordAlign(sizeof(PLocalDeviceId) * numDevs); @@ -233,8 +226,6 @@ template maxDRAMSize) { printf("Error: max DRAM partition size exceeded\n"); exit(EXIT_FAILURE); @@ -247,15 +238,12 @@ template * thread = (PThread*) &threadMem[threadId][0]; @@ -309,8 +288,7 @@ template numVertices = numDevices; // Set tinsel address of array of device states thread->devices = vertexMemBase[threadId]; - // Set tinsel address of base of edge tables - thread->outTableBase = outEdgeMemBase[threadId]; + // Set tinsel address of base of in-edge table thread->inTableBase = inEdgeMemBase[threadId]; // Add space for each device on thread uint32_t numDevs = numDevicesOnThread[threadId]; @@ -326,14 +304,8 @@ template * dev = devices[id]; // Initialise - POutEdge* outEdgeArray = (POutEdge*) outEdgeMem[threadId]; for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) { - dev->pinBase[p] = nextOutIndex; - Seq* edges = outTable[id][p]; - for (uint32_t i = 0; i < edges->numElems; i++) { - outEdgeArray[nextOutIndex] = edges->elems[i]; - nextOutIndex++; - } + dev->pin[p] = keyTable[id][p]; } } // Intialise thread's in edges @@ -348,10 +320,6 @@ template senders = vertexMemBase[threadId] + nextVMem; } @@ -365,9 +333,9 @@ template >**) calloc(TinselMaxThreads,sizeof(Seq>*)); @@ -377,13 +345,18 @@ template ***) calloc(numDevices, sizeof(Seq**)); + outTable = (Seq***) + calloc(numDevices, sizeof(Seq**)); for (uint32_t d = 0; d < numDevices; d++) { - outTable[d] = (Seq**) - calloc(POLITE_NUM_PINS, sizeof(Seq*)); + outTable[d] = (Seq**) + calloc(POLITE_NUM_PINS, sizeof(Seq*)); for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) - outTable[d][p] = new SmallSeq; + outTable[d][p] = new SmallSeq; } + + keyTable = new uint32_t* [numDevices]; + for (uint32_t d = 0; d < numDevices; d++) + keyTable[d] = new uint32_t [POLITE_NUM_PINS]; } // Pack a receivers array @@ -407,9 +380,9 @@ template >* receivers) { + uint32_t findInTableKey(Seq>* receivers) { uint32_t key = 0; bool found = false; @@ -445,9 +418,9 @@ template >* receivers) { - uint32_t key = findThreadKey(receivers); + uint32_t key = findInTableKey(receivers); if (key >= 0xfffe) { - printf("Thread routing key exceeds 16 bits\n"); + printf("In-table routing key exceeds 16 bits\n"); exit(EXIT_FAILURE); } PInEdge null, unused; @@ -475,9 +448,9 @@ template append(term); } } //printf("Average edges per pin: %lu\n", @@ -579,11 +548,6 @@ template > + TinselLogThreadsPerMailbox; + for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) + keyTable[d][p] = routingTables->addDestsFromBoard(src, outTable[d][p]); + } // Stop routing timer and start init timer - gettimeofday(&threadRoutingFinish, NULL); + gettimeofday(&routingFinish, NULL); gettimeofday(&initStart, NULL); // Reallocate and initialise heap structures @@ -704,9 +683,9 @@ template write(hostLink); hostLink->flush(); hostLink->useSendBuffer = useSendBufferOld; diff --git a/include/POLite/ProgRouters.h b/include/POLite/ProgRouters.h new file mode 100644 index 00000000..34f62694 --- /dev/null +++ b/include/POLite/ProgRouters.h @@ -0,0 +1,367 @@ +// SPDX-License-Identifier: BSD-2-Clause +#ifndef _PROGROUTERS_H_ +#define _PROGROUTERS_H_ + +#include +#include +#include +#include +#include +#include + +// ============================= +// Per-board programmable router +// ============================= + +class ProgRouter { + + // Number of chunks used so far in current beat + uint32_t numChunks; + + // Number of records used so far in current beat + uint32_t numRecords; + + // Number of beats associated with current key + uint32_t numBeats; + + // Index of RAM currently being used + uint32_t currentRAM; + + // Pointer to previously created indirection + // (We need indirections to handle record sequences of 31 beats or more) + uint8_t* prevInd; + + // Move on to next the beat + void nextBeat() { + // Set number of records in current beat + uint32_t beatBase = table[currentRAM]->numElems - 32; + uint8_t* beat = &table[currentRAM]->elems[beatBase]; + beat[31] = 0; + beat[30] = numRecords; + numChunks = numRecords = 0; + // Allocate new beat, and check for overflow + numBeats++; + table[currentRAM]->extendBy(32); + if (table[currentRAM]->numElems >= (TinselPOLiteProgRouterLength-1024)) { + printf("ProgRouter out of memory\n"); + exit(EXIT_FAILURE); + } + // We need indirections to handle sequences of 31 beats or more + if ((numBeats % 31) == 0) { + // Set previous indirection, if there is one + if (prevInd) { + uint32_t key = TinselPOLiteProgRouterBase + + table[currentRAM]->numElems - 31*32; + if (currentRAM) key |= 0x80000000; + key |= 31; + setIND(prevInd, key); + } + prevInd = addIND(); + } + } + + // Get current record pointer for 48-bit entry + inline uint8_t* currentRecord48() { + uint32_t beatBase = (table[currentRAM]->numElems-32) + 6*(4-numChunks); + return &table[currentRAM]->elems[beatBase]; + } + + // Get current record pointer for 96-bit entry + inline uint8_t* currentRecord96() { + uint32_t beatBase = (table[currentRAM]->numElems-32) + 6*(3-numChunks); + return &table[currentRAM]->elems[beatBase]; + } + + public: + + // A table holding encoded routing beats for each RAM + Seq** table; + + // Constructor + ProgRouter() { + // Currently we assume two RAMs per board + assert(TinselDRAMsPerBoard == 2); + // Initialise member variables + prevInd = NULL; + numBeats = 1; + numChunks = numRecords = currentRAM = 0; + // Allocate one sequence per RAM + table = new Seq* [TinselDRAMsPerBoard]; + // Initially each sequence is 32MB + for (int i = 0; i < TinselDRAMsPerBoard; i++) { + table[i] = new Seq (1 << 15); + // Allocate first beat + table[i]->extendBy(32); + } + } + + // Destructor + ~ProgRouter() { + for (int i = 0; i < TinselDRAMsPerBoard; i++) delete table[i]; + delete [] table; + } + + // Generate a new key for the records added + uint32_t genKey() { + // Determine index of first beat in record sequence + uint32_t index = table[currentRAM]->numElems - numBeats*32; + // Determine final key length + uint32_t finalKeyLen = prevInd ? 31 : numBeats; + // Insert outstanding indirection, if there is one + if (prevInd) { + // Set previous indirection to latest block of beats + uint32_t indKey = TinselPOLiteProgRouterBase + + table[currentRAM]->numElems - (numBeats%31)*32; + if (currentRAM) indKey |= 0x80000000; + indKey |= (numBeats%31); + setIND(prevInd, indKey); + } + // Determine final key + uint32_t key = TinselPOLiteProgRouterBase + index; + if (currentRAM) key |= 0x80000000; + key |= finalKeyLen; + // Move to next beat + nextBeat(); + numBeats = 1; + prevInd = NULL; + // Pick smaller RAM for next key + currentRAM = table[0]->numElems < table[1]->numElems ? 0 : 1; + return key; + } + + // Add an IND record to the table + // Return a pointer to the indirection key, + // so it can be set later by the caller + uint8_t* addIND() { + if (numChunks == 5) nextBeat(); + uint8_t* ptr = currentRecord48(); + ptr[5] = 4 << 5; + numChunks++; + numRecords++; + return ptr; + } + + // Set indirection key + void setIND(uint8_t* ind, uint32_t key) { + ind[0] = key; + ind[1] = key >> 8; + ind[2] = key >> 16; + ind[3] = key >> 24; + } + + // Add an MRM record to the table + void addMRM(uint32_t mboxX, uint32_t mboxY, + uint32_t threadsHigh, uint32_t threadsLow, + uint16_t localKey) { + if (numChunks >= 4) nextBeat(); + uint8_t* ptr = currentRecord96(); + ptr[0] = threadsLow; + ptr[1] = threadsLow >> 8; + ptr[2] = threadsLow >> 16; + ptr[3] = threadsLow >> 24; + ptr[4] = threadsHigh; + ptr[5] = threadsHigh >> 8; + ptr[6] = threadsHigh >> 16; + ptr[7] = threadsHigh >> 24; + ptr[8] = localKey; + ptr[9] = localKey >> 8; + ptr[11] = (3 << 5) | (mboxY << 3) | (mboxX << 1); + numChunks += 2; + numRecords++; + } + + // Add an RR record to the table + void addRR(uint32_t dir, uint32_t key) { + if (numChunks == 5) nextBeat(); + uint8_t* ptr = currentRecord48(); + ptr[0] = key; + ptr[1] = key >> 8; + ptr[2] = key >> 16; + ptr[3] = key >> 24; + ptr[5] = (2 << 5) | (dir << 3); + numChunks++; + numRecords++; + } +}; + +// ================================== +// Data type for routing destinations +// ================================== + +struct PRoutingDest { + // Destination mailbox + uint32_t mbox; + // Thread-level routing key + uint16_t key; + // Destination threads + uint32_t threadMaskLow; + uint32_t threadMaskHigh; +}; + +// Extract board X coord from routing dest +inline uint32_t destX(uint32_t mbox) { + uint32_t x = mbox >> (TinselMailboxMeshXBits + TinselMailboxMeshYBits); + return x & ((1<> (TinselMailboxMeshXBits + + TinselMailboxMeshYBits + TinselMeshXBits); + return y & ((1<> TinselMailboxMeshXBits) & + ((1<* dests) { + assert(dests->numElems > 0); + + // Categorise non-local dests into local, N, S, E, and W groups + Seq local(dests->numElems); + Seq north(dests->numElems); + Seq south(dests->numElems); + Seq east(dests->numElems); + Seq west(dests->numElems); + for (int i = 0; i < dests->numElems; i++) { + PRoutingDest dest = dests->elems[i]; + uint32_t receiverX = destX(dest.mbox); + uint32_t receiverY = destY(dest.mbox); + if (receiverX < senderX) east.append(dest); + else if (receiverX > senderX) west.append(dest); + else if (receiverY < senderY) south.append(dest); + else if (receiverY > senderY) north.append(dest); + else local.append(dest); + } + + // Recurse on non-local groups and add RR records on return + if (north.numElems > 0) { + uint32_t key = addDestsFromBoardXY(senderX, senderY+1, &north); + table[senderY][senderX].addRR(0, key); + } + if (south.numElems > 0) { + uint32_t key = addDestsFromBoardXY(senderX, senderY-1, &south); + table[senderY][senderX].addRR(1, key); + } + if (east.numElems > 0) { + uint32_t key = addDestsFromBoardXY(senderX+1, senderY, &east); + table[senderY][senderX].addRR(2, key); + } + if (west.numElems > 0) { + uint32_t key = addDestsFromBoardXY(senderX-1, senderY, &west); + table[senderY][senderX].addRR(3, key); + } + + // Add local records + for (int i = 0; i < local.numElems; i++) { + PRoutingDest dest = local.elems[i]; + table[senderY][senderX].addMRM(destMboxX(dest.mbox), + destMboxY(dest.mbox), dest.threadMaskHigh, + dest.threadMaskLow, dest.key); + } + + return table[senderY][senderX].genKey(); + } + + // Add routing destinations from given global mailbox id + uint32_t addDestsFromBoard(uint32_t mbox, Seq* dests) { + addDestsFromBoardXY(destX(mbox), destY(mbox), dests); + } + + // Write routing tables to memory via HostLink + void write(HostLink* hostLink) { + // Request to boot loader + BootReq req; + + // Compute number of cores per DRAM + const uint32_t coresPerDRAM = 1 << + (TinselLogCoresPerDCache + TinselLogDCachesPerDRAM); + + // Initialise write address for each routing table + for (int y = 0; y < boardsY; y++) { + for (int x = 0; x < boardsX; x++) { + for (int i = 0; i < TinselDRAMsPerBoard; i++) { + // Use one core to initialise each DRAM + uint32_t dest = hostLink->toAddr(x, y, coresPerDRAM * i, 0); + req.cmd = SetAddrCmd; + req.numArgs = 1; + req.args[0] = TinselPOLiteProgRouterBase; + hostLink->send(dest, 1, &req); + // Ensure space for an extra 32 bytes in each + // table so we don't have to check for overflow below + // when consuming the tables in chunks of 12 bytes + table[y][x].table[i]->ensureSpaceFor(32); + } + } + } + + // Write each routing table + bool allDone = false; + uint32_t offset = 0; + while (! allDone) { + allDone = true; + for (int y = 0; y < boardsY; y++) { + for (int x = 0; x < boardsX; x++) { + for (int i = 0; i < TinselDRAMsPerBoard; i++) { + Seq* seq = table[y][x].table[i]; + if (offset < seq->numElems) { + uint32_t dest = hostLink->toAddr(x, y, coresPerDRAM * i, 0); + allDone = false; + req.cmd = StoreCmd; + req.numArgs = 3; + req.args[0] = ((uint32_t*) seq->elems)[0]; + req.args[1] = ((uint32_t*) seq->elems)[1]; + req.args[2] = ((uint32_t*) seq->elems)[2]; + hostLink->send(dest, 1, &req); + } + } + } + } + offset += 12; + } + } + + // Destructor + ~ProgRouterMesh() { + for (int y = 0; y < boardsY; y++) + delete [] table[y]; + delete [] table; + } +}; + + +#endif diff --git a/include/POLite/Seq.h b/include/POLite/Seq.h index b6cb61f1..23a7616c 100644 --- a/include/POLite/Seq.h +++ b/include/POLite/Seq.h @@ -45,12 +45,26 @@ template class Seq elems = newElems; } + // Extend size of sequence by N + void extendBy(int n) + { + numElems += n; + if (numElems > maxElems) + setCapacity(numElems*2); + } + // Extend size of sequence by one void extend() { - numElems++; - if (numElems > maxElems) - setCapacity(maxElems*2); + extendBy(1); + } + + // Ensure space for a further N elements + void ensureSpaceFor(int n) + { + int newNumElems = numElems + n; + if (newNumElems > maxElems) + setCapacity(newNumElems*2); } // Append diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv index f6712ba1..1cbdb53e 100644 --- a/rtl/ProgRouter.bsv +++ b/rtl/ProgRouter.bsv @@ -361,15 +361,21 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); Flit flit = flitInPort.value; if (flitInPort.canGet) begin flitInPort.get; - consumeKey <= getRoutingKey(flit.dest); + RoutingKey key = getRoutingKey(flit.dest); + consumeKey <= key // Write to flit buffer flitBuffer.write({chosenReg, consumeFlitCount}, flit); consumeFlitCount <= consumeFlitCount + 1; // On final flit, move to fetch state if (! flit.notFinalFlit) begin - consumeState <= 2; - // Claim chosen slot - flitBufferUsedSlots[chosenReg].set; + // Ignore keys with zero beats + if (key.numBeats == 0) begin + consumeState <= 0; + end else begin + consumeState <= 2; + // Claim chosen slot + flitBufferUsedSlots[chosenReg].set; + end end end endrule From cb708b8d47220b775a3fd7134ae1fbc455e93d3f Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Tue, 14 Apr 2020 20:50:39 +0100 Subject: [PATCH 28/78] Account for ProgRouters in termination detector (Compiles, but untested) --- config.py | 1 + rtl/DE5Top.bsv | 9 ++--- rtl/IdleDetector.bsv | 80 ++++++++++++++++++++++++++------------------ rtl/Makefile | 2 +- rtl/Network.bsv | 5 +++ rtl/ProgRouter.bsv | 34 +++++++++++++++++-- 6 files changed, 92 insertions(+), 39 deletions(-) diff --git a/config.py b/config.py index a4099e7e..d0a2491a 100755 --- a/config.py +++ b/config.py @@ -309,6 +309,7 @@ def quoted(s): return "'\"" + s + "\"'" # Cores per board p["LogCoresPerBoard"] = p["LogCoresPerMailbox"] + p["LogMailboxesPerBoard"] +p["LogCoresPerBoard1"] = p["LogCoresPerBoard"] + 1 p["CoresPerBoard"] = 2**p["LogCoresPerBoard"] # Threads per core diff --git a/rtl/DE5Top.bsv b/rtl/DE5Top.bsv index 0e5672fa..5c353542 100644 --- a/rtl/DE5Top.bsv +++ b/rtl/DE5Top.bsv @@ -140,10 +140,6 @@ module de5Top (DE5Top); // Create idle-detector IdleDetector idle <- mkIdleDetector; - // Connect cores to idle-detector - function idleClient(core) = core.idleClient; - connectCoresToIdleDetector(map(idleClient, vecOfCores), idle); - // Create mailboxes Vector#(`MailboxMeshYLen, Vector#(`MailboxMeshXLen, Mailbox)) mailboxes = @@ -172,6 +168,11 @@ module de5Top (DE5Top); map(map(mailboxNet), mailboxes), idle); + // Connect cores and ProgRouter fetchers to idle-detector + function idleClient(core) = core.idleClient; + connectClientsToIdleDetector( + map(idleClient, vecOfCores), noc.activities, idle); + // Connections to off-chip RAMs for (Integer i = 0; i < `DRAMsPerBoard; i=i+1) connectClientsToOffChipRAM(dcaches[i], diff --git a/rtl/IdleDetector.bsv b/rtl/IdleDetector.bsv index 4cb3ccc5..179a9f41 100644 --- a/rtl/IdleDetector.bsv +++ b/rtl/IdleDetector.bsv @@ -18,14 +18,16 @@ // The implementation below is based on Safra's termination detection // algorithm (EWD998). -import Mailbox :: *; -import Globals :: *; -import Interface :: *; -import Queue :: *; -import Vector :: *; -import ConfigReg :: *; -import Util :: *; -import DReg :: *; +import Mailbox :: *; +import Globals :: *; +import Interface :: *; +import Queue :: *; +import Vector :: *; +import ConfigReg :: *; +import Util :: *; +import DReg :: *; +import ProgRouter :: *; +import Assert :: *; // The total number of messages sent by all threads on an FPGA minus // the total number of messages received by all threads on an FPGA. @@ -343,22 +345,33 @@ interface IdleDetectorClient; method Bool idleStage1Ack; endinterface -// Connect cores to idle detector -module connectCoresToIdleDetector#( - Vector#(n, IdleDetectorClient) core, IdleDetector detector) () - provisos (Log#(n, log_n), Add#(log_n, 1, m), Add#(_a, m, 62)); +// Connect cores and fetchers to idle detector +module connectClientsToIdleDetector#( + Vector#(`CoresPerBoard, IdleDetectorClient) core, + Vector#(`FetchersPerProgRouter, FetcherActivity) fetcher, + IdleDetector detector) () + provisos (Mul#(2, `CoresPerBoard, n)); + + staticAssert(2**`LogCoresPerBoard1 > `CoresPerBoard+`FetchersPerProgRouter, + "connectCoresToIdleDetector: insufficient width"); // Sum "incSent" wires from each core - Vector#(n, Bit#(m)) incSents = newVector; - for (Integer i = 0; i < valueOf(n); i=i+1) + Vector#(n, Bit#(`LogCoresPerBoard1)) incSents = replicate(0); + for (Integer i = 0; i < `CoresPerBoard; i=i+1) incSents[i] = zeroExtend(core[i].incSent); - Bit#(m) incSent <- mkPipelinedReductionTree( \+ , 0, toList(incSents)); + for (Integer i = 0; i < `FetchersPerProgRouter; i=i+1) + incSents[`CoresPerBoard+i] = zeroExtend(fetcher[i].incSent); + Bit#(`LogCoresPerBoard1) incSent <- + mkPipelinedReductionTree( \+ , 0, toList(incSents)); // Sum "incRecv" wires from each core - Vector#(n, Bit#(m)) incRecvs = newVector; - for (Integer i = 0; i < valueOf(n); i=i+1) + Vector#(n, Bit#(`LogCoresPerBoard1)) incRecvs = replicate(0); + for (Integer i = 0; i < `CoresPerBoard; i=i+1) incRecvs[i] = zeroExtend(core[i].incReceived); - Bit#(m) incRecv <- mkPipelinedReductionTree( \+ , 0, toList(incRecvs)); + for (Integer i = 0; i < `FetchersPerProgRouter; i=i+1) + incRecvs[`CoresPerBoard+i] = zeroExtend(fetcher[i].incReceived); + Bit#(`LogCoresPerBoard1) incRecv <- + mkPipelinedReductionTree( \+ , 0, toList(incRecvs)); // Maintain the total count Reg#(MsgCount) count <- mkConfigReg(0); @@ -369,16 +382,18 @@ module connectCoresToIdleDetector#( endrule // OR the "active" wires from each core - Vector#(n, Bool) actives = newVector; - for (Integer i = 0; i < valueOf(n); i=i+1) + Vector#(n, Bool) actives = replicate(False); + for (Integer i = 0; i < `CoresPerBoard; i=i+1) actives[i] = core[i].active; + for (Integer i = 0; i < `FetchersPerProgRouter; i=i+1) + actives[`CoresPerBoard+i] = fetcher[i].active; Bool anyActive <- mkPipelinedReductionTree( \|| , True, toList(actives)); - // OR the "vote" wires from each core - Vector#(n, Bool) votes = newVector; - for (Integer i = 0; i < valueOf(n); i=i+1) + // AND the "vote" wires from each core + Vector#(n, Bool) votes = replicate(True); + for (Integer i = 0; i < `CoresPerBoard; i=i+1) votes[i] = core[i].vote; - Bool unanamous <- mkPipelinedReductionTree( \&& , False, toList(votes)); + Bool voteDecision <- mkPipelinedReductionTree( \&& , False, toList(votes)); // Register the result Reg#(Bool) active <- mkConfigReg(True); @@ -386,24 +401,25 @@ module connectCoresToIdleDetector#( rule updateActive; active <= anyActive; - vote <= unanamous; + vote <= voteDecision; endrule // Counter number of stage 1 acks - Reg#(Bit#(m)) numAcks <- mkConfigReg(0); + Reg#(Bit#(`LogCoresPerBoard1)) numAcks <- mkConfigReg(0); // Sum stage 1 ack wires from each core - Vector#(n, Bit#(m)) incAcks = newVector; - for (Integer i = 0; i < valueOf(n); i=i+1) + Vector#(`CoresPerBoard, Bit#(`LogCoresPerBoard1)) incAcks = newVector; + for (Integer i = 0; i < `CoresPerBoard; i=i+1) incAcks[i] = zeroExtend(pack(core[i].idleStage1Ack)); - Bit#(m) incAck <- mkPipelinedReductionTree( \+ , 0, toList(incAcks)); + Bit#(`LogCoresPerBoard1) incAck <- + mkPipelinedReductionTree( \+ , 0, toList(incAcks)); // Stage 1 output ack Wire#(Bool) stage1AckWire <- mkDWire(False); rule updateAcks; - Bit#(m) total = numAcks + incAck; - if (total == fromInteger(valueOf(n))) begin + Bit#(`LogCoresPerBoard1) total = numAcks + incAck; + if (total == `CoresPerBoard) begin numAcks <= 0; stage1AckWire <= True; end else begin @@ -419,7 +435,7 @@ module connectCoresToIdleDetector#( detector.idle.voteIn(vote); detector.idle.ackStage1(stage1AckWire); - for (Integer i = 0; i < valueOf(n); i=i+1) begin + for (Integer i = 0; i < `CoresPerBoard; i=i+1) begin core[i].idleDetectedStage1(detector.idle.detectedStage1); core[i].idleVoteStage1(detector.idle.voteStage1); core[i].idleDetectedStage2(detector.idle.detectedStage2); diff --git a/rtl/Makefile b/rtl/Makefile index cc521bae..57a2acf8 100644 --- a/rtl/Makefile +++ b/rtl/Makefile @@ -11,7 +11,7 @@ DEFS = $(shell python ../config.py defs) BSC = bsc BSCFLAGS = -wait-for-license -suppress-warnings S0015 \ -suppress-warnings G0023 \ - -steps-warn-interval 500000 -check-assert \ + -steps-warn-interval 750000 -check-assert \ +RTS -K32M -RTS # Top level module diff --git a/rtl/Network.bsv b/rtl/Network.bsv index 4ee2e69b..82fbbd6c 100644 --- a/rtl/Network.bsv +++ b/rtl/Network.bsv @@ -289,6 +289,8 @@ interface NoC; Vector#(`FetchersPerProgRouter, BOut#(DRAMReq))) dramReqs; interface Vector#(`DRAMsPerBoard, Vector#(`FetchersPerProgRouter, In#(DRAMResp))) dramResps; + // ProgRouter fetcher activities + interface Vector#(`FetchersPerProgRouter, FetcherActivity) activities; endinterface module mkNoC#( @@ -462,6 +464,9 @@ module mkNoC#( // Responses from off-chip memory interface dramResps = boardRouter.ramResps; + // Fetcher activities + interface activities = boardRouter.activities; + endmodule endpackage diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv index 1cbdb53e..f100fd10 100644 --- a/rtl/ProgRouter.bsv +++ b/rtl/ProgRouter.bsv @@ -11,6 +11,7 @@ import Interface :: *; import BlockRam :: *; import Assert :: *; import Util :: *; +import DReg :: *; // ============================================================================= // Routing keys and beats @@ -253,6 +254,16 @@ interface Fetcher; // Off-chip RAM connections interface Vector#(`DRAMsPerBoard, BOut#(DRAMReq)) ramReqs; interface Vector#(`DRAMsPerBoard, In#(DRAMResp)) ramResps; + // Activity + interface FetcherActivity activity; +endinterface + +// Fetcher activity for performance counters and termination detection +(* always_ready *) +interface FetcherActivity; + method Bit#(1) incSent; + method Bit#(1) incReceived; + method Bool active; endinterface // Fetcher module @@ -293,6 +304,10 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); // Final output queue for flits Queue1#(RoutedFlit) flitOutQueue <- mkUGShiftQueue(QueueOptFmax); + // Activity + Reg#(Bit#(1)) incSentReg <- mkDReg(0); + Reg#(Bit#(1)) incReceivedReg <- mkDReg(0); + // Stage 1: consume input message // ------------------------------ @@ -362,7 +377,7 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); if (flitInPort.canGet) begin flitInPort.get; RoutingKey key = getRoutingKey(flit.dest); - consumeKey <= key + consumeKey <= key; // Write to flit buffer flitBuffer.write({chosenReg, consumeFlitCount}, flit); consumeFlitCount <= consumeFlitCount + 1; @@ -404,6 +419,7 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); ramReqQueue[consumeKey.ram].enq(req); fetchBeatCount <= fetchBeatCount + zeroExtend(req.burst); beatBufferLen.incBy(zeroExtend(req.burst)); + incReceivedReg <= 1; if (finished) consumeState <= 0; end endrule @@ -583,8 +599,10 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); // Is this the final flit in the message? if (flit.notFinalFlit) newFlitCount = emitFlitCount + 1; - else + else begin + incSentReg <= 1; newFlitCount = 0; + end end // Issue flit load request flitBuffer.read({info.msgAddr, newFlitCount}); @@ -626,6 +644,13 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); interface ramReqs = map(queueToBOut, ramReqQueue); interface ramResps = ramRespsOut; + interface FetcherActivity activity; + method Bit#(1) incSent = incSentReg; + method Bit#(1) incReceived = incReceivedReg; + method Bool active = + beatBufferLen.value == 0 && interpreterState == 0; + endinterface + endmodule // ============================================================================= @@ -761,6 +786,9 @@ interface ProgRouter; Vector#(`FetchersPerProgRouter, BOut#(DRAMReq))) ramReqs; interface Vector#(`DRAMsPerBoard, Vector#(`FetchersPerProgRouter, In#(DRAMResp))) ramResps; + + // Activities + interface Vector#(`FetchersPerProgRouter, FetcherActivity) activities; endinterface module mkProgRouter#(BoardId boardId) (ProgRouter); @@ -822,11 +850,13 @@ module mkProgRouter#(BoardId boardId) (ProgRouter); ramRespIfc[i][j] = fetchers[j].ramResps[i]; end + function FetcherActivity getActivity(Fetcher f) = f.activity; interface flitIn = flitInIfc; interface flitOut = flitOutIfc; interface nocFlitOut = nocFlitOutIfc; interface ramReqs = ramReqIfc; interface ramResps = ramRespIfc; + interface activities = map(getActivity, fetchers); endmodule From aec8e7b295a1e9cec38062151ca4db117cc5e1dc Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Wed, 15 Apr 2020 09:29:15 +0100 Subject: [PATCH 29/78] Fix to previous commit The 'active' condition in the fetcher was inverted, and also referred to the wrong state variable. --- rtl/ProgRouter.bsv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv index f100fd10..63f9e1e8 100644 --- a/rtl/ProgRouter.bsv +++ b/rtl/ProgRouter.bsv @@ -648,7 +648,7 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); method Bit#(1) incSent = incSentReg; method Bit#(1) incReceived = incReceivedReg; method Bool active = - beatBufferLen.value == 0 && interpreterState == 0; + beatBufferLen.value != 0 || consumeState != 0; endinterface endmodule From 3276d487b8b93a33b6a5d0e4923f23722f51c412 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Wed, 15 Apr 2020 15:41:55 +0100 Subject: [PATCH 30/78] Some fixes to ProgRouters.h Missing 'return' statement, and missing use of 'offset' when uploading routing tables. The POLite heat grid now appears to work in simulation. Ready to try things on FPGA. --- include/POLite/ProgRouters.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/POLite/ProgRouters.h b/include/POLite/ProgRouters.h index 34f62694..8fe3c143 100644 --- a/include/POLite/ProgRouters.h +++ b/include/POLite/ProgRouters.h @@ -299,7 +299,7 @@ class ProgRouterMesh { // Add routing destinations from given global mailbox id uint32_t addDestsFromBoard(uint32_t mbox, Seq* dests) { - addDestsFromBoardXY(destX(mbox), destY(mbox), dests); + return addDestsFromBoardXY(destX(mbox), destY(mbox), dests); } // Write routing tables to memory via HostLink @@ -340,12 +340,13 @@ class ProgRouterMesh { Seq* seq = table[y][x].table[i]; if (offset < seq->numElems) { uint32_t dest = hostLink->toAddr(x, y, coresPerDRAM * i, 0); + uint8_t* base = &seq->elems[offset]; allDone = false; req.cmd = StoreCmd; req.numArgs = 3; - req.args[0] = ((uint32_t*) seq->elems)[0]; - req.args[1] = ((uint32_t*) seq->elems)[1]; - req.args[2] = ((uint32_t*) seq->elems)[2]; + req.args[0] = ((uint32_t*) base)[0]; + req.args[1] = ((uint32_t*) base)[1]; + req.args[2] = ((uint32_t*) base)[2]; hostLink->send(dest, 1, &req); } } From fcefa919a6abe5696e92ecf40a2fc176a838d512 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Thu, 16 Apr 2020 17:25:15 +0100 Subject: [PATCH 31/78] Better handling of IND records There was a cyclic dependency in the ProgRouter between trying to send a IND-routed flit, and trying to consume one which would surely lead to deadlock in some cases. This is solved by (1) removing the loopback route around the ProgRouter, and handling IND records within each fetcher, entirely independently of the other fetchers; and (2) restricting the number of IND records allowed per key lookup to one. For efficiency, we introduce a second restriction: IND records are only allowed in max-sized key lookups. This means that fetchers only block in the worst case (and even then we allow two outstanding indirections at a time, per fetcher, to keep utilisation high). These restrictions still permit the use case for which IND records are intended: overcoming the max beat limit of a key lookup. While doing this I also spotted and fixed a few bugs: an overflow bug in one of the fetcher's comparators, and incorrect handling of multi-flit messages inside the fetchers. --- README.md | 19 ++- apps/progrouter/progrouter.cpp | 14 +- config.py | 4 +- de5/S5_DDR3_QSYS.qsys | 4 +- doc/figures/logo.png | Bin 0 -> 7183 bytes rtl/Network.bsv | 3 - rtl/ProgRouter.bsv | 228 ++++++++++++++++++++------------- 7 files changed, 167 insertions(+), 105 deletions(-) create mode 100644 doc/figures/logo.png diff --git a/README.md b/README.md index b001abcd..a3340c3f 100644 --- a/README.md +++ b/README.md @@ -17,16 +17,21 @@ bool keySend(uint32_t key, uint32_t numFlits, void* msg, bool block = true); bool keyTrySend(uint32_t key, uint32_t numFlits, void* msg); ``` +New section on programmable routers: + * Routing record format, byte ordering etc. + * Semantics of records + * Restrictions on IND records + * Avoiding deadlock: programmer has some added resposibility here + # Tinsel 0.7.1 Tinsel is a [RISC-V](https://riscv.org/)-based manythread message-passing architecture designed for FPGA clusters. It is being developed as part of the [POETS Project](https://poets-project.org/about) (Partial Ordered Event -Triggered Systems). This manual describes the architecture and -associated APIs. Further background can be found in our [FPL 2019 -paper](doc/fpl-2019-paper.pdf), which presents Tinsel 0.6. If you're -a POETS Partner, you can access a machine running Tinsel in the [POETS +Triggered Systems). Further background can be found in our [FPL 2019 +paper](doc/fpl-2019-paper.pdf). If you're a POETS Partner, you can +access a machine running Tinsel in the [POETS Cloud](https://github.com/POETSII/poets-cloud). ## Release Log @@ -46,7 +51,7 @@ Released on 10 Sep 2018 and maintained in the * [v0.5](https://github.com/POETSII/tinsel/releases/tag/v0.5): Released on 8 Jan 2019 and maintained in the [tinsel-0.5.1 branch](https://github.com/POETSII/tinsel/tree/tinsel-0.5.1). -(Hardware idle-detection.) +(Hardware termination-detection.) * [v0.6](https://github.com/POETSII/tinsel/releases/tag/v0.6): Released on 11 Apr 2019 and maintained in the [tinsel-0.6.3 branch](https://github.com/POETSII/tinsel/tree/tinsel-0.6.3). @@ -106,7 +111,7 @@ demands, but fairly modest compute requrements. The main features are: instructions for sending and receiving messages between any two threads in the cluster. - * **Hardware termination detection**. A global termination event is + * **Hardware termination-detection**. A global termination event is triggered when every thread indicates termination and no messages are in-flight. Termination can be interpreted as termination of a time step, or termination of the application, supporting @@ -563,7 +568,7 @@ Tinsel also provides a function int tinselIdle(bool vote); ``` -which blocks until either +for global termination detection, which blocks until either 1. a message is available to receive, or diff --git a/apps/progrouter/progrouter.cpp b/apps/progrouter/progrouter.cpp index 6b764ac5..28fb494c 100644 --- a/apps/progrouter/progrouter.cpp +++ b/apps/progrouter/progrouter.cpp @@ -189,9 +189,14 @@ int main() msgOut[1] = 0x20; msgOut[2] = 0x30; msgOut[3] = 0x40; + msgOut[4] = 0x50; + msgOut[5] = 0x60; + msgOut[6] = 0x70; + msgOut[7] = 0x80; // On thread 0 if (me == 0) { +tinselSetLen(1); // Add an URM1 record uint8_t* entry1 = table.currentPointer(); table.addURM1(0, 0, 10, 0xfff); @@ -199,12 +204,7 @@ int main() table.addURM2(0, 0, 60, 0xff3, 0xff2); table.addURM2(0, 0, 60, 0xff5, 0xff4); //table.addMRM(1, 0, 0x22222222, 0x11111111, 0x2222); - uint8_t* ind = table.addIND(); table.next(); - uint8_t* entry2 = table.currentPointer(); - table.addURM1(0, 0, 20, 0x111); - table.next(); - table.setIND(ind, 0, entry2, 1); // Cache flush, to write table into RAM tinselCacheFlush(); @@ -226,7 +226,9 @@ int main() while (me != 0) { tinselWaitUntil(TINSEL_CAN_RECV); volatile uint32_t* msgIn = (uint32_t*) tinselRecv(); - printf("%x %x %x %x\n", msgIn[0], msgIn[1], msgIn[2], msgIn[3]); + printf("%x %x %x %x %x %x %x %x\n", + msgIn[0], msgIn[1], msgIn[2], msgIn[3] + , msgIn[4], msgIn[5], msgIn[6], msgIn[7]); tinselFree(msgIn); } diff --git a/config.py b/config.py index d0a2491a..1d2e5b07 100755 --- a/config.py +++ b/config.py @@ -164,6 +164,8 @@ def quoted(s): return "'\"" + s + "\"'" # Programmable router parameters: p["LogRoutingEntryLen"] = 5 # Number of beats in a routing table entry p["ProgRouterMaxBurst"] = 4 +p["ProgRouterCrossbarOutputs"] = 4 +p["FetcherLogIndQueueSize"] = 1 p["FetcherLogBeatBufferSize"] = 5 p["FetcherLogFlitBufferSize"] = 5 p["FetcherLogMsgsPerFlitBuffer"] = ( @@ -187,7 +189,7 @@ def quoted(s): return "'\"" + s + "\"'" p["UseCustomAccelerator"] = False # Clock frequency (in MHz) -p["ClockFreq"] = 225 +p["ClockFreq"] = 220 #============================================================================== # Derived Parameters diff --git a/de5/S5_DDR3_QSYS.qsys b/de5/S5_DDR3_QSYS.qsys index 0695a737..dc87cb4b 100644 --- a/de5/S5_DDR3_QSYS.qsys +++ b/de5/S5_DDR3_QSYS.qsys @@ -891,7 +891,7 @@ - + @@ -1214,7 +1214,7 @@ - + diff --git a/doc/figures/logo.png b/doc/figures/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..8271002b57533674602c876f50bfdb9e2f8a4dcd GIT binary patch literal 7183 zcmWkz1z1yU6dv6%N?KC7Q@T@9T0*27WOVoF2I(&8=100iLP8qph7r>BU$=YrJ^P;B zdr!XSJ!g^Xs`6Oqr05_J2uo2xMiW?bfKvw*8Tgz>ODO;r{5;V2i>=OAjLLc4kCv>*E%t^L6XhoY8pz9gG2zrNDQ2!bY`LYYM= zniTO`n?bBy|m0#ZOKyu=t16E9h{lz>Na*DMA zxRY8(;5CLK4ekb3;$J?5zcgmh3>-eWjWKmi-cw-dS9vZJyv}@u$Ft*)mCxV4QSqvz;svge+^4u6vC0 z=d&P%MzqptOM-%WvyXs6Nib}j10fisF98yHr-p2z0jKj00rXd|0r3(Jp~uJ;F zEz2QvzhG=6(6}P>5$5mlr={L(w%2+#gSAPB0i}ILbELmt(twTk{yqVR9CgMze{wSX z@oGx?lRC&qpTtluF7)WZ4K+44AuP0qiLrb(8`a2$iMjg$U9HLxZE_aB$}q09Qa4e+ zhrih0|6IR=IJTnVJTf+HV2z^6ke!GXQc{ecnW=t0#?r>+#yq#QgglBYJru<1390&E z$#px{-j)yPM>bdc7I^ghs8R1g*1(`DvlfY8{_tRDn8%OxaAcd}xET0pdq`4OGx%vy z5BMnvFj>yN%)xc}duPzV)p=Yb92)%hn=@Hs2=6Mx2Z{4s(S(QlUZ-6p1q67ii@&@B zl(LYVVl!gos>;fOr&W<*TL|8Vl0d+VxS+|;qzzx+(^<27rH^6;>04QG;wdV737c~( ztgafw_$fBJ&M<9w9k4K~Utcer8+8y5Trn=Sf!zQnF}yrk2_hi{xw(hmX21Wu`_)<0 zC)mZENOa*(U#VHl4cTyW4Tyhen=a|`sClW>?B{%K1%;EUagn?|Ic*{_bTxyCmnOkD z9taldaxG_cbgAA#*>Z6oqM$rqc-S{$&dJGCVp{{eiy7`S9hrQyHLf7QViT=P{+>h zKo?&B6v>sE#A&m0_%rMsymV;8L{n%48R>Vq zjh2ixH(Q!)E`m-wAL5PS<6&}O_Df`CMK|DxhqBTkUcNkU^0~g__42yg-H^luoA>rQ zQwribYG?q9y15iXVSPP#aBxtaIo0!K$%-wxW{`_BU^ccTH$z4KhMDi@si^Dx&j#et zMQoe0h2wg&VeIVpuE;J(|iOMRfqzzyPJTF+{eU-Iw>74K}m_N4_ilG zndnEz42D0jvz@-WUg-7mkv`f-snO+d?&{j;=#;f840#u;Ss|mNqciSu{F_ZnSGTC9 z2Ah0%^LOVF~njh353v1GWJXuf+t zNuHiz0mI=~WOa^fLV~xj_?}S21UWM*&3MXi;_MH|&g6#3Siv1mCLzmK`apOjw6y~P zhs=`Pe0oK=nIJ>PXG?&Nfq{jFSg7+ZBRDj7I1Pl%_h+cS!WIo$l$Cm{l* z{9c$#v%>UE$fVA7d`9u?@wLOqUV@61R&7-V9Zl>I)qfw}3?{`|D$i!}TaodQo$BrM z^+}P7`7R#0<)}u$UZ0(0rZSQA^TgP8_!;o~UBW%{OJbs-HSx2}QN8*u$Ob|C4#^X% zrH#|sSVbyURu@ixx*TrhS6SHQRE#Ky85sIaD+2l%J4xi|3|nb(=u?*swWan_>09iD z;_S#4cw+bYIDD;6L;Aicn}OHzAeh5JNIfIF$UZ7GHmN@Ju1pqE>OuQ3Wt=#ka_gcs ztCXZ6ioBRyJJ;ptM5SI*`eZ^(V$Pnk_=DLpoQ{r;fq7rOEl5=(54_j|DgZ z#`lS_N&8%VB32Y@`6W3~2P0lFIZzRxZkOlIZQ`#TLBv5p&e2iuAkXKQTeenMy?Nvs z$HCrScxWghFCX9L-Nklw&8RuI@BMB%G%k)v_tKOmC#1{e6ZcocbdnuONl8K9JIctUhG;)@AB7$?!&!JV17Q3ZTR{RdiFQGw>=<;BRzDBdu>Lw;Qv2qBx z`_##!8@szBEFUlnfZzL+` zznm^N-(8M#Y?>GJEv~Im2z#6jq%eJ~S}!UtMg;CX_@C>(^Zlg-W2zErFsRcQM-rC` zRVqshd%Ph*fo?+(+t}DxufvmTYioL}uq0!OOfWV%)PWPR+Z1nvic#(+x5jex8Dp{QVPj@B( z8l#vVx;4d>U*A_at{0z_<(1k|QqG-UO+oBd1(o2Sh(V5G*8~>g*92g&5geo4G}X+` z{%kE=vw=!&*8_7eF1x*}r>Ez1BCUK?oVfeRxB2>0(d_JOMlP=UweEkQ!|Qu@jy-j} zyDJ+O7KS~Qo|UDnp`k&MX}8|trKO>{f~h_) zHCVWptVO)_hpw)!qU3ULbK~I&PYEzEpw!pbTU%Qn@a~)auF^M>5mIJO-CsF7zJdP5 z=fp&)&_d7nfc&2!HgLo>!c4Svb+@mk#9Qw(sz0?Zcd(}_0V=`0ef#$A?yhHLg)BEW z*V)Z2tm*##;o*UXmG$S@ZHBN1dy_k9j-jF9#^E6v2L}h>fR%#);f*fXJ(S_?CekhhJV9h9I8_(+wK2s?;j6%f9#9C=Qxck zxv}RaQk7mkJdi2jm6a??%E~SW)718B?epKCOJ@)3>=xL6X7CIbys7VBx9e@NfK1FL z&!Hn3;oy|n85_?fWGcPa&`23_p*y&S6a@r~QHo|lp-FJXQe&Jb7hp>(m;bPQ!GWQ;QdltBq?bDhfuE89wDim=b&M=XgN( zy`r~Wzkam_ysR@Ue!r8Pl9V)qv|d?ZVDYU*g?H#{sXR!Cg@llSfuZB|<-9Ngrp?+ z)Ku}wz6MsypWjSd8Nb||Eb)scr=}wLG03s%Rr2SQgm&vQyl&UYzCgjW_Qo76ERCLG zyu1W8H8uRhHHQD;Nk?E0=8F0xBqz^0UZMpD<^{Ygm9hyqt_lK9ZZ>eZK0iN~n93Ec zUT5Ion5s#>?1_`96r^zO##*Y16sCvI~>lGxQi}URk1#8;si)2 zD4@#4Q^scSSpAa>o8&B*ii(A0{K&1-IDR5az-*<(fgg=6RjIm5Wav#L4?TsIS~d63 zMqPdW6&D8f3dY!xz1Gse;GiiWKzVg_v+z`ou!r;hBn6OzxGpX(cjG26!Rf}{qcz^0 z9$&0G<$%b+|-Kflg?l5cSpx-2g3I=de@?Xp`4zIe%;Qx(9gsj2ziu6dj?<5}C< zO1Zo96R{bDf@$S8&eneLD_SYfxD)Y`szjWs4hQ!eoIca2UPXRz5K416+{1f)N_6{> zD?$<(83~PvvA;SrBou+AT``T;#BAI?QxOp`gn z5%J053ot1udYF`HDr7vJqqWTl;9aD@ zs&5pUv@itTYKn^=kdZpmvSI>#9d1$zV*0r=0jo*eR`&~G!Rw2FH>*Cpg9g|!mL$Z0 z>@gn7`58!=yQzSekiUN+9d}#N?v2Fh943y^LZ4h@03sIfd~DLa1j9${{o<7}PopYF z182j#VP~h|bAapZw1z%qGn>!MK2&|NU7*Vhc0VHMuma5ZyLu!~00%CMpW1VZGC?OA zP&B2=goAq=4wIA@H3H{t%O@ps_FsBwgNyZM4oaC$-Oog2YI!g7bggr8mP(cLH1za? za4_w%km+fVk=J{vq$Chciv23;ZmuY*ijvgEa`Uu~ZqdeWx<-oyh7=BY;A@ss;oc&b zc|19qez!krw=>)fL!QjxBaGJLNDJWNl*-&j?GQBE=H<;PwH%@OJvZgsC@54yMkaKg zOIus}UkQ7;h!1Ew{$XWlxm=B8PCR7!@S#N0bUcmC+TOlNMQDnIC%>{%`lejtvjsP% z7URa;4*?~y7viYb2aY?ZRp^GLTJxn-K>ukZJ{b3H{Q6rzM4&DH%AgX=%!7|mV-&(F zLJrKEwr+qyg4y7iX9}0ox(HAgKA)}rr=X-HWoBlkjF50>%DdQIhYxoE_&m#%nwt8X zpiAL|i&6=mG7vlfw4X0oBgtn2k@z`Eft&piD^IxqNFGCGu1{AM&K|NwypoDFE2M9) zZf@S?s9JUAPFlVA${4nR7J5T8TFg)gT=)B&=9xn&1pm!DE{#3my2-_2&OerI++L2W zzaiLAwPyteVYDZAE`PuApp5xFCJ6fp84kxtE*{PT)4r6a>)N))&v19_3V)u`>a?GK zf$i5jmiDf9$J1%W#JWE2S5;Lt*(HlTo-`5*IA#Nq+}@%rbg<=UkI>SBz3#bvt)}G6 zl+Ybz{_3iHdo+Rj8a+N;|!ZiCZX#8~9kcON94g0UbiHSPS;u}g!{ z=&2@OvxKhh{@YFY>PjRYXai7;A>!iVRe)4k0!}R?nVeV!|B878Hbvh?Z^&C~0r;Y# z4TVQ@Pfx*_nHf&^6TPqgG`6me^B-&iq@0+?GDdpa`}Qmr;|==;Uo`^V8d$*#vud4C@-*1 zl)Nwu7pqC~xm-~U1{0Gzr)9)i{l)|U2mRms5ao71QUrUz!hspN7Z=yL_l0ceJ9_<} zOL*X;qod^k?+=QHySo;A0_DG~^R<|(qIMxAEp2OW zA`9pWt(Tf2wgvw+dAha6sKC_A+L}>BRCE)t3UGycTa_{{qLz+MVM_~O zpKXfdgL|>GvaYslYne(B3&;W)F@c#O!Zx0wh?jvuX;?;+Z|^j$%mMj{KJ*f7))lvuI#(gciwZMCiVn!Vaywn$ZuFb z$=PT!8D0G?Zg<@;KYVa&rzBNk0_}D`aicL4raCN1*>IB>Av3;~q$`+wQntwRmD1h) zs)SVT{pyd`WPk?|!n^PU;dqCP8U0}Fv{Gw_SND>Fw;3N-<%u$+IB^fnU zG2d@tWmwvyK)e#(=H2(fMTEpJnS{wGd1cv9)4tP?HgTbI-_CryE1VN+eY4x zYaNQQHa00sWjIF2SEg zetNLeT8Ehq;H!SS$cp`1e1<-)lW0FRD zR!!4CUZT9CqXIzSAVq1FTo4y?nELOAq-P9etdENeqyMc@y~Y+7{E9aI_0wz~uKy)n zaZu1m4nNlQ{uJ%WBG(<8QPdENE>%-ey2L<#tUJTgTY`r%C4!DiblW(}uv=I_{B$v0 zx#Mb}hzNOAbaWdH&7nU= 0; x=x-1) botOutList = Cons(routers[0][x].bottomOut, botOutList); - // Also include loopback connection to board router to implement IND records - botOutList = Cons(fromBOut(boardRouter.flitOut[4]), botOutList); - function In#(Flit) getFlitIn(BoardLink link) = link.flitIn; reduceConnect(mkFlitMerger, botOutList, single(boardRouter.flitIn[4])); // Connect board router to mailbox mesh south rim diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv index 63f9e1e8..ecf6e927 100644 --- a/rtl/ProgRouter.bsv +++ b/rtl/ProgRouter.bsv @@ -126,7 +126,10 @@ typedef struct { Bit#(64) destMask; } MRMRecord deriving (Bits); -// 48-bit Indirection (IND) record: +// 48-bit Indirection (IND) record +// Note the restrictions on IND records: +// 1. At most one IND record per key lookup +// 2. A max-sized key lookup must contain an IND record typedef struct { // Record type RoutingRecordTag tag; @@ -136,6 +139,10 @@ typedef struct { Bit#(32) newKey; } INDRecord deriving (Bits); +// ============================================================================= +// Internal types +// ============================================================================= + // It is sometimes convenient (though redundant) to record a routing // decision for a flit internally within the programmable router typedef struct { @@ -151,10 +158,17 @@ typedef enum { RouteSouth, RouteEast, RouteWest, - RouteNoC, - RouteLoop + RouteNoC } RoutingDecision deriving (Bits, Eq, FShow); +// Elements of the indirection queue inside each fetcher +typedef struct { + // The indirection + RoutingKey key; + // The location of the message in the flit buffer + FetcherFlitBufferMsgAddr addr; +} IndQueueEntry deriving (Bits, FShow); + // ============================================================================= // Design // ============================================================================= @@ -165,17 +179,17 @@ typedef enum { // NoC edge, but the diagram assumes four. // -// N S E W L0..L3/Loop Input flits -// | | | | | | -// +---+ +---+ +---+ +---+ +---+ | -// | F | | F | | F | | F | | F | | Fetchers -// +---+ +---+ +---+ +---+ +---+ | -// | | | | | | -// +---------------------------+ | -// | Crossbar | | Routing -// +---------------------------+ | -// | | | | | | -// N/L0 S/L1 E/L2 W/L3 Ind-----+ Output queues +// N S E W L0..L3 Input flits +// | | | | | +// +---+ +---+ +---+ +---+ +---+ +// | F | | F | | F | | F | | F | Fetchers +// +---+ +---+ +---+ +---+ +---+ +// | | | | | +// +---------------------------+ +// | Crossbar | Routing +// +---------------------------+ +// | | | | +// N/L0 S/L1 E/L2 W/L3 Output queues // | | | | // +---------------------------+ // | Splitter | Final splitting @@ -192,15 +206,15 @@ typedef enum { // The key property of these fetchers is that they act entirely // indepdedently of each other: each one can make progress even if -// another is blocked. Unfortunately, this leads to a duplicated -// logic resources, but is necessary to avoid deadlock. +// another is blocked. This leads to duplicated logic resources, but +// is necessary to avoid deadlock. -// Note that, as the routers are fully programmable, it is possible -// for the programmer to introduce deadlock using an ill-defined -// routing scheme, e.g. where a flit arrives in on (say) link N and -// requires a flit to be sent back along the same direction N. -// However, the hardware does guarantee deadlock-freedom if the -// routing scheme is based on dimension-ordered routing. +// As the routers are fully programmable, it is possible for the +// programmer to introduce deadlock using an ill-defined routing +// scheme, e.g. where a flit arrives in on (say) link N and requires a +// flit to be sent back along the same direction N. However, the +// hardware does guarantee deadlock-freedom if the routing scheme is +// based on dimension-ordered routing. // After the fetchers have interpreted the flits, they are fed to a // fair crossbar which organises them by destination into output @@ -234,6 +248,8 @@ typedef struct { Bit#(`BeatBurstWidth) burst; // Is this the final burst of routing records for the current key? Bool finalBurst; + // Are we processing a max-sized key (which must contain an IND record)? + Bool isMaxSizedKey; } InflightFetcherReqInfo deriving (Bits, FShow); // Routing beat, tagged with the beat number in the DRAM burst @@ -304,6 +320,12 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); // Final output queue for flits Queue1#(RoutedFlit) flitOutQueue <- mkUGShiftQueue(QueueOptFmax); + // Indirection queue and size + SizedQueue#(`FetcherLogIndQueueSize, IndQueueEntry) indQueue <- + mkUGShiftQueue(QueueOptFmax); + Count#(TAdd#(`FetcherLogIndQueueSize, 1)) indQueueLen <- + mkCount(2 ** `FetcherLogIndQueueSize); + // Activity Reg#(Bit#(1)) incSentReg <- mkDReg(0); Reg#(Bit#(1)) incReceivedReg <- mkDReg(0); @@ -333,6 +355,9 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); // Maintain count of routing beats fetched so far Reg#(Bit#(`LogRoutingEntryLen)) fetchBeatCount <- mkReg(0); + // Track when messages are bypassing fetcher, to keep the bypass atomic + Reg#(Bool) bypassInProgress <- mkReg(False); + // State 0: pass through flits that don't contain routing keys rule consumeMessage0 (consumeState == 0); Flit flit = flitInPort.value; @@ -344,29 +369,54 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); found = True; chosen = fromInteger(i); end - chosenReg <= chosen; // Initialise counters for subsequent states consumeFlitCount <= 0; fetchBeatCount <= 0; - // Consume flit - if (flitInPort.canGet) begin - if (flit.dest.addr.isKey) begin - if (found) begin - consumeState <= 1; + // First, try to consume indirection + if (indQueue.canDeq && indQueue.canPeek && !bypassInProgress) begin + IndQueueEntry ind = indQueue.dataOut; + // Consume + indQueue.deq; + // Release space in indQueue, unless we have another max-sized key + if (!allHigh(ind.key.numBeats)) + indQueueLen.dec; + // Jump straight to fetch state, as message already in flit buffer + chosenReg <= ind.addr; + consumeKey <= ind.key; + // Proceed only if key size is non-zero + if (ind.key.numBeats != 0) + consumeState <= 2; + end else begin + chosenReg <= chosen; + // Otherwise, try to consume flit + if (flitInPort.canGet) begin + if (flit.dest.addr.isKey) begin + if (found) begin + RoutingKey key = getRoutingKey(flit.dest); + // For a full-size key, we must reserve space in the indQueue + if (allHigh(key.numBeats)) begin + if (indQueueLen.notFull) begin + indQueueLen.inc; + consumeState <= 1; + end + end else + consumeState <= 1; + end + end else if (flitBypassQueue.notFull) begin + flitInPort.get; + bypassInProgress <= flit.notFinalFlit; + // Make routing decision + RoutingDecision decision = RouteNoC; + MailboxNetAddr addr = flit.dest.addr; + if (addr.host.valid) + decision = addr.host.value == 0 ? RouteWest : RouteEast; + else if (addr.board.x < boardId.x) decision = RouteWest; + else if (addr.board.x > boardId.x) decision = RouteEast; + else if (addr.board.y < boardId.y) decision = RouteSouth; + else if (addr.board.y > boardId.y) decision = RouteNorth; + // Insert into bypass queue + flitBypassQueue.enq(RoutedFlit { decision: decision, flit: flit}); end - end else if (flitBypassQueue.notFull) begin - flitInPort.get; - // Make routing decision - RoutingDecision decision = RouteNoC; - MailboxNetAddr addr = flit.dest.addr; - if (addr.host.valid) - decision = addr.host.value == 0 ? RouteWest : RouteEast; - else if (addr.board.x < boardId.x) decision = RouteWest; - else if (addr.board.x > boardId.x) decision = RouteEast; - else if (addr.board.y < boardId.y) decision = RouteSouth; - else if (addr.board.y > boardId.y) decision = RouteNorth; - // Insert into bypass queue - flitBypassQueue.enq(RoutedFlit { decision: decision, flit: flit}); end end endrule @@ -398,7 +448,7 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); // State 2: fetch routing beats rule consumeMessage2 (consumeState == 2); // Have we finished fetching beats? - Bool finished = fetchBeatCount+`ProgRouterMaxBurst >= consumeKey.numBeats; + Bool finished = (consumeKey.numBeats-fetchBeatCount) <= `ProgRouterMaxBurst; // Prepare inflight RAM request info // (to handle out of order resps from the RAMs) InflightFetcherReqInfo info; @@ -406,6 +456,7 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); info.burst = truncate( min(consumeKey.numBeats - fetchBeatCount, `ProgRouterMaxBurst)); info.finalBurst = finished; + info.isMaxSizedKey = allHigh(consumeKey.numBeats); // Prepare RAM request DRAMReq req; req.isStore = False; @@ -559,20 +610,16 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); decision = RouteNoC; end // 48-bit Indirection - IND: begin - INDRecord rec = unpack(beat.chunks[4]); - flit.dest.threads = {?, rec.newKey}; - decision = RouteLoop; - end + IND: begin end endcase // Is output queue ready for new flit? Bool emit = flitProcessedQueue.notFull; let newFlitCount = emitFlitCount; // Consume routing record if (emit) begin - flitProcessedQueue.enq(RoutedFlit { decision: decision, flit: flit }); - // Move to next record - recordCount <= recordCount + 1; + // Only enqueue if not an IND record + if (tag != IND) + flitProcessedQueue.enq(RoutedFlit { decision: decision, flit: flit }); // Shift beat to point to next record RoutingBeat newBeat = beat; Bool doubleChunk = unpack(pack(tag)[0]); @@ -585,21 +632,32 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); for (Integer i = 4; i > 0; i=i-1) newBeat.chunks[i] = beat.chunks[i-1]; end - beatReg <= NumberedRoutingBeat { - beat: newBeat, beatNum: beatNum, info: info }; - // Is this the final record in the beat? - if ((recordCount+1) == truncate(beat.size)) begin - interpreterState <= 0; - // Have we finished with this message yet? - if (info.finalBurst && info.burst == (beatNum+1)) begin - // Reclaim message slot in flit buffer - flitBufferUsedSlots[info.msgAddr].clear; - end - end // Is this the final flit in the message? if (flit.notFinalFlit) newFlitCount = emitFlitCount + 1; else begin + // Move to next record + recordCount <= recordCount + 1; + beatReg <= NumberedRoutingBeat { + beat: newBeat, beatNum: beatNum, info: info }; + // Handle IND record: insert into indirection queue + if (tag == IND) begin + myAssert(indQueue.notFull, "Restrictions on IND records violated"); + INDRecord ind = unpack(beat.chunks[4]); + indQueue.enq(IndQueueEntry + { key: unpack(ind.newKey), addr: info.msgAddr }); + end + // Is this the final record in the beat? + if ((recordCount+1) == truncate(beat.size)) begin + interpreterState <= 0; + // Have we finished with this message yet? + if (info.finalBurst && info.burst == (beatNum+1)) begin + // Reclaim message slot in flit buffer + // (Don't do this when we have an indirection to process) + if (! info.isMaxSizedKey) + flitBufferUsedSlots[info.msgAddr].clear; + end + end incSentReg <= 1; newFlitCount = 0; end @@ -661,46 +719,46 @@ endmodule typedef function Bool selector(RoutedFlit flit) SelectorFunc; module mkProgRouterCrossbar#( - Vector#(n, SelectorFunc) f, - Vector#(n, BOut#(RoutedFlit)) out) - (Vector#(n, BOut#(RoutedFlit))) - provisos (Add#(a_, 1, n)); + Vector#(numOut, SelectorFunc) f, + Vector#(numIn, BOut#(RoutedFlit)) out) + (Vector#(numOut, BOut#(RoutedFlit))) + provisos(Add#(a__, 1, numIn)); // Input ports - Vector#(n, InPort#(RoutedFlit)) inPort <- replicateM(mkInPort); + Vector#(numIn, InPort#(RoutedFlit)) inPort <- replicateM(mkInPort); // Connect up input ports - for (Integer i = 0; i < valueOf(n); i=i+1) + for (Integer i = 0; i < valueOf(numIn); i=i+1) connectDirect(out[i], inPort[i].in); // Cosume wires, for each input port - Vector#(n, PulseWire) consumeWire<- replicateM(mkPulseWireOR); + Vector#(numIn, PulseWire) consumeWire <- replicateM(mkPulseWireOR); // Keep track of service history for flit sources (for fair selection) - Vector#(n, Reg#(Bit#(n))) hist <- replicateM(mkReg(0)); + Vector#(numOut, Reg#(Bit#(numIn))) hist <- replicateM(mkReg(0)); // Current choice of flit source - Vector#(n, Reg#(Bit#(n))) choiceReg <- replicateM(mkReg(0)); + Vector#(numOut, Reg#(Bit#(numIn))) choiceReg <- replicateM(mkReg(0)); // Output queue - Vector#(n, Queue1#(RoutedFlit)) outQueue <- + Vector#(numOut, Queue1#(RoutedFlit)) outQueue <- replicateM(mkUGShiftQueue(QueueOptFmax)); // Selector mux for each out queue - for (Integer i = 0; i < valueOf(n); i=i+1) begin + for (Integer i = 0; i < valueOf(numOut); i=i+1) begin rule select; // Vector of input flits and available flits - Vector#(n, RoutedFlit) flits = newVector; - Vector#(n, Bool) nextAvails = newVector; + Vector#(numIn, RoutedFlit) flits = newVector; + Vector#(numIn, Bool) nextAvails = newVector; Bool avail = False; - for (Integer j = 0; j < valueOf(n); j=j+1) begin + for (Integer j = 0; j < valueOf(numIn); j=j+1) begin flits[j] = inPort[j].value; nextAvails[j] = inPort[j].canGet && f[i](inPort[j].value) && choiceReg[i][j] == 0; avail = avail || (choiceReg[i][j] == 1 && inPort[j].canGet); end - Bit#(n) nextAvail = pack(nextAvails); + Bit#(numIn) nextAvail = pack(nextAvails); // Choose a new source using fair scheduler match {.newHist, .nextChoice} = sched(hist[i], nextAvail); // Select a flit @@ -721,7 +779,7 @@ module mkProgRouterCrossbar#( hist[i] <= newHist; end // Consume from chosen source - for (Integer j = 0; j < valueOf(n); j=j+1) + for (Integer j = 0; j < valueOf(numIn); j=j+1) if (inPort[j].canGet && choiceReg[i][j] == 1 && outQueue[i].notFull) consumeWire[j].send; endrule @@ -730,7 +788,7 @@ module mkProgRouterCrossbar#( // Consume from flit sources rule consumeFlitSources; - for (Integer j = 0; j < valueOf(n); j=j+1) + for (Integer j = 0; j < valueOf(numIn); j=j+1) if (consumeWire[j]) inPort[j].get; endrule @@ -778,7 +836,7 @@ endmodule interface ProgRouter; // Incoming and outgoing flits interface Vector#(`FetchersPerProgRouter, In#(Flit)) flitIn; - interface Vector#(`FetchersPerProgRouter, BOut#(Flit)) flitOut; + interface Vector#(`ProgRouterCrossbarOutputs, BOut#(Flit)) flitOut; interface Vector#(`MailboxMeshXLen, BOut#(Flit)) nocFlitOut; // Interface to off-chip memory @@ -809,15 +867,14 @@ module mkProgRouter#(BoardId boardId) (ProgRouter); rf.decision == RouteEast || (rf.decision == RouteNoC && xcoord(rf) == 2); function Bool routeW(RoutedFlit rf) = rf.decision == RouteWest || (rf.decision == RouteNoC && xcoord(rf) == 3); - function Bool routeLoop(RoutedFlit rf) = rf.decision == RouteLoop; - Vector#(`FetchersPerProgRouter, SelectorFunc) funcs = - vector(routeN, routeS, routeE, routeW, routeLoop); + Vector#(`ProgRouterCrossbarOutputs, SelectorFunc) funcs = + vector(routeN, routeS, routeE, routeW); // Crossbar function BOut#(RoutedFlit) getFetcherFlitOut(Fetcher f) = f.flitOut; Vector#(`FetchersPerProgRouter, BOut#(RoutedFlit)) fetcherOuts = map(getFetcherFlitOut, fetchers); - Vector#(`FetchersPerProgRouter, BOut#(RoutedFlit)) + Vector#(`ProgRouterCrossbarOutputs, BOut#(RoutedFlit)) crossbarOuts <- mkProgRouterCrossbar(funcs, fetcherOuts); // Flit input interfaces @@ -826,18 +883,17 @@ module mkProgRouter#(BoardId boardId) (ProgRouter); flitInIfc[i] = fetchers[i].flitIn; // Flit output interfaces - Vector#(`FetchersPerProgRouter, BOut#(Flit)) flitOutIfc = newVector; + Vector#(`ProgRouterCrossbarOutputs, BOut#(Flit)) flitOutIfc = newVector; Vector#(`MailboxMeshXLen, BOut#(Flit)) nocFlitOutIfc = newVector; // Strands function Bool forNoC(RoutedFlit rf) = rf.decision == RouteNoC; - for (Integer i = 0; i < 4; i=i+1) begin + for (Integer i = 0; i < `ProgRouterCrossbarOutputs; i=i+1) begin match {.noc, .other} <- splitFlits(forNoC, crossbarOuts[i]); flitOutIfc[i] = other; if (i < `MailboxMeshXLen) nocFlitOutIfc[i] = noc; end function Flit toFlit (RoutedFlit rf) = rf.flit; - flitOutIfc[4] <- onBOut(toFlit, crossbarOuts[4]); // RAM interfaces Vector#(`DRAMsPerBoard, Vector#(`FetchersPerProgRouter, In#(DRAMResp))) From db331bdd7597487344525460079acfc6f6d2f665 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Mon, 20 Apr 2020 08:49:35 +0000 Subject: [PATCH 32/78] Better ProgRouter test --- Makefile | 2 +- apps/POLite/progrouters/Makefile | 7 + apps/POLite/progrouters/ProgRoutersTest.cpp | 43 ++++ apps/POLite/progrouters/Run.cpp | 47 ++++ apps/progrouter/Makefile | 53 ----- apps/progrouter/entry.S | 3 - apps/progrouter/genld.sh | 32 --- apps/progrouter/progrouter.cpp | 236 -------------------- apps/progrouter/run.cpp | 12 - include/POLite/ProgRouters.h | 5 +- 10 files changed, 100 insertions(+), 340 deletions(-) create mode 100644 apps/POLite/progrouters/Makefile create mode 100644 apps/POLite/progrouters/ProgRoutersTest.cpp create mode 100644 apps/POLite/progrouters/Run.cpp delete mode 100644 apps/progrouter/Makefile delete mode 100644 apps/progrouter/entry.S delete mode 100755 apps/progrouter/genld.sh delete mode 100644 apps/progrouter/progrouter.cpp delete mode 100644 apps/progrouter/run.cpp diff --git a/Makefile b/Makefile index d95602f9..133b1533 100644 --- a/Makefile +++ b/Makefile @@ -24,7 +24,6 @@ clean: make -C apps/multiprog clean make -C apps/sync clean make -C apps/temps clean - make -C apps/progrouter clean make -C apps/POLite/heat-gals clean make -C apps/POLite/heat-sync clean make -C apps/POLite/heat-cube-sync clean @@ -40,5 +39,6 @@ clean: make -C apps/POLite/izhikevich-sync clean make -C apps/POLite/pressure-sync clean make -C apps/POLite/hashmin-sync clean + make -C apps/POLite/progrouters clean make -C bin clean make -C tests clean diff --git a/apps/POLite/progrouters/Makefile b/apps/POLite/progrouters/Makefile new file mode 100644 index 00000000..9c0837be --- /dev/null +++ b/apps/POLite/progrouters/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: BSD-2-Clause +APP_CPP = ProgRoutersTest.cpp +APP_HDR = +RUN_CPP = Run.cpp +RUN_H = + +include ../util/polite.mk diff --git a/apps/POLite/progrouters/ProgRoutersTest.cpp b/apps/POLite/progrouters/ProgRoutersTest.cpp new file mode 100644 index 00000000..109565df --- /dev/null +++ b/apps/POLite/progrouters/ProgRoutersTest.cpp @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: BSD-2-Clause +#include + +int main() +{ + // Get thread id + int me = tinselId(); + + // Sample outgoing message + volatile uint32_t* msgOut = (uint32_t*) tinselSendSlot(); + msgOut[0] = 0x10; + msgOut[1] = 0x20; + msgOut[2] = 0x30; + msgOut[3] = 0x40; + msgOut[4] = 0x50; + msgOut[5] = 0x60; + msgOut[6] = 0x70; + msgOut[7] = 0x80; + + // On thread 0, send to key supplied by host + if (me == 0) { + tinselSetLen(1); + tinselWaitUntil(TINSEL_CAN_RECV); + volatile uint32_t* msgIn = (uint32_t*) tinselRecv(); + uint32_t key = msgIn[0]; + tinselFree(msgIn); + + tinselWaitUntil(TINSEL_CAN_SEND); + tinselKeySend(key, msgOut); + } + + // Print anything received + while (1) { + tinselWaitUntil(TINSEL_CAN_RECV); + volatile uint32_t* msgIn = (uint32_t*) tinselRecv(); + printf("%x %x %x %x %x %x %x %x\n", + msgIn[0], msgIn[1], msgIn[2], msgIn[3] + , msgIn[4], msgIn[5], msgIn[6], msgIn[7]); + tinselFree(msgIn); + } + + return 0; +} diff --git a/apps/POLite/progrouters/Run.cpp b/apps/POLite/progrouters/Run.cpp new file mode 100644 index 00000000..4a7ad6ed --- /dev/null +++ b/apps/POLite/progrouters/Run.cpp @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: BSD-2-Clause +#include +#include + +int main(int argc, char **argv) +{ + // Connection to tinsel machine + HostLink hostLink; + + // Create routing tables + ProgRouterMesh mesh(TinselMeshXLenWithinBox, TinselMeshYLenWithinBox); + + // Board (1, 0) + for (int i = 0; i < 60; i++) { + uint64_t mask = 1ul << i; + mesh.table[0][1].addMRM(1, 0, mask >> 32, mask, 0xf0f0); + } + uint32_t key01 = mesh.table[0][0].genKey(); + + // Board (0, 0) + for (int i = 0; i < 40; i++) { + uint64_t mask = 1ul << i; + mesh.table[0][0].addMRM(1, 0, mask >> 32, mask, 0xf0f0); + } + for (int i = 0; i < 30; i++) { + uint64_t mask = 1ul << i; + mesh.table[0][0].addMRM(1, 1, mask >> 32, mask, 0xf0f0); + } + mesh.table[0][0].addRR(2, key01); // East + uint32_t key00 = mesh.table[0][0].genKey(); + + // Transfer routing tables to FPGAs + mesh.write(&hostLink); + + // Load code and trigger execution + hostLink.boot("code.v", "data.v"); + hostLink.go(); + + // Send key + printf("Sending key %x\n", key00); + uint32_t msg[1 << TinselLogWordsPerMsg]; + msg[0] = key00; + hostLink.send(0, 1, msg); + + hostLink.dumpStdOut(); + return 0; +} diff --git a/apps/progrouter/Makefile b/apps/progrouter/Makefile deleted file mode 100644 index 76c728f5..00000000 --- a/apps/progrouter/Makefile +++ /dev/null @@ -1,53 +0,0 @@ -# Tinsel root -TINSEL_ROOT=../.. - -ifndef QUARTUS_ROOTDIR - $(error Please set QUARTUS_ROOTDIR) -endif - -include $(TINSEL_ROOT)/globals.mk - -# RISC-V compiler flags -CFLAGS = $(RV_CFLAGS) -O2 -I $(INC) -LDFLAGS = -melf32lriscv -G 0 - -.PHONY: all -all: code.v data.v run - -code.v: progrouter.elf - checkelf.sh progrouter.elf - $(RV_OBJCOPY) -O verilog --only-section=.text progrouter.elf code.v - -data.v: progrouter.elf - $(RV_OBJCOPY) -O verilog --remove-section=.text \ - --set-section-flags .bss=alloc,load,contents \ - progrouter.elf data.v - -progrouter.elf: progrouter.cpp link.ld $(INC)/config.h $(INC)/tinsel.h entry.o $(LIB)/lib.o - $(RV_CPPC) $(CFLAGS) -Wall -c -o progrouter.o progrouter.cpp - $(RV_LD) $(LDFLAGS) -T link.ld -o progrouter.elf entry.o progrouter.o $(LIB)/lib.o - -entry.o: - $(RV_CPPC) $(CFLAGS) -Wall -c -o entry.o entry.S - -link.ld: genld.sh - ./genld.sh > link.ld - -$(LIB)/lib.o: - make -C $(LIB) - -$(INC)/config.h: $(TINSEL_ROOT)/config.py - make -C $(INC) - -$(HL)/%.o: - make -C $(HL) - -run: run.cpp $(HL)/*.o - g++ -O2 -I $(INC) -I $(HL) -o run run.cpp $(HL)/*.o - -sim: run.cpp $(HL)/sim/*.o - g++ -O2 -I $(INC) -I $(HL) -o sim run.cpp $(HL)/sim/*.o - -.PHONY: clean -clean: - rm -f *.o *.elf link.ld *.v run sim diff --git a/apps/progrouter/entry.S b/apps/progrouter/entry.S deleted file mode 100644 index 18cd8d27..00000000 --- a/apps/progrouter/entry.S +++ /dev/null @@ -1,3 +0,0 @@ -# We assume the boot loader has already setup the stack. -# All we need to do is jump to main. -j main diff --git a/apps/progrouter/genld.sh b/apps/progrouter/genld.sh deleted file mode 100755 index cfe144c4..00000000 --- a/apps/progrouter/genld.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -# Load config parameters -while read -r EXPORT; do - eval $EXPORT -done <<< `python ../../config.py envs` - -# Compute space available for instructions -MaxInstrBytes=$((4 * 2**$LogInstrsPerCore - $MaxBootImageBytes)) - -cat - << EOF -/* THIS FILE HAS BEEN GENERATED AUTOMATICALLY. */ -/* DO NOT MODIFY. INSTEAD, MODIFY THE genld.sh SCRIPT. */ - -OUTPUT_ARCH( "riscv" ) - -MEMORY -{ - instrs : ORIGIN = $MaxBootImageBytes, LENGTH = $MaxInstrBytes - globals : ORIGIN = $DRAMBase, LENGTH = $DRAMGlobalsLength -} - -SECTIONS -{ - .text : { *.o(.text*) } > instrs - .bss : { *.o(.bss*) } > globals = 0 - .rodata : { *.o(.rodata*) } > globals - .sdata : { *.o(.sdata*) } > globals - .data : { *.o(.data*) } > globals - __heapBase = ALIGN(.); -} -EOF diff --git a/apps/progrouter/progrouter.cpp b/apps/progrouter/progrouter.cpp deleted file mode 100644 index 28fb494c..00000000 --- a/apps/progrouter/progrouter.cpp +++ /dev/null @@ -1,236 +0,0 @@ -#include - -// Simplest possible example involving programmable routers - -/* -Byte ordering in a routing beat: - - 31: Upper byte of length (i.e. number of records in beat) - 30: Lower byte of length - 29: Upper byte of first chunk - 28: - 27: - 26: - 25: - 24: Lower byte of first chunk - 23: Upper byte of second chunk - 22: - 21: - 20: - 19: - 18: Lower byte of second chunk - 17: Upper byte of third chunk - 16: - 15: - 14: - 13: - 12: Lower byte of third chunk - 11: Upper byte of fourth chunk - 10: - 9: - 8: - 7: - 6: Lower byte of fourth chunk - 5: Upper byte of fifth chunk - 4: - 3: - 2: - 1: - 0: Lower byte of fifth chunk - -Need to fold this into the docs eventually. -*/ - -// Use this to align on beat boundary -#define ALIGNED __attribute__((aligned(32))) - -// A single RAM beat -struct ALIGNED Beat { - uint8_t bytes[32]; -}; - -// Routing table, with methods to aid construction -template struct RoutingTable { - // Raw beats comprising the table - Beat beats[NumBeats]; - - // Number of chunks used so far in current beat - uint32_t numChunks; - - // Number of records used so far in current beat - uint32_t numRecords; - - // Index of beat currently being filled - uint32_t currentBeat; - - // Constructor - RoutingTable() { - currentBeat = 0; - numChunks = numRecords = 0; - } - - // Pointer to current beat being filled - uint8_t* currentPointer() { - return beats[currentBeat].bytes; - } - - // Move on to next the beat - void next() { - beats[currentBeat].bytes[31] = 0; - beats[currentBeat].bytes[30] = numRecords; - numChunks = 0; - numRecords = 0; - currentBeat++; - } - - // Add a URM1 record to the table - void addURM1(uint32_t mboxX, uint32_t mboxY, - uint32_t mboxThread, uint32_t localKey) { - if (numChunks == 5) next(); - uint8_t* ptr = beats[currentBeat].bytes + 6*(4-numChunks); - ptr[0] = localKey; - ptr[1] = localKey >> 8; - ptr[2] = localKey >> 16; - ptr[3] = localKey >> 24; - ptr[4] = (mboxThread&0x1f) << 3; - ptr[5] = (mboxY << 3) | (mboxX << 1) | (mboxThread >> 5); - numChunks++; - numRecords++; - } - - // Add a URM2 record to the table - void addURM2(uint32_t mboxX, uint32_t mboxY, uint32_t mboxThread, - uint32_t localKeyHigh, uint32_t localKeyLow) { - if (numChunks >= 4) next(); - uint8_t* ptr = beats[currentBeat].bytes + 6*(3-numChunks); - ptr[0] = localKeyLow; - ptr[1] = localKeyLow >> 8; - ptr[2] = localKeyLow >> 16; - ptr[3] = localKeyLow >> 24; - ptr[4] = localKeyHigh; - ptr[5] = localKeyHigh >> 8; - ptr[6] = localKeyHigh >> 16; - ptr[7] = localKeyHigh >> 24; - ptr[10] = (mboxThread&0x1f) << 3; - ptr[11] = (1 << 5) | (mboxY << 3) | (mboxX << 1) | (mboxThread >> 5); - numChunks += 2; - numRecords++; - } - - // Add an MRM record to the table - void addMRM(uint32_t mboxX, uint32_t mboxY, - uint32_t threadsHigh, uint32_t threadsLow, - uint16_t localKey) { - if (numChunks >= 4) next(); - uint8_t* ptr = beats[currentBeat].bytes + 6*(3-numChunks); - ptr[0] = threadsLow; - ptr[1] = threadsLow >> 8; - ptr[2] = threadsLow >> 16; - ptr[3] = threadsLow >> 24; - ptr[4] = threadsHigh; - ptr[5] = threadsHigh >> 8; - ptr[6] = threadsHigh >> 16; - ptr[7] = threadsHigh >> 24; - ptr[8] = localKey; - ptr[9] = localKey >> 8; - ptr[11] = (3 << 5) | (mboxY << 3) | (mboxX << 1); - numChunks += 2; - numRecords++; - } - - // Add an IND record to the table - // Return a pointer to the indirection key, - // so it can be set later by the caller - uint8_t* addIND() { - if (numChunks == 5) next(); - uint8_t* ptr = beats[currentBeat].bytes + 6*(4-numChunks); - ptr[5] = 4 << 5; - numChunks++; - numRecords++; - return ptr; - } - - // Set indirection key - void setIND(uint8_t* ind, bool upperRam, - uint8_t* beatPtr, uint32_t numBeats) { - uint32_t key = (uint32_t) beatPtr | numBeats; - if (upperRam) key |= 0x80000000; - ind[0] = key; - ind[1] = key >> 8; - ind[2] = key >> 16; - ind[3] = key >> 24; - } - - // Add an RR record to the table - void addRR(uint32_t dir, uint32_t key) { - if (numChunks == 5) next(); - uint8_t* ptr = beats[currentBeat].bytes + 6*(4-numChunks); - ptr[0] = key; - ptr[1] = key >> 8; - ptr[2] = key >> 16; - ptr[3] = key >> 24; - ptr[5] = (2 << 5) | (dir << 3); - numChunks++; - numRecords++; - } -}; - -// Create global routing table of 16 beats -RoutingTable<16> table; - -int main() -{ - // Get thread id - int me = tinselId(); - - // Sample outgoing message - volatile uint32_t* msgOut = (uint32_t*) tinselSendSlot(); - msgOut[0] = 0x10; - msgOut[1] = 0x20; - msgOut[2] = 0x30; - msgOut[3] = 0x40; - msgOut[4] = 0x50; - msgOut[5] = 0x60; - msgOut[6] = 0x70; - msgOut[7] = 0x80; - - // On thread 0 - if (me == 0) { -tinselSetLen(1); - // Add an URM1 record - uint8_t* entry1 = table.currentPointer(); - table.addURM1(0, 0, 10, 0xfff); - table.addURM2(0, 0, 60, 0xff1, 0xff0); - table.addURM2(0, 0, 60, 0xff3, 0xff2); - table.addURM2(0, 0, 60, 0xff5, 0xff4); - //table.addMRM(1, 0, 0x22222222, 0x11111111, 0x2222); - table.next(); - - // Cache flush, to write table into RAM - tinselCacheFlush(); - // Wait until flush done, by issuing a load - volatile uint32_t* dummyPtr = (uint32_t*) entry1; dummyPtr[0]; - - // Construct key - uint32_t key = (uint32_t) entry1; - key = key | 2; // Entry is 2 beats long - - // Send message to key - tinselWaitUntil(TINSEL_CAN_SEND); - tinselKeySend(key, msgOut); - - while (1); - } - - // On other threads, print anything received - while (me != 0) { - tinselWaitUntil(TINSEL_CAN_RECV); - volatile uint32_t* msgIn = (uint32_t*) tinselRecv(); - printf("%x %x %x %x %x %x %x %x\n", - msgIn[0], msgIn[1], msgIn[2], msgIn[3] - , msgIn[4], msgIn[5], msgIn[6], msgIn[7]); - tinselFree(msgIn); - } - - return 0; -} diff --git a/apps/progrouter/run.cpp b/apps/progrouter/run.cpp deleted file mode 100644 index a198a064..00000000 --- a/apps/progrouter/run.cpp +++ /dev/null @@ -1,12 +0,0 @@ -#include - -int main() -{ - HostLink hostLink; - - hostLink.boot("code.v", "data.v"); - hostLink.go(); - hostLink.dumpStdOut(); - - return 0; -} diff --git a/include/POLite/ProgRouters.h b/include/POLite/ProgRouters.h index 8fe3c143..45d12cbd 100644 --- a/include/POLite/ProgRouters.h +++ b/include/POLite/ProgRouters.h @@ -227,14 +227,13 @@ inline uint32_t destMboxY(uint32_t mbox) { // ============================ class ProgRouterMesh { - // 2D array of tables; - ProgRouter** table; - // Board mesh dimensions uint32_t boardsX; uint32_t boardsY; public: + // 2D array of tables; + ProgRouter** table; // Constructor ProgRouterMesh(uint32_t numBoardsX, uint32_t numBoardsY) { From 468f65f1d59bf11989bf343d154086b7e6ff2c21 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Mon, 20 Apr 2020 14:08:18 +0100 Subject: [PATCH 33/78] Fix mistake in beat shifter --- apps/POLite/progrouters/Run.cpp | 10 +++++----- rtl/ProgRouter.bsv | 4 +--- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/apps/POLite/progrouters/Run.cpp b/apps/POLite/progrouters/Run.cpp index 4a7ad6ed..c2b27bd2 100644 --- a/apps/POLite/progrouters/Run.cpp +++ b/apps/POLite/progrouters/Run.cpp @@ -8,21 +8,21 @@ int main(int argc, char **argv) HostLink hostLink; // Create routing tables - ProgRouterMesh mesh(TinselMeshXLenWithinBox, TinselMeshYLenWithinBox); + ProgRouterMesh mesh(2, 1); // Board (1, 0) - for (int i = 0; i < 60; i++) { + for (int i = 0; i < 2; i++) { uint64_t mask = 1ul << i; mesh.table[0][1].addMRM(1, 0, mask >> 32, mask, 0xf0f0); } - uint32_t key01 = mesh.table[0][0].genKey(); + uint32_t key01 = mesh.table[0][1].genKey(); // Board (0, 0) - for (int i = 0; i < 40; i++) { + for (int i = 0; i < 2; i++) { uint64_t mask = 1ul << i; mesh.table[0][0].addMRM(1, 0, mask >> 32, mask, 0xf0f0); } - for (int i = 0; i < 30; i++) { + for (int i = 0; i < 2; i++) { uint64_t mask = 1ul << i; mesh.table[0][0].addMRM(1, 1, mask >> 32, mask, 0xf0f0); } diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv index ecf6e927..9c0b5f95 100644 --- a/rtl/ProgRouter.bsv +++ b/rtl/ProgRouter.bsv @@ -624,10 +624,8 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); RoutingBeat newBeat = beat; Bool doubleChunk = unpack(pack(tag)[0]); if (doubleChunk) begin - for (Integer i = 4; i > 2; i=i-2) begin + for (Integer i = 4; i > 1; i=i-1) newBeat.chunks[i] = beat.chunks[i-2]; - newBeat.chunks[i-1] = beat.chunks[i-3]; - end end else begin for (Integer i = 4; i > 0; i=i-1) newBeat.chunks[i] = beat.chunks[i-1]; From 249e0915da65f978e13fd2604a105e6be4d7f056 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Tue, 21 Apr 2020 13:16:40 +0000 Subject: [PATCH 34/78] Fix E/W mixup in POLite --- include/POLite/ProgRouters.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/POLite/ProgRouters.h b/include/POLite/ProgRouters.h index 45d12cbd..90083802 100644 --- a/include/POLite/ProgRouters.h +++ b/include/POLite/ProgRouters.h @@ -250,7 +250,7 @@ class ProgRouterMesh { Seq* dests) { assert(dests->numElems > 0); - // Categorise non-local dests into local, N, S, E, and W groups + // Categorise dests into local, N, S, E, and W groups Seq local(dests->numElems); Seq north(dests->numElems); Seq south(dests->numElems); @@ -260,8 +260,8 @@ class ProgRouterMesh { PRoutingDest dest = dests->elems[i]; uint32_t receiverX = destX(dest.mbox); uint32_t receiverY = destY(dest.mbox); - if (receiverX < senderX) east.append(dest); - else if (receiverX > senderX) west.append(dest); + if (receiverX < senderX) west.append(dest); + else if (receiverX > senderX) east.append(dest); else if (receiverY < senderY) south.append(dest); else if (receiverY > senderY) north.append(dest); else local.append(dest); From d4d4da8854a480c6f6d22b41f948587ed0624ea8 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Tue, 21 Apr 2020 22:38:01 +0100 Subject: [PATCH 35/78] Fix another bug --- rtl/ProgRouter.bsv | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv index 9c0b5f95..b9cdc469 100644 --- a/rtl/ProgRouter.bsv +++ b/rtl/ProgRouter.bsv @@ -470,8 +470,10 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); ramReqQueue[consumeKey.ram].enq(req); fetchBeatCount <= fetchBeatCount + zeroExtend(req.burst); beatBufferLen.incBy(zeroExtend(req.burst)); - incReceivedReg <= 1; - if (finished) consumeState <= 0; + if (finished) begin + consumeState <= 0; + incReceivedReg <= 1; + end end endrule From 5f6eafa9ff4d07a5b2b2479f2cf959cb53bddb24 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Wed, 22 Apr 2020 09:13:00 +0100 Subject: [PATCH 36/78] Set board id in flits emitted by ProgRouter This was a sutble problem due to the way tinselKeySend works: it uses a zero address with the isKey bit set as the message destination. This means that when a message reaches the ProgRouter on a board other than (0, 0), the board-id of any genenerated local messages is incorrect. This could be easily solved in software by tweaking tinselKeySend, but it's simpler (from a docs/semantics perspective) and safer to solve it in hardware. --- rtl/ProgRouter.bsv | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv index b9cdc469..1ea7a8c6 100644 --- a/rtl/ProgRouter.bsv +++ b/rtl/ProgRouter.bsv @@ -545,6 +545,9 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); // Modify flit by interpreting routing key RoutingDecision decision = ?; Flit flit = flitBuffer.dataOut; + // Unless otherwise stated (e.g. RR records), + // flits emitted will be destined for this board + flit.dest.addr.board = boardId; case (tag) // 48-bit Unicast Router-to-Mailbox URM1: begin From ce65fc5d1abc2c4f11c117e713d842e210eeafd2 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Wed, 22 Apr 2020 20:37:45 +0100 Subject: [PATCH 37/78] Exploring impact of an extra fetcher Introduce an additional fetcher for messages arriving locally. How does this impact performance (area, throughput)? --- config.py | 2 +- rtl/Network.bsv | 12 ++++++++---- rtl/ProgRouter.bsv | 18 +++++++++--------- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/config.py b/config.py index 1d2e5b07..e9917ae5 100755 --- a/config.py +++ b/config.py @@ -380,7 +380,7 @@ def quoted(s): return "'\"" + s + "\"'" # Parameters for programmable routers # (and the routing-record fetchers they contain) -p["FetchersPerProgRouter"] = 5 +p["FetchersPerProgRouter"] = 6 p["LogFetcherFlitBufferSize"] = 5 #============================================================================== diff --git a/rtl/Network.bsv b/rtl/Network.bsv index f8396509..baf36d63 100644 --- a/rtl/Network.bsv +++ b/rtl/Network.bsv @@ -401,10 +401,14 @@ module mkNoC#( // Connect mailbox mesh south rim to board router function List#(t) single(t elem) = List::cons(elem, Nil); - List#(Out#(Flit)) botOutList = Nil; - for (Integer x = `MailboxMeshXLen-1; x >= 0; x=x-1) - botOutList = Cons(routers[0][x].bottomOut, botOutList); - reduceConnect(mkFlitMerger, botOutList, single(boardRouter.flitIn[4])); + List#(Out#(Flit)) botOutList0 = Nil; + List#(Out#(Flit)) botOutList1 = Nil; + for (Integer x = `MailboxMeshXLen-1; x >= 0; x=x-2) begin + botOutList0 = Cons(routers[0][x].bottomOut, botOutList0); + botOutList1 = Cons(routers[0][x-1].bottomOut, botOutList1); + end + reduceConnect(mkFlitMerger, botOutList0, single(boardRouter.flitIn[4])); + reduceConnect(mkFlitMerger, botOutList1, single(boardRouter.flitIn[5])); // Connect board router to mailbox mesh south rim function In#(Flit) getBottomIn(MeshRouter r) = r.bottomIn; diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv index 1ea7a8c6..0cfd6e16 100644 --- a/rtl/ProgRouter.bsv +++ b/rtl/ProgRouter.bsv @@ -179,15 +179,15 @@ typedef struct { // NoC edge, but the diagram assumes four. // -// N S E W L0..L3 Input flits -// | | | | | -// +---+ +---+ +---+ +---+ +---+ -// | F | | F | | F | | F | | F | Fetchers -// +---+ +---+ +---+ +---+ +---+ -// | | | | | -// +---------------------------+ -// | Crossbar | Routing -// +---------------------------+ +// N S E W L0/L1 L2/L3 Input flits +// | | | | | | +// +---+ +---+ +---+ +---+ +---+ +---+ +// | F | | F | | F | | F | | F | | F | Fetchers +// +---+ +---+ +---+ +---+ +---+ +---+ +// | | | | | | +// +---------------------------------+ +// | Crossbar | Routing +// +---------------------------------+ // | | | | // N/L0 S/L1 E/L2 W/L3 Output queues // | | | | From 100b18147044d137454d61f09759f4905cb16bd8 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Sun, 26 Apr 2020 23:02:09 +0100 Subject: [PATCH 38/78] Throttle ProgRouter's RAM access --- rtl/Connections.bsv | 65 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 59 insertions(+), 6 deletions(-) diff --git a/rtl/Connections.bsv b/rtl/Connections.bsv index 013224c9..2e472c37 100644 --- a/rtl/Connections.bsv +++ b/rtl/Connections.bsv @@ -7,6 +7,7 @@ import DRAM :: *; import Queue :: *; import DCache :: *; import DCacheTypes :: *; +import Util :: *; // ============================================================================ // DCache <-> Core connections @@ -54,17 +55,54 @@ endmodule module connectClientsToOffChipRAM#( // Data caches Vector#(`DCachesPerDRAM, DCache) caches, - // Programmable per-board router, reqs and resps + // Reqs and resps from ProgRouter's fetchers Vector#(`FetchersPerProgRouter, BOut#(DRAMReq)) routerReqs, Vector#(`FetchersPerProgRouter, In#(DRAMResp)) routerResps, // Off-chip memory OffChipRAM ram) (); - // Connect requests + // Count the number of outstanding fetcher requests + // Used to throttle the fetcher requests to avoid starving/blocking + // the cache requests + Integer throttleCount = 2 ** (`DRAMLogMaxInFlight - 1); + Count#(`DRAMLogMaxInFlight) fetcherCount <- mkCount(throttleCount); + + // Merge cache requests function getReqOut(cache) = cache.reqOut; - let reqs <- mkMergeTreeB(Fair, - mkUGShiftQueue1(QueueOptFmax), - append(map(getReqOut, caches), routerReqs)); + Out#(DRAMReq) cacheReqs <- + mkMergeTreeB(Fair, + mkUGShiftQueue1(QueueOptFmax), + map(getReqOut, caches)); + Queue1#(DRAMReq) cacheReqsQueue <- mkUGShiftQueue1(QueueOptFmax); + connectToQueue(cacheReqs, cacheReqsQueue); + BOut#(DRAMReq) cacheReqsB = queueToBOut(cacheReqsQueue); + + // Merge router requests + Out#(DRAMReq) fetcherReqs <- + mkMergeTreeB(Fair, + mkUGShiftQueue1(QueueOptFmax), + routerReqs); + Queue1#(DRAMReq) fetcherReqsQueue <- mkUGShiftQueue1(QueueOptFmax); + connectToQueue(fetcherReqs, fetcherReqsQueue); + BOut#(DRAMReq) fetcherReqsB = queueToBOut(fetcherReqsQueue); + + // Update count on router request + BOut#(DRAMReq) fetcherReqsIncCountB = + interface BOut + method Action get = + action + fetcherReqsB.get; + fetcherCount.incBy(zeroExtend(fetcherReqsB.value.burst)); + endaction; + method Bool valid = fetcherReqsB.valid && + (fetcherCount.available + + zeroExtend(fetcherReqsB.value.burst) <= + fromInteger(throttleCount)); + method DRAMReq value = fetcherReqsB.value; + endinterface; + + // Merge cache and router requests, and connect to off-chip RAM + let reqs <- mkMergeTwoB(Fair, cacheReqsB, fetcherReqsIncCountB); connectUsing(mkUGQueue, reqs, ram.reqIn); // Connect load responses @@ -74,7 +112,22 @@ module connectClientsToOffChipRAM#( getRespKey, mkUGShiftQueue2(QueueOptFmax), append(map(getRespIn, caches), routerResps)); - connectDirect(ram.respOut, ramResps); + + // Update count on respose + BOut#(DRAMResp) ramRespOutDecCount = + interface BOut + method Action get = + action + ram.respOut.get; + if (ram.respOut.value.id >= fromInteger(`DCachesPerDRAM)) + fetcherCount.dec; + endaction; + method Bool valid = ram.respOut.valid; + method DRAMResp value = ram.respOut.value; + endinterface; + + // Connect responses from off-chip RAM + connectDirect(ramRespOutDecCount, ramResps); endmodule From cc785bdcc01bcf6253752467777c94715cfac087 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Sun, 26 Apr 2020 23:07:08 +0100 Subject: [PATCH 39/78] Fix throttle condition --- rtl/Connections.bsv | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/rtl/Connections.bsv b/rtl/Connections.bsv index 2e472c37..cd762f66 100644 --- a/rtl/Connections.bsv +++ b/rtl/Connections.bsv @@ -95,9 +95,7 @@ module connectClientsToOffChipRAM#( fetcherCount.incBy(zeroExtend(fetcherReqsB.value.burst)); endaction; method Bool valid = fetcherReqsB.valid && - (fetcherCount.available + - zeroExtend(fetcherReqsB.value.burst) <= - fromInteger(throttleCount)); + zeroExtend(fetcherReqsB.value.burst) <= fetcherCount.available; method DRAMReq value = fetcherReqsB.value; endinterface; From e58f70d7aa7cade6ef1fe2b03da3aed1b38948f9 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Sun, 26 Apr 2020 23:16:34 +0100 Subject: [PATCH 40/78] Preserve order in mkMergeTreeB --- rtl/Interface.bsv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rtl/Interface.bsv b/rtl/Interface.bsv index 0484cb41..a7cd0e91 100644 --- a/rtl/Interface.bsv +++ b/rtl/Interface.bsv @@ -412,7 +412,7 @@ module mkMergeTreeB#(MergeMethod m, module#(SizedQueue#(d, t)) mkQ, xs = List::cons(x, xs); end - let out <- mkMergeTreeList(m, mkQ, xs); + let out <- mkMergeTreeList(m, mkQ, List::reverse(xs)); return out; endmodule From 2c76edc82887a2e16a185842789e1de27c699834 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Wed, 29 Apr 2020 15:14:48 +0100 Subject: [PATCH 41/78] Use full-rate queue at root of cache request tree When adding the ProgRouter to the RAM request tree, the max rate of requests from caches was halved even though the ProgRouter may not be busy. Not desirable, but easy to fix: keep a full-rate queue at the root of the cache request tree. --- rtl/Connections.bsv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rtl/Connections.bsv b/rtl/Connections.bsv index cd762f66..214c30f7 100644 --- a/rtl/Connections.bsv +++ b/rtl/Connections.bsv @@ -73,7 +73,7 @@ module connectClientsToOffChipRAM#( mkMergeTreeB(Fair, mkUGShiftQueue1(QueueOptFmax), map(getReqOut, caches)); - Queue1#(DRAMReq) cacheReqsQueue <- mkUGShiftQueue1(QueueOptFmax); + Queue#(DRAMReq) cacheReqsQueue <- mkUGQueue; connectToQueue(cacheReqs, cacheReqsQueue); BOut#(DRAMReq) cacheReqsB = queueToBOut(cacheReqsQueue); From 5a4beabfb681913286c3473312823e35134ba2bb Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Thu, 30 Apr 2020 15:43:52 +0100 Subject: [PATCH 42/78] ProgRouter performance counters Accessible from core zero on each board only. --- README.md | 3 ++ rtl/Connections.bsv | 19 ++++++++++++ rtl/Core.bsv | 69 +++++++++++++++++++++++++++++++++----------- rtl/DE5Top.bsv | 4 +++ rtl/IdleDetector.bsv | 27 ----------------- rtl/Network.bsv | 7 ++++- rtl/ProgRouter.bsv | 47 +++++++++++++++++++++++++++++- rtl/Util.bsv | 27 +++++++++++++++++ 8 files changed, 157 insertions(+), 46 deletions(-) diff --git a/README.md b/README.md index a3340c3f..871c9e5e 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,9 @@ New section on programmable routers: * Restrictions on IND records * Avoiding deadlock: programmer has some added resposibility here +New performance counters accessible from core zero on each board only: + * `ProgRouterSent` and `ProgRouterSentInterBoard` + # Tinsel 0.7.1 Tinsel is a [RISC-V](https://riscv.org/)-based manythread diff --git a/rtl/Connections.bsv b/rtl/Connections.bsv index 214c30f7..d093001b 100644 --- a/rtl/Connections.bsv +++ b/rtl/Connections.bsv @@ -8,6 +8,8 @@ import Queue :: *; import DCache :: *; import DCacheTypes :: *; import Util :: *; +import ProgRouter :: *; +import Core :: *; // ============================================================================ // DCache <-> Core connections @@ -129,4 +131,21 @@ module connectClientsToOffChipRAM#( endmodule +// ============================================================================ +// ProgRouter performance counter connections +// ============================================================================ + +module connectProgRouterPerfCountersToCores#( + ProgRouterPerfCounters counters, Vector#(n, Core) cores) (Empty); + rule connect; + // Only core zero can access the ProgRouter perf counters + cores[0].progRouterPerfClient.incSent(counters.incSent); + cores[0].progRouterPerfClient.incSentInterBoard(counters.incSentInterBoard); + for (Integer i = 1; i < valueOf(n); i=i+1) begin + cores[i].progRouterPerfClient.incSent(?); + cores[i].progRouterPerfClient.incSentInterBoard(?); + end + endrule +endmodule + endpackage diff --git a/rtl/Core.bsv b/rtl/Core.bsv index 1d35d278..4c454c98 100644 --- a/rtl/Core.bsv +++ b/rtl/Core.bsv @@ -25,6 +25,7 @@ import FPUOps :: *; import InstrMem :: *; import DCacheTypes :: *; import IdleDetector :: *; +import ProgRouter :: *; // ============================================================================ // Control/status registers (CSRs) supported @@ -60,15 +61,17 @@ import IdleDetector :: *; // Performance Counter CSRs (Optional) // ============================================================================ -// Name | CSR | R/W | Function -// --------------- | ------ | --- | -------- -// PerfCount | 0xc07 | W | Reset(0)/Start(1)/Stop(2) all counters -// MissCount | 0xc08 | R | Cache miss count -// HitCount | 0xc09 | R | Cache hit count -// WritebackCount | 0xc0a | R | Cache writeback count -// CPUIdleCount | 0xc0b | R | CPU idle-cycle count (lower 32 bits) -// CPUIdleCountU | 0xc0c | R | CPU idle-cycle count (upper 8 bits) -// CycleU | 0xc0d | R | Cycle counter (upper 8 bits) +// Name | CSR | R/W | Function +// ------------------- | ------ | --- | -------- +// PerfCount | 0xc07 | W | Reset(0)/Start(1)/Stop(2) all counters +// MissCount | 0xc08 | R | Cache miss count +// HitCount | 0xc09 | R | Cache hit count +// WritebackCount | 0xc0a | R | Cache writeback count +// CPUIdleCount | 0xc0b | R | CPU idle-cycle count (lower 32 bits) +// CPUIdleCountU | 0xc0c | R | CPU idle-cycle count (upper 8 bits) +// CycleU | 0xc0d | R | Cycle counter (upper 8 bits) +// ProgRouterSent | 0xc0e | R | Msgs sent by ProgRouter +// ProgRouterSentInter | 0xc0f | R | Inter-board msgs sent by ProgRouter // ============================================================================ // Types @@ -505,12 +508,13 @@ endfunction // ============================================================================ interface Core; - interface DCacheClient dcacheClient; - interface MailboxClient mailboxClient; - interface DebugLinkClient debugLinkClient; - interface FPUClient fpuClient; - interface InstrMemClient instrMemClient; - interface IdleDetectorClient idleClient; + interface DCacheClient dcacheClient; + interface MailboxClient mailboxClient; + interface DebugLinkClient debugLinkClient; + interface FPUClient fpuClient; + interface InstrMemClient instrMemClient; + interface IdleDetectorClient idleClient; + interface ProgRouterPerfClient progRouterPerfClient; // Each core can see its board id (* always_ready, always_enabled *) @@ -676,18 +680,27 @@ module mkCore#(CoreId myId) (Core); Reg#(Bit#(32)) hitCount <- mkConfigReg(0); Reg#(Bit#(32)) writebackCount <- mkConfigReg(0); Reg#(Bit#(40)) cpuIdleCount <- mkConfigReg(0); + // Only core zero maintains the following two counters + Reg#(Bit#(32)) progRouterSent <- mkConfigReg(0); + Reg#(Bit#(32)) progRouterSentInterBoard <- mkConfigReg(0); // Indexable vector of performance counters - Vector#(6, Bit#(32)) perfCounters = + Vector#(8, Bit#(32)) perfCounters = vector(missCount, hitCount, writebackCount, cpuIdleCount[31:0], zeroExtend(cpuIdleCount[39:32]), - zeroExtend(cycleCount[39:32])); + zeroExtend(cycleCount[39:32]), + myId == 0 ? progRouterSent : ?, + myId == 0 ? progRouterSentInterBoard : ?); // Increment wires Wire#(Bool) incMissCountWire <- mkDWire(False); Wire#(Bool) incHitCountWire <- mkDWire(False); Wire#(Bool) incWritebackCountWire <- mkDWire(False); Wire#(Bool) incCPUIdleCountWire <- mkDWire(False); + Wire#(Bit#(LogFetchersPerProgRouter)) + incProgRouterSent <- mkBypassWire; + Wire#(Bit#(LogFetchersPerProgRouter)) + incProgRouterSentInterBoard <- mkBypassWire; // Update performance counters rule updatePerfCounters; @@ -696,11 +709,20 @@ module mkCore#(CoreId myId) (Core); hitCount <= 0; writebackCount <= 0; cpuIdleCount <= 0; + if (myId == 0) begin + progRouterSent <= 0; + progRouterSentInterBoard <= 0; + end end else if (perfCountEnabled) begin if (incMissCountWire) missCount <= missCount+1; if (incHitCountWire) hitCount <= hitCount+1; if (incWritebackCountWire) writebackCount <= writebackCount+1; if (incCPUIdleCountWire) cpuIdleCount <= cpuIdleCount+1; + if (myId == 0) begin + progRouterSent <= progRouterSent + zeroExtend(incProgRouterSent); + progRouterSentInterBoard <= progRouterSentInterBoard + + zeroExtend(incProgRouterSentInterBoard); + end end endrule `endif @@ -1321,6 +1343,19 @@ module mkCore#(CoreId myId) (Core); method Bool idleStage1Ack = mailbox.idleStage1Ack; endinterface + interface ProgRouterPerfClient progRouterPerfClient; + method Action incSent(Bit#(LogFetchersPerProgRouter) amount); + `ifdef EnablePerfCount + incProgRouterSent <= amount; + `endif + endmethod + method Action incSentInterBoard(Bit#(LogFetchersPerProgRouter) amount); + `ifdef EnablePerfCount + incProgRouterSentInterBoard <= amount; + `endif + endmethod + endinterface + endmodule endpackage diff --git a/rtl/DE5Top.bsv b/rtl/DE5Top.bsv index 5c353542..b522284d 100644 --- a/rtl/DE5Top.bsv +++ b/rtl/DE5Top.bsv @@ -178,6 +178,10 @@ module de5Top (DE5Top); connectClientsToOffChipRAM(dcaches[i], noc.dramReqs[i], noc.dramResps[i], rams[i]); + // Connects ProgRouter performance counters to cores + connectProgRouterPerfCountersToCores(noc.progRouterPerfCounters, + concat(concat(cores))); + // Set board ids rule setBoardIds; for (Integer i = 0; i < `DRAMsPerBoard; i=i+1) diff --git a/rtl/IdleDetector.bsv b/rtl/IdleDetector.bsv index 179a9f41..59e4b530 100644 --- a/rtl/IdleDetector.bsv +++ b/rtl/IdleDetector.bsv @@ -304,33 +304,6 @@ module mkIdleDetector (IdleDetector); endmodule -// Pipelined reduction tree -module mkPipelinedReductionTree#( - function a reduce(a x, a y), - a init, - List#(a) xs) - (a) provisos(Bits#(a, _)); - Integer len = List::length(xs); - if (len == 0) - return error("mkSumList applied to empty list"); - else if (len == 1) - return xs[0]; - else begin - List#(a) ys = xs; - List#(a) reduced = Nil; - for (Integer i = 0; i < len; i=i+2) begin - Reg#(a) r <- mkConfigReg(init); - rule assignOut; - r <= reduce(ys[0], ys[1]); - endrule - ys = List::drop(2, ys); - reduced = Cons(readReg(r), reduced); - end - a res <- mkPipelinedReductionTree(reduce, init, reduced); - return res; - end -endmodule - interface IdleDetectorClient; method Bit#(1) incSent; method Bit#(1) incReceived; diff --git a/rtl/Network.bsv b/rtl/Network.bsv index baf36d63..136d327c 100644 --- a/rtl/Network.bsv +++ b/rtl/Network.bsv @@ -289,8 +289,9 @@ interface NoC; Vector#(`FetchersPerProgRouter, BOut#(DRAMReq))) dramReqs; interface Vector#(`DRAMsPerBoard, Vector#(`FetchersPerProgRouter, In#(DRAMResp))) dramResps; - // ProgRouter fetcher activities + // ProgRouter fetcher activities & performance counters interface Vector#(`FetchersPerProgRouter, FetcherActivity) activities; + interface ProgRouterPerfCounters progRouterPerfCounters; endinterface module mkNoC#( @@ -468,6 +469,10 @@ module mkNoC#( // Fetcher activities interface activities = boardRouter.activities; + // Performance counters + interface ProgRouterPerfCounters progRouterPerfCounters = + boardRouter.perfCounters; + endmodule endpackage diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv index 0cfd6e16..2ab0a0d6 100644 --- a/rtl/ProgRouter.bsv +++ b/rtl/ProgRouter.bsv @@ -277,8 +277,13 @@ endinterface // Fetcher activity for performance counters and termination detection (* always_ready *) interface FetcherActivity; + // Increment number of sent messages method Bit#(1) incSent; + // Increment number of messages sent to another board + method Bit#(1) incSentInterBoard; + // Increment number of received messages method Bit#(1) incReceived; + // Active (in the termination-detection sense)? method Bool active; endinterface @@ -328,6 +333,7 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); // Activity Reg#(Bit#(1)) incSentReg <- mkDReg(0); + Reg#(Bit#(1)) incSentInterBoardReg <- mkDReg(0); Reg#(Bit#(1)) incReceivedReg <- mkDReg(0); // Stage 1: consume input message @@ -662,6 +668,7 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); end end incSentReg <= 1; + if (tag == RR) incSentInterBoardReg <= 1; newFlitCount = 0; end end @@ -707,6 +714,7 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); interface FetcherActivity activity; method Bit#(1) incSent = incSentReg; + method Bit#(1) incSentInterBoard = incSentInterBoardReg; method Bit#(1) incReceived = incReceivedReg; method Bool active = beatBufferLen.value != 0 || consumeState != 0; @@ -836,6 +844,16 @@ endmodule // Programmable router // ============================================================================= +// Enough bits to store a count of the number of fetchers +typedef TLog#(TAdd#(`FetchersPerProgRouter, 1)) LogFetchersPerProgRouter; + +// ProgRouter's performance counters +(* always_ready, always_enabled *) +interface ProgRouterPerfCounters; + method Bit#(LogFetchersPerProgRouter) incSent; + method Bit#(LogFetchersPerProgRouter) incSentInterBoard; +endinterface + interface ProgRouter; // Incoming and outgoing flits interface Vector#(`FetchersPerProgRouter, In#(Flit)) flitIn; @@ -848,8 +866,9 @@ interface ProgRouter; interface Vector#(`DRAMsPerBoard, Vector#(`FetchersPerProgRouter, In#(DRAMResp))) ramResps; - // Activities + // Activities & performance counters interface Vector#(`FetchersPerProgRouter, FetcherActivity) activities; + interface ProgRouterPerfCounters perfCounters; endinterface module mkProgRouter#(BoardId boardId) (ProgRouter); @@ -909,6 +928,21 @@ module mkProgRouter#(BoardId boardId) (ProgRouter); ramRespIfc[i][j] = fetchers[j].ramResps[i]; end + // Performance counters + Vector#(`FetchersPerProgRouter, + Bit#(LogFetchersPerProgRouter)) incSents; + Vector#(`FetchersPerProgRouter, + Bit#(LogFetchersPerProgRouter)) incSentsInterBoard; + for (Integer i = 0; i < `FetchersPerProgRouter; i=i+1) begin + incSents[i] = zeroExtend(fetchers[i].activity.incSent); + incSentsInterBoard[i] = + zeroExtend(fetchers[i].activity.incSentInterBoard); + end + Bit#(LogFetchersPerProgRouter) numSent <- + mkPipelinedReductionTree( \+ , 0, toList(incSents)); + Bit#(LogFetchersPerProgRouter) numSentInterBoard <- + mkPipelinedReductionTree( \+ , 0, toList(incSentsInterBoard)); + function FetcherActivity getActivity(Fetcher f) = f.activity; interface flitIn = flitInIfc; interface flitOut = flitOutIfc; @@ -916,7 +950,18 @@ module mkProgRouter#(BoardId boardId) (ProgRouter); interface ramReqs = ramReqIfc; interface ramResps = ramRespIfc; interface activities = map(getActivity, fetchers); + interface ProgRouterPerfCounters perfCounters; + method incSent = numSent; + method incSentInterBoard = numSentInterBoard; + endinterface endmodule +// For core(s) to access ProgRouter's performance counters +(* always_ready, always_enabled *) +interface ProgRouterPerfClient; + method Action incSent(Bit#(LogFetchersPerProgRouter) amount); + method Action incSentInterBoard(Bit#(LogFetchersPerProgRouter) amount); +endinterface + endpackage diff --git a/rtl/Util.bsv b/rtl/Util.bsv index 507d1ef2..f45ece48 100644 --- a/rtl/Util.bsv +++ b/rtl/Util.bsv @@ -274,4 +274,31 @@ function Tuple2#(Bit#(n), Bit#(n)) sched(Bit#(n) hist, Bit#(n) avail); end endfunction +// Pipelined reduction tree +module mkPipelinedReductionTree#( + function a reduce(a x, a y), + a init, + List#(a) xs) + (a) provisos(Bits#(a, _)); + Integer len = List::length(xs); + if (len == 0) + return error("mkSumList applied to empty list"); + else if (len == 1) + return xs[0]; + else begin + List#(a) ys = xs; + List#(a) reduced = Nil; + for (Integer i = 0; i < len; i=i+2) begin + Reg#(a) r <- mkConfigReg(init); + rule assignOut; + r <= reduce(ys[0], ys[1]); + endrule + ys = List::drop(2, ys); + reduced = Cons(readReg(r), reduced); + end + a res <- mkPipelinedReductionTree(reduce, init, reduced); + return res; + end +endmodule + endpackage From 1c8e863fe4f7159974eee8377988d2185c6c786b Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Thu, 30 Apr 2020 18:12:16 +0100 Subject: [PATCH 43/78] Satisfy limitations of mkPipelinedReductionTree --- rtl/ProgRouter.bsv | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv index 2ab0a0d6..ae361ed1 100644 --- a/rtl/ProgRouter.bsv +++ b/rtl/ProgRouter.bsv @@ -929,10 +929,10 @@ module mkProgRouter#(BoardId boardId) (ProgRouter); end // Performance counters - Vector#(`FetchersPerProgRouter, - Bit#(LogFetchersPerProgRouter)) incSents; - Vector#(`FetchersPerProgRouter, - Bit#(LogFetchersPerProgRouter)) incSentsInterBoard; + Vector#(TExp#(TLog#(`FetchersPerProgRouter)), + Bit#(LogFetchersPerProgRouter)) incSents = replicate(0); + Vector#(TExp#(TLog#(`FetchersPerProgRouter)), + Bit#(LogFetchersPerProgRouter)) incSentsInterBoard = replicate(0); for (Integer i = 0; i < `FetchersPerProgRouter; i=i+1) begin incSents[i] = zeroExtend(fetchers[i].activity.incSent); incSentsInterBoard[i] = From 324e03a14973fb69837ed7b7d71e2bc6f256ba1b Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Fri, 1 May 2020 09:40:52 +0100 Subject: [PATCH 44/78] Tinsel API for ProgRouter perf counters --- README.md | 10 ++++++++++ include/tinsel.h | 34 ++++++++++++++++++++++++++-------- 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 871c9e5e..c6101804 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,16 @@ New section on programmable routers: New performance counters accessible from core zero on each board only: * `ProgRouterSent` and `ProgRouterSentInterBoard` +Document the following: + +```c++ +// Performance counter: number of messages emitted by ProgRouter +INLINE uint32_t tinselProgRouterSent(); + +// Performance counter: number of inter-board messages emitted by ProgRouter +INLINE uint32_t tinselProgRouterSentInterBoard(); +``` + # Tinsel 0.7.1 Tinsel is a [RISC-V](https://riscv.org/)-based manythread diff --git a/include/tinsel.h b/include/tinsel.h index ec26b849..d06e2bfd 100644 --- a/include/tinsel.h +++ b/include/tinsel.h @@ -28,13 +28,15 @@ #define CSR_FLUSH "0xc01" // Performance counter CSRs -#define CSR_PERFCOUNT "0xc07" -#define CSR_MISSCOUNT "0xc08" -#define CSR_HITCOUNT "0xc09" -#define CSR_WBCOUNT "0xc0a" -#define CSR_CPUIDLECOUNT "0xc0b" -#define CSR_CPUIDLECOUNTU "0xc0c" -#define CSR_CYCLEU "0xc0d" +#define CSR_PERFCOUNT "0xc07" +#define CSR_MISSCOUNT "0xc08" +#define CSR_HITCOUNT "0xc09" +#define CSR_WBCOUNT "0xc0a" +#define CSR_CPUIDLECOUNT "0xc0b" +#define CSR_CPUIDLECOUNTU "0xc0c" +#define CSR_CYCLEU "0xc0d" +#define CSR_PROGROUTERSENT "0xc0e" +#define CSR_PROGROUTERSENTINTER "0xc0f" // Get globally unique thread id of caller INLINE uint32_t tinselId() @@ -280,7 +282,7 @@ INLINE uint32_t tinselWritebackCount() return n; } -// Performance counter:: get the CPU-idle count +// Performance counter: get the CPU-idle count INLINE uint32_t tinselCPUIdleCount() { uint32_t n; @@ -304,6 +306,22 @@ INLINE uint32_t tinselCycleCountU() return n; } +// Performance counter: number of messages emitted by ProgRouter +INLINE uint32_t tinselProgRouterSent() +{ + uint32_t n; + asm volatile ("csrrw %0, " CSR_PROGROUTERSENT ", zero" : "=r"(n)); + return n; +} + +// Performance counter: number of inter-board messages emitted by ProgRouter +INLINE uint32_t tinselProgRouterSentInterBoard() +{ + uint32_t n; + asm volatile ("csrrw %0, " CSR_PROGROUTERSENTINTER ", zero" : "=r"(n)); + return n; +} + // Get address of any specified host // (This Y coordinate specifies the row of the FPGA mesh that the // host is connected to, and the X coordinate specifies whether it is From 65dc133cd7aa7db373f93d884b772bb0f20eb6d4 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Fri, 1 May 2020 10:14:07 +0100 Subject: [PATCH 45/78] Update POLite perf counter script (Untested) --- apps/POLite/util/sumstats.awk | 37 ++++++++++++++++++++++------------- include/POLite/PDevice.h | 21 +++++++++++--------- 2 files changed, 35 insertions(+), 23 deletions(-) diff --git a/apps/POLite/util/sumstats.awk b/apps/POLite/util/sumstats.awk index f1f70329..f29e32ea 100755 --- a/apps/POLite/util/sumstats.awk +++ b/apps/POLite/util/sumstats.awk @@ -10,10 +10,11 @@ BEGIN { cacheCount = 0; coreCount = 0; cacheLineSize = 32; - intraThreadSendCount = 0; - interThreadSendCount = 0; - interBoardSendCount = 0; - fmax = 250000000; + msgsReceived = 0; + msgsSent = 0; + progRouterSent = 0; + progRouterSentInter = 0; + fmax = 220000000; if (boardsX == "" || boardsY == "") { boardsX = 3; boardsY = 2; @@ -48,13 +49,15 @@ BEGIN { coreCount = coreCount+1; } # Per-thread message counts - else if (match($0, /(.*) LS:(.*),TS:(.*),BS:(.*)/, fields)) { - ls=strtonum("0x"fields[2]); - ts=strtonum("0x"fields[3]); - bs=strtonum("0x"fields[4]); - intraThreadSendCount = intraThreadSendCount+ls; - interThreadSendCount = interThreadSendCount+ts; - interBoardSendCount = interBoardSendCount+bs; + else if (match($0, /(.*) MS:(.*),MR:(.*),PR:(.*),PRI:(.*)/, fields)) { + ms=strtonum("0x"fields[2]); + mr=strtonum("0x"fields[3]); + pr=strtonum("0x"fields[4]); + pri=strtonum("0x"fields[5]); + msgsSent = msgsSent + mr; + msgsReceived = msgsReceived + mr; + progRouterSent = progRouterSent + pr; + progRouterSentInter = progRouterSentInter + pri; } } } @@ -70,7 +73,13 @@ END { bytes = cacheLineSize * (missCount + writebackCount) print "Off-chip memory (GBytes/s): ", ((1/time) * bytes)/1000000000 print "CPU util (%): ", (1-(cpuIdleCount/cycleCount))*100 - print "Intra-thread messages: ", intraThreadSendCount - print "Inter-thread messages: ", interThreadSendCount - print "Inter-board messages: ", interBoardSendCount + print "Msgs received: ", msgsReceived + print "Msgs sent by threads: ", msgsSent + print "Msgs injected by ProgRouter:", progRouterSent + print "Inter-board msgs:", progRouterSentInter + print "" + print "Notes:" + print " * ProgRouter injections includes inter-board msgs" + print " * Memory bandwidth does not include lookups by ProgRouter" + print " * If runtime > 40s approx, hit/miss counts may overflow" } diff --git a/include/POLite/PDevice.h b/include/POLite/PDevice.h index b5f99340..41de7ac9 100644 --- a/include/POLite/PDevice.h +++ b/include/POLite/PDevice.h @@ -147,11 +147,9 @@ template Date: Fri, 1 May 2020 11:02:19 +0100 Subject: [PATCH 46/78] Typo --- apps/POLite/util/sumstats.awk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/POLite/util/sumstats.awk b/apps/POLite/util/sumstats.awk index f29e32ea..4a79a2f9 100755 --- a/apps/POLite/util/sumstats.awk +++ b/apps/POLite/util/sumstats.awk @@ -54,7 +54,7 @@ BEGIN { mr=strtonum("0x"fields[3]); pr=strtonum("0x"fields[4]); pri=strtonum("0x"fields[5]); - msgsSent = msgsSent + mr; + msgsSent = msgsSent + ms; msgsReceived = msgsReceived + mr; progRouterSent = progRouterSent + pr; progRouterSentInter = progRouterSentInter + pri; From d6f29800e942796da90401ebb5abcd258881175f Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Mon, 4 May 2020 08:53:26 +0000 Subject: [PATCH 47/78] Count blocked sends in POLite softswitch --- apps/POLite/util/sumstats.awk | 7 ++++++- include/POLite/PDevice.h | 17 ++++++++++++----- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/apps/POLite/util/sumstats.awk b/apps/POLite/util/sumstats.awk index 4a79a2f9..9ccfac8c 100755 --- a/apps/POLite/util/sumstats.awk +++ b/apps/POLite/util/sumstats.awk @@ -14,6 +14,7 @@ BEGIN { msgsSent = 0; progRouterSent = 0; progRouterSentInter = 0; + blockedSends = 0; fmax = 220000000; if (boardsX == "" || boardsY == "") { boardsX = 3; @@ -49,15 +50,18 @@ BEGIN { coreCount = coreCount+1; } # Per-thread message counts - else if (match($0, /(.*) MS:(.*),MR:(.*),PR:(.*),PRI:(.*)/, fields)) { + else if (match($0, /(.*) MS:(.*),MR:(.*),PR:(.*),PRI:(.*),BL:(.*)/, + fields)) { ms=strtonum("0x"fields[2]); mr=strtonum("0x"fields[3]); pr=strtonum("0x"fields[4]); pri=strtonum("0x"fields[5]); + bl=strtonum("0x"fields[6]); msgsSent = msgsSent + ms; msgsReceived = msgsReceived + mr; progRouterSent = progRouterSent + pr; progRouterSentInter = progRouterSentInter + pri; + blockedSends = blockedSends + bl; } } } @@ -77,6 +81,7 @@ END { print "Msgs sent by threads: ", msgsSent print "Msgs injected by ProgRouter:", progRouterSent print "Inter-board msgs:", progRouterSentInter + print "Blocked sends:", blockedSends print "" print "Notes:" print " * ProgRouter injections includes inter-board msgs" diff --git a/include/POLite/PDevice.h b/include/POLite/PDevice.h index 41de7ac9..d46bd7b4 100644 --- a/include/POLite/PDevice.h +++ b/include/POLite/PDevice.h @@ -150,6 +150,8 @@ template Date: Mon, 4 May 2020 14:47:28 +0000 Subject: [PATCH 48/78] Control chattyness from environment var --- include/POLite/PGraph.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/POLite/PGraph.h b/include/POLite/PGraph.h index b6a1245a..20cc4b5b 100644 --- a/include/POLite/PGraph.h +++ b/include/POLite/PGraph.h @@ -98,6 +98,10 @@ template Date: Tue, 5 May 2020 16:14:34 +0100 Subject: [PATCH 49/78] URM1 record support in POLite --- include/POLite/ProgRouters.h | 60 +++++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 7 deletions(-) diff --git a/include/POLite/ProgRouters.h b/include/POLite/ProgRouters.h index 90083802..a1c5942d 100644 --- a/include/POLite/ProgRouters.h +++ b/include/POLite/ProgRouters.h @@ -182,22 +182,58 @@ class ProgRouter { numChunks++; numRecords++; } + + // Add a URM1 record to the table + void addURM1(uint32_t mboxX, uint32_t mboxY, + uint32_t threadId, uint32_t key) { + if (numChunks == 5) nextBeat(); + uint8_t* ptr = currentRecord48(); + ptr[0] = key; + ptr[1] = key >> 8; + ptr[2] = key >> 16; + ptr[3] = key >> 24; + ptr[4] = (threadId << 3); + ptr[5] = (mboxY << 3) | (mboxX << 1) | (threadId >> 5); + numChunks++; + numRecords++; + } }; // ================================== // Data type for routing destinations // ================================== -struct PRoutingDest { - // Destination mailbox - uint32_t mbox; - // Thread-level routing key +enum PRoutingDestKind { PRDestKindURM1, PRDestKindMRM }; + +// URM1 routing destination +struct PRoutingDestURM1 { + // Mailbox-local thread + uint16_t threadId; + // Thread-local routing key + uint32_t key; +}; + +// MRM routing destination +struct PRoutingDestMRM { + // Thread-local routing key uint16_t key; // Destination threads uint32_t threadMaskLow; uint32_t threadMaskHigh; }; +// Routing destination +struct PRoutingDest { + PRoutingDestKind kind; + // Destination mailbox + uint32_t mbox; + // URM1 or MRM destination + union { + PRoutingDestURM1 urm1; + PRoutingDestMRM mrm; + }; +}; + // Extract board X coord from routing dest inline uint32_t destX(uint32_t mbox) { uint32_t x = mbox >> (TinselMailboxMeshXBits + TinselMailboxMeshYBits); @@ -288,9 +324,19 @@ class ProgRouterMesh { // Add local records for (int i = 0; i < local.numElems; i++) { PRoutingDest dest = local.elems[i]; - table[senderY][senderX].addMRM(destMboxX(dest.mbox), - destMboxY(dest.mbox), dest.threadMaskHigh, - dest.threadMaskLow, dest.key); + if (dest.kind == PRDestKindMRM) { + table[senderY][senderX].addMRM(destMboxX(dest.mbox), + destMboxY(dest.mbox), dest.mrm.threadMaskHigh, + dest.mrm.threadMaskLow, dest.mrm.key); + } + else if (dest.kind == PRDestKindURM1) { + table[senderY][senderX].addURM1(destMboxX(dest.mbox), + destMboxY(dest.mbox), dest.urm1.threadId, dest.urm1.key); + } + else { + fprintf(stderr, "ProgRouters.h: unknown routing record kind\n"); + exit(EXIT_FAILURE); + } } return table[senderY][senderX].genKey(); From 2ca458933a9c2f0e0dbc7d484d0427a5b1c7b8b6 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Tue, 5 May 2020 17:05:49 +0100 Subject: [PATCH 50/78] This should have been in previous commit --- include/POLite/PGraph.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/POLite/PGraph.h b/include/POLite/PGraph.h index 20cc4b5b..57b3172e 100644 --- a/include/POLite/PGraph.h +++ b/include/POLite/PGraph.h @@ -512,10 +512,11 @@ template append(edge); // Prepare for new output table entry dests.numElems = destsRemaining; From ba1e8b49978b63794286e85902708a0f5092902c Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Tue, 5 May 2020 17:06:01 +0100 Subject: [PATCH 51/78] First attempt at fast mapper Just exploring... there's probably a better way to do this with less duplicated code. --- include/POLite.h | 15 +- include/POLite/FastMap/PDevice.h | 302 +++++++++++++ include/POLite/FastMap/PGraph.h | 710 +++++++++++++++++++++++++++++++ 3 files changed, 1024 insertions(+), 3 deletions(-) create mode 100644 include/POLite/FastMap/PDevice.h create mode 100644 include/POLite/FastMap/PGraph.h diff --git a/include/POLite.h b/include/POLite.h index d12a0e73..858b865e 100644 --- a/include/POLite.h +++ b/include/POLite.h @@ -6,13 +6,22 @@ #ifdef TINSEL #include - #include + #ifdef POLITE_FAST_MAP + #include + #else + #include + #endif #else - #include + #ifdef POLITE_FAST_MAP + #include + #include + #else + #include + #include + #endif #include #include #include - #include #endif #endif diff --git a/include/POLite/FastMap/PDevice.h b/include/POLite/FastMap/PDevice.h new file mode 100644 index 00000000..f095eba6 --- /dev/null +++ b/include/POLite/FastMap/PDevice.h @@ -0,0 +1,302 @@ +// SPDX-License-Identifier: BSD-2-Clause +#ifndef _PDEVICE_H_ +#define _PDEVICE_H_ + +#include +#include +#include + +#ifdef TINSEL + #include + #define PTR(t) t* +#else + #include + #define PTR(t) uint32_t +#endif + +// Use this to align on half-cache-line boundary +#define ALIGNED __attribute__((aligned(1<<(TinselLogBytesPerLine-1)))) + +// This is a static limit on the number of pins per device +#ifndef POLITE_NUM_PINS +#define POLITE_NUM_PINS 1 +#endif + +// Macros for performance stats +// POLITE_DUMP_STATS - dump performance stats on termination +// POLITE_COUNT_MSGS - include message counts of performance stats + +// Thread-local device id +typedef uint16_t PLocalDeviceId; + +// Thread id +typedef uint32_t PThreadId; + +// Device address +// Bits 17->0: thread id +// Bit 18: invalid address +// Bits 31->19: thread-local device id +typedef uint32_t PDeviceAddr; + +// Device address constructors +inline PDeviceAddr invalidDeviceAddr() { return 0x40000; } +inline PDeviceAddr makeDeviceAddr(PThreadId t, PLocalDeviceId d) { + return (d << 19) | t; +} + +// Device address deconstructors +inline bool isValidDeviceAddr(PDeviceAddr addr) { return !(addr & 0x40000); } +inline PThreadId getThreadId(PDeviceAddr addr) { return addr & 0x3ffff; } +inline PLocalDeviceId getLocalDeviceId(PDeviceAddr addr) { return addr >> 19; } + +// What's the max allowed local device address? +inline uint32_t maxLocalDeviceId() { return 8192; } + +// Pins +// No - means 'not ready to send' +// HostPin - means 'send to host' +// Pin(n) - means 'send to application pin number n' +typedef uint8_t PPin; +#define No 0 +#define HostPin 1 +#define Pin(n) ((n)+2) + +// For template arguments that are not used +struct None {}; + +// Generic device structure +// Type parameters: +// S - State +// E - Edge label +// M - Message structure +template struct PDevice { + // State + S* s; + PPin* readyToSend; + uint32_t numVertices; + uint16_t time; + + // Handlers + void init(); + void send(volatile M* msg); + void recv(M* msg, E* edge); + bool step(); + bool finish(volatile M* msg); +}; + +// Generic device state structure +template struct ALIGNED PState { + // Board-level routing key for each outgoing pin + uint32_t pin[POLITE_NUM_PINS]; + // Ready-to-send status + PPin readyToSend; + // Custom state + S state; +}; + +// Message structure +template struct PMessage { + // Destination thread-local device id + uint16_t devId; + // Id of incoming edge + uint16_t edgeId; + // Application message + M payload; +}; + +// An incoming edge to a device +template struct PInEdge { + E edge; +}; + +// Generic thread structure +template struct PThread { + + // Number of devices handled by thread + PLocalDeviceId numDevices; + // Number of times step handler has been called + uint16_t time; + // Number of devices in graph + uint32_t numVertices; + // Pointer to array of device states + PTR(PState) devices; + // Pointer to base of edge table + PTR(PInEdge) inTableBase; + // Array of local device ids are ready to send + PTR(PLocalDeviceId) senders; + // This array is accessed in a LIFO manner + PTR(PLocalDeviceId) sendersTop; + + // Count number of messages sent + #ifdef POLITE_COUNT_MSGS + // Total messages sent + uint32_t msgsSent; + // Total messages received + uint32_t msgsReceived; + // Number of times we wanted to send but couldn't + uint32_t blockedSends; + #endif + + #ifdef TINSEL + + // Helper function to construct a device + INLINE DeviceType getDevice(uint32_t id) { + DeviceType dev; + dev.s = &devices[id].state; + dev.readyToSend = &devices[id].readyToSend; + dev.numVertices = numVertices; + dev.time = time; + return dev; + } + + // Dump performance counter stats over UART + void dumpStats() { + tinselPerfCountStop(); + uint32_t me = tinselId(); + // Per-cache performance counters + uint32_t cacheMask = (1 << + (TinselLogThreadsPerCore + TinselLogCoresPerDCache)) - 1; + if ((me & cacheMask) == 0) { + printf("H:%x,M:%x,W:%x\n", + tinselHitCount(), + tinselMissCount(), + tinselWritebackCount()); + } + // Per-core performance counters + uint32_t coreMask = (1 << (TinselLogThreadsPerCore)) - 1; + if ((me & coreMask) == 0) { + printf("C:%x %x,I:%x %x\n", + tinselCycleCountU(), tinselCycleCount(), + tinselCPUIdleCountU(), tinselCPUIdleCount()); + } + // Per-thread performance counters + #ifdef POLITE_COUNT_MSGS + uint32_t intraBoardId = me & ((1<)-1) >> TinselLogBytesPerFlit); + + // Event loop + while (1) { + // Try to send + if (sendersTop != senders) { + if (tinselCanSend()) { + // Get next sender + PLocalDeviceId src = *(--sendersTop); + // Lookup device + DeviceType dev = getDevice(src); + PPin pin = *dev.readyToSend; + // Invoke send handler + PMessage* m = (PMessage*) tinselSendSlot(); + dev.send(&m->payload); + // Reinsert sender, if it still wants to send + if (*dev.readyToSend != No) sendersTop++; + // Is it a send to the host pin or a user pin? + if (pin == HostPin) + tinselSend(tinselHostId(), m); + else + tinselKeySend(devices[src].pin[pin-2], m); + #ifdef POLITE_COUNT_MSGS + msgsSent++; + #endif + } + else { + #ifdef POLITE_COUNT_MSGS + blockedSends++; + #endif + tinselWaitUntil(TINSEL_CAN_SEND|TINSEL_CAN_RECV); + } + } + else { + // Idle detection + int idle = tinselIdle(!active); + if (idle > 1) + break; + else if (idle) { + active = false; + for (uint32_t i = 0; i < numDevices; i++) { + DeviceType dev = getDevice(i); + // Invoke the step handler for each device + active = dev.step() || active; + // Device ready to send? + if (*dev.readyToSend != No) { + *(sendersTop++) = i; + } + } + time++; + } + } + + // Step 2: try to receive + while (tinselCanRecv()) { + PMessage* inMsg = (PMessage*) tinselRecv(); + PInEdge* inEdge = &inTableBase[inMsg->edgeId]; + // Lookup destination device + PLocalDeviceId id = inMsg->devId; + DeviceType dev = getDevice(id); + // Was it ready to send? + PPin oldReadyToSend = *dev.readyToSend; + // Invoke receive handler + dev.recv(&inMsg->payload, &inEdge->edge); + // Insert device into a senders array, if not already there + if (*dev.readyToSend != No && oldReadyToSend == No) + *(sendersTop++) = id; + #ifdef POLITE_COUNT_MSGS + msgsReceived++; + #endif + tinselFree(inMsg); + } + } + + // Termination + #ifdef POLITE_DUMP_STATS + dumpStats(); + #endif + + // Invoke finish handler for each device + for (uint32_t i = 0; i < numDevices; i++) { + DeviceType dev = getDevice(i); + tinselWaitUntil(TINSEL_CAN_SEND); + PMessage* m = (PMessage*) tinselSendSlot(); + if (dev.finish(&m->payload)) tinselSend(tinselHostId(), m); + } + + // Sleep + tinselWaitUntil(TINSEL_CAN_RECV); while (1); + } + + #endif + +}; + +#endif diff --git a/include/POLite/FastMap/PGraph.h b/include/POLite/FastMap/PGraph.h new file mode 100644 index 00000000..8ac0c84d --- /dev/null +++ b/include/POLite/FastMap/PGraph.h @@ -0,0 +1,710 @@ +// SPDX-License-Identifier: BSD-2-Clause +#ifndef _PGRAPH_H_ +#define _PGRAPH_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Nodes of a POETS graph are devices +typedef NodeId PDeviceId; + +// POETS graph +template class PGraph { + private: + // Align address to 2^n byte boundary + inline uint32_t align(uint32_t n, uint32_t addr) { + if ((addr & (1<> n) + 1) << n; + } + + // Align address to 32-bit word boundary + uint32_t wordAlign(uint32_t addr) { return align(2, addr); } + + // Align address to cache-line boundary + uint32_t cacheAlign(uint32_t addr) { + return align(TinselLogBytesPerLine, addr); + } + + // Helper function + inline uint32_t min(uint32_t x, uint32_t y) { return x < y ? x : y; } + + // Number of FPGA boards available + uint32_t meshLenX; + uint32_t meshLenY; + + // Number of FPGA boards to use + uint32_t numBoardsX; + uint32_t numBoardsY; + + // Out table (sender-side edge tables) + // Sequence of destinations for every (device, pin) pair + Seq*** outTable; + + // Key table (sender-side key tables) + // Global routing key for every (device, pin) pair + uint32_t** keyTable; + + // In table (receiver-side edge tables) + // Sequence of incoming edges for every thread + Seq>** inTable; + + // Mesh of per-board programmable routers + ProgRouterMesh* routingTables; + + // Generic constructor + void constructor(uint32_t lenX, uint32_t lenY) { + meshLenX = lenX; + meshLenY = lenY; + char* str = getenv("POLITE_BOARDS_X"); + int nx = str ? atoi(str) : meshLenX; + str = getenv("POLITE_BOARDS_Y"); + int ny = str ? atoi(str) : meshLenY; + setNumBoards(nx, ny); + numDevices = 0; + devices = NULL; + toDeviceAddr = NULL; + numDevicesOnThread = NULL; + fromDeviceAddr = NULL; + vertexMem = NULL; + vertexMemSize = NULL; + vertexMemBase = NULL; + inEdgeMem = NULL; + inEdgeMemSize = NULL; + inEdgeMemBase = NULL; + mapVerticesToDRAM = false; + mapInEdgesToDRAM = true; + outTable = NULL; + keyTable = NULL; + inTable = NULL; + routingTables = NULL; + chatty = 0; + str = getenv("POLITE_CHATTY"); + if (str != NULL) { + chatty = !strcmp(str, "0") ? 0 : 1; + } + } + + public: + // Number of devices + uint32_t numDevices; + + // Graph containing device ids and connections + Graph graph; + + // Edge labels: has same structure as graph.outgoing + Seq*> edgeLabels; + + // Mapping from device id to device state + // (Not valid until the mapper is called) + PState** devices; + + // Mapping from thread id to number of devices on that thread + // (Not valid until the mapper is called) + uint32_t* numDevicesOnThread; + + // Mapping from device id to device address and back + // (Not valid until the mapper is called) + PDeviceAddr* toDeviceAddr; // Device id -> device address + PDeviceId** fromDeviceAddr; // Device address -> device id + + // Each thread's vertex mem and thread mem regions + // (Not valid until the mapper is called) + uint8_t** vertexMem; uint8_t** threadMem; + uint32_t* vertexMemSize; uint32_t* threadMemSize; + uint32_t* vertexMemBase; uint32_t* threadMemBase; + + // Each thread's in-edge tables + // (Not valid until the mapper is called) + uint8_t** inEdgeMem; + uint32_t* inEdgeMemSize; + uint32_t* inEdgeMemBase; + + // Where to map the various regions + // (If false, map to SRAM instead) + bool mapVerticesToDRAM; + bool mapInEdgesToDRAM; + + // Allow mapper to print useful information to stdout + uint32_t chatty; + + // Setter for number of boards to use + void setNumBoards(uint32_t x, uint32_t y) { + if (x > meshLenX || y > meshLenY) { + printf("Mapper: %d x %d boards requested, %d x %d available\n", + numBoardsX, numBoardsY, meshLenX, meshLenY); + exit(EXIT_FAILURE); + } + numBoardsX = x; + numBoardsY = y; + } + + // Create new device + inline PDeviceId newDevice() { + edgeLabels.append(new SmallSeq); + numDevices++; + return graph.newNode(); + } + + // Add a connection between devices + inline void addEdge(PDeviceId from, PinId pin, PDeviceId to) { + if (pin >= POLITE_NUM_PINS) { + printf("addEdge: pin exceeds POLITE_NUM_PINS\n"); + exit(EXIT_FAILURE); + } + graph.addEdge(from, pin, to); + E edge; + edgeLabels.elems[from]->append(edge); + } + + // Add labelled edge using given output pin + void addLabelledEdge(E edge, PDeviceId x, PinId pin, PDeviceId y) { + graph.addEdge(x, pin, y); + edgeLabels.elems[x]->append(edge); + } + + // Allocate SRAM and DRAM partitions + void allocatePartitions() { + // Decide a maximum partition size that is reasonable + // SRAM: Partition size minus 2048 bytes for the stack + uint32_t maxSRAMSize = (1<)); + // Add space for devices + uint32_t numDevs = numDevicesOnThread[threadId]; + for (uint32_t devNum = 0; devNum < numDevs; devNum++) { + // Add space for device + sizeVMem = sizeVMem + sizeof(PState); + } + // Add space for incoming edge table + if (inTable[threadId]) { + sizeEIMem = inTable[threadId]->numElems * sizeof(PInEdge); + sizeEIMem = wordAlign(sizeEIMem); + } + // The total partition size including uninitialised portions + uint32_t totalSizeVMem = + sizeVMem + wordAlign(sizeof(PLocalDeviceId) * numDevs); + // Check that total size is reasonable + uint32_t totalSizeSRAM = sizeTMem; + uint32_t totalSizeDRAM = 0; + if (mapVerticesToDRAM) totalSizeDRAM += totalSizeVMem; + else totalSizeSRAM += totalSizeVMem; + if (mapInEdgesToDRAM) totalSizeDRAM += sizeEIMem; + else totalSizeSRAM += sizeEIMem; + if (totalSizeDRAM > maxDRAMSize) { + printf("Error: max DRAM partition size exceeded\n"); + exit(EXIT_FAILURE); + } + if (totalSizeSRAM > maxSRAMSize) { + printf("Error: max SRAM partition size exceeded\n"); + exit(EXIT_FAILURE); + } + // Allocate space for the initialised portion of the partition + assert((sizeVMem%4) == 0); + assert((sizeTMem%4) == 0); + assert((sizeEIMem%4) == 0); + vertexMem[threadId] = (uint8_t*) calloc(sizeVMem, 1); + vertexMemSize[threadId] = sizeVMem; + threadMem[threadId] = (uint8_t*) calloc(sizeTMem, 1); + threadMemSize[threadId] = sizeTMem; + inEdgeMem[threadId] = (uint8_t*) calloc(sizeEIMem, 1); + inEdgeMemSize[threadId] = sizeEIMem; + // Tinsel address of base of partition + uint32_t partId = threadId & (TinselThreadsPerDRAM-1); + uint32_t sramBase = (1 << TinselLogBytesPerSRAM) + + (partId << TinselLogBytesPerSRAMPartition); + uint32_t dramBase = TinselBytesPerDRAM - + ((partId+1) << TinselLogBytesPerDRAMPartition); + // Use partition-interleaved region for DRAM + dramBase |= 0x80000000; + threadMemBase[threadId] = sramBase; + sramBase += threadMemSize[threadId]; + // Determine base addresses of each region + if (mapVerticesToDRAM) { + vertexMemBase[threadId] = dramBase; + dramBase += totalSizeVMem; + } + else { + vertexMemBase[threadId] = sramBase; + sramBase += totalSizeVMem; + } + if (mapInEdgesToDRAM) { + inEdgeMemBase[threadId] = dramBase; + dramBase += sizeEIMem; + } + else { + inEdgeMemBase[threadId] = sramBase; + sramBase += sizeEIMem; + } + } + } + + // Initialise partitions + void initialisePartitions() { + for (uint32_t threadId = 0; threadId < TinselMaxThreads; threadId++) { + // Next pointers for each partition + uint32_t nextVMem = 0; + // Pointer to thread structure + PThread* thread = + (PThread*) &threadMem[threadId][0]; + // Set number of devices on thread + thread->numDevices = numDevicesOnThread[threadId]; + // Set number of devices in graph + thread->numVertices = numDevices; + // Set tinsel address of array of device states + thread->devices = vertexMemBase[threadId]; + // Set tinsel address of base of in-edge table + thread->inTableBase = inEdgeMemBase[threadId]; + // Add space for each device on thread + uint32_t numDevs = numDevicesOnThread[threadId]; + for (uint32_t devNum = 0; devNum < numDevs; devNum++) { + PState* dev = (PState*) &vertexMem[threadId][nextVMem]; + PDeviceId id = fromDeviceAddr[threadId][devNum]; + devices[id] = dev; + // Add space for device + nextVMem = nextVMem + sizeof(PState); + } + // Initialise each device and the thread's out edges + for (uint32_t devNum = 0; devNum < numDevs; devNum++) { + PDeviceId id = fromDeviceAddr[threadId][devNum]; + PState* dev = devices[id]; + // Initialise + for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) { + dev->pin[p] = keyTable[id][p]; + } + } + // Intialise thread's in edges + PInEdge* inEdgeArray = (PInEdge*) inEdgeMem[threadId]; + Seq>* edges = inTable[threadId]; + if (edges) + for (uint32_t i = 0; i < edges->numElems; i++) { + inEdgeArray[i] = edges->elems[i]; + } + // At this point, check that next pointers line up with heap sizes + if (nextVMem != vertexMemSize[threadId]) { + printf("Error: vertex mem size does not match pre-computed size\n"); + exit(EXIT_FAILURE); + } + // Set tinsel address of senders array + thread->senders = vertexMemBase[threadId] + nextVMem; + } + } + + // Allocate mapping structures + void allocateMapping() { + devices = (PState**) calloc(numDevices, sizeof(PState*)); + toDeviceAddr = (PDeviceAddr*) calloc(numDevices, sizeof(PDeviceAddr)); + fromDeviceAddr = (PDeviceId**) calloc(TinselMaxThreads, sizeof(PDeviceId*)); + numDevicesOnThread = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t)); + } + + // Allocate thread edge input and output tables + // (Only valid after mapper is called) + void allocateInOutTables() { + // Receiver-side tables + inTable = (Seq>**) + calloc(TinselMaxThreads,sizeof(Seq>*)); + for (uint32_t t = 0; t < TinselMaxThreads; t++) { + if (numDevicesOnThread[t] != 0) + inTable[t] = new SmallSeq>; + } + + // Sender-side tables + outTable = (Seq***) + calloc(numDevices, sizeof(Seq**)); + for (uint32_t d = 0; d < numDevices; d++) { + outTable[d] = (Seq**) + calloc(POLITE_NUM_PINS, sizeof(Seq*)); + for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) + outTable[d][p] = new SmallSeq; + } + + keyTable = new uint32_t* [numDevices]; + for (uint32_t d = 0; d < numDevices; d++) + keyTable[d] = new uint32_t [POLITE_NUM_PINS]; + } + + // Compute thread edge input and output tables + // (Only valid after mapper is called) + void computeInOutTables() { + // For each device + for (uint32_t d = 0; d < numDevices; d++) { + // For each pin + for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) { + Seq* dests = graph.outgoing->elems[d]; + Seq* edges = edgeLabels.elems[d]; + for (uint32_t i = 0; i < dests->numElems; i++) { + PDeviceId destId = dests->elems[i]; + // Destination thread id + uint32_t threadId = getThreadId(toDeviceAddr[destId]); + // Thread-local device id + uint32_t devId = getLocalDeviceId(toDeviceAddr[destId]); + // Add edge to thread's input table + uint32_t edgeId = inTable[threadId]->numElems; + if (i < inTable[threadId]->numElems) { + PInEdge edge; + edge.edge = edges->elems[i]; + inTable[threadId]->append(edge); + } + // Add output table entry + PRoutingDest rdest; + rdest.kind = PRDestKindURM1; + rdest.mbox = threadId >> TinselLogThreadsPerMailbox; + rdest.urm1.key = devId | (edgeId << 16); + rdest.urm1.threadId = threadId & + ((1<append(rdest); + } + } + } + } + + // Release all structures + void releaseAll() { + if (devices != NULL) { + free(devices); + free(toDeviceAddr); + free(numDevicesOnThread); + for (uint32_t t = 0; t < TinselMaxThreads; t++) + if (fromDeviceAddr[t] != NULL) free(fromDeviceAddr[t]); + free(fromDeviceAddr); + for (uint32_t t = 0; t < TinselMaxThreads; t++) + if (vertexMem[t] != NULL) free(vertexMem[t]); + free(vertexMem); + free(vertexMemSize); + free(vertexMemBase); + for (uint32_t t = 0; t < TinselMaxThreads; t++) + if (threadMem[t] != NULL) free(threadMem[t]); + free(threadMem); + free(threadMemSize); + free(threadMemBase); + for (uint32_t t = 0; t < TinselMaxThreads; t++) + if (inEdgeMem[t] != NULL) free(inEdgeMem[t]); + free(inEdgeMem); + free(inEdgeMemSize); + free(inEdgeMemBase); + } + if (inTable != NULL) { + for (uint32_t t = 0; t < TinselMaxThreads; t++) + if (inTable[t] != NULL) delete inTable[t]; + free(inTable); + inTable = NULL; + } + if (outTable != NULL) { + for (uint32_t d = 0; d < numDevices; d++) { + if (outTable[d] == NULL) continue; + for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) + delete outTable[d][p]; + free(outTable[d]); + } + free(outTable); + outTable = NULL; + } + if (keyTable != NULL) { + for (uint32_t d = 0; d < numDevices; d++) delete [] keyTable[d]; + delete [] keyTable; + keyTable = NULL; + } + if (routingTables != NULL) delete routingTables; + } + + // Implement mapping to tinsel threads + void map() { + // Let's measure some times + struct timeval placementStart, placementFinish; + struct timeval routingStart, routingFinish; + struct timeval initStart, initFinish; + + // Release all mapping and heap structures + releaseAll(); + + // Reallocate mapping structures + allocateMapping(); + + // Start placement timer + gettimeofday(&placementStart, NULL); + + // Partition into subgraphs, one per board + Placer boards(&graph, numBoardsX, numBoardsY); + + // Place subgraphs onto 2D mesh + const uint32_t placerEffort = 8; + boards.place(placerEffort); + + // For each board + for (uint32_t boardY = 0; boardY < numBoardsY; boardY++) { + for (uint32_t boardX = 0; boardX < numBoardsX; boardX++) { + // Partition into subgraphs, one per mailbox + PartitionId b = boards.mapping[boardY][boardX]; + Placer boxes(&boards.subgraphs[b], + TinselMailboxMeshXLen, TinselMailboxMeshYLen); + boxes.place(placerEffort); + + // For each mailbox + for (uint32_t boxX = 0; boxX < TinselMailboxMeshXLen; boxX++) { + for (uint32_t boxY = 0; boxY < TinselMailboxMeshYLen; boxY++) { + // Partition into subgraphs, one per thread + uint32_t numThreads = 1<incoming->numElems; + numDevicesOnThread[threadId] = numDevs; + fromDeviceAddr[threadId] = (PDeviceId*) + malloc(sizeof(PDeviceId) * numDevs); + for (uint32_t devNum = 0; devNum < numDevs; devNum++) + fromDeviceAddr[threadId][devNum] = g->labels->elems[devNum]; + + // Populate toDeviceAddr mapping + assert(numDevs < maxLocalDeviceId()); + for (uint32_t devNum = 0; devNum < numDevs; devNum++) { + PDeviceAddr devAddr = + makeDeviceAddr(threadId, devNum); + toDeviceAddr[g->labels->elems[devNum]] = devAddr; + } + } + } + } + } + } + + // Stop placement timer and start In/Out table timer + gettimeofday(&placementFinish, NULL); + gettimeofday(&routingStart, NULL); + + // Compute send and receive side routing tables + allocateInOutTables(); + computeInOutTables(); + + // Compute per-board programmable routing tables + routingTables = new ProgRouterMesh(numBoardsX, numBoardsY); + for (uint32_t d = 0; d < numDevices; d++) { + uint32_t src = getThreadId(toDeviceAddr[d]) >> + TinselLogThreadsPerMailbox; + for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) + keyTable[d][p] = routingTables->addDestsFromBoard(src, outTable[d][p]); + } + + // Stop routing timer and start init timer + gettimeofday(&routingFinish, NULL); + gettimeofday(&initStart, NULL); + + // Reallocate and initialise heap structures + allocatePartitions(); + initialisePartitions(); + + // Display times, if chatty + gettimeofday(&initFinish, NULL); + if (chatty > 0) { + struct timeval diff; + + timersub(&placementFinish, &placementStart, &diff); + double duration = (double) diff.tv_sec + + (double) diff.tv_usec / 1000000.0; + printf("POLite mapper profile:\n"); + printf(" Partitioning and placement: %lfs\n", duration); + + timersub(&routingFinish, &routingStart, &diff); + duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; + printf(" In/Out table construction: %lfs\n", duration); + + timersub(&initFinish, &initStart, &diff); + duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; + printf(" Thread state initialisation: %lfs\n", duration); + } + } + + // Constructor + PGraph() { + char* str = getenv("HOSTLINK_BOXES_X"); + int x = str ? atoi(str) : 1; + x = x * TinselMeshXLenWithinBox; + str = getenv("HOSTLINK_BOXES_Y"); + int y = str ? atoi(str) : 1; + y = y * TinselMeshYLenWithinBox; + constructor(x, y); + } + PGraph(uint32_t numBoxesX, uint32_t numBoxesY) { + int x = numBoxesX * TinselMeshXLenWithinBox; + int y = numBoxesY * TinselMeshYLenWithinBox; + constructor(x, y); + } + + // Deconstructor + ~PGraph() { + releaseAll(); + for (uint32_t i = 0; i < edgeLabels.numElems; i++) + delete edgeLabels.elems[i]; + } + + // Write partition to tinsel machine + void writeRAM(HostLink* hostLink, + uint8_t** heap, uint32_t* heapSize, uint32_t* heapBase) { + // Number of bytes written by each thread + uint32_t* writeCount = (uint32_t*) + calloc(TinselMaxThreads, sizeof(uint32_t)); + + // Number of threads completed by each core + uint32_t*** threadCount = (uint32_t***) + calloc(meshLenX, sizeof(uint32_t**)); + for (uint32_t x = 0; x < meshLenX; x++) { + threadCount[x] = (uint32_t**) + calloc(meshLenY, sizeof(uint32_t*)); + for (uint32_t y = 0; y < meshLenY; y++) + threadCount[x][y] = (uint32_t*) + calloc(TinselCoresPerBoard, sizeof(uint32_t)); + } + + // Initialise write addresses + for (int x = 0; x < meshLenX; x++) + for (int y = 0; y < meshLenY; y++) + for (int c = 0; c < TinselCoresPerBoard; c++) + hostLink->setAddr(x, y, c, heapBase[hostLink->toAddr(x, y, c, 0)]); + + // Write heaps + uint32_t done = false; + while (! done) { + done = true; + for (int x = 0; x < meshLenX; x++) { + for (int y = 0; y < meshLenY; y++) { + for (int c = 0; c < TinselCoresPerBoard; c++) { + uint32_t t = threadCount[x][y][c]; + if (t < TinselThreadsPerCore) { + done = false; + uint32_t threadId = hostLink->toAddr(x, y, c, t); + uint32_t written = writeCount[threadId]; + if (written == heapSize[threadId]) { + threadCount[x][y][c] = t+1; + if ((t+1) < TinselThreadsPerCore) + hostLink->setAddr(x, y, c, + heapBase[hostLink->toAddr(x, y, c, t+1)]); + } else { + uint32_t send = min((heapSize[threadId] - written)>>2, 15); + hostLink->store(x, y, c, send, + (uint32_t*) &heap[threadId][written]); + writeCount[threadId] = written + send * sizeof(uint32_t); + } + } + } + } + } + } + + // Release memory + free(writeCount); + for (uint32_t x = 0; x < meshLenX; x++) { + for (uint32_t y = 0; y < meshLenY; y++) + free(threadCount[x][y]); + free(threadCount[x]); + } + free(threadCount); + } + + // Write graph to tinsel machine + void write(HostLink* hostLink) { + // Start timer + struct timeval start, finish; + gettimeofday(&start, NULL); + + bool useSendBufferOld = hostLink->useSendBuffer; + hostLink->useSendBuffer = true; + writeRAM(hostLink, vertexMem, vertexMemSize, vertexMemBase); + writeRAM(hostLink, threadMem, threadMemSize, threadMemBase); + writeRAM(hostLink, inEdgeMem, inEdgeMemSize, inEdgeMemBase); + routingTables->write(hostLink); + hostLink->flush(); + hostLink->useSendBuffer = useSendBufferOld; + + // Display time if chatty + gettimeofday(&finish, NULL); + if (chatty > 0) { + struct timeval diff; + timersub(&finish, &start, &diff); + double duration = (double) diff.tv_sec + + (double) diff.tv_usec / 1000000.0; + printf("POLite graph upload time: %lfs\n", duration); + } + } + + // Determine fan-in of given device + uint32_t fanIn(PDeviceId id) { + return graph.fanIn(id); + } + + // Determine fan-out of given device + uint32_t fanOut(PDeviceId id) { + return graph.fanOut(id); + } + +}; + +// Read performance stats and store in file +inline void politeSaveStats(HostLink* hostLink, const char* filename) { + #ifdef POLITE_DUMP_STATS + // Open file for performance counters + FILE* statsFile = fopen(filename, "wt"); + if (statsFile == NULL) { + printf("Error creating stats file\n"); + exit(EXIT_FAILURE); + } + uint32_t meshLenX = hostLink->meshXLen; + uint32_t meshLenY = hostLink->meshYLen; + // Number of caches + uint32_t numLines = meshLenX * meshLenY * + TinselDCachesPerDRAM * TinselDRAMsPerBoard; + // Add on number of cores + numLines += meshLenX * meshLenY * TinselCoresPerBoard; + // Add on number of threads + #ifdef POLITE_COUNT_MSGS + numLines += meshLenX * meshLenY * TinselThreadsPerBoard; + #endif + hostLink->dumpStdOut(statsFile, numLines); + fclose(statsFile); + #endif +} + +#endif From ce3894962fbff386925def486ec2c675ae439093 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Tue, 5 May 2020 20:34:20 +0000 Subject: [PATCH 52/78] Allow random placement --- include/POLite/Placer.h | 60 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/include/POLite/Placer.h b/include/POLite/Placer.h index 32aec831..1468f0c7 100644 --- a/include/POLite/Placer.h +++ b/include/POLite/Placer.h @@ -10,6 +10,14 @@ typedef uint32_t PartitionId; // Partition and place a graph on a 2D mesh struct Placer { + // Select between different methods + enum Method { + Default, + Metis, + Random + }; + const Method defaultMethod=Metis; + // The graph being placed Graph* graph; @@ -41,8 +49,31 @@ struct Placer { uint32_t* yCoordSaved; uint64_t savedCost; + // Controls which strategy is used + Method method = Default; + + // Select placer method + void chooseMethod() + { + auto e = getenv("POLITE_PLACER"); + if (e) { + if (!strcmp(e, "metis")) + method=Metis; + else if (!strcmp(e, "random")) + method=Random; + else if (!strcmp(e, "default") || *e == '\0') + method=Default; + else { + fprintf(stderr, "Don't understand placer method : %s\n", e); + exit(EXIT_FAILURE); + } + } + if (method == Default) + method = defaultMethod; + } + // Partition the graph using Metis - void partition() { + void partitionMetis() { // Compute total number of edges uint32_t numEdges = 0; for (uint32_t i = 0; i < graph->incoming->numElems; i++) { @@ -116,6 +147,31 @@ struct Placer { free(parts); } + // Partition the graph randomly + void partitionRandom() { + uint32_t numVertices = graph->incoming->numElems; + uint32_t numParts = width * height; + + // Populate result array + srand(0); + for (uint32_t i = 0; i < numVertices; i++) { + partitions[i] = rand() % numParts; + } + } + + void partition() + { + switch(method){ + case Default: + case Metis: + partitionMetis(); + break; + case Random: + partitionRandom(); + break; + } + } + // Create subgraph for each partition void computeSubgraphs() { uint32_t numPartitions = width*height; @@ -316,6 +372,8 @@ struct Placer { yCoord = new uint32_t [width*height]; xCoordSaved = new uint32_t [width*height]; yCoordSaved = new uint32_t [width*height]; + // Pick a placement method, or select default + chooseMethod(); // Partition the graph using Metis partition(); // Compute subgraphs, one per partition From a679245e9559c473de2640c77b48ca87ec7cd7b5 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Tue, 5 May 2020 21:58:50 +0100 Subject: [PATCH 53/78] Full rate ProgRouter --- config.py | 3 +- rtl/Network.bsv | 14 +++----- rtl/ProgRouter.bsv | 86 ++++++++++++++++++---------------------------- 3 files changed, 38 insertions(+), 65 deletions(-) diff --git a/config.py b/config.py index e9917ae5..6faca950 100755 --- a/config.py +++ b/config.py @@ -164,7 +164,6 @@ def quoted(s): return "'\"" + s + "\"'" # Programmable router parameters: p["LogRoutingEntryLen"] = 5 # Number of beats in a routing table entry p["ProgRouterMaxBurst"] = 4 -p["ProgRouterCrossbarOutputs"] = 4 p["FetcherLogIndQueueSize"] = 1 p["FetcherLogBeatBufferSize"] = 5 p["FetcherLogFlitBufferSize"] = 5 @@ -380,7 +379,7 @@ def quoted(s): return "'\"" + s + "\"'" # Parameters for programmable routers # (and the routing-record fetchers they contain) -p["FetchersPerProgRouter"] = 6 +p["FetchersPerProgRouter"] = 4 + p["MailboxMeshXLen"] p["LogFetcherFlitBufferSize"] = 5 #============================================================================== diff --git a/rtl/Network.bsv b/rtl/Network.bsv index 136d327c..07d9adfd 100644 --- a/rtl/Network.bsv +++ b/rtl/Network.bsv @@ -401,22 +401,16 @@ module mkNoC#( westLink[0].flitOut, boardRouter.flitIn[3]); // Connect mailbox mesh south rim to board router - function List#(t) single(t elem) = List::cons(elem, Nil); - List#(Out#(Flit)) botOutList0 = Nil; - List#(Out#(Flit)) botOutList1 = Nil; - for (Integer x = `MailboxMeshXLen-1; x >= 0; x=x-2) begin - botOutList0 = Cons(routers[0][x].bottomOut, botOutList0); - botOutList1 = Cons(routers[0][x-1].bottomOut, botOutList1); - end - reduceConnect(mkFlitMerger, botOutList0, single(boardRouter.flitIn[4])); - reduceConnect(mkFlitMerger, botOutList1, single(boardRouter.flitIn[5])); + for (Integer i = 0; i < `MailboxMeshXLen; i=i+1) + connectUsing(mkUGShiftQueue1(QueueOptFmax), + routers[0][i].bottomOut, boardRouter.flitIn[4+i]); // Connect board router to mailbox mesh south rim function In#(Flit) getBottomIn(MeshRouter r) = r.bottomIn; Vector#(`MailboxMeshXLen, In#(Flit)) southRimInPorts = map(getBottomIn, routers[0]); for (Integer i = 0; i < `MailboxMeshXLen; i=i+1) - connectDirect(boardRouter.nocFlitOut[i], southRimInPorts[i]); + connectDirect(boardRouter.flitOut[4+i], southRimInPorts[i]); // Detect inter-board activity // --------------------------- diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv index ae361ed1..e8a7d97c 100644 --- a/rtl/ProgRouter.bsv +++ b/rtl/ProgRouter.bsv @@ -179,24 +179,17 @@ typedef struct { // NoC edge, but the diagram assumes four. // -// N S E W L0/L1 L2/L3 Input flits -// | | | | | | -// +---+ +---+ +---+ +---+ +---+ +---+ -// | F | | F | | F | | F | | F | | F | Fetchers -// +---+ +---+ +---+ +---+ +---+ +---+ -// | | | | | | -// +---------------------------------+ -// | Crossbar | Routing -// +---------------------------------+ -// | | | | -// N/L0 S/L1 E/L2 W/L3 Output queues -// | | | | -// +---------------------------+ -// | Splitter | Final splitting -// +---------------------------+ -// | | | | | | | | -// N S E W L0 L1 L2 L3 Output flits -// +// N S E W L0 L1 L2 L3 Input flits +// | | | | | | | | +// +---+ +---+ +---+ +---+ +---+ +---+ +---+ +---+ +// | F | | F | | F | | F | | F | | F | | F | | F | Fetchers +// +---+ +---+ +---+ +---+ +---+ +---+ +---+ +---+ +// | | | | | | | | +// +-------------------------------------------+ +// | Crossbar | Routing +// +-------------------------------------------+ +// | | | | | | | | +// N S E W L0 L1 L2 L3 Output queues // The core functionality is implemented in the fetchers, which: // (1) extract routing keys from incoming flits; @@ -218,10 +211,7 @@ typedef struct { // After the fetchers have interpreted the flits, they are fed to a // fair crossbar which organises them by destination into output -// queues. To reduce logic, we allow each inter-board link to share -// an output queue with a local link, as this does not compromise -// forward progress. Finally the queues are split to provide an -// output stream for each possible destination. +// queues. // ============================================================================= // Fetcher @@ -751,8 +741,8 @@ module mkProgRouterCrossbar#( // Current choice of flit source Vector#(numOut, Reg#(Bit#(numIn))) choiceReg <- replicateM(mkReg(0)); - // Output queue - Vector#(numOut, Queue1#(RoutedFlit)) outQueue <- + // Output queues + Vector#(numOut, Queue#(RoutedFlit)) outQueue <- replicateM(mkUGShiftQueue(QueueOptFmax)); // Selector mux for each out queue @@ -857,8 +847,7 @@ endinterface interface ProgRouter; // Incoming and outgoing flits interface Vector#(`FetchersPerProgRouter, In#(Flit)) flitIn; - interface Vector#(`ProgRouterCrossbarOutputs, BOut#(Flit)) flitOut; - interface Vector#(`MailboxMeshXLen, BOut#(Flit)) nocFlitOut; + interface Vector#(`FetchersPerProgRouter, BOut#(Flit)) flitOut; // Interface to off-chip memory interface Vector#(`DRAMsPerBoard, @@ -879,44 +868,36 @@ module mkProgRouter#(BoardId boardId) (ProgRouter); fetchers[i] <- mkFetcher(boardId, i); // Crossbar routing functions - function Bit#(2) xcoord(RoutedFlit rf) = + function Bit#(`MailboxMeshXBits) xcoord(RoutedFlit rf) = zeroExtend(rf.flit.dest.addr.mbox.x); - function Bool routeN(RoutedFlit rf) = - rf.decision == RouteNorth || (rf.decision == RouteNoC && xcoord(rf) == 0); - function Bool routeS(RoutedFlit rf) = - rf.decision == RouteSouth || (rf.decision == RouteNoC && xcoord(rf) == 1); - function Bool routeE(RoutedFlit rf) = - rf.decision == RouteEast || (rf.decision == RouteNoC && xcoord(rf) == 2); - function Bool routeW(RoutedFlit rf) = - rf.decision == RouteWest || (rf.decision == RouteNoC && xcoord(rf) == 3); - Vector#(`ProgRouterCrossbarOutputs, SelectorFunc) funcs = - vector(routeN, routeS, routeE, routeW); + function Bool routeN(RoutedFlit rf) = rf.decision == RouteNorth; + function Bool routeS(RoutedFlit rf) = rf.decision == RouteSouth; + function Bool routeE(RoutedFlit rf) = rf.decision == RouteEast; + function Bool routeW(RoutedFlit rf) = rf.decision == RouteWest; + function Bool routeL(Bit#(`MailboxMeshXBits) x, RoutedFlit rf) = + rf.decision == RouteNoC && xcoord(rf) == x; + Vector#(`FetchersPerProgRouter, SelectorFunc) funcs; + funcs[0] = routeN; funcs[1] = routeS; + funcs[2] = routeE; funcs[3] = routeW; + for (Integer i = 0; i < `MailboxMeshXLen; i=i+1) + funcs[4+i] = routeL(fromInteger(i)); // Crossbar function BOut#(RoutedFlit) getFetcherFlitOut(Fetcher f) = f.flitOut; Vector#(`FetchersPerProgRouter, BOut#(RoutedFlit)) fetcherOuts = map(getFetcherFlitOut, fetchers); - Vector#(`ProgRouterCrossbarOutputs, BOut#(RoutedFlit)) + Vector#(`FetchersPerProgRouter, BOut#(RoutedFlit)) crossbarOuts <- mkProgRouterCrossbar(funcs, fetcherOuts); + Vector#(`FetchersPerProgRouter, BOut#(Flit)) crossbarOutFlits; + function Flit toFlit (RoutedFlit rf) = rf.flit; + for (Integer i = 0; i < `FetchersPerProgRouter; i=i+1) + crossbarOutFlits[i] <- onBOut(toFlit, crossbarOuts[i]); // Flit input interfaces Vector#(`FetchersPerProgRouter, In#(Flit)) flitInIfc = newVector; for (Integer i = 0; i < `FetchersPerProgRouter; i=i+1) flitInIfc[i] = fetchers[i].flitIn; - // Flit output interfaces - Vector#(`ProgRouterCrossbarOutputs, BOut#(Flit)) flitOutIfc = newVector; - Vector#(`MailboxMeshXLen, BOut#(Flit)) nocFlitOutIfc = newVector; - - // Strands - function Bool forNoC(RoutedFlit rf) = rf.decision == RouteNoC; - for (Integer i = 0; i < `ProgRouterCrossbarOutputs; i=i+1) begin - match {.noc, .other} <- splitFlits(forNoC, crossbarOuts[i]); - flitOutIfc[i] = other; - if (i < `MailboxMeshXLen) nocFlitOutIfc[i] = noc; - end - function Flit toFlit (RoutedFlit rf) = rf.flit; - // RAM interfaces Vector#(`DRAMsPerBoard, Vector#(`FetchersPerProgRouter, In#(DRAMResp))) ramRespIfc = replicate(newVector); @@ -945,8 +926,7 @@ module mkProgRouter#(BoardId boardId) (ProgRouter); function FetcherActivity getActivity(Fetcher f) = f.activity; interface flitIn = flitInIfc; - interface flitOut = flitOutIfc; - interface nocFlitOut = nocFlitOutIfc; + interface flitOut = crossbarOutFlits; interface ramReqs = ramReqIfc; interface ramResps = ramRespIfc; interface activities = map(getActivity, fetchers); From 5c72ead39232f13b892177410c7394ba1ffd5707 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Wed, 6 May 2020 14:22:53 +0100 Subject: [PATCH 54/78] Set clock to 215MHz --- README.md | 2 +- apps/POLite/util/sumstats.awk | 2 +- config.py | 2 +- de5/S5_DDR3_QSYS.qsys | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index c6101804..c148323a 100644 --- a/README.md +++ b/README.md @@ -1180,7 +1180,7 @@ the DE5-Net. `MeshXLenWithinBox` | 3 | Boards in X dimension within box `MeshYLenWithinBox` | 2 | Boards in Y dimension within box `EnablePerfCount` | True | Enable performance counters - `ClockFreq` | 225 | Clock frequency in MHz + `ClockFreq` | 215 | Clock frequency in MHz Further parameters can be found in [config.py](config.py). diff --git a/apps/POLite/util/sumstats.awk b/apps/POLite/util/sumstats.awk index 9ccfac8c..719699aa 100755 --- a/apps/POLite/util/sumstats.awk +++ b/apps/POLite/util/sumstats.awk @@ -15,7 +15,7 @@ BEGIN { progRouterSent = 0; progRouterSentInter = 0; blockedSends = 0; - fmax = 220000000; + fmax = 215000000; if (boardsX == "" || boardsY == "") { boardsX = 3; boardsY = 2; diff --git a/config.py b/config.py index 6faca950..6500be58 100755 --- a/config.py +++ b/config.py @@ -188,7 +188,7 @@ def quoted(s): return "'\"" + s + "\"'" p["UseCustomAccelerator"] = False # Clock frequency (in MHz) -p["ClockFreq"] = 220 +p["ClockFreq"] = 215 #============================================================================== # Derived Parameters diff --git a/de5/S5_DDR3_QSYS.qsys b/de5/S5_DDR3_QSYS.qsys index dc87cb4b..4d8e3a49 100644 --- a/de5/S5_DDR3_QSYS.qsys +++ b/de5/S5_DDR3_QSYS.qsys @@ -891,7 +891,7 @@ - + @@ -1214,7 +1214,7 @@ - + From b8ad3b4f85bec7b32bcf07b931922522fac45b34 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Thu, 7 May 2020 13:25:16 +0100 Subject: [PATCH 55/78] 2nd attempt at optional extra send slot --- README.md | 7 ++++- hostlink/DebugLink.cpp | 14 ++++++---- hostlink/DebugLink.h | 9 +++++- hostlink/HostLink.cpp | 24 ++++++++++++---- hostlink/HostLink.h | 13 ++++++++- include/tinsel.h | 12 ++++++++ rtl/DE5Top.bsv | 7 +++++ rtl/DebugLink.bsv | 35 +++++++++++++++++------ rtl/GenInit.sh | 19 ------------- rtl/Mailbox.bsv | 63 ++++++++++++++++++++++++++++++++++-------- rtl/Makefile | 11 ++------ 11 files changed, 152 insertions(+), 62 deletions(-) delete mode 100755 rtl/GenInit.sh diff --git a/README.md b/README.md index c148323a..acc9d10c 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ New section on programmable routers: New performance counters accessible from core zero on each board only: * `ProgRouterSent` and `ProgRouterSentInterBoard` -Document the following: +Document the following new perf counters: ```c++ // Performance counter: number of messages emitted by ProgRouter @@ -36,6 +36,11 @@ INLINE uint32_t tinselProgRouterSent(); INLINE uint32_t tinselProgRouterSentInterBoard(); ``` +Document extra send slot option: + + * `HostLinkParams`, `DebugLinkParams` + * `tinselSendSlotExtra()` + # Tinsel 0.7.1 Tinsel is a [RISC-V](https://riscv.org/)-based manythread diff --git a/hostlink/DebugLink.cpp b/hostlink/DebugLink.cpp index f838441d..0031969c 100644 --- a/hostlink/DebugLink.cpp +++ b/hostlink/DebugLink.cpp @@ -60,10 +60,10 @@ void DebugLink::putPacket(int x, int y, BoardCtrlPkt* pkt) } // Constructor -DebugLink::DebugLink(uint32_t numBoxesX, uint32_t numBoxesY) +DebugLink::DebugLink(DebugLinkParams p) { - boxMeshXLen = numBoxesX; - boxMeshYLen = numBoxesY; + boxMeshXLen = p.numBoxesX; + boxMeshYLen = p.numBoxesY; get_tryNextX = 0; get_tryNextY = 0; @@ -105,11 +105,11 @@ DebugLink::DebugLink(uint32_t numBoxesX, uint32_t numBoxesY) "But is has a box X coordinate of %i\n", thisBoxX); exit(EXIT_FAILURE); } - if ((thisBoxX+numBoxesX-1) >= TinselBoxMeshXLen || - (thisBoxY+numBoxesY-1) >= TinselBoxMeshYLen) { + if ((thisBoxX+p.numBoxesX-1) >= TinselBoxMeshXLen || + (thisBoxY+p.numBoxesY-1) >= TinselBoxMeshYLen) { fprintf(stderr, "Requested box sub-mesh of size %ix%i " "is not valid from box %s\n", - numBoxesX, numBoxesY, hostname); + p.numBoxesX, p.numBoxesY, hostname); exit(EXIT_FAILURE); } @@ -187,6 +187,8 @@ DebugLink::DebugLink(uint32_t numBoxesX, uint32_t numBoxesY) if (y == 0) pkt.payload[2] |= 2; if (thisBoxX == 0 && boxMeshXLen == 1) pkt.payload[2] |= 4; if (thisBoxX == 1 && boxMeshXLen == 1) pkt.payload[2] |= 8; + // Reserve extra send slot? + pkt.payload[2] |= p.useExtraSendSlot ? 0x10 : 0; // Send commands to each board for (int b = 0; b < TinselBoardsPerBox; b++) { pkt.linkId = b; diff --git a/hostlink/DebugLink.h b/hostlink/DebugLink.h index fd3c8291..18d352dc 100644 --- a/hostlink/DebugLink.h +++ b/hostlink/DebugLink.h @@ -8,6 +8,13 @@ #include "BoardCtrl.h" #include "DebugLinkFormat.h" +// DebugLinkH parameters +struct DebugLinkParams { + uint32_t numBoxesX; + uint32_t numBoxesY; + bool useExtraSendSlot; +}; + class DebugLink { // Location of this box with full box mesh @@ -46,7 +53,7 @@ class DebugLink { int meshYLen; // Constructor - DebugLink(uint32_t numBoxesX, uint32_t numBoxesY); + DebugLink(DebugLinkParams params); // On given board, set destination core and thread void setDest(uint32_t boardX, uint32_t boardY, diff --git a/hostlink/HostLink.cpp b/hostlink/HostLink.cpp index 4708457e..725c673b 100644 --- a/hostlink/HostLink.cpp +++ b/hostlink/HostLink.cpp @@ -60,9 +60,11 @@ static int connectToPCIeStream(const char* socketPath) } // Internal constructor -void HostLink::constructor(uint32_t numBoxesX, uint32_t numBoxesY) +void HostLink::constructor(HostLinkParams p) { - if (numBoxesX > TinselBoxMeshXLen || numBoxesY > TinselBoxMeshYLen) { + useExtraSendSlot = p.useExtraSendSlot; + + if (p.numBoxesX > TinselBoxMeshXLen || p.numBoxesY > TinselBoxMeshYLen) { fprintf(stderr, "Number of boxes requested exceeds those available\n"); exit(EXIT_FAILURE); } @@ -92,7 +94,11 @@ void HostLink::constructor(uint32_t numBoxesX, uint32_t numBoxesY) #endif // Create DebugLink - debugLink = new DebugLink(numBoxesX, numBoxesY); + DebugLinkParams debugLinkParams; + debugLinkParams.numBoxesX = p.numBoxesX; + debugLinkParams.numBoxesY = p.numBoxesY; + debugLinkParams.useExtraSendSlot = p.useExtraSendSlot; + debugLink = new DebugLink(debugLinkParams); // Set board mesh dimensions meshXLen = debugLink->meshXLen; @@ -145,12 +151,20 @@ HostLink::HostLink() int x = str ? atoi(str) : 1; str = getenv("HOSTLINK_BOXES_Y"); int y = str ? atoi(str) : 1; - constructor(x, y); + HostLinkParams params; + params.numBoxesX = x; + params.numBoxesY = y; + params.useExtraSendSlot = false; + constructor(params); } HostLink::HostLink(uint32_t numBoxesX, uint32_t numBoxesY) { - constructor(numBoxesX, numBoxesY); + HostLinkParams params; + params.numBoxesX = numBoxesX; + params.numBoxesY = numBoxesY; + params.useExtraSendSlot = false; + constructor(params); } // Destructor diff --git a/hostlink/HostLink.h b/hostlink/HostLink.h index f6a7a71c..41d78303 100644 --- a/hostlink/HostLink.h +++ b/hostlink/HostLink.h @@ -16,6 +16,13 @@ #define PCIESTREAM "pciestream" #define PCIESTREAM_SIM "tinsel.b-1.1" +// HostLink parameters +struct HostLinkParams { + uint32_t numBoxesX; + uint32_t numBoxesY; + bool useExtraSendSlot; +}; + class HostLink { // Lock file for acquring exclusive access to PCIeStream int lockFile; @@ -33,8 +40,11 @@ class HostLink { char* sendBuffer; int sendBufferLen; + // Request an extra send slot when bringing up Tinsel FPGAs + bool useExtraSendSlot; + // Internal constructor - void constructor(uint32_t numBoxesX, uint32_t numBoxesY); + void constructor(HostLinkParams params); // Internal helper for sending messages bool sendHelper(uint32_t dest, uint32_t numFlits, void* payload, @@ -47,6 +57,7 @@ class HostLink { // Constructors HostLink(); HostLink(uint32_t numBoxesX, uint32_t numBoxesY); + HostLink(HostLinkParams params); // Destructor ~HostLink(); diff --git a/include/tinsel.h b/include/tinsel.h index d06e2bfd..ab29fbae 100644 --- a/include/tinsel.h +++ b/include/tinsel.h @@ -129,6 +129,18 @@ INLINE volatile void* tinselSendSlot() return mb_scratchpad_base + (threadId << TinselLogBytesPerMsg); } +// Get pointer to thread's extra message slot reserved for sending +// (Assumes that HostLink has requested the extra slot) +INLINE volatile void* tinselSendSlotExtra() +{ + volatile char* mb_scratchpad_base = + (volatile char*) (1 << TinselLogBytesPerMailbox); + uint32_t threadId = tinselId() & + ((1<> FreeSlots.hex - -# Emit MIF file -../bin/hex-to-mif.py FreeSlots.hex $LogMsgsPerMailbox > ../de5/FreeSlots.mif diff --git a/rtl/Mailbox.bsv b/rtl/Mailbox.bsv index 0398b0e2..e08b1b9a 100644 --- a/rtl/Mailbox.bsv +++ b/rtl/Mailbox.bsv @@ -260,6 +260,9 @@ interface Mailbox; (* always_ready *) method Bit#(1) freeDone; // Network-side interface interface MailboxNet net; + // Initialise send slots (use extra send slot?) + (* always_ready, always_enabled *) + method Action initSendSlots(Option#(Bool) useExtraSendSlot); endinterface // Combined receive request/response interface @@ -292,6 +295,45 @@ module mkMailbox (Mailbox); Vector#(`CoresPerMailbox, InPort#(ReceiveReq)) rxReqPorts <- replicateM(mkInPort); + // Initialise free slots + // ===================== + + // Set of currently-unused message slots + // By default, the first ThreadsPerMailbox slots are reserved for sending + // Optionally, the first 2*ThreadsPerMailbox slots are reserved for sending + SizedQueue#(`LogMsgsPerMailbox, Bit#(`LogMsgsPerMailbox)) + freeSlots <- mkUGSizedQueuePrefetch; + + // Reserve extra send slot? + Wire#(Option#(Bool)) useExtraSendSlot <- mkBypassWire; + + // State of free slot initialiser + Reg#(Bit#(1)) freeSlotsInitState <- mkConfigReg(0); + + // Have the free slots been initialised yet? + Reg#(Bool) freeSlotsInitDone <- mkConfigReg(False); + + // Next slot to insert into free slot queue + Reg#(Bit#(`LogMsgsPerMailbox)) freeSlotsInitNext <- mkConfigRegU; + + // Wait until config option available, which tells us how + // many slots to reserve for sending + rule initFreeSlots0 (freeSlotsInitState == 0); + if (useExtraSendSlot.valid) begin + freeSlotsInitNext <= useExtraSendSlot.value ? + fromInteger(2*`ThreadsPerMailbox) : `ThreadsPerMailbox; + freeSlotsInitState <= 1; + end + endrule + + // Initialise free slots + rule initFreeSlots1 (!freeSlotsInitDone && freeSlotsInitState == 1); + freeSlots.enq(freeSlotsInitNext); + freeSlotsInitNext <= freeSlotsInitNext + 1; + if (freeSlotsInitNext == fromInteger(2**`LogMsgsPerMailbox - 1)) + freeSlotsInitDone <= True; + endrule + // Message access unit // =================== @@ -336,15 +378,6 @@ module mkMailbox (Mailbox); Reg#(RefCount) refCountReg <- mkConfigRegU; Reg#(Bit#(`LogMsgsPerMailbox)) refCountSlot <- mkConfigRegU; - // Set of currently-unused message slots - // (The first ThreadsPerMailbox slots are reserved for sending) - QueueOpts freeSlotsOpts; - freeSlotsOpts.style = "AUTO"; - freeSlotsOpts.size = 2**`LogMsgsPerMailbox - `ThreadsPerMailbox; - freeSlotsOpts.file = Valid("FreeSlots"); - SizedQueue#(`LogMsgsPerMailbox, Bit#(`LogMsgsPerMailbox)) - freeSlots <- mkUGSizedQueuePrefetchOpts(freeSlotsOpts); - // Multicast buffer Vector#(`CoresPerMailbox, SizedQueue#(`LogMulticastBufferSize, MulticastBufferEntry)) @@ -598,7 +631,7 @@ module mkMailbox (Mailbox); // to a message slot is freed Reg#(Bit#(1)) freeDoneReg <- mkDReg(0); - rule free (freeReqPort.canGet); + rule free (freeReqPort.canGet && freeSlotsInitDone); FreeReq req = freeReqPort.value; // Process request in two cycles let count = refCount.dataOutB; @@ -667,6 +700,10 @@ module mkMailbox (Mailbox); endinterface endinterface + method Action initSendSlots(Option#(Bool) useExtra); + useExtraSendSlot <= useExtra; + endmethod + endmodule // ============================================================================= @@ -1138,14 +1175,16 @@ import "BVI" ExternalTinselAccelerator = `ifndef UseCustomAccelerator -module mkMailboxAcc#(BoardId boardId, Integer tileX, Integer tileY) (Mailbox); +module mkMailboxAcc#(BoardId boardId, + Integer tileX, Integer tileY) (Mailbox); Mailbox mbox <- mkMailbox; return mbox; endmodule `else -module mkMailboxAcc#(BoardId boardId, Integer tileX, Integer tileY) (Mailbox); +module mkMailboxAcc#(BoardId boardId, + Integer tileX, Integer tileY) (Mailbox); // Instantiate standard mailbox Mailbox mbox <- mkMailbox; diff --git a/rtl/Makefile b/rtl/Makefile index 57a2acf8..e938b015 100644 --- a/rtl/Makefile +++ b/rtl/Makefile @@ -28,13 +28,13 @@ sim: $(TOPMOD) $(HOSTTOPMOD) .PHONY: verilog verilog: $(TOPMOD).v $(HOSTTOPMOD).v -$(TOPMOD): *.bsv *.c InstrMem.hex FreeSlots.hex +$(TOPMOD): *.bsv *.c InstrMem.hex make -C $(TINSEL_ROOT)/apps/boot make -C $(TINSEL_ROOT)/hostlink udsock $(BSC) $(BSCFLAGS) $(DEFS) -D SIMULATE -sim -g $(TOPMOD) -u $(TOPFILE) $(BSC) $(BSCFLAGS) -sim -o $(TOPMOD) -e $(TOPMOD) *.c -$(TOPMOD).v: *.bsv $(QP)/InstrMem.mif $(QP)/FreeSlots.mif +$(TOPMOD).v: *.bsv $(QP)/InstrMem.mif make -C $(TINSEL_ROOT)/apps/boot $(BSC) $(BSCFLAGS) -opt-undetermined-vals -unspecified-to X \ $(DEFS) -u -verilog -g $(TOPMOD) $(TOPFILE) @@ -63,12 +63,6 @@ InstrMem.hex: $(QP)/InstrMem.mif: make -C $(TINSEL_ROOT)/apps/boot -FreeSlots.hex: GenInit.sh - ./GenInit.sh - -$(QP)/FreeSlots.mif: GenInit.sh - ./GenInit.sh - .PHONY: test-mem test-mem: testMem @@ -83,7 +77,6 @@ clean: rm -f de5Top.v mkCore.v mkDCache.v mkMailbox.v mkDebugLinkRouter.v rm -f mkFPU.v mkMeshRouter.v rm -f de5BridgeTop.v - rm -f FreeSlots.hex ../de5/FreeSlots.mif rm -rf test-mem-log rm -rf test-mailbox-log rm -rf test-array-of-queue-log From a49ee33b42620e75d6c85a4898886be6674f9a72 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Thu, 7 May 2020 14:08:08 +0100 Subject: [PATCH 56/78] Forgot to add new HostLink constructor --- hostlink/HostLink.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hostlink/HostLink.cpp b/hostlink/HostLink.cpp index 725c673b..dd896f4d 100644 --- a/hostlink/HostLink.cpp +++ b/hostlink/HostLink.cpp @@ -167,6 +167,11 @@ HostLink::HostLink(uint32_t numBoxesX, uint32_t numBoxesY) constructor(params); } +HostLink::HostLink(HostLinkParams params) +{ + constructor(params); +} + // Destructor HostLink::~HostLink() { From 590cb373b32f68b157b60fb885fdcab2b2d1733a Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Sun, 10 May 2020 11:09:53 +0100 Subject: [PATCH 57/78] Full throughput on ProgRouter side of ram req tree --- rtl/Connections.bsv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rtl/Connections.bsv b/rtl/Connections.bsv index d093001b..7f542acc 100644 --- a/rtl/Connections.bsv +++ b/rtl/Connections.bsv @@ -84,7 +84,7 @@ module connectClientsToOffChipRAM#( mkMergeTreeB(Fair, mkUGShiftQueue1(QueueOptFmax), routerReqs); - Queue1#(DRAMReq) fetcherReqsQueue <- mkUGShiftQueue1(QueueOptFmax); + Queue#(DRAMReq) fetcherReqsQueue <- mkUGQueue; connectToQueue(fetcherReqs, fetcherReqsQueue); BOut#(DRAMReq) fetcherReqsB = queueToBOut(fetcherReqsQueue); From 78842a7e6fde4141874c48f4d916c6947c7c9e25 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Mon, 11 May 2020 14:09:01 +0100 Subject: [PATCH 58/78] Handle 0-size routing keys properly --- include/POLite/ProgRouters.h | 2 +- rtl/ProgRouter.bsv | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/POLite/ProgRouters.h b/include/POLite/ProgRouters.h index a1c5942d..9890c43e 100644 --- a/include/POLite/ProgRouters.h +++ b/include/POLite/ProgRouters.h @@ -284,7 +284,7 @@ class ProgRouterMesh { // Returns routing key uint32_t addDestsFromBoardXY(uint32_t senderX, uint32_t senderY, Seq* dests) { - assert(dests->numElems > 0); + if (dests->numElems == 0) return 0; // Categorise dests into local, N, S, E, and W groups Seq local(dests->numElems); diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv index e8a7d97c..9570ff09 100644 --- a/rtl/ProgRouter.bsv +++ b/rtl/ProgRouter.bsv @@ -432,6 +432,7 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); // Ignore keys with zero beats if (key.numBeats == 0) begin consumeState <= 0; + incReceivedReg <= 1; end else begin consumeState <= 2; // Claim chosen slot From b6f98ad5a8ea4a7574c8d363869b11aa6d8c9c8d Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Mon, 11 May 2020 14:56:15 +0000 Subject: [PATCH 59/78] Faster edge list reader --- include/EdgeList.h | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/include/EdgeList.h b/include/EdgeList.h index efb65c73..a4bca0fe 100644 --- a/include/EdgeList.h +++ b/include/EdgeList.h @@ -3,8 +3,11 @@ #define _NETWORK_H_ #include -#include #include +#include +#include +#include +#include struct EdgeList { // Number of nodes and edges @@ -18,50 +21,42 @@ struct EdgeList { // Read network from file void read(const char* filename, bool warn = true) { - // Read edges - FILE* fp = fopen(filename, "rt"); - if (fp == NULL) { - fprintf(stderr, "Can't open '%s'\n", filename); - exit(EXIT_FAILURE); - } + std::fstream file(filename, std::ios_base::in); + std::vector vec; // Count number of nodes and edges numEdges = 0; numNodes = 0; - int ret; - while (1) { - uint32_t src, dst; - ret = fscanf(fp, "%d %d", &src, &dst); - if (ret == EOF) break; + uint32_t numInts = 0; + uint32_t val; + while (file >> val) { + vec.push_back(val); + numNodes = val >= numNodes ? val+1 : numNodes; numEdges++; - numNodes = src >= numNodes ? src+1 : numNodes; - numNodes = dst >= numNodes ? dst+1 : numNodes; } - rewind(fp); + assert((numEdges&1) == 0); + numEdges >>= 1; uint32_t* count = (uint32_t*) calloc(numNodes, sizeof(uint32_t)); - for (int i = 0; i < numEdges; i++) { - uint32_t src, dst; - ret = fscanf(fp, "%d %d", &src, &dst); - count[src]++; + for (int i = 0; i < vec.size(); i+=2) { + count[vec[i]]++; } // Create mapping from node id to neighbours neighbours = (uint32_t**) calloc(numNodes, sizeof(uint32_t*)); - rewind(fp); for (int i = 0; i < numNodes; i++) { neighbours[i] = (uint32_t*) calloc(count[i]+1, sizeof(uint32_t)); neighbours[i][0] = count[i]; } - for (int i = 0; i < numEdges; i++) { - uint32_t src, dst; - ret = fscanf(fp, "%d %d", &src, &dst); + for (int i = 0; i < vec.size(); i+=2) { + uint32_t src = vec[i]; + uint32_t dst = vec[i+1]; neighbours[src][count[src]--] = dst; } // Release free(count); - fclose(fp); + file.close(); if (warn && minFanOut() == 0) { printf("Warning: some vertices have no outgoing edges and\n"); From 35dd55ad555be2bd544f8d17384d24bb3562b1a8 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Mon, 11 May 2020 15:46:28 +0000 Subject: [PATCH 60/78] Add direct-mapped placer --- include/POLite/Placer.h | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/include/POLite/Placer.h b/include/POLite/Placer.h index 1468f0c7..4178af50 100644 --- a/include/POLite/Placer.h +++ b/include/POLite/Placer.h @@ -14,7 +14,8 @@ struct Placer { enum Method { Default, Metis, - Random + Random, + Direct }; const Method defaultMethod=Metis; @@ -61,6 +62,8 @@ struct Placer { method=Metis; else if (!strcmp(e, "random")) method=Random; + else if (!strcmp(e, "direct")) + method=Direct; else if (!strcmp(e, "default") || *e == '\0') method=Default; else { @@ -159,6 +162,18 @@ struct Placer { } } + // Partition the graph using direct mapping + void partitionDirect() { + uint32_t numVertices = graph->incoming->numElems; + uint32_t numParts = width * height; + uint32_t partSize = (numVertices + numParts) / numParts; + + // Populate result array + for (uint32_t i = 0; i < numVertices; i++) { + partitions[i] = i / partSize; + } + } + void partition() { switch(method){ @@ -169,6 +184,9 @@ struct Placer { case Random: partitionRandom(); break; + case Direct: + partitionDirect(); + break; } } From 8f4d197e980cbdcf31bcffd5c8f77b4eca7f87d8 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Thu, 14 May 2020 10:41:45 +0100 Subject: [PATCH 61/78] Bit of work on the docs --- README.md | 127 +++++++++++------ doc/PIP-0024-global-multicast.md | 226 +++++++++++++++++++++++++++++++ 2 files changed, 312 insertions(+), 41 deletions(-) create mode 100644 doc/PIP-0024-global-multicast.md diff --git a/README.md b/README.md index acc9d10c..502a2e29 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ TODO, document the following: // ========== // Send message at addr using given routing key -inline void tinselKeySend(int key, volatile void* addr); +inline void tinselKeySend(uint32_t key, volatile void* addr); // HostLink API // ============ @@ -36,12 +36,7 @@ INLINE uint32_t tinselProgRouterSent(); INLINE uint32_t tinselProgRouterSentInterBoard(); ``` -Document extra send slot option: - - * `HostLinkParams`, `DebugLinkParams` - * `tinselSendSlotExtra()` - -# Tinsel 0.7.1 +# Tinsel 0.8 Tinsel is a [RISC-V](https://riscv.org/)-based manythread message-passing architecture designed for FPGA clusters. It is being @@ -76,8 +71,12 @@ Released on 11 Apr 2019 and maintained in the (Multi-box cluster.) * [v0.7](https://github.com/POETSII/tinsel/releases/tag/v0.7): Released on 2 Dec 2019 and maintained in the +[tinsel-0.7.1 branch](https://github.com/POETSII/tinsel/tree/tinsel-0.7.1). +(Local hardware multicast.) +* [v0.8](https://github.com/POETSII/tinsel/releases/tag/v0.8): +Released on 18 May 2020 and maintained in the [master branch](https://github.com/POETSII/tinsel/). -(Localised hardware multicast.) +(Global hardware multicast.) ## Contents @@ -87,8 +86,9 @@ Released on 2 Dec 2019 and maintained in the * [4. Tinsel Cache](#4-tinsel-cache) * [5. Tinsel Mailbox](#5-tinsel-mailbox) * [6. Tinsel Network](#6-tinsel-network) -* [7. Tinsel HostLink](#7-tinsel-hostlink) -* [8. POLite API](#8-polite-api) +* [7. Tinsel Router](#7-tinsel-router) +* [8. Tinsel HostLink](#8-tinsel-hostlink) +* [9. POLite API](#9-polite-api) ## Appendices @@ -135,11 +135,16 @@ demands, but fairly modest compute requrements. The main features are: time step, or termination of the application, supporting both synchronous and asynchronous event-driven systems. - * **Localised hardware multicast**. Threads can send a message to - multiple colocated destination threads simultaneously, greatly reducing + * **Local hardware multicast**. Threads can send a message to + multiple collocated destination threads simultaneously, greatly reducing the number of inter-thread messages in applications exhibiting good locality of communication. + * **Global hardware multicast**. Programmable routers + automatically propagate messages to any number of destination + threads distributed throughout the cluster, minimising inter-FPGA + bandwidth usage for distributed fanouts. + * **Host communication**. Tinsel threads communicate with x86 machines distributed throughout the FPGA cluster, for command and control, via PCI Express and USB. @@ -148,7 +153,7 @@ demands, but fairly modest compute requrements. The main features are: include custom accelerators written in SystemVerilog. This repository also includes a prototype high-level vertex-centric -programming API for Tinsel, called [POLite](#8-polite-api). +programming API for Tinsel, called [POLite](#9-polite-api). ## 2. High-Level Structure @@ -175,11 +180,13 @@ accelerators](doc/custom) in tiles. #### Tinsel FPGA -Each FPGA contains two *Tinsel Slices*, with each slice typically +Each FPGA contains two *Tinsel Slices*, with each slice by default comprising eight tiles connected to one 4GB DDR3 DIMM and two 8MB QDRII+ SRAMs. All tiles are connected together via a routers to form a 2D NoC. The NoC is connected to the inter-FPGA links using a -per-board router. +*per-board programmable router*. Note that the per-board router also +has connections to off-chip memory: this is where the programmable +routing tables are stored. @@ -460,16 +467,22 @@ has reached the destination or none of it has. As one would expect, shorter messages consume less bandwidth than longer ones. The size of a flit is defined by `LogWordsPerFlit`. -At the heart of a mailbox is a memory-mapped *scratchpad* that -stores both incoming and outgoing messages. The capacity of the -scratchpad is defined by `LogMsgsPerMailbox`. Each thread connected -to the mailbox has one message slot reserved for sending messages. -The address of this slot is obtained using the following Tinsel API -call. +At the heart of a mailbox is a memory-mapped *scratchpad* that stores +both incoming and outgoing messages. The capacity of the scratchpad +is defined by `LogMsgsPerMailbox`. Each thread connected to the +mailbox has one or two message slots reserved for sending messages. +(By default, only a single send slot is reserved; the extra send slot +may be optionally reserved at power-up via a parameter to the +[HostLink](#8-tinsel-hostlink) constructor.) The addresses of these +slots are obtained using the following Tinsel API calls. ```c -// Get pointer to thread's message slot reserved for sending. +// Get pointer to thread's message slot reserved for sending volatile void* tinselSendSlot(); + +// Get pointer to thread's extra message slot reserved for sending +// (Assumes that HostLink has requested the extra slot) +volatile void* tinselSendSlotExtra(); ``` Once a thread has written a message to the scratchpad, it can trigger @@ -681,7 +694,11 @@ communication. And since we are using the links point-to-point, almost all of the ethernet header fields can be used for our own purposes, resulting in very little overhead on the wire. -## 7. Tinsel HostLink +## 7. Tinsel Router + +TODO + +## 8. Tinsel HostLink *HostLink* is the means by which Tinsel cores running on a mesh of FPGA boards communicate with a *host PC*. It comprises three main @@ -689,7 +706,7 @@ communication channels: * An FPGA *bridge board* that connects the host PC inside a POETS box (PCI Express) to the FPGA mesh (SFP+). Using this high-bandwidth -channel (10Gbps), the host PC can efficiently send messages to any +channel (2 x 10Gbps), the host PC can efficiently send messages to any Tinsel thread and vice-versa. * A set of *debug links* connecting the host PC inside a POETS box to @@ -704,34 +721,45 @@ each FPGA's *power management module* via separate USB UART cables. These connections can be used to power-on/power-off each FPGA and to monitor power consumption, temperature, and fan tachometer. -HostLink supports multiple POETS boxes, but requires that one of these -boxes is designated as the **master box**. Currently, all messages -are injected/extracted to/from the FPGA network via the master box's -bridge board. - -A Tinsel application typically consists of two programs: one which -runs on the RISC-V cores, linked against the [Tinsel +HostLink allows multiple POETS boxes to be used to run an application, +but requires that one of these boxes is designated as the **master +box**. A Tinsel application typically consists of two programs: one +which runs on the RISC-V cores, linked against the [Tinsel API](#f-tinsel-api), and the other which runs on the host PC of the master box, linked against the [HostLink API](#g-hostlink-api). The HostLink API is implemented as a C++ class called `HostLink`. The constructor for this class first powers up all the worker FPGAs (which -are by default powered down). On power-up the FPGAs are automatically -programmed using the Tinsel bit-file residing in flash memory, and are -ready to be used within a few seconds, as soon as the `HostLink` -constructor returns. +are by default powered down). On power-up, the FPGAs are +automatically programmed using the Tinsel bit-file residing in flash +memory, and are ready to be used within a few seconds, as soon as the +`HostLink` constructor returns. The `HostLink` constructor is overloaded: ```cpp HostLink::HostLink(); HostLink::HostLink(uint32_t numBoxesX, uint32_t numBoxesY); +HostLink::HostLink(HostLinkParams params); ``` If it is called without any arguments, then it assumes that a single -box is to be used. Alternatively, the user may request multiple -boxes by specifying the width and height of the box sub-mesh they -wish to use. (The box from which the application is started is -considered as the origin of this sub-mesh.) +box is to be used. Alternatively, the user may request multiple boxes +by specifying the width and height of the box sub-mesh they wish to +use. (The box from which the application is started, i.e. the master +box, is considered as the the origin of this sub-mesh.) The most +general constructor takes a `HostLinkParams` structure as an argument, +which allows additional options to be specified. + +```cpp +// HostLink parameters +struct HostLinkParams { + // Number of boxes to use (default is 1x1) + uint32_t numBoxesX; + uint32_t numBoxesY; + // Enable use of tinselSendSlotExtra() on threads (default is false) + bool useExtraSendSlot; +}; +``` HostLink methods for sending and receiving messages on the host PC are as follows. @@ -937,7 +965,7 @@ not be called. When the application returns from `main()`, all but one thread on each core are killed and the remaining threads reenter the boot loader. -## 8. POLite API +## 9. POLite API POLite is a layer of abstraction that takes care of mapping arbitrary task graphs onto the Tinsel overlay, completely hiding architectural @@ -1300,6 +1328,13 @@ inline void tinselFlushLine(uint32_t lineNum, uint32_t way); // (A message of length n is comprised of n+1 flits) inline void tinselSetLen(uint32_t n); +// Get pointer to thread's message slot reserved for sending +volatile void* tinselSendSlot(); + +// Get pointer to thread's extra message slot reserved for sending +// (Assumes that HostLink has requested the extra slot) +volatile void* tinselSendSlotExtra(); + // Determine if calling thread can send a message inline uint32_t tinselCanSend(); @@ -1518,14 +1553,24 @@ class HostLink { // Trigger application execution on all started threads on given core void goOne(uint32_t meshX, uint32_t meshY, uint32_t coreId); }; + +// HostLink parameters (used by the most general HostLink constructor) +struct HostLinkParams { + // Number of boxes to use (default is 1x1) + uint32_t numBoxesX; + uint32_t numBoxesY; + // Enable use of tinselSendSlotExtra() on threads (default is false) + bool useExtraSendSlot; +}; ``` ```cpp class DebugLink { public: - // Constructor + // Constructors DebugLink(uint32_t numBoxesX, uint32_t numBoxesY); + DebugLink(DebugLinkParams params); // On given board, set destination core and thread void setDest(uint32_t boardX, uint32_t boardY, diff --git a/doc/PIP-0024-global-multicast.md b/doc/PIP-0024-global-multicast.md new file mode 100644 index 00000000..65105f71 --- /dev/null +++ b/doc/PIP-0024-global-multicast.md @@ -0,0 +1,226 @@ +# PIP-0024: Programmable routers and global multicast + +Author: Matthew Naylor + +This proposal replaces PIP 21. + +## Proposal + +We propose to generalise the destination component of a message so +that it can be (1) a thread id; or (2) a **routing key**. A message, +sent by a thread, containing a routing key as a destination will go to +a **per-board router** on the same FPGA. The router will use they key +as an index into a DRAM-based routing table and automatically +propagate the message towards all the destinations associated with +that key. + +## Motivation/Rationale + +PIP 22 resulted in a *mailbox-level* multicast feature, implemented in +Tinsel 0.7. It enables each thread to send to a message +simultaneously to any subset of the 64 threads on a destination +mailbox. It works well when graphs exhibit good locality, with +destination vertices often collocated on the same mailbox. + +However, it has a few drawbacks: + + 1. Costly graph partitioning algorithms are needed to identify + locality. This is problematic for graphs with billions of edges + and vertices, because mapping time may significantly outweigh + execution time. (Indeed, graph partitioning is itself an + interesting application for the hardware.) + + 2. In some graphs there are limits to how well destination vertices + can be collocated after partitioning. For example, *small-world + graphs* contain some extremely large, highly-distributed fanouts. + +A *global multicast* feature should reduce the need to find optimal +partitions for very large graphs, and support distributed fanouts. It +should also move work away from the cores and into the hardware +routers: the softswitch no longer needs to iterate over the outgoing +edges of a pin. While providing these improvements, it is also +important to maintain the advantages of the existing mailbox-level +multicast, for applications in which the mapping time is not a +concern. + +## Functional overview + +A **routing key** is a 32-bit value consisting of a *ram id*, an +*address*, and a *size*: + +```sv +// 32-bit routing key (MSB to LSB) +typedef struct { + // Which off-chip RAM on this board? + Bit#(`LogDRAMsPerBoard) ram; + // Pointer to array of routing beats containing routing records + Bit#(`LogBeatsPerDRAM) ptr; + // Number of beats in the array + Bit#(`LogRoutingEntryLen) numBeats; +} RoutingKey; +``` + +When a message reaches the per-board router, the `ptr` field of the +routing key is used as an index into DRAM, where a sequence of 256-bit +**routing beats** are found. The `numBeats` field of the routing key +indicates how many contiguous routing beats there are. Knowing the +size before the lookup makes the hardware simpler and more efficient, +e.g. it can avoid blocking on responses and issue a burst of an +appropriate size. The value of `numBeats` may be zero. + +A routing beat consists of a *size* and a sequence of five 48-bit +*routing chunks*: + +```sv +// 256-bit routing beat (aligned, MSB to LSB) +typedef struct { + // Number of routing records present in this beat + Bit#(16) size; + // Five 48-bit record chunks + Vector#(5, Bit#(48)) chunks; +} RoutingBeat; +``` + +The *size* must lie in the range 1 to 5 inclusive (0 is disallowed). +A **routing record** consists of one or two routing chunks, depending +on the **record type**. + +All byte orderings are little endian. For example, the order of bytes +in a routing beat is as follows. + +``` +Byte Contents +---- -------- +31: Upper byte of length (i.e. number of records in beat) +30: Lower byte of length +29: Upper byte of first chunk + ... +24: Lower byte of first chunk +23: Upper byte of second chunk + ... +18: Lower byte of second chunk +17: Upper byte of third chunk + ... +12: Lower byte of third chunk +11: Upper byte of fourth chunk + ... + 6: Lower byte of fourth chunk + 5: Upper byte of fifth chunk + ... + 0: Lower byte of fifth chunk +``` + +Clearly, both routing keys and routing beats have a maximum size. +However, in principle there is no limit to the number of records +associated with a key, due to the possibility of *indirection records* +(see below). + +There are five types of routing record, defined below. + +**48-bit Unicast Router-to-Mailbox (URM1).** + +```sv +typedef struct { + // Record type (URM1 == 0) + Bit#(3) tag; + // Mailbox destination + Bit#(4) mbox; + // Mailbox-local thread identifier + Bit#(6) thread; + // Unused + Bit#(3) unused; + // Local key. The first word of the message + // payload is overwritten with this. + Bit#(32) localKey; +} URM1Record; +``` + +The `localKey` can be used for anything, but might encode the +destination thread-local device identifier, or edge identifier, or +both. The `mbox` field is currently 4 bits (two Y bits followed by +two X bits), but there are spare bits available to increase the size +of this field in future if necessary. + +**96-bit Unicast Router-to-Mailbox (URM2).** + +```sv +typedef struct { + // Record type (URM2 == 1) + Bit#(3) tag; + // Mailbox destination + Bit#(4) mbox; + // Mailbox-local thread identifier + Bit#(6) thread; + // Currently unused + Bit#(19) unused; + // Local key. The first two words of the message + // payload is overwritten with this. + Bit#(64) localKey; +} URM2Record; +``` + +This is the same as a URM1 record except the local key is 64-bits in +size. + +**48-bit Router-to-Router (RR).** + +```sv +typedef struct { + // Record type (RR == 2) + Bit#(3) tag; + // Direction (N,S,E,W == 0,1,2,3) + Bit#(2) dir; + // Currently unused + Bit#(11) unused; + // New 32-bit routing key that will replace the one in the + // current message for the next hop of the message's journey + Bit#(32) newKey; +} RRRecord; +``` + +The `newKey` field will replace the key in the current message for the +next hop of the message's journey. Introducing a new key at each hop +simplifies the mapping process (keeping it quick). + +**96-bit Multicast Router-to-Mailbox (MRM).** + +```sv +typedef struct { + // Record type (MRM == 3) + Bit#(3) tag; + // Mailbox destination + Bit#(4) mbox; + // Currently unused + Bit#(9) unused; + // Local key. The least-significant half-word + // of the message is replaced with this + Bit#(16) localKey; + // Mailbox-local destination mask + Bit#(64) destMask; +} MRMRecord; +``` + +**48-bit Indirection (IND).** + +```sv +// 48-bit Indirection (IND) record +// Note the restrictions on IND records: +// 1. At most one IND record per key lookup +// 2. A max-sized key lookup must contain an IND record +typedef struct { + // Record type (IND == 4) + Bit#(3) tag; + // Currently unused + Bit#(13) unused; + // New 32-bit routing key for new set of records on current router + Bit#(32) newKey; +} INDRecord; +``` + +Indirection records can be used to handle large fanouts, which exceed +the number of bits available in the size portion of the routing key. + +## Impact + +Since use of routing keys is optional, existing applications will +continue to work unmodified. From bd03ed85f2097f4d15f5adbe04f3571e488fdb01 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Thu, 14 May 2020 10:36:45 +0000 Subject: [PATCH 62/78] Relax POLite constraint on min fan out POLite now allows empty pins, so fan outs of some vertices may be zero --- apps/POLite/asp-sync/Run.cpp | 1 + apps/POLite/heat-sync/Run.cpp | 1 - apps/POLite/izhikevich-sync/Run.cpp | 3 ++- apps/POLite/pagerank-sync/Run.cpp | 4 ++-- apps/POLite/sssp-async/Run.cpp | 4 ++-- apps/POLite/sssp-sync/Run.cpp | 4 ++-- include/EdgeList.h | 7 +------ 7 files changed, 10 insertions(+), 14 deletions(-) diff --git a/apps/POLite/asp-sync/Run.cpp b/apps/POLite/asp-sync/Run.cpp index 3264d114..287f32db 100644 --- a/apps/POLite/asp-sync/Run.cpp +++ b/apps/POLite/asp-sync/Run.cpp @@ -22,6 +22,7 @@ int main(int argc, char**argv) // Print max fan-out printf("Max fan-out = %d\n", net.maxFanOut()); + printf("Min fan-out = %d\n", net.minFanOut()); assert(net.minFanOut() > 0); // Check that parameters make sense diff --git a/apps/POLite/heat-sync/Run.cpp b/apps/POLite/heat-sync/Run.cpp index c3db2fbf..91652712 100644 --- a/apps/POLite/heat-sync/Run.cpp +++ b/apps/POLite/heat-sync/Run.cpp @@ -25,7 +25,6 @@ int main(int argc, char **argv) // Print max fan-out printf("Min fan-out = %d\n", net.minFanOut()); printf("Max fan-out = %d\n", net.maxFanOut()); - assert(net.minFanOut() > 0); // Connection to tinsel machine HostLink hostLink; diff --git a/apps/POLite/izhikevich-sync/Run.cpp b/apps/POLite/izhikevich-sync/Run.cpp index dd1ac79e..09efb701 100644 --- a/apps/POLite/izhikevich-sync/Run.cpp +++ b/apps/POLite/izhikevich-sync/Run.cpp @@ -21,7 +21,8 @@ int main(int argc, char**argv) // Read network EdgeList net; net.read(argv[1]); - assert(net.minFanOut() > 0); + printf("Max fan-out = %d\n", net.maxFanOut()); + printf("Min fan-out = %d\n", net.minFanOut()); // Connection to tinsel machine HostLink hostLink; diff --git a/apps/POLite/pagerank-sync/Run.cpp b/apps/POLite/pagerank-sync/Run.cpp index 3ce786b5..1b0eb356 100644 --- a/apps/POLite/pagerank-sync/Run.cpp +++ b/apps/POLite/pagerank-sync/Run.cpp @@ -27,9 +27,9 @@ int main(int argc, char **argv) EdgeList net; net.read(argv[1]); printf(" done\n"); - assert(net.minFanOut() > 0); - // Print max fan-out + // Print fan-out + printf("Min fan-out = %d\n", net.minFanOut()); printf("Max fan-out = %d\n", net.maxFanOut()); // Create nodes in POETS graph diff --git a/apps/POLite/sssp-async/Run.cpp b/apps/POLite/sssp-async/Run.cpp index b9c174a3..b78ccec4 100644 --- a/apps/POLite/sssp-async/Run.cpp +++ b/apps/POLite/sssp-async/Run.cpp @@ -20,9 +20,9 @@ int main(int argc, char**argv) EdgeList net; net.read(argv[1]); - // Print max fan-out + // Print fan-out printf("Max fan-out = %d\n", net.maxFanOut()); - assert(net.minFanOut() > 0); + printf("Min fan-out = %d\n", net.minFanOut()); // Connection to tinsel machine HostLink hostLink; diff --git a/apps/POLite/sssp-sync/Run.cpp b/apps/POLite/sssp-sync/Run.cpp index b9c174a3..b78ccec4 100644 --- a/apps/POLite/sssp-sync/Run.cpp +++ b/apps/POLite/sssp-sync/Run.cpp @@ -20,9 +20,9 @@ int main(int argc, char**argv) EdgeList net; net.read(argv[1]); - // Print max fan-out + // Print fan-out printf("Max fan-out = %d\n", net.maxFanOut()); - assert(net.minFanOut() > 0); + printf("Min fan-out = %d\n", net.minFanOut()); // Connection to tinsel machine HostLink hostLink; diff --git a/include/EdgeList.h b/include/EdgeList.h index a4bca0fe..ebd5d37f 100644 --- a/include/EdgeList.h +++ b/include/EdgeList.h @@ -19,7 +19,7 @@ struct EdgeList { uint32_t** neighbours; // Read network from file - void read(const char* filename, bool warn = true) + void read(const char* filename) { std::fstream file(filename, std::ios_base::in); std::vector vec; @@ -57,11 +57,6 @@ struct EdgeList { // Release free(count); file.close(); - - if (warn && minFanOut() == 0) { - printf("Warning: some vertices have no outgoing edges and\n"); - printf(" some POLite apps do not handle this case.\n"); - } } // Determine max fan-out From 0e52f73e0cef7948b87b6320661f4ce75e1fd19f Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Thu, 14 May 2020 10:43:42 +0000 Subject: [PATCH 63/78] Tweak --- include/POLite/PGraph.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/POLite/PGraph.h b/include/POLite/PGraph.h index 57b3172e..126eaa97 100644 --- a/include/POLite/PGraph.h +++ b/include/POLite/PGraph.h @@ -178,6 +178,10 @@ template = POLITE_NUM_PINS) { + printf("addEdge: pin exceeds POLITE_NUM_PINS\n"); + exit(EXIT_FAILURE); + } graph.addEdge(x, pin, y); edgeLabels.elems[x]->append(edge); } From 8d8751bfa03db45c6daab889aa9800b958ca35a9 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Thu, 14 May 2020 17:37:43 +0100 Subject: [PATCH 64/78] More updates to the docs --- README.md | 317 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 262 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index 502a2e29..465243ca 100644 --- a/README.md +++ b/README.md @@ -1,41 +1,3 @@ -TODO, document the following: - -```c++ -// Tinsel API -// ========== - -// Send message at addr using given routing key -inline void tinselKeySend(uint32_t key, volatile void* addr); - -// HostLink API -// ============ - -// Send a message using routing key (blocking by default) -bool keySend(uint32_t key, uint32_t numFlits, void* msg, bool block = true); - -// Try to send using routing key (non-blocking, returns true on success) -bool keyTrySend(uint32_t key, uint32_t numFlits, void* msg); -``` - -New section on programmable routers: - * Routing record format, byte ordering etc. - * Semantics of records - * Restrictions on IND records - * Avoiding deadlock: programmer has some added resposibility here - -New performance counters accessible from core zero on each board only: - * `ProgRouterSent` and `ProgRouterSentInterBoard` - -Document the following new perf counters: - -```c++ -// Performance counter: number of messages emitted by ProgRouter -INLINE uint32_t tinselProgRouterSent(); - -// Performance counter: number of inter-board messages emitted by ProgRouter -INLINE uint32_t tinselProgRouterSentInterBoard(); -``` - # Tinsel 0.8 Tinsel is a [RISC-V](https://riscv.org/)-based manythread @@ -696,7 +658,203 @@ purposes, resulting in very little overhead on the wire. ## 7. Tinsel Router -TODO +The Tinsel overlay provides a programmable router on each FPGA board +to support *global* multicasting. Programmable routers automatically +propagate messages to any number of destination threads distributed +throughout the cluster, minimising inter-FPGA bandwidth usage for +distributed fanouts, and offloading the work from the cores. Further +background can be found in [PIP 24](doc/PIP-0024-global-multicast.md). + +To support programmable routers, the destination component of a +message is generalised so that it can be (1) a thread id; or (2) a +*routing key*. A message, sent by a thread, containing a routing +key as a destination will go to a per-board router on the same +FPGA. The router will use the key as an index into a DRAM-based +routing table and automatically propagate the message towards all the +destinations associated with that key. + +A **routing key** is a 32-bit value consisting of a board-local *ram +id*, a *pointer*, and a *size*: + +```sv +// 32-bit routing key (MSB to LSB) +typedef struct { + // Which off-chip RAM on this board? + Bit#(`LogDRAMsPerBoard) ram; + // Pointer to array of routing beats containing routing records + Bit#(`LogBeatsPerDRAM) ptr; + // Number of beats in the array + Bit#(`LogRoutingEntryLen) numBeats; +} RoutingKey; +``` + +To send a message to a routing key, a new Tinsel API call is provided: + +```c +// Send message at addr using given routing key +inline void tinselKeySend(uint32_t key, volatile void* addr); +``` + +When a message reaches the per-board router, the `ptr` field of the +routing key is used as an index into DRAM, where a sequence of 256-bit +**routing beats** are found. The `numBeats` field of the routing key +indicates how many contiguous routing beats there are. The value of +`numBeats` may be zero, in which case there are no destinations +associated with the key. + +A routing beat consists of a *size* and a sequence of five 48-bit +*routing chunks*: + +```sv +// 256-bit routing beat (aligned, MSB to LSB) +typedef struct { + // Number of routing records present in this beat + Bit#(16) size; + // Five 48-bit record chunks + Vector#(5, Bit#(48)) chunks; +} RoutingBeat; +``` + +The *size* must lie in the range 1 to 5 inclusive (0 is disallowed). +A **routing record** consists of one or two routing chunks, depending +on the **record type**. + +All byte orderings are little endian. For example, the order of bytes +in a routing beat is as follows. + +Byte | Contents +---- | -------- +31: | Upper byte of size (i.e. number of records in beat) +30: | Lower byte of size +29: | Upper byte of first chunk +... | ... +24: | Lower byte of first chunk +23: | Upper byte of second chunk +... | ... +18: | Lower byte of second chunk +17: | Upper byte of third chunk +... | ... +12: | Lower byte of third chunk +11: | Upper byte of fourth chunk +... | ... + 6: | Lower byte of fourth chunk + 5: | Upper byte of fifth chunk +... | ... + 0: | Lower byte of fifth chunk + +Clearly, both routing keys and routing beats have a maximum size. +However, in principle there is no limit to the number of records +associated with a key, due to the possibility of *indirection records* +(see below). + +There are five types of routing record, defined below. + +**48-bit Unicast Router-to-Mailbox (URM1):** + +```sv +typedef struct { + // Record type (URM1 == 0) + Bit#(3) tag; + // Mailbox destination + Bit#(4) mbox; + // Mailbox-local thread identifier + Bit#(6) thread; + // Unused + Bit#(3) unused; + // Local key. The first word of the message + // payload is overwritten with this. + Bit#(32) localKey; +} URM1Record; +``` + +The `localKey` can be used for anything, but might encode the +destination thread-local device identifier, or edge identifier, or +both. The `mbox` field is currently 4 bits (two Y bits followed by +two X bits), but there are spare bits available to increase the size +of this field in future if necessary. + +**96-bit Unicast Router-to-Mailbox (URM2):** + +```sv +typedef struct { + // Record type (URM2 == 1) + Bit#(3) tag; + // Mailbox destination + Bit#(4) mbox; + // Mailbox-local thread identifier + Bit#(6) thread; + // Currently unused + Bit#(19) unused; + // Local key. The first two words of the message + // payload is overwritten with this. + Bit#(64) localKey; +} URM2Record; +``` + +This is the same as a URM1 record except the local key is 64-bits in +size. + +**48-bit Router-to-Router (RR):** + +```sv +typedef struct { + // Record type (RR == 2) + Bit#(3) tag; + // Direction (N,S,E,W == 0,1,2,3) + Bit#(2) dir; + // Currently unused + Bit#(11) unused; + // New 32-bit routing key that will replace the one in the + // current message for the next hop of the message's journey + Bit#(32) newKey; +} RRRecord; +``` + +The `newKey` field will replace the key in the current message for the +next hop of the message's journey. Introducing a new key at each hop +simplifies the mapping process (keeping it quick). + +**96-bit Multicast Router-to-Mailbox (MRM):** + +```sv +typedef struct { + // Record type (MRM == 3) + Bit#(3) tag; + // Mailbox destination + Bit#(4) mbox; + // Currently unused + Bit#(9) unused; + // Local key. The least-significant half-word + // of the message is replaced with this + Bit#(16) localKey; + // Mailbox-local destination mask + Bit#(64) destMask; +} MRMRecord; +``` + +**48-bit Indirection (IND):** + +```sv +// 48-bit Indirection (IND) record +// Note the restrictions on IND records: +// 1. At most one IND record per key lookup +// 2. A max-sized key lookup must contain an IND record +typedef struct { + // Record type (IND == 4) + Bit#(3) tag; + // Currently unused + Bit#(13) unused; + // New 32-bit routing key for new set of records on current router + Bit#(32) newKey; +} INDRecord; +``` + +Indirection records can be used to handle large fanouts, which exceed +the number of bits available in the size portion of the routing key. + +Finally, it is worth noting that when using programmable routers, +there is an added responsibility for the programmer to use a +deadlock-free routing scheme, such as dimension-ordered routing. ## 8. Tinsel HostLink @@ -781,6 +939,12 @@ bool HostLink::canRecv(); // Receive a message (blocking), given size of message in bytes // Any bytes beyond numBytes up to the next message boundary will be ignored void HostLink::recvMsg(void* msg, uint32_t numBytes); + +// Send a message using routing key (blocking) +bool HostLink::keySend(uint32_t key, uint32_t numFlits, void* msg); + +// Try to send using routing key (non-blocking, returns true on success) +bool HostLink::keyTrySend(uint32_t key, uint32_t numFlits, void* msg); ``` The `send` method allows a message consisting of multiple flits to be @@ -1148,7 +1312,7 @@ vertex can send messages to the host via the `HostPin` or the `finish` handler, and the host can send messages to any vertex. **Softswitch**. Central to POLite is an event loop running on each -Tinsel thread, which we call **the softswitch** as it effectively +Tinsel thread, which we call the softswitch as it effectively context-switches between vertices mapped to the same thread. The softswitch has four main responsibilities: (1) to maintain a queue of vertices wanting to send; (2) to implement multicast sends over a pin @@ -1157,14 +1321,34 @@ messages efficiently between vertices running on the same thread and on different threads; and (4) to invoke the vertex handlers when required, to meet the semantics of the POLite library. +**POLite static parameters**. The following macros can be defined, +before the first instance of `#include `, to control some +aspects of POLite behaviour. + + Macro | Meaning + --------- | ------- + `POLITE_NUM_PINS` | Max number of pins per vertex (default 1) + `POLITE_DUMP_STATS` | Dump stats upon completion + `POLITE_COUNT_MSGS` | Include message counts in stats dump + `POLITE_FAST_MAP` | Use fast mapper (at the expense of application performance) + +**POLite dynamic parameters**. The following environment variables can +be set, to control some aspects of POLite behaviour. + + Environment variable | Meaning + -------------------- | ------- + `HOSTLINK_BOXES_X` | Size of box mesh to use in X dimension + `HOSTLINK_BOXES_Y` | Size of box mesh to use in Y dimension + `POLITE_BOARDS_X` | Size of board mesh to use in X dimension + `POLITE_BOARDS_Y` | Size of board mesh to use in Y dimension + `POLITE_CHATTY` | Set to `1` to enable emission of mapper stats + `POLITE_PLACER` | Use `metis`, `random`, or `direct` placement + **Limitations**. POLite provides several important features of the vertex-centric paradigm, but there are some limitations. One of the features of the Pregel framework is the ability for vertices to add and remove vertices and edges at runtime -- but currently, POLite only -supports static graphs. And for large *non-localised* fan-outs, a -hierarchical hardware or software multicast feature may be desirable -(where messages get forked at intermediate stages along the way to the -destinations). +supports static graphs. ## A. DE5-Net Synthesis Report @@ -1181,9 +1365,10 @@ The default Tinsel configuration on a single DE5-Net board contains: * four QDRII+ SRAM controllers * four 10Gbps reliable links * one termination/idle detector + * one 8x8 programmable router * a JTAG UART -The clock frequency is 225MHz and the resource utilisation is 74% of +The clock frequency is 215MHz and the resource utilisation is 84% of the DE5-Net. ## B. Tinsel Parameters @@ -1215,7 +1400,7 @@ the DE5-Net. `EnablePerfCount` | True | Enable performance counters `ClockFreq` | 215 | Clock frequency in MHz -Further parameters can be found in [config.py](config.py). +A full list of parameters can be found in [config.py](config.py). ## C. Tinsel Memory Map @@ -1274,15 +1459,20 @@ separate memory regions (which they are not). Optional performance-counter CSRs (when `EnablePerfCount` is `True`): - Name | CSR | R/W | Function - ---------------- | ------ | --- | -------- - `PerfCount` | 0xc07 | W | Reset(0)/Start(1)/Stop(2) all counters - `MissCount` | 0xc08 | R | Cache miss count - `HitCount` | 0xc09 | R | Cache hit count - `WritebackCount` | 0xc0a | R | Cache writeback count - `CPUIdleCount` | 0xc0b | R | CPU idle-cycle count (lower 32 bits) - `CPUIdleCountU` | 0xc0c | R | CPU idle-cycle count (upper 8 bits) - `CycleU` | 0xc0d | R | Cycle counter (upper 8 bits) + Name | CSR | R/W | Function + ---------------- | ------ | --- | -------- + `PerfCount` | 0xc07 | W | Reset(0)/Start(1)/Stop(2) all counters + `MissCount` | 0xc08 | R | Cache miss count + `HitCount` | 0xc09 | R | Cache hit count + `WritebackCount` | 0xc0a | R | Cache writeback count + `CPUIdleCount` | 0xc0b | R | CPU idle-cycle count (lower 32 bits) + `CPUIdleCountU` | 0xc0c | R | CPU idle-cycle count (upper 8 bits) + `CycleU` | 0xc0d | R | Cycle counter (upper 8 bits) + `ProgRouterSent` | 0xc0e | R | Total msgs sent by ProgRouter + `ProgRouterSentInter` | 0xc0f | R | Inter-board msgs sent by ProgRouter + +Note that `ProgRouterSent` and `ProgRouterSentInter` are only valid +from thread zero on each board. Tinsel also supports the following custom instructions. @@ -1350,6 +1540,9 @@ inline void tinselMulticast( // (Address must be aligned on message boundary) inline void tinselSend(uint32_t dest, volatile void* addr); +// Send message at address using given routing key +inline void tinselKeySend(uint32_t key, volatile void* addr); + // Determine if calling thread can receive a message inline uint32_t tinselCanRecv(); @@ -1429,6 +1622,14 @@ inline uint32_t tinselCPUIdleCountU(); // Read cycle counter (upper 8 bits) inline uint32_t tinselCycleCountU(); +// Performance counter: number of messages emitted by ProgRouter +// (Only valid from thread zero on each board) +inline uint32_t tinselProgRouterSent(); + +// Performance counter: number of inter-board messages emitted by ProgRouter +// (Only valid from thread zero on each board) +inline uint32_t tinselProgRouterSentInterBoard(); + // Address construction inline uint32_t tinselToAddr( uint32_t boardX, uint32_t boardY, @@ -1487,6 +1688,12 @@ class HostLink { // Any bytes beyond numBytes up to the next message boundary will be ignored void recvMsg(void* msg, uint32_t numBytes); + // Send a message using routing key (blocking by default) + bool keySend(uint32_t key, uint32_t numFlits, void* msg, bool block = true); + + // Try to send using routing key (non-blocking, returns true on success) + bool keyTrySend(uint32_t key, uint32_t numFlits, void* msg); + // Bulk send and receive // --------------------- From 4da990df5e9677decf756ab1ec88692602ab7026 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Fri, 15 May 2020 11:06:50 +0100 Subject: [PATCH 65/78] Tweaks --- README.md | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 465243ca..631a4a82 100644 --- a/README.md +++ b/README.md @@ -66,12 +66,11 @@ Released on 18 May 2020 and maintained in the ## 1. Overview On the [POETS Project](https://poets-project.org/about), we are -looking at ways to accelerate applications that can be expressed as -large numbers of small processes communicating by message-passing. -Our first attempt is based around a manythread RISC-V architecture -called Tinsel running on an FPGA cluster. Tinsel aims to support -irregular applications that have heavy memory and communication -demands, but fairly modest compute requrements. The main features are: +looking at ways to accelerate applications that are naturally +expressed as a large number of small processes communicating by +message-passing. Our first attempt is based around a manythread +RISC-V architecture called Tinsel, running on an FPGA cluster. The +main features are: * **Multithreading**. A critical aspect of the design is to tolerate latency as cleanly as possible. This includes the @@ -80,10 +79,6 @@ demands, but fairly modest compute requrements. The main features are: (keeping Fmax high); and sharing of resources between cores (such as caches, mailboxes, and FPUs). - * **Caches**. To keep the programming model simple, we have opted - to use thread-partitioned data caches to optimise access to - off-chip memory rather than DMA. - * **Message-passing**. Although there is a requirement to support a large amount of memory, it is not necessary to provide the illusion of a single shared memory space: message-passing is intended @@ -658,12 +653,12 @@ purposes, resulting in very little overhead on the wire. ## 7. Tinsel Router -The Tinsel overlay provides a programmable router on each FPGA board -to support *global* multicasting. Programmable routers automatically -propagate messages to any number of destination threads distributed -throughout the cluster, minimising inter-FPGA bandwidth usage for -distributed fanouts, and offloading the work from the cores. Further -background can be found in [PIP 24](doc/PIP-0024-global-multicast.md). +Tinsel provides a programmable router on each FPGA board to support +*global* multicasting. Programmable routers automatically propagate +messages to any number of destination threads distributed throughout +the cluster, minimising inter-FPGA bandwidth usage for distributed +fanouts, and offloading work from the cores. Further background can +be found in [PIP 24](doc/PIP-0024-global-multicast.md). To support programmable routers, the destination component of a message is generalised so that it can be (1) a thread id; or (2) a @@ -688,7 +683,8 @@ typedef struct { } RoutingKey; ``` -To send a message to a routing key, a new Tinsel API call is provided: +To send a message using a routing key as the destination, a new Tinsel +API call is provided: ```c // Send message at addr using given routing key From 1bc368f4a6e6c4c1113bb441c1b9315cdde44ac3 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Mon, 18 May 2020 20:32:00 +0100 Subject: [PATCH 66/78] Route in box Y dir before considering host bit Multiple boxes in the Y direction only has stopped working. Looking back at the first commit the in 0.8 branch, it looks like I accidently changed the behviour of inter-board routing by considering the host bit before knowing that we're on the correct Y coordinate. Hopefully, this is the fix... --- rtl/ProgRouter.bsv | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rtl/ProgRouter.bsv b/rtl/ProgRouter.bsv index 9570ff09..6e531261 100644 --- a/rtl/ProgRouter.bsv +++ b/rtl/ProgRouter.bsv @@ -404,12 +404,12 @@ module mkFetcher#(BoardId boardId, Integer fetcherId) (Fetcher); // Make routing decision RoutingDecision decision = RouteNoC; MailboxNetAddr addr = flit.dest.addr; - if (addr.host.valid) + if (addr.board.y < boardId.y) decision = RouteSouth; + else if (addr.board.y > boardId.y) decision = RouteNorth; + else if (addr.host.valid) decision = addr.host.value == 0 ? RouteWest : RouteEast; else if (addr.board.x < boardId.x) decision = RouteWest; else if (addr.board.x > boardId.x) decision = RouteEast; - else if (addr.board.y < boardId.y) decision = RouteSouth; - else if (addr.board.y > boardId.y) decision = RouteNorth; // Insert into bypass queue flitBypassQueue.enq(RoutedFlit { decision: decision, flit: flit}); end From ff6deed7e52b881f1919d71aee1fc3be2dd0e134 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Wed, 27 May 2020 09:14:05 +0100 Subject: [PATCH 67/78] Fix fast-mapping of weights --- include/POLite/FastMap/PGraph.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/include/POLite/FastMap/PGraph.h b/include/POLite/FastMap/PGraph.h index 8ac0c84d..923c34d3 100644 --- a/include/POLite/FastMap/PGraph.h +++ b/include/POLite/FastMap/PGraph.h @@ -371,11 +371,9 @@ template numElems; - if (i < inTable[threadId]->numElems) { - PInEdge edge; - edge.edge = edges->elems[i]; - inTable[threadId]->append(edge); - } + PInEdge edge; + edge.edge = edges->elems[i]; + inTable[threadId]->append(edge); // Add output table entry PRoutingDest rdest; rdest.kind = PRDestKindURM1; From 8a19d07b5c8857b5a06fd08446be285d40f5ac6b Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Wed, 3 Jun 2020 10:39:04 +0100 Subject: [PATCH 68/78] POLite examples: display time conditionally If POLITE_DUMP_STATS is defined then measuring time from the host isn't very accurate because it will include the time to transfer a large number of stats over the slow UART. To help users become aware of this, we now don't display the time if POLITE_DUMP_STATS is enabled. --- apps/POLite/asp-gals/ASP.h | 4 ++-- apps/POLite/asp-gals/Run.cpp | 5 ++++- apps/POLite/asp-sync/Run.cpp | 2 ++ apps/POLite/asp-tiles-sync/Run.cpp | 4 ++-- apps/POLite/clocktree-async/Run.cpp | 2 ++ apps/POLite/hashmin-sync/Run.cpp | 2 ++ apps/POLite/heat-cube-sync/Run.cpp | 2 ++ apps/POLite/heat-gals/Run.cpp | 2 ++ apps/POLite/heat-sync/Run.cpp | 2 ++ apps/POLite/izhikevich-gals/Run.cpp | 2 ++ apps/POLite/izhikevich-sync/Run.cpp | 2 ++ apps/POLite/sssp-async/Run.cpp | 2 ++ apps/POLite/sssp-sync/Run.cpp | 2 ++ 13 files changed, 28 insertions(+), 5 deletions(-) diff --git a/apps/POLite/asp-gals/ASP.h b/apps/POLite/asp-gals/ASP.h index 42462622..f69dfa3d 100644 --- a/apps/POLite/asp-gals/ASP.h +++ b/apps/POLite/asp-gals/ASP.h @@ -9,8 +9,8 @@ #ifndef _ASP_H_ #define _ASP_H_ -//#define POLITE_DUMP_STATS -//#define POLITE_COUNT_MSGS +#define POLITE_DUMP_STATS +#define POLITE_COUNT_MSGS // Lightweight POETS frontend #include diff --git a/apps/POLite/asp-gals/Run.cpp b/apps/POLite/asp-gals/Run.cpp index d50821ce..4c00e1da 100644 --- a/apps/POLite/asp-gals/Run.cpp +++ b/apps/POLite/asp-gals/Run.cpp @@ -51,7 +51,8 @@ int main(int argc, char**argv) // Create random set of source nodes uint32_t numSources = NUM_SOURCES*32; uint32_t sources[numSources]; - randomSet(numSources, sources, graph.numDevices); + //randomSet(numSources, sources, graph.numDevices); + for (int i = 0; i < numSources; i++) sources[i] = i; // Initialise devices for (PDeviceId i = 0; i < graph.numDevices; i++) { @@ -102,7 +103,9 @@ int main(int argc, char**argv) // Display time timersub(&finish, &start, &diff); double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; + #ifndef POLITE_DUMP_STATS printf("Time = %lf\n", duration); + #endif return 0; } diff --git a/apps/POLite/asp-sync/Run.cpp b/apps/POLite/asp-sync/Run.cpp index 287f32db..518a33b5 100644 --- a/apps/POLite/asp-sync/Run.cpp +++ b/apps/POLite/asp-sync/Run.cpp @@ -99,7 +99,9 @@ int main(int argc, char**argv) // Display time timersub(&finish, &start, &diff); double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; + #ifndef POLITE_DUMP_STATS printf("Time = %lf\n", duration); + #endif return 0; } diff --git a/apps/POLite/asp-tiles-sync/Run.cpp b/apps/POLite/asp-tiles-sync/Run.cpp index 049d83a8..cdc2bb14 100644 --- a/apps/POLite/asp-tiles-sync/Run.cpp +++ b/apps/POLite/asp-tiles-sync/Run.cpp @@ -135,11 +135,11 @@ int main(int argc, char**argv) double duration; timersub(&finishCompute, &startCompute, &diff); duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; - printf("Time (compute) = %lf\n", duration); + printf("Time (compute, including stats transfer over UART) = %lf\n", duration); gettimeofday(&finishAll, NULL); timersub(&finishAll, &startAll, &diff); duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; - printf("Time (all) = %lf\n", duration); + printf("Time (all, including stats transfer over UART) = %lf\n", duration); return 0; } diff --git a/apps/POLite/clocktree-async/Run.cpp b/apps/POLite/clocktree-async/Run.cpp index 270c9b48..02f76723 100644 --- a/apps/POLite/clocktree-async/Run.cpp +++ b/apps/POLite/clocktree-async/Run.cpp @@ -93,7 +93,9 @@ int main(int argc, char** argv) // Display time timersub(&finish, &start, &diff); double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; + #ifndef POLITE_DUMP_STATS printf("Time = %lf\n", duration); + #endif return 0; } diff --git a/apps/POLite/hashmin-sync/Run.cpp b/apps/POLite/hashmin-sync/Run.cpp index cb6a7ced..eab92eff 100644 --- a/apps/POLite/hashmin-sync/Run.cpp +++ b/apps/POLite/hashmin-sync/Run.cpp @@ -82,7 +82,9 @@ int main(int argc, char**argv) // Display time timersub(&finish, &start, &diff); double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; + #ifndef POLITE_DUMP_STATS printf("Time = %lf\n", duration); + #endif return 0; } diff --git a/apps/POLite/heat-cube-sync/Run.cpp b/apps/POLite/heat-cube-sync/Run.cpp index aaa42c39..1163f01b 100644 --- a/apps/POLite/heat-cube-sync/Run.cpp +++ b/apps/POLite/heat-cube-sync/Run.cpp @@ -76,7 +76,9 @@ int main() // Display time timersub(&finish, &start, &diff); double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; + #ifndef POLITE_DUMP_STATS printf("Time = %lf\n", duration); + #endif return 0; } diff --git a/apps/POLite/heat-gals/Run.cpp b/apps/POLite/heat-gals/Run.cpp index eacf449f..44c2f921 100644 --- a/apps/POLite/heat-gals/Run.cpp +++ b/apps/POLite/heat-gals/Run.cpp @@ -98,7 +98,9 @@ int main(int argc, char **argv) // Display time timersub(&finish, &start, &diff); double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; + #ifndef POLITE_DUMP_STATS printf("Time = %lf\n", duration); + #endif return 0; } diff --git a/apps/POLite/heat-sync/Run.cpp b/apps/POLite/heat-sync/Run.cpp index 91652712..ed978e39 100644 --- a/apps/POLite/heat-sync/Run.cpp +++ b/apps/POLite/heat-sync/Run.cpp @@ -97,7 +97,9 @@ int main(int argc, char **argv) // Display time timersub(&finish, &start, &diff); double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; + #ifndef POLITE_DUMP_STATS printf("Time = %lf\n", duration); + #endif return 0; } diff --git a/apps/POLite/izhikevich-gals/Run.cpp b/apps/POLite/izhikevich-gals/Run.cpp index 43fb3d4d..e542881f 100644 --- a/apps/POLite/izhikevich-gals/Run.cpp +++ b/apps/POLite/izhikevich-gals/Run.cpp @@ -124,7 +124,9 @@ int main(int argc, char**argv) // Display time timersub(&finish, &start, &diff); double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; + #ifndef POLITE_DUMP_STATS printf("Time = %lf\n", duration); + #endif return 0; } diff --git a/apps/POLite/izhikevich-sync/Run.cpp b/apps/POLite/izhikevich-sync/Run.cpp index 09efb701..0693b8c3 100644 --- a/apps/POLite/izhikevich-sync/Run.cpp +++ b/apps/POLite/izhikevich-sync/Run.cpp @@ -112,7 +112,9 @@ int main(int argc, char**argv) // Display time timersub(&finish, &start, &diff); double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; + #ifndef POLITE_DUMP_STATS printf("Time = %lf\n", duration); + #endif return 0; } diff --git a/apps/POLite/sssp-async/Run.cpp b/apps/POLite/sssp-async/Run.cpp index b78ccec4..37ffcb4e 100644 --- a/apps/POLite/sssp-async/Run.cpp +++ b/apps/POLite/sssp-async/Run.cpp @@ -87,7 +87,9 @@ int main(int argc, char**argv) // Display time timersub(&finish, &start, &diff); double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; + #ifndef POLITE_DUMP_STATS printf("Time = %lf\n", duration); + #endif return 0; } diff --git a/apps/POLite/sssp-sync/Run.cpp b/apps/POLite/sssp-sync/Run.cpp index b78ccec4..37ffcb4e 100644 --- a/apps/POLite/sssp-sync/Run.cpp +++ b/apps/POLite/sssp-sync/Run.cpp @@ -87,7 +87,9 @@ int main(int argc, char**argv) // Display time timersub(&finish, &start, &diff); double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; + #ifndef POLITE_DUMP_STATS printf("Time = %lf\n", duration); + #endif return 0; } From d70c212309952f774b2782c010be32b8cacd0590 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Wed, 3 Jun 2020 15:16:06 +0100 Subject: [PATCH 69/78] Support three mapper variants in POLite --- README.md | 19 +- include/POLite.h | 33 +- include/POLite/{FastMap => Dist}/PDevice.h | 0 include/POLite/{FastMap => Dist}/PGraph.h | 0 include/POLite/{ => Hybrid}/PDevice.h | 0 include/POLite/{ => Hybrid}/PGraph.h | 0 include/POLite/Local/PDevice.h | 370 +++++++++ include/POLite/Local/PGraph.h | 866 +++++++++++++++++++++ 8 files changed, 1273 insertions(+), 15 deletions(-) rename include/POLite/{FastMap => Dist}/PDevice.h (100%) rename include/POLite/{FastMap => Dist}/PGraph.h (100%) rename include/POLite/{ => Hybrid}/PDevice.h (100%) rename include/POLite/{ => Hybrid}/PGraph.h (100%) create mode 100644 include/POLite/Local/PDevice.h create mode 100644 include/POLite/Local/PGraph.h diff --git a/README.md b/README.md index 631a4a82..e4fdf72c 100644 --- a/README.md +++ b/README.md @@ -1326,7 +1326,15 @@ aspects of POLite behaviour. `POLITE_NUM_PINS` | Max number of pins per vertex (default 1) `POLITE_DUMP_STATS` | Dump stats upon completion `POLITE_COUNT_MSGS` | Include message counts in stats dump - `POLITE_FAST_MAP` | Use fast mapper (at the expense of application performance) + +POLite supports three mapping modes, also controlled via macros: + + + Macro | Use when graphs have... + --------- | ----------------------- + `POLITE_MAP_LOCAL` | ...lots of local connections and few distributed connections + `POLITE_MAP_DIST` | ...lots of distributed connections and few local connections (this mapper is fast) + `POLITE_MAP_HYBRID` | ...a mix of local and distributed connections (default) **POLite dynamic parameters**. The following environment variables can be set, to control some aspects of POLite behaviour. @@ -1341,10 +1349,11 @@ be set, to control some aspects of POLite behaviour. `POLITE_PLACER` | Use `metis`, `random`, or `direct` placement **Limitations**. POLite provides several important features of the -vertex-centric paradigm, but there are some limitations. One of the -features of the Pregel framework is the ability for vertices to add -and remove vertices and edges at runtime -- but currently, POLite only -supports static graphs. +vertex-centric paradigm, but there are lots of limitations and quirks; +it is only intended as a prototype library for hardware evaluation +purposes. One of the features of the Pregel framework is the ability +for vertices to add and remove vertices and edges at runtime -- but +currently, POLite only supports static graphs. ## A. DE5-Net Synthesis Report diff --git a/include/POLite.h b/include/POLite.h index 858b865e..d1d5fbc6 100644 --- a/include/POLite.h +++ b/include/POLite.h @@ -4,20 +4,33 @@ #include +// Select default mapper +#if !defined(POLITE_MAP_LOCAL) || \ + !defined(POLITE_MAP_DIST) || \ + !defined(POLITE_MAP_HYBRID) + // Default mapper + #define POLITE_MAP_HYBRID +#endif + #ifdef TINSEL #include - #ifdef POLITE_FAST_MAP - #include - #else - #include + #if defined(POLITE_MAP_LOCAL) + #include + #elif defined(POLITE_MAP_DIST) + #include + #elif defined(POLITE_MAP_HYBRID) + #include #endif #else - #ifdef POLITE_FAST_MAP - #include - #include - #else - #include - #include + #if defined(POLITE_FAST_LOCAL) + #include + #include + #elif defined(POLITE_MAP_DIST) + #include + #include + #elif defined (POLITE_MAP_HYBRID) + #include + #include #endif #include #include diff --git a/include/POLite/FastMap/PDevice.h b/include/POLite/Dist/PDevice.h similarity index 100% rename from include/POLite/FastMap/PDevice.h rename to include/POLite/Dist/PDevice.h diff --git a/include/POLite/FastMap/PGraph.h b/include/POLite/Dist/PGraph.h similarity index 100% rename from include/POLite/FastMap/PGraph.h rename to include/POLite/Dist/PGraph.h diff --git a/include/POLite/PDevice.h b/include/POLite/Hybrid/PDevice.h similarity index 100% rename from include/POLite/PDevice.h rename to include/POLite/Hybrid/PDevice.h diff --git a/include/POLite/PGraph.h b/include/POLite/Hybrid/PGraph.h similarity index 100% rename from include/POLite/PGraph.h rename to include/POLite/Hybrid/PGraph.h diff --git a/include/POLite/Local/PDevice.h b/include/POLite/Local/PDevice.h new file mode 100644 index 00000000..ca806a58 --- /dev/null +++ b/include/POLite/Local/PDevice.h @@ -0,0 +1,370 @@ +// SPDX-License-Identifier: BSD-2-Clause +#ifndef _PDEVICE_H_ +#define _PDEVICE_H_ + +#include +#include +#include + +#ifdef TINSEL + #include + #define PTR(t) t* +#else + #include + #define PTR(t) uint32_t +#endif + +// Use this to align on half-cache-line boundary +#define ALIGNED __attribute__((aligned(1<<(TinselLogBytesPerLine-1)))) + +// This is a static limit on the number of pins per device +#ifndef POLITE_NUM_PINS +#define POLITE_NUM_PINS 1 +#endif + +// Macros for performance stats +// POLITE_DUMP_STATS - dump performance stats on termination +// POLITE_COUNT_MSGS - include message counts of performance stats + +// Thread-local device id +typedef uint16_t PLocalDeviceId; +#define InvalidLocalDevId 0xffff +#define UnusedLocalDevId 0xfffe + +// Thread id +typedef uint32_t PThreadId; + +// Device address +// Bits 17->0: thread id +// Bit 18: invalid address +// Bits 31->19: thread-local device id +typedef uint32_t PDeviceAddr; + +// Device address constructors +inline PDeviceAddr invalidDeviceAddr() { return 0x40000; } +inline PDeviceAddr makeDeviceAddr(PThreadId t, PLocalDeviceId d) { + return (d << 19) | t; +} + +// Device address deconstructors +inline bool isValidDeviceAddr(PDeviceAddr addr) { return !(addr & 0x40000); } +inline PThreadId getThreadId(PDeviceAddr addr) { return addr & 0x3ffff; } +inline PLocalDeviceId getLocalDeviceId(PDeviceAddr addr) { return addr >> 19; } + +// What's the max allowed local device address? +inline uint32_t maxLocalDeviceId() { return 8192; } + +// Routing key +typedef uint16_t Key; +#define InvalidKey 0xffff + +// Pins +// No - means 'not ready to send' +// HostPin - means 'send to host' +// Pin(n) - means 'send to application pin number n' +typedef uint8_t PPin; +#define No 0 +#define HostPin 1 +#define Pin(n) ((n)+2) + +// For template arguments that are not used +struct None {}; + +// Generic device structure +// Type parameters: +// S - State +// E - Edge label +// M - Message structure +template struct PDevice { + // State + S* s; + PPin* readyToSend; + uint32_t numVertices; + uint16_t time; + + // Handlers + void init(); + void send(volatile M* msg); + void recv(M* msg, E* edge); + bool step(); + bool finish(volatile M* msg); +}; + +// Generic device state structure +template struct ALIGNED PState { + // Pointer to base of neighbours arrays + uint16_t pinBase[POLITE_NUM_PINS]; + // Ready-to-send status + PPin readyToSend; + // Custom state + S state; +}; + +// Message structure +template struct PMessage { + // Source-based routing key + Key key; + // Application message + M payload; +}; + +// An outgoing edge from a device +struct POutEdge { + // Destination mailbox + uint16_t mbox; + // Routing key + uint16_t key; + // Destination threads + uint32_t threadMaskLow; + uint32_t threadMaskHigh; +}; + +// An incoming edge to a device (labelleled) +template struct PInEdge { + // Destination device + PLocalDeviceId devId; + // Edge info + E edge; +}; + +// An incoming edge to a device (unlabelleled) +template <> struct PInEdge { + union { + // Destination device + PLocalDeviceId devId; + // Unused + None edge; + }; +}; + +// Helper function: Count board hops between two threads +inline uint32_t hopsBetween(uint32_t t0, uint32_t t1) { + uint32_t xmask = ((1<> (TinselLogThreadsPerBoard + TinselMeshXBits); + int32_t x0 = (t0 >> TinselLogThreadsPerBoard) & xmask; + int32_t y1 = t1 >> (TinselLogThreadsPerBoard + TinselMeshXBits); + int32_t x1 = (t1 >> TinselLogThreadsPerBoard) & xmask; + return (abs(x0-x1) + abs(y0-y1)); +} + +// Generic thread structure +template struct PThread { + + // Number of devices handled by thread + PLocalDeviceId numDevices; + // Number of times step handler has been called + uint16_t time; + // Number of devices in graph + uint32_t numVertices; + // Pointer to array of device states + PTR(PState) devices; + // Pointer to base of routing tables + PTR(POutEdge) outTableBase; + PTR(PInEdge) inTableBase; + // Array of local device ids are ready to send + PTR(PLocalDeviceId) senders; + // This array is accessed in a LIFO manner + PTR(PLocalDeviceId) sendersTop; + + // Count number of messages sent + #ifdef POLITE_COUNT_MSGS + // Total message received + uint32_t msgsReceived; + // Number of times we wanted to send but couldn't + uint32_t blockedSends; + // Total messages sent between threads + uint32_t interThreadSendCount; + // Messages sent between threads on different boards + uint32_t interBoardSendCount; + #endif + + #ifdef TINSEL + + // Helper function to construct a device + INLINE DeviceType getDevice(uint32_t id) { + DeviceType dev; + dev.s = &devices[id].state; + dev.readyToSend = &devices[id].readyToSend; + dev.numVertices = numVertices; + dev.time = time; + return dev; + } + + // Dump performance counter stats over UART + void dumpStats() { + tinselPerfCountStop(); + uint32_t me = tinselId(); + // Per-cache performance counters + uint32_t cacheMask = (1 << + (TinselLogThreadsPerCore + TinselLogCoresPerDCache)) - 1; + if ((me & cacheMask) == 0) { + printf("H:%x,M:%x,W:%x\n", + tinselHitCount(), + tinselMissCount(), + tinselWritebackCount()); + } + // Per-core performance counters + uint32_t coreMask = (1 << (TinselLogThreadsPerCore)) - 1; + if ((me & coreMask) == 0) { + printf("C:%x %x,I:%x %x\n", + tinselCycleCountU(), tinselCycleCount(), + tinselCPUIdleCountU(), tinselCPUIdleCount()); + } + // Per-thread performance counters + #ifdef POLITE_COUNT_MSGS + printf("MS:%x,MR:%x,PR:%x,PRI:%x,BL:%x\n", + interThreadSendCount, msgsReceived, 0, + interBoardSendCount, blockedSends); + #endif + } + + // Invoke device handlers + void run() { + // Current out-going edge in multicast + POutEdge* outEdge; + + // Outgoing edge to host + POutEdge outHost[2]; + outHost[0].mbox = tinselHostId() >> TinselLogThreadsPerMailbox; + outHost[0].key = 0; + outHost[1].key = InvalidKey; + // Initialise outEdge to null terminator + outEdge = &outHost[1]; + + // Did last call to step handler request a new time step? + bool active = true; + + // Reset performance counters + tinselPerfCountReset(); + + // Initialisation + sendersTop = senders; + for (uint32_t i = 0; i < numDevices; i++) { + DeviceType dev = getDevice(i); + // Invoke the initialiser for each device + dev.init(); + // Device ready to send? + if (*dev.readyToSend != No) { + *(sendersTop++) = i; + } + } + + // Set number of flits per message + tinselSetLen((sizeof(PMessage)-1) >> TinselLogBytesPerFlit); + + // Event loop + while (1) { + // Step 1: try to send + if (outEdge->key != InvalidKey) { + if (tinselCanSend()) { + PMessage* m = (PMessage*) tinselSendSlot(); + // Send message + m->key = outEdge->key; + tinselMulticast(outEdge->mbox, outEdge->threadMaskHigh, + outEdge->threadMaskLow, m); + #ifdef POLITE_COUNT_MSGS + interThreadSendCount++; + interBoardSendCount += + hopsBetween(outEdge->mbox << TinselLogThreadsPerMailbox, + tinselId()); + #endif + // Move to next neighbour + outEdge++; + } + else { + blockedSends++; + tinselWaitUntil(TINSEL_CAN_SEND|TINSEL_CAN_RECV); + } + } + else if (sendersTop != senders) { + if (tinselCanSend()) { + // Start new multicast + PLocalDeviceId src = *(--sendersTop); + // Lookup device + DeviceType dev = getDevice(src); + PPin pin = *dev.readyToSend; + // Invoke send handler + PMessage* m = (PMessage*) tinselSendSlot(); + dev.send(&m->payload); + // Reinsert sender, if it still wants to send + if (*dev.readyToSend != No) sendersTop++; + // Determine out-edge array for sender + if (pin == HostPin) + outEdge = outHost; + else + outEdge = (POutEdge*) &outTableBase[ + devices[src].pinBase[pin-2] + ]; + } + else { + blockedSends++; + tinselWaitUntil(TINSEL_CAN_SEND|TINSEL_CAN_RECV); + } + } + else { + // Idle detection + int idle = tinselIdle(!active); + if (idle > 1) + break; + else if (idle) { + active = false; + for (uint32_t i = 0; i < numDevices; i++) { + DeviceType dev = getDevice(i); + // Invoke the step handler for each device + active = dev.step() || active; + // Device ready to send? + if (*dev.readyToSend != No) { + *(sendersTop++) = i; + } + } + time++; + } + } + + // Step 2: try to receive + while (tinselCanRecv()) { + PMessage* inMsg = (PMessage*) tinselRecv(); + PInEdge* inEdge = &inTableBase[inMsg->key]; + while (inEdge->devId != InvalidLocalDevId) { + // Lookup destination device + PLocalDeviceId id = inEdge->devId; + DeviceType dev = getDevice(id); + // Was it ready to send? + PPin oldReadyToSend = *dev.readyToSend; + // Invoke receive handler + dev.recv(&inMsg->payload, &inEdge->edge); + // Insert device into a senders array, if not already there + if (*dev.readyToSend != No && oldReadyToSend == No) + *(sendersTop++) = id; + inEdge++; + #ifdef POLITE_COUNT_MSGS + msgsReceived++; + #endif + } + tinselFree(inMsg); + } + } + + // Termination + #ifdef POLITE_DUMP_STATS + dumpStats(); + #endif + + // Invoke finish handler for each device + for (uint32_t i = 0; i < numDevices; i++) { + DeviceType dev = getDevice(i); + tinselWaitUntil(TINSEL_CAN_SEND); + PMessage* m = (PMessage*) tinselSendSlot(); + if (dev.finish(&m->payload)) tinselSend(tinselHostId(), m); + } + + // Sleep + tinselWaitUntil(TINSEL_CAN_RECV); while (1); + } + + #endif + +}; + +#endif diff --git a/include/POLite/Local/PGraph.h b/include/POLite/Local/PGraph.h new file mode 100644 index 00000000..4181c3da --- /dev/null +++ b/include/POLite/Local/PGraph.h @@ -0,0 +1,866 @@ +// SPDX-License-Identifier: BSD-2-Clause +#ifndef _PGRAPH_H_ +#define _PGRAPH_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "Seq.h" + +// Nodes of a POETS graph are devices +typedef NodeId PDeviceId; + +// This structure holds a group of receiving edges on a thread. +// All of the edges originate from the same output pin. +template struct PReceiverGroup { + // Thread id where all the receivers reside + uint32_t threadId; + // A sequence of receiving devices on that thread + Seq>* receivers; +}; + +// POETS graph +template class PGraph { + private: + // Align address to 2^n byte boundary + inline uint32_t align(uint32_t n, uint32_t addr) { + if ((addr & (1<> n) + 1) << n; + } + + // Align address to 32-bit word boundary + uint32_t wordAlign(uint32_t addr) { return align(2, addr); } + + // Align address to cache-line boundary + uint32_t cacheAlign(uint32_t addr) { + return align(TinselLogBytesPerLine, addr); + } + + // Helper function + inline uint32_t min(uint32_t x, uint32_t y) { return x < y ? x : y; } + + // Number of FPGA boards available + uint32_t meshLenX; + uint32_t meshLenY; + + // Number of FPGA boards to use + uint32_t numBoardsX; + uint32_t numBoardsY; + + // Multicast routing tables: + // Sequence of outgoing edges for every (device, pin) pair + Seq*** outTable; + // Sequence of incoming edges for every thread + Seq>** inTable; + + // Generic constructor + void constructor(uint32_t lenX, uint32_t lenY) { + meshLenX = lenX; + meshLenY = lenY; + char* str = getenv("POLITE_BOARDS_X"); + int nx = str ? atoi(str) : meshLenX; + str = getenv("POLITE_BOARDS_Y"); + int ny = str ? atoi(str) : meshLenY; + setNumBoards(nx, ny); + numDevices = 0; + devices = NULL; + toDeviceAddr = NULL; + numDevicesOnThread = NULL; + fromDeviceAddr = NULL; + vertexMem = NULL; + vertexMemSize = NULL; + vertexMemBase = NULL; + inEdgeMem = NULL; + inEdgeMemSize = NULL; + inEdgeMemBase = NULL; + outEdgeMem = NULL; + outEdgeMemSize = NULL; + outEdgeMemBase = NULL; + mapVerticesToDRAM = false; + mapInEdgesToDRAM = true; + mapOutEdgesToDRAM = true; + outTable = NULL; + inTable = NULL; + chatty = 0; + } + + public: + // Number of devices + uint32_t numDevices; + + // Graph containing device ids and connections + Graph graph; + + // Edge labels: has same structure as graph.outgoing + Seq*> edgeLabels; + + // Mapping from device id to device state + // (Not valid until the mapper is called) + PState** devices; + + // Mapping from thread id to number of devices on that thread + // (Not valid until the mapper is called) + uint32_t* numDevicesOnThread; + + // Mapping from device id to device address and back + // (Not valid until the mapper is called) + PDeviceAddr* toDeviceAddr; // Device id -> device address + PDeviceId** fromDeviceAddr; // Device address -> device id + + // Each thread's vertex mem and thread mem regions + // (Not valid until the mapper is called) + uint8_t** vertexMem; uint8_t** threadMem; + uint32_t* vertexMemSize; uint32_t* threadMemSize; + uint32_t* vertexMemBase; uint32_t* threadMemBase; + + // Each thread's in-edge and out-edge regions + // (Not valid until the mapper is called) + uint8_t** inEdgeMem; uint8_t** outEdgeMem; + uint32_t* inEdgeMemSize; uint32_t* outEdgeMemSize; + uint32_t* inEdgeMemBase; uint32_t* outEdgeMemBase; + + // Where to map the various regions + // (If false, map to SRAM instead) + bool mapVerticesToDRAM; + bool mapInEdgesToDRAM; + bool mapOutEdgesToDRAM; + + // Allow mapper to print useful information to stdout + uint32_t chatty; + + // Setter for number of boards to use + void setNumBoards(uint32_t x, uint32_t y) { + if (x > meshLenX || y > meshLenY) { + printf("Mapper: %d x %d boards requested, %d x %d available\n", + numBoardsX, numBoardsY, meshLenX, meshLenY); + exit(EXIT_FAILURE); + } + numBoardsX = x; + numBoardsY = y; + } + + // Create new device + inline PDeviceId newDevice() { + edgeLabels.append(new SmallSeq); + numDevices++; + return graph.newNode(); + } + + // Add a connection between devices + inline void addEdge(PDeviceId from, PinId pin, PDeviceId to) { + if (pin >= POLITE_NUM_PINS) { + printf("addEdge: pin exceeds POLITE_NUM_PINS\n"); + exit(EXIT_FAILURE); + } + graph.addEdge(from, pin, to); + E edge; + edgeLabels.elems[from]->append(edge); + } + + // Add labelled edge using given output pin + void addLabelledEdge(E edge, PDeviceId x, PinId pin, PDeviceId y) { + graph.addEdge(x, pin, y); + edgeLabels.elems[x]->append(edge); + } + + // Allocate SRAM and DRAM partitions + void allocatePartitions() { + // Decide a maximum partition size that is reasonable + // SRAM: Partition size minus 2048 bytes for the stack + uint32_t maxSRAMSize = (1<)); + // Add space for devices + uint32_t numDevs = numDevicesOnThread[threadId]; + for (uint32_t devNum = 0; devNum < numDevs; devNum++) { + // Add space for device + sizeVMem = sizeVMem + sizeof(PState); + } + // Add space for incoming edge table + if (inTable[threadId]) { + sizeEIMem = inTable[threadId]->numElems * sizeof(PInEdge); + sizeEIMem = wordAlign(sizeEIMem); + } + // Add space for outgoing edge table + for (uint32_t devNum = 0; devNum < numDevs; devNum++) { + PDeviceId id = fromDeviceAddr[threadId][devNum]; + for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) { + Seq* edges = outTable[id][p]; + sizeEOMem += sizeof(POutEdge) * edges->numElems; + } + } + sizeEOMem = wordAlign(sizeEOMem); + // The total partition size including uninitialised portions + uint32_t totalSizeVMem = + sizeVMem + wordAlign(sizeof(PLocalDeviceId) * numDevs); + // Check that total size is reasonable + uint32_t totalSizeSRAM = sizeTMem; + uint32_t totalSizeDRAM = 0; + if (mapVerticesToDRAM) totalSizeDRAM += totalSizeVMem; + else totalSizeSRAM += totalSizeVMem; + if (mapInEdgesToDRAM) totalSizeDRAM += sizeEIMem; + else totalSizeSRAM += sizeEIMem; + if (mapOutEdgesToDRAM) totalSizeDRAM += sizeEOMem; + else totalSizeSRAM += sizeEOMem; + if (totalSizeDRAM > maxDRAMSize) { + printf("Error: max DRAM partition size exceeded\n"); + exit(EXIT_FAILURE); + } + if (totalSizeSRAM > maxSRAMSize) { + printf("Error: max SRAM partition size exceeded\n"); + exit(EXIT_FAILURE); + } + // Allocate space for the initialised portion of the partition + assert((sizeVMem%4) == 0); + assert((sizeTMem%4) == 0); + assert((sizeEIMem%4) == 0); + assert((sizeEOMem%4) == 0); + vertexMem[threadId] = (uint8_t*) calloc(sizeVMem, 1); + vertexMemSize[threadId] = sizeVMem; + threadMem[threadId] = (uint8_t*) calloc(sizeTMem, 1); + threadMemSize[threadId] = sizeTMem; + inEdgeMem[threadId] = (uint8_t*) calloc(sizeEIMem, 1); + inEdgeMemSize[threadId] = sizeEIMem; + outEdgeMem[threadId] = (uint8_t*) calloc(sizeEOMem, 1); + outEdgeMemSize[threadId] = sizeEOMem; + // Tinsel address of base of partition + uint32_t partId = threadId & (TinselThreadsPerDRAM-1); + uint32_t sramBase = (1 << TinselLogBytesPerSRAM) + + (partId << TinselLogBytesPerSRAMPartition); + uint32_t dramBase = TinselBytesPerDRAM - + ((partId+1) << TinselLogBytesPerDRAMPartition); + // Use partition-interleaved region for DRAM + dramBase |= 0x80000000; + threadMemBase[threadId] = sramBase; + sramBase += threadMemSize[threadId]; + // Determine base addresses of each region + if (mapVerticesToDRAM) { + vertexMemBase[threadId] = dramBase; + dramBase += totalSizeVMem; + } + else { + vertexMemBase[threadId] = sramBase; + sramBase += totalSizeVMem; + } + if (mapInEdgesToDRAM) { + inEdgeMemBase[threadId] = dramBase; + dramBase += sizeEIMem; + } + else { + inEdgeMemBase[threadId] = sramBase; + sramBase += sizeEIMem; + } + if (mapOutEdgesToDRAM) { + outEdgeMemBase[threadId] = dramBase; + dramBase += sizeEOMem; + } + else { + outEdgeMemBase[threadId] = sramBase; + sramBase += sizeEOMem; + } + } + } + + // Initialise partitions + void initialisePartitions() { + for (uint32_t threadId = 0; threadId < TinselMaxThreads; threadId++) { + // Next pointers for each partition + uint32_t nextVMem = 0; + uint32_t nextOutIndex = 0; + // Pointer to thread structure + PThread* thread = + (PThread*) &threadMem[threadId][0]; + // Set number of devices on thread + thread->numDevices = numDevicesOnThread[threadId]; + // Set number of devices in graph + thread->numVertices = numDevices; + // Set tinsel address of array of device states + thread->devices = vertexMemBase[threadId]; + // Set tinsel address of base of edge tables + thread->outTableBase = outEdgeMemBase[threadId]; + thread->inTableBase = inEdgeMemBase[threadId]; + // Add space for each device on thread + uint32_t numDevs = numDevicesOnThread[threadId]; + for (uint32_t devNum = 0; devNum < numDevs; devNum++) { + PState* dev = (PState*) &vertexMem[threadId][nextVMem]; + PDeviceId id = fromDeviceAddr[threadId][devNum]; + devices[id] = dev; + // Add space for device + nextVMem = nextVMem + sizeof(PState); + } + // Initialise each device and the thread's out edges + for (uint32_t devNum = 0; devNum < numDevs; devNum++) { + PDeviceId id = fromDeviceAddr[threadId][devNum]; + PState* dev = devices[id]; + // Initialise + POutEdge* outEdgeArray = (POutEdge*) outEdgeMem[threadId]; + for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) { + dev->pinBase[p] = nextOutIndex; + Seq* edges = outTable[id][p]; + for (uint32_t i = 0; i < edges->numElems; i++) { + outEdgeArray[nextOutIndex] = edges->elems[i]; + nextOutIndex++; + } + } + } + // Intialise thread's in edges + PInEdge* inEdgeArray = (PInEdge*) inEdgeMem[threadId]; + Seq>* edges = inTable[threadId]; + if (edges) + for (uint32_t i = 0; i < edges->numElems; i++) { + inEdgeArray[i] = edges->elems[i]; + } + // At this point, check that next pointers line up with heap sizes + if (nextVMem != vertexMemSize[threadId]) { + printf("Error: vertex mem size does not match pre-computed size\n"); + exit(EXIT_FAILURE); + } + if ((nextOutIndex * sizeof(POutEdge)) != outEdgeMemSize[threadId]) { + printf("Error: out edge mem size does not match pre-computed size\n"); + exit(EXIT_FAILURE); + } + // Set tinsel address of senders array + thread->senders = vertexMemBase[threadId] + nextVMem; + } + } + + // Allocate mapping structures + void allocateMapping() { + devices = (PState**) calloc(numDevices, sizeof(PState*)); + toDeviceAddr = (PDeviceAddr*) calloc(numDevices, sizeof(PDeviceAddr)); + fromDeviceAddr = (PDeviceId**) calloc(TinselMaxThreads, sizeof(PDeviceId*)); + numDevicesOnThread = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t)); + } + + // Allocate routing tables + // (Only valid after mapper is called) + void allocateRoutingTables() { + // Receiver-side tables + inTable = (Seq>**) + calloc(TinselMaxThreads,sizeof(Seq>*)); + for (uint32_t t = 0; t < TinselMaxThreads; t++) { + if (numDevicesOnThread[t] != 0) + inTable[t] = new SmallSeq>; + } + + // Sender-side tables + outTable = (Seq***) calloc(numDevices, sizeof(Seq**)); + for (uint32_t d = 0; d < numDevices; d++) { + outTable[d] = (Seq**) + calloc(POLITE_NUM_PINS, sizeof(Seq*)); + for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) + outTable[d][p] = new SmallSeq; + } + } + + // Pack a receivers array + // Input: an in-edge sequence for each thread in a mailbox. + // Input array may contain lots of holes (0-element sequences) + // Output: a sequence of receiver groups + // Output array contains no empty receiver groups + void createReceiverGroups( + uint32_t mbox, + Seq>* receivers, + Seq>* groups) { + groups->clear(); + for (uint32_t i = 0; i < 64; i++) { + if (receivers[i].numElems > 0) { + // Add receiver group + PReceiverGroup g; + g.threadId = (mbox << TinselLogThreadsPerMailbox) | i; + g.receivers = &receivers[i]; + groups->append(g); + } + } + } + + // Determine routing key for given set of receivers + // (The key must be the same for all receivers) + uint32_t findKey(Seq>* receivers) { + uint32_t key = 0; + + bool found = false; + while (!found) { + found = true; + for (uint32_t i = 0; i < receivers->numElems; i++) { + PReceiverGroup g = receivers->elems[i]; + uint32_t numReceivers = g.receivers->numElems; + if (numReceivers > 0) { + // Lookup thread id of receiver + uint32_t t = g.threadId; + // Lookup table size for this thread + uint32_t tableSize = inTable[t]->numElems; + // Move to next receiver when we find a space + if (key >= tableSize) continue; + // Is there space at the current key? + // (Need space for numReceivers plus null terminator) + bool space = true; + for (int j = 0; j < numReceivers+1; j++) { + if ((key+j) >= tableSize) break; + if (inTable[t]->elems[key+j].devId != UnusedLocalDevId) { + found = false; + key = key+j+1; + break; + } + } + } + } + } + return key; + } + + // Add entries to the input tables for the given receivers + // (Only valid after mapper is called) + uint32_t addInTableEntries(Seq>* receivers) { + uint32_t key = findKey(receivers); + if (key >= 0xfffe) { + printf("Routing key exceeds 16 bits\n"); + exit(EXIT_FAILURE); + } + PInEdge null, unused; + null.devId = InvalidLocalDevId; + unused.devId = UnusedLocalDevId; + // Now that a key with sufficient space has been found, populate the tables + for (uint32_t i = 0; i < receivers->numElems; i++) { + PReceiverGroup g = receivers->elems[i]; + uint32_t numReceivers = g.receivers->numElems; + if (numReceivers > 0) { + // Lookup thread id of receiver + uint32_t t = g.threadId; + // Lookup table size for this thread + uint32_t tableSize = inTable[t]->numElems; + // Make sure inTable is big enough for new entries + for (uint32_t j = tableSize; j < (key+numReceivers+1); j++) + inTable[t]->append(unused); + // Add receivers to thread's inTable + for (uint32_t j = 0; j < numReceivers; j++) { + inTable[t]->elems[key+j] = g.receivers->elems[j]; + } + inTable[t]->elems[key+numReceivers] = null; + } + } + return key; + } + + // Compute routing tables + // (Only valid after mapper is called) + void computeRoutingTables() { + // Routing table stats + uint64_t totalOutEdges = 0; + + // Sequence of local device ids, for each multicast destiation + SmallSeq> receivers[64]; + + // Sequence of receiver groups + // (A more compact representation of the receivers array) + SmallSeq> groups; + + // For each device + for (uint32_t d = 0; d < numDevices; d++) { + // For each pin + for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) { + Seq dests = *(graph.outgoing->elems[d]); + Seq edges = *(edgeLabels.elems[d]); + // While destinations are remaining + while (dests.numElems > 0) { + // Clear receivers + for (uint32_t i = 0; i < 64; i++) receivers[i].clear(); + uint32_t threadMaskLow = 0; + uint32_t threadMaskHigh = 0; + // Current mailbox being considered + PDeviceAddr mbox = getThreadId(toDeviceAddr[dests.elems[0]]) >> + TinselLogThreadsPerMailbox; + // For each destination + uint32_t destsRemaining = 0; + for (uint32_t i = 0; i < dests.numElems; i++) { + // Determine destination mailbox address and mailbox-local thread + PDeviceId destId = dests.elems[i]; + PDeviceAddr destAddr = toDeviceAddr[destId]; + uint32_t destMailbox = getThreadId(destAddr) >> + TinselLogThreadsPerMailbox; + uint32_t destThread = getThreadId(destAddr) & + ((1< edge; + edge.devId = getLocalDeviceId(destAddr); + if (! std::is_same::value) edge.edge = edges.elems[i]; + receivers[destThread].append(edge); + if (destThread < 32) threadMaskLow |= 1 << destThread; + if (destThread >= 32) threadMaskHigh |= 1 << (destThread-32); + } + else { + // Add destination back into sequence + dests.elems[destsRemaining] = dests.elems[i]; + edges.elems[destsRemaining] = edges.elems[i]; + destsRemaining++; + } + } + // Create receiver groups + createReceiverGroups(mbox, receivers, &groups); + // Add input table entries + uint32_t key = addInTableEntries(&groups); + // Add output table entry + POutEdge edge; + edge.mbox = mbox; + edge.key = key; + edge.threadMaskLow = threadMaskLow; + edge.threadMaskHigh = threadMaskHigh; + outTable[d][p]->append(edge); + // Prepare for new output table entry + dests.numElems = destsRemaining; + edges.numElems = destsRemaining; + totalOutEdges++; + } + // Add output edge terminator + POutEdge term; + term.key = InvalidKey; + outTable[d][p]->append(term); + } + } + //printf("Average edges per pin: %lu\n", + // totalOutEdges / (numDevices * POLITE_NUM_PINS); + } + + // Release all structures + void releaseAll() { + if (devices != NULL) { + free(devices); + free(toDeviceAddr); + free(numDevicesOnThread); + for (uint32_t t = 0; t < TinselMaxThreads; t++) + if (fromDeviceAddr[t] != NULL) free(fromDeviceAddr[t]); + free(fromDeviceAddr); + for (uint32_t t = 0; t < TinselMaxThreads; t++) + if (vertexMem[t] != NULL) free(vertexMem[t]); + free(vertexMem); + free(vertexMemSize); + free(vertexMemBase); + for (uint32_t t = 0; t < TinselMaxThreads; t++) + if (threadMem[t] != NULL) free(threadMem[t]); + free(threadMem); + free(threadMemSize); + free(threadMemBase); + for (uint32_t t = 0; t < TinselMaxThreads; t++) + if (inEdgeMem[t] != NULL) free(inEdgeMem[t]); + free(inEdgeMem); + free(inEdgeMemSize); + free(inEdgeMemBase); + for (uint32_t t = 0; t < TinselMaxThreads; t++) + if (outEdgeMem[t] != NULL) free(outEdgeMem[t]); + free(outEdgeMem); + free(outEdgeMemSize); + free(outEdgeMemBase); + } + if (inTable != NULL) { + for (uint32_t t = 0; t < TinselMaxThreads; t++) + if (inTable[t] != NULL) delete inTable[t]; + free(inTable); + inTable = NULL; + } + if (outTable != NULL) { + for (uint32_t d = 0; d < numDevices; d++) { + if (outTable[d] == NULL) continue; + for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) + delete outTable[d][p]; + free(outTable[d]); + } + free(outTable); + outTable = NULL; + } + } + + // Implement mapping to tinsel threads + void map() { + // Let's measure some times + struct timeval placementStart, placementFinish; + struct timeval routingStart, routingFinish; + struct timeval initStart, initFinish; + + // Release all mapping and heap structures + releaseAll(); + + // Reallocate mapping structures + allocateMapping(); + + // Start placement timer + gettimeofday(&placementStart, NULL); + + // Partition into subgraphs, one per board + Placer boards(&graph, numBoardsX, numBoardsY); + + // Place subgraphs onto 2D mesh + const uint32_t placerEffort = 8; + boards.place(placerEffort); + + // For each board + for (uint32_t boardY = 0; boardY < numBoardsY; boardY++) { + for (uint32_t boardX = 0; boardX < numBoardsX; boardX++) { + // Partition into subgraphs, one per mailbox + PartitionId b = boards.mapping[boardY][boardX]; + Placer boxes(&boards.subgraphs[b], + TinselMailboxMeshXLen, TinselMailboxMeshYLen); + boxes.place(placerEffort); + + // For each mailbox + for (uint32_t boxX = 0; boxX < TinselMailboxMeshXLen; boxX++) { + for (uint32_t boxY = 0; boxY < TinselMailboxMeshYLen; boxY++) { + // Partition into subgraphs, one per thread + uint32_t numThreads = 1<incoming->numElems; + numDevicesOnThread[threadId] = numDevs; + fromDeviceAddr[threadId] = (PDeviceId*) + malloc(sizeof(PDeviceId) * numDevs); + for (uint32_t devNum = 0; devNum < numDevs; devNum++) + fromDeviceAddr[threadId][devNum] = g->labels->elems[devNum]; + + // Populate toDeviceAddr mapping + assert(numDevs < maxLocalDeviceId()); + for (uint32_t devNum = 0; devNum < numDevs; devNum++) { + PDeviceAddr devAddr = + makeDeviceAddr(threadId, devNum); + toDeviceAddr[g->labels->elems[devNum]] = devAddr; + } + } + } + } + } + } + + // Stop placement timer and start routing timer + gettimeofday(&placementFinish, NULL); + gettimeofday(&routingStart, NULL); + + // Compute send and receive side routing tables + allocateRoutingTables(); + computeRoutingTables(); + + // Stop routing timer and start init timer + gettimeofday(&routingFinish, NULL); + gettimeofday(&initStart, NULL); + + // Reallocate and initialise heap structures + allocatePartitions(); + initialisePartitions(); + + // Display times, if chatty + gettimeofday(&initFinish, NULL); + if (chatty > 0) { + struct timeval diff; + + timersub(&placementFinish, &placementStart, &diff); + double duration = (double) diff.tv_sec + + (double) diff.tv_usec / 1000000.0; + printf("POLite mapper profile:\n"); + printf(" Partitioning and placement: %lfs\n", duration); + + timersub(&routingFinish, &routingStart, &diff); + duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; + printf(" Routing table construction: %lfs\n", duration); + + timersub(&initFinish, &initStart, &diff); + duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; + printf(" Thread state initialisation: %lfs\n", duration); + } + } + + // Constructor + PGraph() { + char* str = getenv("HOSTLINK_BOXES_X"); + int x = str ? atoi(str) : 1; + x = x * TinselMeshXLenWithinBox; + str = getenv("HOSTLINK_BOXES_Y"); + int y = str ? atoi(str) : 1; + y = y * TinselMeshYLenWithinBox; + constructor(x, y); + } + PGraph(uint32_t numBoxesX, uint32_t numBoxesY) { + int x = numBoxesX * TinselMeshXLenWithinBox; + int y = numBoxesY * TinselMeshYLenWithinBox; + constructor(x, y); + } + + // Deconstructor + ~PGraph() { + releaseAll(); + for (uint32_t i = 0; i < edgeLabels.numElems; i++) + delete edgeLabels.elems[i]; + } + + // Write partition to tinsel machine + void writeRAM(HostLink* hostLink, + uint8_t** heap, uint32_t* heapSize, uint32_t* heapBase) { + // Number of bytes written by each thread + uint32_t* writeCount = (uint32_t*) + calloc(TinselMaxThreads, sizeof(uint32_t)); + + // Number of threads completed by each core + uint32_t*** threadCount = (uint32_t***) + calloc(meshLenX, sizeof(uint32_t**)); + for (uint32_t x = 0; x < meshLenX; x++) { + threadCount[x] = (uint32_t**) + calloc(meshLenY, sizeof(uint32_t*)); + for (uint32_t y = 0; y < meshLenY; y++) + threadCount[x][y] = (uint32_t*) + calloc(TinselCoresPerBoard, sizeof(uint32_t)); + } + + // Initialise write addresses + for (int x = 0; x < meshLenX; x++) + for (int y = 0; y < meshLenY; y++) + for (int c = 0; c < TinselCoresPerBoard; c++) + hostLink->setAddr(x, y, c, heapBase[hostLink->toAddr(x, y, c, 0)]); + + // Write heaps + uint32_t done = false; + while (! done) { + done = true; + for (int x = 0; x < meshLenX; x++) { + for (int y = 0; y < meshLenY; y++) { + for (int c = 0; c < TinselCoresPerBoard; c++) { + uint32_t t = threadCount[x][y][c]; + if (t < TinselThreadsPerCore) { + done = false; + uint32_t threadId = hostLink->toAddr(x, y, c, t); + uint32_t written = writeCount[threadId]; + if (written == heapSize[threadId]) { + threadCount[x][y][c] = t+1; + if ((t+1) < TinselThreadsPerCore) + hostLink->setAddr(x, y, c, + heapBase[hostLink->toAddr(x, y, c, t+1)]); + } else { + uint32_t send = min((heapSize[threadId] - written)>>2, 15); + hostLink->store(x, y, c, send, + (uint32_t*) &heap[threadId][written]); + writeCount[threadId] = written + send * sizeof(uint32_t); + } + } + } + } + } + } + + // Release memory + free(writeCount); + for (uint32_t x = 0; x < meshLenX; x++) { + for (uint32_t y = 0; y < meshLenY; y++) + free(threadCount[x][y]); + free(threadCount[x]); + } + free(threadCount); + } + + // Write graph to tinsel machine + void write(HostLink* hostLink) { + // Start timer + struct timeval start, finish; + gettimeofday(&start, NULL); + + bool useSendBufferOld = hostLink->useSendBuffer; + hostLink->useSendBuffer = true; + writeRAM(hostLink, vertexMem, vertexMemSize, vertexMemBase); + writeRAM(hostLink, threadMem, threadMemSize, threadMemBase); + writeRAM(hostLink, inEdgeMem, inEdgeMemSize, inEdgeMemBase); + writeRAM(hostLink, outEdgeMem, outEdgeMemSize, outEdgeMemBase); + hostLink->flush(); + hostLink->useSendBuffer = useSendBufferOld; + + // Display time if chatty + gettimeofday(&finish, NULL); + if (chatty > 0) { + struct timeval diff; + timersub(&finish, &start, &diff); + double duration = (double) diff.tv_sec + + (double) diff.tv_usec / 1000000.0; + printf("POLite graph upload time: %lfs\n", duration); + } + } + + // Determine fan-in of given device + uint32_t fanIn(PDeviceId id) { + return graph.fanIn(id); + } + + // Determine fan-out of given device + uint32_t fanOut(PDeviceId id) { + return graph.fanOut(id); + } + +}; + +// Read performance stats and store in file +inline void politeSaveStats(HostLink* hostLink, const char* filename) { + #ifdef POLITE_DUMP_STATS + // Open file for performance counters + FILE* statsFile = fopen(filename, "wt"); + if (statsFile == NULL) { + printf("Error creating stats file\n"); + exit(EXIT_FAILURE); + } + uint32_t meshLenX = hostLink->meshXLen; + uint32_t meshLenY = hostLink->meshYLen; + // Number of caches + uint32_t numLines = meshLenX * meshLenY * + TinselDCachesPerDRAM * TinselDRAMsPerBoard; + // Add on number of cores + numLines += meshLenX * meshLenY * TinselCoresPerBoard; + // Add on number of threads + #ifdef POLITE_COUNT_MSGS + numLines += meshLenX * meshLenY * TinselThreadsPerBoard; + #endif + hostLink->dumpStdOut(statsFile, numLines); + fclose(statsFile); + #endif +} + +#endif From 7b0529d03a16d7c16059c15657116a001022a84a Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Mon, 8 Jun 2020 08:30:36 +0000 Subject: [PATCH 70/78] POLite: fixes to local mapper --- include/POLite.h | 8 ++++---- include/POLite/Local/PDevice.h | 4 ++++ include/POLite/Local/PGraph.h | 5 ++++- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/include/POLite.h b/include/POLite.h index d1d5fbc6..735f8bd3 100644 --- a/include/POLite.h +++ b/include/POLite.h @@ -5,8 +5,8 @@ #include // Select default mapper -#if !defined(POLITE_MAP_LOCAL) || \ - !defined(POLITE_MAP_DIST) || \ +#if !defined(POLITE_MAP_LOCAL) && \ + !defined(POLITE_MAP_DIST) && \ !defined(POLITE_MAP_HYBRID) // Default mapper #define POLITE_MAP_HYBRID @@ -22,13 +22,13 @@ #include #endif #else - #if defined(POLITE_FAST_LOCAL) + #if defined(POLITE_MAP_LOCAL) #include #include #elif defined(POLITE_MAP_DIST) #include #include - #elif defined (POLITE_MAP_HYBRID) + #elif defined(POLITE_MAP_HYBRID) #include #include #endif diff --git a/include/POLite/Local/PDevice.h b/include/POLite/Local/PDevice.h index ca806a58..9408cfae 100644 --- a/include/POLite/Local/PDevice.h +++ b/include/POLite/Local/PDevice.h @@ -273,7 +273,9 @@ template #include #include -#include "Seq.h" // Nodes of a POETS graph are devices typedef NodeId PDeviceId; @@ -91,6 +90,10 @@ template Date: Wed, 10 Jun 2020 14:38:58 +0000 Subject: [PATCH 71/78] New BFS-based partitioner This is faster than METIS and gives similar results for a few graphs that I've tried. It works by picking an unvisited vertex, and then doing a size-bounded BFS from that vertex with the size-bound equal to the partition size. This process is repeated until the partition is full, at which point we move to a new parition and repeat the process until all vertices have been visited. It is just a few lines of code, and could be easily parallelised in future. --- include/POLite/Placer.h | 57 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/include/POLite/Placer.h b/include/POLite/Placer.h index 4178af50..d2f2378a 100644 --- a/include/POLite/Placer.h +++ b/include/POLite/Placer.h @@ -5,6 +5,7 @@ #include #include #include +#include typedef uint32_t PartitionId; @@ -15,7 +16,8 @@ struct Placer { Default, Metis, Random, - Direct + Direct, + BFS }; const Method defaultMethod=Metis; @@ -64,6 +66,8 @@ struct Placer { method=Random; else if (!strcmp(e, "direct")) method=Direct; + else if (!strcmp(e, "bfs")) + method=BFS; else if (!strcmp(e, "default") || *e == '\0') method=Default; else { @@ -174,6 +178,54 @@ struct Placer { } } + // Partition the graph using repeated BFS + void partitionBFS() { + uint32_t numVertices = graph->incoming->numElems; + uint32_t numParts = width * height; + uint32_t partSize = (numVertices + numParts) / numParts; + + // Visited bit for each vertex + bool* seen = new bool [numVertices]; + memset(seen, 0, numVertices); + + // Next vertex to visit + uint32_t nextUnseen = 0; + + // Next partition id + uint32_t nextPart = 0; + + while (nextUnseen < numVertices) { + // Frontier + std::queue frontier; + uint32_t count = 0; + + while (nextUnseen < numVertices && count < partSize) { + // Sized-bounded BFS from nextUnseen + frontier.push(nextUnseen); + while (count < partSize && !frontier.empty()) { + uint32_t v = frontier.front(); + frontier.pop(); + if (!seen[v]) { + seen[v] = true; + partitions[v] = nextPart; + count++; + // Add unvisited neighbours of v to the frontier + Seq* dests = graph->outgoing->elems[v]; + for (uint32_t i = 0; i < dests->numElems; i++) { + uint32_t w = dests->elems[i]; + if (!seen[w]) frontier.push(w); + } + } + } + while (nextUnseen < numVertices && seen[nextUnseen]) nextUnseen++; + } + + nextPart++; + } + + delete [] seen; + } + void partition() { switch(method){ @@ -187,6 +239,9 @@ struct Placer { case Direct: partitionDirect(); break; + case BFS: + partitionBFS(); + break; } } From cb7f663a28298d740478bab8ae499797b5444a64 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Tue, 23 Jun 2020 09:12:40 +0100 Subject: [PATCH 72/78] Attempt to unify POLite mappers Compiles but untested... --- include/POLite.h | 28 +- include/POLite/Bitmap.h | 58 ++ include/POLite/Dist/PDevice.h | 302 ---------- include/POLite/Dist/PGraph.h | 708 ---------------------- include/POLite/Hybrid/PDevice.h | 321 ---------- include/POLite/Hybrid/PGraph.h | 854 --------------------------- include/POLite/{Local => }/PDevice.h | 91 +-- include/POLite/{Local => }/PGraph.h | 529 +++++++++++------ include/tinsel-interface.h | 9 + include/tinsel.h | 6 +- 10 files changed, 469 insertions(+), 2437 deletions(-) create mode 100644 include/POLite/Bitmap.h delete mode 100644 include/POLite/Dist/PDevice.h delete mode 100644 include/POLite/Dist/PGraph.h delete mode 100644 include/POLite/Hybrid/PDevice.h delete mode 100644 include/POLite/Hybrid/PGraph.h rename include/POLite/{Local => }/PDevice.h (80%) rename include/POLite/{Local => }/PGraph.h (62%) diff --git a/include/POLite.h b/include/POLite.h index 735f8bd3..f053e440 100644 --- a/include/POLite.h +++ b/include/POLite.h @@ -4,34 +4,12 @@ #include -// Select default mapper -#if !defined(POLITE_MAP_LOCAL) && \ - !defined(POLITE_MAP_DIST) && \ - !defined(POLITE_MAP_HYBRID) - // Default mapper - #define POLITE_MAP_HYBRID -#endif - #ifdef TINSEL #include - #if defined(POLITE_MAP_LOCAL) - #include - #elif defined(POLITE_MAP_DIST) - #include - #elif defined(POLITE_MAP_HYBRID) - #include - #endif + #include #else - #if defined(POLITE_MAP_LOCAL) - #include - #include - #elif defined(POLITE_MAP_DIST) - #include - #include - #elif defined(POLITE_MAP_HYBRID) - #include - #include - #endif + #include + #include #include #include #include diff --git a/include/POLite/Bitmap.h b/include/POLite/Bitmap.h new file mode 100644 index 00000000..262f99af --- /dev/null +++ b/include/POLite/Bitmap.h @@ -0,0 +1,58 @@ +#ifndef _BITMAP_H_ +#define _BITMAP_H_ + +#include +#include +#include + +struct Bitmap { + // Bitmap contents (sequence of 64-bit words) + Seq* contents; + + // Index of first non-full word in bitmap + uint32_t firstFree; + + // Constructor + Bitmap() { + contents = new Seq (16); + firstFree = 0; + } + + // Destructor + ~Bitmap() { + if (contents) delete contents; + } + + // Get value of word at given index, return 0 if out-of-bounds + inline uint64_t getWord(uint32_t index) { + return index >= contents->numElems ? 0ul : contents->elems[index]; + } + + // Find index of next free word in bitmap starting from given word index + inline uint32_t nextFreeWordFrom(uint32_t start) { + for (uint32_t i = start; i < contents->numElems; i++) + if (~contents->elems[i] != 0ul) return i; + return contents->numElems; + } + + // Set bit at given index and bit offset in bitmap + inline void setBit(uint32_t wordIndex, uint32_t bitIndex) { + for (uint32_t i = contents->numElems; i <= wordIndex; i++) + contents->append(0ul); + contents->elems[wordIndex] |= 1ul << bitIndex; + if (wordIndex == firstFree) { + firstFree = nextFreeWordFrom(firstFree); + } + } + + // Find index of next zero bit, and flip that bit + inline uint32_t grabNextBit() { + uint64_t word = getWord(firstFree); + assert(word != 0ul); + uint32_t bit = __builtin_ctzll(~word); + setBit(firstFree, bit); + return 64*firstFree + bit; + } +}; + +#endif diff --git a/include/POLite/Dist/PDevice.h b/include/POLite/Dist/PDevice.h deleted file mode 100644 index f095eba6..00000000 --- a/include/POLite/Dist/PDevice.h +++ /dev/null @@ -1,302 +0,0 @@ -// SPDX-License-Identifier: BSD-2-Clause -#ifndef _PDEVICE_H_ -#define _PDEVICE_H_ - -#include -#include -#include - -#ifdef TINSEL - #include - #define PTR(t) t* -#else - #include - #define PTR(t) uint32_t -#endif - -// Use this to align on half-cache-line boundary -#define ALIGNED __attribute__((aligned(1<<(TinselLogBytesPerLine-1)))) - -// This is a static limit on the number of pins per device -#ifndef POLITE_NUM_PINS -#define POLITE_NUM_PINS 1 -#endif - -// Macros for performance stats -// POLITE_DUMP_STATS - dump performance stats on termination -// POLITE_COUNT_MSGS - include message counts of performance stats - -// Thread-local device id -typedef uint16_t PLocalDeviceId; - -// Thread id -typedef uint32_t PThreadId; - -// Device address -// Bits 17->0: thread id -// Bit 18: invalid address -// Bits 31->19: thread-local device id -typedef uint32_t PDeviceAddr; - -// Device address constructors -inline PDeviceAddr invalidDeviceAddr() { return 0x40000; } -inline PDeviceAddr makeDeviceAddr(PThreadId t, PLocalDeviceId d) { - return (d << 19) | t; -} - -// Device address deconstructors -inline bool isValidDeviceAddr(PDeviceAddr addr) { return !(addr & 0x40000); } -inline PThreadId getThreadId(PDeviceAddr addr) { return addr & 0x3ffff; } -inline PLocalDeviceId getLocalDeviceId(PDeviceAddr addr) { return addr >> 19; } - -// What's the max allowed local device address? -inline uint32_t maxLocalDeviceId() { return 8192; } - -// Pins -// No - means 'not ready to send' -// HostPin - means 'send to host' -// Pin(n) - means 'send to application pin number n' -typedef uint8_t PPin; -#define No 0 -#define HostPin 1 -#define Pin(n) ((n)+2) - -// For template arguments that are not used -struct None {}; - -// Generic device structure -// Type parameters: -// S - State -// E - Edge label -// M - Message structure -template struct PDevice { - // State - S* s; - PPin* readyToSend; - uint32_t numVertices; - uint16_t time; - - // Handlers - void init(); - void send(volatile M* msg); - void recv(M* msg, E* edge); - bool step(); - bool finish(volatile M* msg); -}; - -// Generic device state structure -template struct ALIGNED PState { - // Board-level routing key for each outgoing pin - uint32_t pin[POLITE_NUM_PINS]; - // Ready-to-send status - PPin readyToSend; - // Custom state - S state; -}; - -// Message structure -template struct PMessage { - // Destination thread-local device id - uint16_t devId; - // Id of incoming edge - uint16_t edgeId; - // Application message - M payload; -}; - -// An incoming edge to a device -template struct PInEdge { - E edge; -}; - -// Generic thread structure -template struct PThread { - - // Number of devices handled by thread - PLocalDeviceId numDevices; - // Number of times step handler has been called - uint16_t time; - // Number of devices in graph - uint32_t numVertices; - // Pointer to array of device states - PTR(PState) devices; - // Pointer to base of edge table - PTR(PInEdge) inTableBase; - // Array of local device ids are ready to send - PTR(PLocalDeviceId) senders; - // This array is accessed in a LIFO manner - PTR(PLocalDeviceId) sendersTop; - - // Count number of messages sent - #ifdef POLITE_COUNT_MSGS - // Total messages sent - uint32_t msgsSent; - // Total messages received - uint32_t msgsReceived; - // Number of times we wanted to send but couldn't - uint32_t blockedSends; - #endif - - #ifdef TINSEL - - // Helper function to construct a device - INLINE DeviceType getDevice(uint32_t id) { - DeviceType dev; - dev.s = &devices[id].state; - dev.readyToSend = &devices[id].readyToSend; - dev.numVertices = numVertices; - dev.time = time; - return dev; - } - - // Dump performance counter stats over UART - void dumpStats() { - tinselPerfCountStop(); - uint32_t me = tinselId(); - // Per-cache performance counters - uint32_t cacheMask = (1 << - (TinselLogThreadsPerCore + TinselLogCoresPerDCache)) - 1; - if ((me & cacheMask) == 0) { - printf("H:%x,M:%x,W:%x\n", - tinselHitCount(), - tinselMissCount(), - tinselWritebackCount()); - } - // Per-core performance counters - uint32_t coreMask = (1 << (TinselLogThreadsPerCore)) - 1; - if ((me & coreMask) == 0) { - printf("C:%x %x,I:%x %x\n", - tinselCycleCountU(), tinselCycleCount(), - tinselCPUIdleCountU(), tinselCPUIdleCount()); - } - // Per-thread performance counters - #ifdef POLITE_COUNT_MSGS - uint32_t intraBoardId = me & ((1<)-1) >> TinselLogBytesPerFlit); - - // Event loop - while (1) { - // Try to send - if (sendersTop != senders) { - if (tinselCanSend()) { - // Get next sender - PLocalDeviceId src = *(--sendersTop); - // Lookup device - DeviceType dev = getDevice(src); - PPin pin = *dev.readyToSend; - // Invoke send handler - PMessage* m = (PMessage*) tinselSendSlot(); - dev.send(&m->payload); - // Reinsert sender, if it still wants to send - if (*dev.readyToSend != No) sendersTop++; - // Is it a send to the host pin or a user pin? - if (pin == HostPin) - tinselSend(tinselHostId(), m); - else - tinselKeySend(devices[src].pin[pin-2], m); - #ifdef POLITE_COUNT_MSGS - msgsSent++; - #endif - } - else { - #ifdef POLITE_COUNT_MSGS - blockedSends++; - #endif - tinselWaitUntil(TINSEL_CAN_SEND|TINSEL_CAN_RECV); - } - } - else { - // Idle detection - int idle = tinselIdle(!active); - if (idle > 1) - break; - else if (idle) { - active = false; - for (uint32_t i = 0; i < numDevices; i++) { - DeviceType dev = getDevice(i); - // Invoke the step handler for each device - active = dev.step() || active; - // Device ready to send? - if (*dev.readyToSend != No) { - *(sendersTop++) = i; - } - } - time++; - } - } - - // Step 2: try to receive - while (tinselCanRecv()) { - PMessage* inMsg = (PMessage*) tinselRecv(); - PInEdge* inEdge = &inTableBase[inMsg->edgeId]; - // Lookup destination device - PLocalDeviceId id = inMsg->devId; - DeviceType dev = getDevice(id); - // Was it ready to send? - PPin oldReadyToSend = *dev.readyToSend; - // Invoke receive handler - dev.recv(&inMsg->payload, &inEdge->edge); - // Insert device into a senders array, if not already there - if (*dev.readyToSend != No && oldReadyToSend == No) - *(sendersTop++) = id; - #ifdef POLITE_COUNT_MSGS - msgsReceived++; - #endif - tinselFree(inMsg); - } - } - - // Termination - #ifdef POLITE_DUMP_STATS - dumpStats(); - #endif - - // Invoke finish handler for each device - for (uint32_t i = 0; i < numDevices; i++) { - DeviceType dev = getDevice(i); - tinselWaitUntil(TINSEL_CAN_SEND); - PMessage* m = (PMessage*) tinselSendSlot(); - if (dev.finish(&m->payload)) tinselSend(tinselHostId(), m); - } - - // Sleep - tinselWaitUntil(TINSEL_CAN_RECV); while (1); - } - - #endif - -}; - -#endif diff --git a/include/POLite/Dist/PGraph.h b/include/POLite/Dist/PGraph.h deleted file mode 100644 index 923c34d3..00000000 --- a/include/POLite/Dist/PGraph.h +++ /dev/null @@ -1,708 +0,0 @@ -// SPDX-License-Identifier: BSD-2-Clause -#ifndef _PGRAPH_H_ -#define _PGRAPH_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// Nodes of a POETS graph are devices -typedef NodeId PDeviceId; - -// POETS graph -template class PGraph { - private: - // Align address to 2^n byte boundary - inline uint32_t align(uint32_t n, uint32_t addr) { - if ((addr & (1<> n) + 1) << n; - } - - // Align address to 32-bit word boundary - uint32_t wordAlign(uint32_t addr) { return align(2, addr); } - - // Align address to cache-line boundary - uint32_t cacheAlign(uint32_t addr) { - return align(TinselLogBytesPerLine, addr); - } - - // Helper function - inline uint32_t min(uint32_t x, uint32_t y) { return x < y ? x : y; } - - // Number of FPGA boards available - uint32_t meshLenX; - uint32_t meshLenY; - - // Number of FPGA boards to use - uint32_t numBoardsX; - uint32_t numBoardsY; - - // Out table (sender-side edge tables) - // Sequence of destinations for every (device, pin) pair - Seq*** outTable; - - // Key table (sender-side key tables) - // Global routing key for every (device, pin) pair - uint32_t** keyTable; - - // In table (receiver-side edge tables) - // Sequence of incoming edges for every thread - Seq>** inTable; - - // Mesh of per-board programmable routers - ProgRouterMesh* routingTables; - - // Generic constructor - void constructor(uint32_t lenX, uint32_t lenY) { - meshLenX = lenX; - meshLenY = lenY; - char* str = getenv("POLITE_BOARDS_X"); - int nx = str ? atoi(str) : meshLenX; - str = getenv("POLITE_BOARDS_Y"); - int ny = str ? atoi(str) : meshLenY; - setNumBoards(nx, ny); - numDevices = 0; - devices = NULL; - toDeviceAddr = NULL; - numDevicesOnThread = NULL; - fromDeviceAddr = NULL; - vertexMem = NULL; - vertexMemSize = NULL; - vertexMemBase = NULL; - inEdgeMem = NULL; - inEdgeMemSize = NULL; - inEdgeMemBase = NULL; - mapVerticesToDRAM = false; - mapInEdgesToDRAM = true; - outTable = NULL; - keyTable = NULL; - inTable = NULL; - routingTables = NULL; - chatty = 0; - str = getenv("POLITE_CHATTY"); - if (str != NULL) { - chatty = !strcmp(str, "0") ? 0 : 1; - } - } - - public: - // Number of devices - uint32_t numDevices; - - // Graph containing device ids and connections - Graph graph; - - // Edge labels: has same structure as graph.outgoing - Seq*> edgeLabels; - - // Mapping from device id to device state - // (Not valid until the mapper is called) - PState** devices; - - // Mapping from thread id to number of devices on that thread - // (Not valid until the mapper is called) - uint32_t* numDevicesOnThread; - - // Mapping from device id to device address and back - // (Not valid until the mapper is called) - PDeviceAddr* toDeviceAddr; // Device id -> device address - PDeviceId** fromDeviceAddr; // Device address -> device id - - // Each thread's vertex mem and thread mem regions - // (Not valid until the mapper is called) - uint8_t** vertexMem; uint8_t** threadMem; - uint32_t* vertexMemSize; uint32_t* threadMemSize; - uint32_t* vertexMemBase; uint32_t* threadMemBase; - - // Each thread's in-edge tables - // (Not valid until the mapper is called) - uint8_t** inEdgeMem; - uint32_t* inEdgeMemSize; - uint32_t* inEdgeMemBase; - - // Where to map the various regions - // (If false, map to SRAM instead) - bool mapVerticesToDRAM; - bool mapInEdgesToDRAM; - - // Allow mapper to print useful information to stdout - uint32_t chatty; - - // Setter for number of boards to use - void setNumBoards(uint32_t x, uint32_t y) { - if (x > meshLenX || y > meshLenY) { - printf("Mapper: %d x %d boards requested, %d x %d available\n", - numBoardsX, numBoardsY, meshLenX, meshLenY); - exit(EXIT_FAILURE); - } - numBoardsX = x; - numBoardsY = y; - } - - // Create new device - inline PDeviceId newDevice() { - edgeLabels.append(new SmallSeq); - numDevices++; - return graph.newNode(); - } - - // Add a connection between devices - inline void addEdge(PDeviceId from, PinId pin, PDeviceId to) { - if (pin >= POLITE_NUM_PINS) { - printf("addEdge: pin exceeds POLITE_NUM_PINS\n"); - exit(EXIT_FAILURE); - } - graph.addEdge(from, pin, to); - E edge; - edgeLabels.elems[from]->append(edge); - } - - // Add labelled edge using given output pin - void addLabelledEdge(E edge, PDeviceId x, PinId pin, PDeviceId y) { - graph.addEdge(x, pin, y); - edgeLabels.elems[x]->append(edge); - } - - // Allocate SRAM and DRAM partitions - void allocatePartitions() { - // Decide a maximum partition size that is reasonable - // SRAM: Partition size minus 2048 bytes for the stack - uint32_t maxSRAMSize = (1<)); - // Add space for devices - uint32_t numDevs = numDevicesOnThread[threadId]; - for (uint32_t devNum = 0; devNum < numDevs; devNum++) { - // Add space for device - sizeVMem = sizeVMem + sizeof(PState); - } - // Add space for incoming edge table - if (inTable[threadId]) { - sizeEIMem = inTable[threadId]->numElems * sizeof(PInEdge); - sizeEIMem = wordAlign(sizeEIMem); - } - // The total partition size including uninitialised portions - uint32_t totalSizeVMem = - sizeVMem + wordAlign(sizeof(PLocalDeviceId) * numDevs); - // Check that total size is reasonable - uint32_t totalSizeSRAM = sizeTMem; - uint32_t totalSizeDRAM = 0; - if (mapVerticesToDRAM) totalSizeDRAM += totalSizeVMem; - else totalSizeSRAM += totalSizeVMem; - if (mapInEdgesToDRAM) totalSizeDRAM += sizeEIMem; - else totalSizeSRAM += sizeEIMem; - if (totalSizeDRAM > maxDRAMSize) { - printf("Error: max DRAM partition size exceeded\n"); - exit(EXIT_FAILURE); - } - if (totalSizeSRAM > maxSRAMSize) { - printf("Error: max SRAM partition size exceeded\n"); - exit(EXIT_FAILURE); - } - // Allocate space for the initialised portion of the partition - assert((sizeVMem%4) == 0); - assert((sizeTMem%4) == 0); - assert((sizeEIMem%4) == 0); - vertexMem[threadId] = (uint8_t*) calloc(sizeVMem, 1); - vertexMemSize[threadId] = sizeVMem; - threadMem[threadId] = (uint8_t*) calloc(sizeTMem, 1); - threadMemSize[threadId] = sizeTMem; - inEdgeMem[threadId] = (uint8_t*) calloc(sizeEIMem, 1); - inEdgeMemSize[threadId] = sizeEIMem; - // Tinsel address of base of partition - uint32_t partId = threadId & (TinselThreadsPerDRAM-1); - uint32_t sramBase = (1 << TinselLogBytesPerSRAM) + - (partId << TinselLogBytesPerSRAMPartition); - uint32_t dramBase = TinselBytesPerDRAM - - ((partId+1) << TinselLogBytesPerDRAMPartition); - // Use partition-interleaved region for DRAM - dramBase |= 0x80000000; - threadMemBase[threadId] = sramBase; - sramBase += threadMemSize[threadId]; - // Determine base addresses of each region - if (mapVerticesToDRAM) { - vertexMemBase[threadId] = dramBase; - dramBase += totalSizeVMem; - } - else { - vertexMemBase[threadId] = sramBase; - sramBase += totalSizeVMem; - } - if (mapInEdgesToDRAM) { - inEdgeMemBase[threadId] = dramBase; - dramBase += sizeEIMem; - } - else { - inEdgeMemBase[threadId] = sramBase; - sramBase += sizeEIMem; - } - } - } - - // Initialise partitions - void initialisePartitions() { - for (uint32_t threadId = 0; threadId < TinselMaxThreads; threadId++) { - // Next pointers for each partition - uint32_t nextVMem = 0; - // Pointer to thread structure - PThread* thread = - (PThread*) &threadMem[threadId][0]; - // Set number of devices on thread - thread->numDevices = numDevicesOnThread[threadId]; - // Set number of devices in graph - thread->numVertices = numDevices; - // Set tinsel address of array of device states - thread->devices = vertexMemBase[threadId]; - // Set tinsel address of base of in-edge table - thread->inTableBase = inEdgeMemBase[threadId]; - // Add space for each device on thread - uint32_t numDevs = numDevicesOnThread[threadId]; - for (uint32_t devNum = 0; devNum < numDevs; devNum++) { - PState* dev = (PState*) &vertexMem[threadId][nextVMem]; - PDeviceId id = fromDeviceAddr[threadId][devNum]; - devices[id] = dev; - // Add space for device - nextVMem = nextVMem + sizeof(PState); - } - // Initialise each device and the thread's out edges - for (uint32_t devNum = 0; devNum < numDevs; devNum++) { - PDeviceId id = fromDeviceAddr[threadId][devNum]; - PState* dev = devices[id]; - // Initialise - for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) { - dev->pin[p] = keyTable[id][p]; - } - } - // Intialise thread's in edges - PInEdge* inEdgeArray = (PInEdge*) inEdgeMem[threadId]; - Seq>* edges = inTable[threadId]; - if (edges) - for (uint32_t i = 0; i < edges->numElems; i++) { - inEdgeArray[i] = edges->elems[i]; - } - // At this point, check that next pointers line up with heap sizes - if (nextVMem != vertexMemSize[threadId]) { - printf("Error: vertex mem size does not match pre-computed size\n"); - exit(EXIT_FAILURE); - } - // Set tinsel address of senders array - thread->senders = vertexMemBase[threadId] + nextVMem; - } - } - - // Allocate mapping structures - void allocateMapping() { - devices = (PState**) calloc(numDevices, sizeof(PState*)); - toDeviceAddr = (PDeviceAddr*) calloc(numDevices, sizeof(PDeviceAddr)); - fromDeviceAddr = (PDeviceId**) calloc(TinselMaxThreads, sizeof(PDeviceId*)); - numDevicesOnThread = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t)); - } - - // Allocate thread edge input and output tables - // (Only valid after mapper is called) - void allocateInOutTables() { - // Receiver-side tables - inTable = (Seq>**) - calloc(TinselMaxThreads,sizeof(Seq>*)); - for (uint32_t t = 0; t < TinselMaxThreads; t++) { - if (numDevicesOnThread[t] != 0) - inTable[t] = new SmallSeq>; - } - - // Sender-side tables - outTable = (Seq***) - calloc(numDevices, sizeof(Seq**)); - for (uint32_t d = 0; d < numDevices; d++) { - outTable[d] = (Seq**) - calloc(POLITE_NUM_PINS, sizeof(Seq*)); - for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) - outTable[d][p] = new SmallSeq; - } - - keyTable = new uint32_t* [numDevices]; - for (uint32_t d = 0; d < numDevices; d++) - keyTable[d] = new uint32_t [POLITE_NUM_PINS]; - } - - // Compute thread edge input and output tables - // (Only valid after mapper is called) - void computeInOutTables() { - // For each device - for (uint32_t d = 0; d < numDevices; d++) { - // For each pin - for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) { - Seq* dests = graph.outgoing->elems[d]; - Seq* edges = edgeLabels.elems[d]; - for (uint32_t i = 0; i < dests->numElems; i++) { - PDeviceId destId = dests->elems[i]; - // Destination thread id - uint32_t threadId = getThreadId(toDeviceAddr[destId]); - // Thread-local device id - uint32_t devId = getLocalDeviceId(toDeviceAddr[destId]); - // Add edge to thread's input table - uint32_t edgeId = inTable[threadId]->numElems; - PInEdge edge; - edge.edge = edges->elems[i]; - inTable[threadId]->append(edge); - // Add output table entry - PRoutingDest rdest; - rdest.kind = PRDestKindURM1; - rdest.mbox = threadId >> TinselLogThreadsPerMailbox; - rdest.urm1.key = devId | (edgeId << 16); - rdest.urm1.threadId = threadId & - ((1<append(rdest); - } - } - } - } - - // Release all structures - void releaseAll() { - if (devices != NULL) { - free(devices); - free(toDeviceAddr); - free(numDevicesOnThread); - for (uint32_t t = 0; t < TinselMaxThreads; t++) - if (fromDeviceAddr[t] != NULL) free(fromDeviceAddr[t]); - free(fromDeviceAddr); - for (uint32_t t = 0; t < TinselMaxThreads; t++) - if (vertexMem[t] != NULL) free(vertexMem[t]); - free(vertexMem); - free(vertexMemSize); - free(vertexMemBase); - for (uint32_t t = 0; t < TinselMaxThreads; t++) - if (threadMem[t] != NULL) free(threadMem[t]); - free(threadMem); - free(threadMemSize); - free(threadMemBase); - for (uint32_t t = 0; t < TinselMaxThreads; t++) - if (inEdgeMem[t] != NULL) free(inEdgeMem[t]); - free(inEdgeMem); - free(inEdgeMemSize); - free(inEdgeMemBase); - } - if (inTable != NULL) { - for (uint32_t t = 0; t < TinselMaxThreads; t++) - if (inTable[t] != NULL) delete inTable[t]; - free(inTable); - inTable = NULL; - } - if (outTable != NULL) { - for (uint32_t d = 0; d < numDevices; d++) { - if (outTable[d] == NULL) continue; - for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) - delete outTable[d][p]; - free(outTable[d]); - } - free(outTable); - outTable = NULL; - } - if (keyTable != NULL) { - for (uint32_t d = 0; d < numDevices; d++) delete [] keyTable[d]; - delete [] keyTable; - keyTable = NULL; - } - if (routingTables != NULL) delete routingTables; - } - - // Implement mapping to tinsel threads - void map() { - // Let's measure some times - struct timeval placementStart, placementFinish; - struct timeval routingStart, routingFinish; - struct timeval initStart, initFinish; - - // Release all mapping and heap structures - releaseAll(); - - // Reallocate mapping structures - allocateMapping(); - - // Start placement timer - gettimeofday(&placementStart, NULL); - - // Partition into subgraphs, one per board - Placer boards(&graph, numBoardsX, numBoardsY); - - // Place subgraphs onto 2D mesh - const uint32_t placerEffort = 8; - boards.place(placerEffort); - - // For each board - for (uint32_t boardY = 0; boardY < numBoardsY; boardY++) { - for (uint32_t boardX = 0; boardX < numBoardsX; boardX++) { - // Partition into subgraphs, one per mailbox - PartitionId b = boards.mapping[boardY][boardX]; - Placer boxes(&boards.subgraphs[b], - TinselMailboxMeshXLen, TinselMailboxMeshYLen); - boxes.place(placerEffort); - - // For each mailbox - for (uint32_t boxX = 0; boxX < TinselMailboxMeshXLen; boxX++) { - for (uint32_t boxY = 0; boxY < TinselMailboxMeshYLen; boxY++) { - // Partition into subgraphs, one per thread - uint32_t numThreads = 1<incoming->numElems; - numDevicesOnThread[threadId] = numDevs; - fromDeviceAddr[threadId] = (PDeviceId*) - malloc(sizeof(PDeviceId) * numDevs); - for (uint32_t devNum = 0; devNum < numDevs; devNum++) - fromDeviceAddr[threadId][devNum] = g->labels->elems[devNum]; - - // Populate toDeviceAddr mapping - assert(numDevs < maxLocalDeviceId()); - for (uint32_t devNum = 0; devNum < numDevs; devNum++) { - PDeviceAddr devAddr = - makeDeviceAddr(threadId, devNum); - toDeviceAddr[g->labels->elems[devNum]] = devAddr; - } - } - } - } - } - } - - // Stop placement timer and start In/Out table timer - gettimeofday(&placementFinish, NULL); - gettimeofday(&routingStart, NULL); - - // Compute send and receive side routing tables - allocateInOutTables(); - computeInOutTables(); - - // Compute per-board programmable routing tables - routingTables = new ProgRouterMesh(numBoardsX, numBoardsY); - for (uint32_t d = 0; d < numDevices; d++) { - uint32_t src = getThreadId(toDeviceAddr[d]) >> - TinselLogThreadsPerMailbox; - for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) - keyTable[d][p] = routingTables->addDestsFromBoard(src, outTable[d][p]); - } - - // Stop routing timer and start init timer - gettimeofday(&routingFinish, NULL); - gettimeofday(&initStart, NULL); - - // Reallocate and initialise heap structures - allocatePartitions(); - initialisePartitions(); - - // Display times, if chatty - gettimeofday(&initFinish, NULL); - if (chatty > 0) { - struct timeval diff; - - timersub(&placementFinish, &placementStart, &diff); - double duration = (double) diff.tv_sec + - (double) diff.tv_usec / 1000000.0; - printf("POLite mapper profile:\n"); - printf(" Partitioning and placement: %lfs\n", duration); - - timersub(&routingFinish, &routingStart, &diff); - duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; - printf(" In/Out table construction: %lfs\n", duration); - - timersub(&initFinish, &initStart, &diff); - duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; - printf(" Thread state initialisation: %lfs\n", duration); - } - } - - // Constructor - PGraph() { - char* str = getenv("HOSTLINK_BOXES_X"); - int x = str ? atoi(str) : 1; - x = x * TinselMeshXLenWithinBox; - str = getenv("HOSTLINK_BOXES_Y"); - int y = str ? atoi(str) : 1; - y = y * TinselMeshYLenWithinBox; - constructor(x, y); - } - PGraph(uint32_t numBoxesX, uint32_t numBoxesY) { - int x = numBoxesX * TinselMeshXLenWithinBox; - int y = numBoxesY * TinselMeshYLenWithinBox; - constructor(x, y); - } - - // Deconstructor - ~PGraph() { - releaseAll(); - for (uint32_t i = 0; i < edgeLabels.numElems; i++) - delete edgeLabels.elems[i]; - } - - // Write partition to tinsel machine - void writeRAM(HostLink* hostLink, - uint8_t** heap, uint32_t* heapSize, uint32_t* heapBase) { - // Number of bytes written by each thread - uint32_t* writeCount = (uint32_t*) - calloc(TinselMaxThreads, sizeof(uint32_t)); - - // Number of threads completed by each core - uint32_t*** threadCount = (uint32_t***) - calloc(meshLenX, sizeof(uint32_t**)); - for (uint32_t x = 0; x < meshLenX; x++) { - threadCount[x] = (uint32_t**) - calloc(meshLenY, sizeof(uint32_t*)); - for (uint32_t y = 0; y < meshLenY; y++) - threadCount[x][y] = (uint32_t*) - calloc(TinselCoresPerBoard, sizeof(uint32_t)); - } - - // Initialise write addresses - for (int x = 0; x < meshLenX; x++) - for (int y = 0; y < meshLenY; y++) - for (int c = 0; c < TinselCoresPerBoard; c++) - hostLink->setAddr(x, y, c, heapBase[hostLink->toAddr(x, y, c, 0)]); - - // Write heaps - uint32_t done = false; - while (! done) { - done = true; - for (int x = 0; x < meshLenX; x++) { - for (int y = 0; y < meshLenY; y++) { - for (int c = 0; c < TinselCoresPerBoard; c++) { - uint32_t t = threadCount[x][y][c]; - if (t < TinselThreadsPerCore) { - done = false; - uint32_t threadId = hostLink->toAddr(x, y, c, t); - uint32_t written = writeCount[threadId]; - if (written == heapSize[threadId]) { - threadCount[x][y][c] = t+1; - if ((t+1) < TinselThreadsPerCore) - hostLink->setAddr(x, y, c, - heapBase[hostLink->toAddr(x, y, c, t+1)]); - } else { - uint32_t send = min((heapSize[threadId] - written)>>2, 15); - hostLink->store(x, y, c, send, - (uint32_t*) &heap[threadId][written]); - writeCount[threadId] = written + send * sizeof(uint32_t); - } - } - } - } - } - } - - // Release memory - free(writeCount); - for (uint32_t x = 0; x < meshLenX; x++) { - for (uint32_t y = 0; y < meshLenY; y++) - free(threadCount[x][y]); - free(threadCount[x]); - } - free(threadCount); - } - - // Write graph to tinsel machine - void write(HostLink* hostLink) { - // Start timer - struct timeval start, finish; - gettimeofday(&start, NULL); - - bool useSendBufferOld = hostLink->useSendBuffer; - hostLink->useSendBuffer = true; - writeRAM(hostLink, vertexMem, vertexMemSize, vertexMemBase); - writeRAM(hostLink, threadMem, threadMemSize, threadMemBase); - writeRAM(hostLink, inEdgeMem, inEdgeMemSize, inEdgeMemBase); - routingTables->write(hostLink); - hostLink->flush(); - hostLink->useSendBuffer = useSendBufferOld; - - // Display time if chatty - gettimeofday(&finish, NULL); - if (chatty > 0) { - struct timeval diff; - timersub(&finish, &start, &diff); - double duration = (double) diff.tv_sec + - (double) diff.tv_usec / 1000000.0; - printf("POLite graph upload time: %lfs\n", duration); - } - } - - // Determine fan-in of given device - uint32_t fanIn(PDeviceId id) { - return graph.fanIn(id); - } - - // Determine fan-out of given device - uint32_t fanOut(PDeviceId id) { - return graph.fanOut(id); - } - -}; - -// Read performance stats and store in file -inline void politeSaveStats(HostLink* hostLink, const char* filename) { - #ifdef POLITE_DUMP_STATS - // Open file for performance counters - FILE* statsFile = fopen(filename, "wt"); - if (statsFile == NULL) { - printf("Error creating stats file\n"); - exit(EXIT_FAILURE); - } - uint32_t meshLenX = hostLink->meshXLen; - uint32_t meshLenY = hostLink->meshYLen; - // Number of caches - uint32_t numLines = meshLenX * meshLenY * - TinselDCachesPerDRAM * TinselDRAMsPerBoard; - // Add on number of cores - numLines += meshLenX * meshLenY * TinselCoresPerBoard; - // Add on number of threads - #ifdef POLITE_COUNT_MSGS - numLines += meshLenX * meshLenY * TinselThreadsPerBoard; - #endif - hostLink->dumpStdOut(statsFile, numLines); - fclose(statsFile); - #endif -} - -#endif diff --git a/include/POLite/Hybrid/PDevice.h b/include/POLite/Hybrid/PDevice.h deleted file mode 100644 index d46bd7b4..00000000 --- a/include/POLite/Hybrid/PDevice.h +++ /dev/null @@ -1,321 +0,0 @@ -// SPDX-License-Identifier: BSD-2-Clause -#ifndef _PDEVICE_H_ -#define _PDEVICE_H_ - -#include -#include -#include - -#ifdef TINSEL - #include - #define PTR(t) t* -#else - #include - #define PTR(t) uint32_t -#endif - -// Use this to align on half-cache-line boundary -#define ALIGNED __attribute__((aligned(1<<(TinselLogBytesPerLine-1)))) - -// This is a static limit on the number of pins per device -#ifndef POLITE_NUM_PINS -#define POLITE_NUM_PINS 1 -#endif - -// Macros for performance stats -// POLITE_DUMP_STATS - dump performance stats on termination -// POLITE_COUNT_MSGS - include message counts of performance stats - -// Thread-local device id -typedef uint16_t PLocalDeviceId; -#define InvalidLocalDevId 0xffff -#define UnusedLocalDevId 0xfffe - -// Thread id -typedef uint32_t PThreadId; - -// Device address -// Bits 17->0: thread id -// Bit 18: invalid address -// Bits 31->19: thread-local device id -typedef uint32_t PDeviceAddr; - -// Device address constructors -inline PDeviceAddr invalidDeviceAddr() { return 0x40000; } -inline PDeviceAddr makeDeviceAddr(PThreadId t, PLocalDeviceId d) { - return (d << 19) | t; -} - -// Device address deconstructors -inline bool isValidDeviceAddr(PDeviceAddr addr) { return !(addr & 0x40000); } -inline PThreadId getThreadId(PDeviceAddr addr) { return addr & 0x3ffff; } -inline PLocalDeviceId getLocalDeviceId(PDeviceAddr addr) { return addr >> 19; } - -// What's the max allowed local device address? -inline uint32_t maxLocalDeviceId() { return 8192; } - -// Index into the per-thread in-edge table -typedef uint16_t InTableKey; - -// Pins -// No - means 'not ready to send' -// HostPin - means 'send to host' -// Pin(n) - means 'send to application pin number n' -typedef uint8_t PPin; -#define No 0 -#define HostPin 1 -#define Pin(n) ((n)+2) - -// For template arguments that are not used -struct None {}; - -// Generic device structure -// Type parameters: -// S - State -// E - Edge label -// M - Message structure -template struct PDevice { - // State - S* s; - PPin* readyToSend; - uint32_t numVertices; - uint16_t time; - - // Handlers - void init(); - void send(volatile M* msg); - void recv(M* msg, E* edge); - bool step(); - bool finish(volatile M* msg); -}; - -// Generic device state structure -template struct ALIGNED PState { - // Board-level routing key for each outgoing pin - uint32_t pin[POLITE_NUM_PINS]; - // Ready-to-send status - PPin readyToSend; - // Custom state - S state; -}; - -// Message structure -template struct PMessage { - // Source-based routing key - InTableKey key; - // Application message - M payload; -}; - -// An incoming edge to a device (labelleled) -template struct PInEdge { - // Destination device - PLocalDeviceId devId; - // Edge info - E edge; -}; - -// An incoming edge to a device (unlabelleled) -template <> struct PInEdge { - union { - // Destination device - PLocalDeviceId devId; - // Unused - None edge; - }; -}; - -// Generic thread structure -template struct PThread { - - // Number of devices handled by thread - PLocalDeviceId numDevices; - // Number of times step handler has been called - uint16_t time; - // Number of devices in graph - uint32_t numVertices; - // Pointer to array of device states - PTR(PState) devices; - // Pointer to base of in table - PTR(PInEdge) inTableBase; - // Array of local device ids are ready to send - PTR(PLocalDeviceId) senders; - // This array is accessed in a LIFO manner - PTR(PLocalDeviceId) sendersTop; - - // Count number of messages sent - #ifdef POLITE_COUNT_MSGS - // Total messages sent - uint32_t msgsSent; - // Total messages received - uint32_t msgsReceived; - // Number of times we wanted to send but couldn't - uint32_t blockedSends; - #endif - - #ifdef TINSEL - - // Helper function to construct a device - INLINE DeviceType getDevice(uint32_t id) { - DeviceType dev; - dev.s = &devices[id].state; - dev.readyToSend = &devices[id].readyToSend; - dev.numVertices = numVertices; - dev.time = time; - return dev; - } - - // Dump performance counter stats over UART - void dumpStats() { - tinselPerfCountStop(); - uint32_t me = tinselId(); - // Per-cache performance counters - uint32_t cacheMask = (1 << - (TinselLogThreadsPerCore + TinselLogCoresPerDCache)) - 1; - if ((me & cacheMask) == 0) { - printf("H:%x,M:%x,W:%x\n", - tinselHitCount(), - tinselMissCount(), - tinselWritebackCount()); - } - // Per-core performance counters - uint32_t coreMask = (1 << (TinselLogThreadsPerCore)) - 1; - if ((me & coreMask) == 0) { - printf("C:%x %x,I:%x %x\n", - tinselCycleCountU(), tinselCycleCount(), - tinselCPUIdleCountU(), tinselCPUIdleCount()); - } - // Per-thread performance counters - #ifdef POLITE_COUNT_MSGS - uint32_t intraBoardId = me & ((1<)-1) >> TinselLogBytesPerFlit); - - // Event loop - while (1) { - // Try to send - if (sendersTop != senders) { - if (tinselCanSend()) { - // Get next sender - PLocalDeviceId src = *(--sendersTop); - // Lookup device - DeviceType dev = getDevice(src); - PPin pin = *dev.readyToSend; - // Invoke send handler - PMessage* m = (PMessage*) tinselSendSlot(); - dev.send(&m->payload); - // Reinsert sender, if it still wants to send - if (*dev.readyToSend != No) sendersTop++; - // Is it a send to the host pin or a user pin? - if (pin == HostPin) - tinselSend(tinselHostId(), m); - else - tinselKeySend(devices[src].pin[pin-2], m); - #ifdef POLITE_COUNT_MSGS - msgsSent++; - #endif - } - else { - #ifdef POLITE_COUNT_MSGS - blockedSends++; - #endif - tinselWaitUntil(TINSEL_CAN_SEND|TINSEL_CAN_RECV); - } - } - else { - // Idle detection - int idle = tinselIdle(!active); - if (idle > 1) - break; - else if (idle) { - active = false; - for (uint32_t i = 0; i < numDevices; i++) { - DeviceType dev = getDevice(i); - // Invoke the step handler for each device - active = dev.step() || active; - // Device ready to send? - if (*dev.readyToSend != No) { - *(sendersTop++) = i; - } - } - time++; - } - } - - // Step 2: try to receive - while (tinselCanRecv()) { - PMessage* inMsg = (PMessage*) tinselRecv(); - PInEdge* inEdge = &inTableBase[inMsg->key]; - while (inEdge->devId != InvalidLocalDevId) { - // Lookup destination device - PLocalDeviceId id = inEdge->devId; - DeviceType dev = getDevice(id); - // Was it ready to send? - PPin oldReadyToSend = *dev.readyToSend; - // Invoke receive handler - dev.recv(&inMsg->payload, &inEdge->edge); - // Insert device into a senders array, if not already there - if (*dev.readyToSend != No && oldReadyToSend == No) - *(sendersTop++) = id; - inEdge++; - #ifdef POLITE_COUNT_MSGS - msgsReceived++; - #endif - } - tinselFree(inMsg); - } - } - - // Termination - #ifdef POLITE_DUMP_STATS - dumpStats(); - #endif - - // Invoke finish handler for each device - for (uint32_t i = 0; i < numDevices; i++) { - DeviceType dev = getDevice(i); - tinselWaitUntil(TINSEL_CAN_SEND); - PMessage* m = (PMessage*) tinselSendSlot(); - if (dev.finish(&m->payload)) tinselSend(tinselHostId(), m); - } - - // Sleep - tinselWaitUntil(TINSEL_CAN_RECV); while (1); - } - - #endif - -}; - -#endif diff --git a/include/POLite/Hybrid/PGraph.h b/include/POLite/Hybrid/PGraph.h deleted file mode 100644 index 126eaa97..00000000 --- a/include/POLite/Hybrid/PGraph.h +++ /dev/null @@ -1,854 +0,0 @@ -// SPDX-License-Identifier: BSD-2-Clause -#ifndef _PGRAPH_H_ -#define _PGRAPH_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// Nodes of a POETS graph are devices -typedef NodeId PDeviceId; - -// This structure holds a group of receiving edges on a thread. -// All of the edges originate from the same output pin. -template struct PReceiverGroup { - // Thread id where all the receivers reside - uint32_t threadId; - // A sequence of receiving devices on that thread - Seq>* receivers; -}; - -// POETS graph -template class PGraph { - private: - // Align address to 2^n byte boundary - inline uint32_t align(uint32_t n, uint32_t addr) { - if ((addr & (1<> n) + 1) << n; - } - - // Align address to 32-bit word boundary - uint32_t wordAlign(uint32_t addr) { return align(2, addr); } - - // Align address to cache-line boundary - uint32_t cacheAlign(uint32_t addr) { - return align(TinselLogBytesPerLine, addr); - } - - // Helper function - inline uint32_t min(uint32_t x, uint32_t y) { return x < y ? x : y; } - - // Number of FPGA boards available - uint32_t meshLenX; - uint32_t meshLenY; - - // Number of FPGA boards to use - uint32_t numBoardsX; - uint32_t numBoardsY; - - // Out table (sender-side edge tables) - // Sequence of destinations for every (device, pin) pair - Seq*** outTable; - - // Key table (sender-side key tables) - // Global routing key for every (device, pin) pair - uint32_t** keyTable; - - // In table (receiver-side edge tables) - // Sequence of incoming edges for every thread - Seq>** inTable; - - // Mesh of per-board programmable routers - ProgRouterMesh* routingTables; - - // Generic constructor - void constructor(uint32_t lenX, uint32_t lenY) { - meshLenX = lenX; - meshLenY = lenY; - char* str = getenv("POLITE_BOARDS_X"); - int nx = str ? atoi(str) : meshLenX; - str = getenv("POLITE_BOARDS_Y"); - int ny = str ? atoi(str) : meshLenY; - setNumBoards(nx, ny); - numDevices = 0; - devices = NULL; - toDeviceAddr = NULL; - numDevicesOnThread = NULL; - fromDeviceAddr = NULL; - vertexMem = NULL; - vertexMemSize = NULL; - vertexMemBase = NULL; - inEdgeMem = NULL; - inEdgeMemSize = NULL; - inEdgeMemBase = NULL; - mapVerticesToDRAM = false; - mapInEdgesToDRAM = true; - outTable = NULL; - keyTable = NULL; - inTable = NULL; - routingTables = NULL; - chatty = 0; - str = getenv("POLITE_CHATTY"); - if (str != NULL) { - chatty = !strcmp(str, "0") ? 0 : 1; - } - } - - public: - // Number of devices - uint32_t numDevices; - - // Graph containing device ids and connections - Graph graph; - - // Edge labels: has same structure as graph.outgoing - Seq*> edgeLabels; - - // Mapping from device id to device state - // (Not valid until the mapper is called) - PState** devices; - - // Mapping from thread id to number of devices on that thread - // (Not valid until the mapper is called) - uint32_t* numDevicesOnThread; - - // Mapping from device id to device address and back - // (Not valid until the mapper is called) - PDeviceAddr* toDeviceAddr; // Device id -> device address - PDeviceId** fromDeviceAddr; // Device address -> device id - - // Each thread's vertex mem and thread mem regions - // (Not valid until the mapper is called) - uint8_t** vertexMem; uint8_t** threadMem; - uint32_t* vertexMemSize; uint32_t* threadMemSize; - uint32_t* vertexMemBase; uint32_t* threadMemBase; - - // Each thread's in-edge tables - // (Not valid until the mapper is called) - uint8_t** inEdgeMem; - uint32_t* inEdgeMemSize; - uint32_t* inEdgeMemBase; - - // Where to map the various regions - // (If false, map to SRAM instead) - bool mapVerticesToDRAM; - bool mapInEdgesToDRAM; - - // Allow mapper to print useful information to stdout - uint32_t chatty; - - // Setter for number of boards to use - void setNumBoards(uint32_t x, uint32_t y) { - if (x > meshLenX || y > meshLenY) { - printf("Mapper: %d x %d boards requested, %d x %d available\n", - numBoardsX, numBoardsY, meshLenX, meshLenY); - exit(EXIT_FAILURE); - } - numBoardsX = x; - numBoardsY = y; - } - - // Create new device - inline PDeviceId newDevice() { - edgeLabels.append(new SmallSeq); - numDevices++; - return graph.newNode(); - } - - // Add a connection between devices - inline void addEdge(PDeviceId from, PinId pin, PDeviceId to) { - if (pin >= POLITE_NUM_PINS) { - printf("addEdge: pin exceeds POLITE_NUM_PINS\n"); - exit(EXIT_FAILURE); - } - graph.addEdge(from, pin, to); - E edge; - edgeLabels.elems[from]->append(edge); - } - - // Add labelled edge using given output pin - void addLabelledEdge(E edge, PDeviceId x, PinId pin, PDeviceId y) { - if (pin >= POLITE_NUM_PINS) { - printf("addEdge: pin exceeds POLITE_NUM_PINS\n"); - exit(EXIT_FAILURE); - } - graph.addEdge(x, pin, y); - edgeLabels.elems[x]->append(edge); - } - - // Allocate SRAM and DRAM partitions - void allocatePartitions() { - // Decide a maximum partition size that is reasonable - // SRAM: Partition size minus 2048 bytes for the stack - uint32_t maxSRAMSize = (1<)); - // Add space for devices - uint32_t numDevs = numDevicesOnThread[threadId]; - for (uint32_t devNum = 0; devNum < numDevs; devNum++) { - // Add space for device - sizeVMem = sizeVMem + sizeof(PState); - } - // Add space for incoming edge table - if (inTable[threadId]) { - sizeEIMem = inTable[threadId]->numElems * sizeof(PInEdge); - sizeEIMem = wordAlign(sizeEIMem); - } - // The total partition size including uninitialised portions - uint32_t totalSizeVMem = - sizeVMem + wordAlign(sizeof(PLocalDeviceId) * numDevs); - // Check that total size is reasonable - uint32_t totalSizeSRAM = sizeTMem; - uint32_t totalSizeDRAM = 0; - if (mapVerticesToDRAM) totalSizeDRAM += totalSizeVMem; - else totalSizeSRAM += totalSizeVMem; - if (mapInEdgesToDRAM) totalSizeDRAM += sizeEIMem; - else totalSizeSRAM += sizeEIMem; - if (totalSizeDRAM > maxDRAMSize) { - printf("Error: max DRAM partition size exceeded\n"); - exit(EXIT_FAILURE); - } - if (totalSizeSRAM > maxSRAMSize) { - printf("Error: max SRAM partition size exceeded\n"); - exit(EXIT_FAILURE); - } - // Allocate space for the initialised portion of the partition - assert((sizeVMem%4) == 0); - assert((sizeTMem%4) == 0); - assert((sizeEIMem%4) == 0); - vertexMem[threadId] = (uint8_t*) calloc(sizeVMem, 1); - vertexMemSize[threadId] = sizeVMem; - threadMem[threadId] = (uint8_t*) calloc(sizeTMem, 1); - threadMemSize[threadId] = sizeTMem; - inEdgeMem[threadId] = (uint8_t*) calloc(sizeEIMem, 1); - inEdgeMemSize[threadId] = sizeEIMem; - // Tinsel address of base of partition - uint32_t partId = threadId & (TinselThreadsPerDRAM-1); - uint32_t sramBase = (1 << TinselLogBytesPerSRAM) + - (partId << TinselLogBytesPerSRAMPartition); - uint32_t dramBase = TinselBytesPerDRAM - - ((partId+1) << TinselLogBytesPerDRAMPartition); - // Use partition-interleaved region for DRAM - dramBase |= 0x80000000; - threadMemBase[threadId] = sramBase; - sramBase += threadMemSize[threadId]; - // Determine base addresses of each region - if (mapVerticesToDRAM) { - vertexMemBase[threadId] = dramBase; - dramBase += totalSizeVMem; - } - else { - vertexMemBase[threadId] = sramBase; - sramBase += totalSizeVMem; - } - if (mapInEdgesToDRAM) { - inEdgeMemBase[threadId] = dramBase; - dramBase += sizeEIMem; - } - else { - inEdgeMemBase[threadId] = sramBase; - sramBase += sizeEIMem; - } - } - } - - // Initialise partitions - void initialisePartitions() { - for (uint32_t threadId = 0; threadId < TinselMaxThreads; threadId++) { - // Next pointers for each partition - uint32_t nextVMem = 0; - // Pointer to thread structure - PThread* thread = - (PThread*) &threadMem[threadId][0]; - // Set number of devices on thread - thread->numDevices = numDevicesOnThread[threadId]; - // Set number of devices in graph - thread->numVertices = numDevices; - // Set tinsel address of array of device states - thread->devices = vertexMemBase[threadId]; - // Set tinsel address of base of in-edge table - thread->inTableBase = inEdgeMemBase[threadId]; - // Add space for each device on thread - uint32_t numDevs = numDevicesOnThread[threadId]; - for (uint32_t devNum = 0; devNum < numDevs; devNum++) { - PState* dev = (PState*) &vertexMem[threadId][nextVMem]; - PDeviceId id = fromDeviceAddr[threadId][devNum]; - devices[id] = dev; - // Add space for device - nextVMem = nextVMem + sizeof(PState); - } - // Initialise each device and the thread's out edges - for (uint32_t devNum = 0; devNum < numDevs; devNum++) { - PDeviceId id = fromDeviceAddr[threadId][devNum]; - PState* dev = devices[id]; - // Initialise - for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) { - dev->pin[p] = keyTable[id][p]; - } - } - // Intialise thread's in edges - PInEdge* inEdgeArray = (PInEdge*) inEdgeMem[threadId]; - Seq>* edges = inTable[threadId]; - if (edges) - for (uint32_t i = 0; i < edges->numElems; i++) { - inEdgeArray[i] = edges->elems[i]; - } - // At this point, check that next pointers line up with heap sizes - if (nextVMem != vertexMemSize[threadId]) { - printf("Error: vertex mem size does not match pre-computed size\n"); - exit(EXIT_FAILURE); - } - // Set tinsel address of senders array - thread->senders = vertexMemBase[threadId] + nextVMem; - } - } - - // Allocate mapping structures - void allocateMapping() { - devices = (PState**) calloc(numDevices, sizeof(PState*)); - toDeviceAddr = (PDeviceAddr*) calloc(numDevices, sizeof(PDeviceAddr)); - fromDeviceAddr = (PDeviceId**) calloc(TinselMaxThreads, sizeof(PDeviceId*)); - numDevicesOnThread = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t)); - } - - // Allocate thread edge input and output tables - // (Only valid after mapper is called) - void allocateInOutTables() { - // Receiver-side tables - inTable = (Seq>**) - calloc(TinselMaxThreads,sizeof(Seq>*)); - for (uint32_t t = 0; t < TinselMaxThreads; t++) { - if (numDevicesOnThread[t] != 0) - inTable[t] = new SmallSeq>; - } - - // Sender-side tables - outTable = (Seq***) - calloc(numDevices, sizeof(Seq**)); - for (uint32_t d = 0; d < numDevices; d++) { - outTable[d] = (Seq**) - calloc(POLITE_NUM_PINS, sizeof(Seq*)); - for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) - outTable[d][p] = new SmallSeq; - } - - keyTable = new uint32_t* [numDevices]; - for (uint32_t d = 0; d < numDevices; d++) - keyTable[d] = new uint32_t [POLITE_NUM_PINS]; - } - - // Pack a receivers array - // Input: an in-edge sequence for each thread in a mailbox. - // Input array may contain lots of holes (0-element sequences) - // Output: a sequence of receiver groups - // Output array contains no empty receiver groups - void createReceiverGroups( - uint32_t mbox, - Seq>* receivers, - Seq>* groups) { - groups->clear(); - for (uint32_t i = 0; i < 64; i++) { - if (receivers[i].numElems > 0) { - // Add receiver group - PReceiverGroup g; - g.threadId = (mbox << TinselLogThreadsPerMailbox) | i; - g.receivers = &receivers[i]; - groups->append(g); - } - } - } - - // Determine in-table key for given set of receivers - // (The key must be the same for all receivers) - uint32_t findInTableKey(Seq>* receivers) { - uint32_t key = 0; - - bool found = false; - while (!found) { - found = true; - for (uint32_t i = 0; i < receivers->numElems; i++) { - PReceiverGroup g = receivers->elems[i]; - uint32_t numReceivers = g.receivers->numElems; - if (numReceivers > 0) { - // Lookup thread id of receiver - uint32_t t = g.threadId; - // Lookup table size for this thread - uint32_t tableSize = inTable[t]->numElems; - // Move to next receiver when we find a space - if (key >= tableSize) continue; - // Is there space at the current key? - // (Need space for numReceivers plus null terminator) - bool space = true; - for (int j = 0; j < numReceivers+1; j++) { - if ((key+j) >= tableSize) break; - if (inTable[t]->elems[key+j].devId != UnusedLocalDevId) { - found = false; - key = key+j+1; - break; - } - } - } - } - } - return key; - } - - // Add entries to the input tables for the given receivers - // (Only valid after mapper is called) - uint32_t addInTableEntries(Seq>* receivers) { - uint32_t key = findInTableKey(receivers); - if (key >= 0xfffe) { - printf("In-table routing key exceeds 16 bits\n"); - exit(EXIT_FAILURE); - } - PInEdge null, unused; - null.devId = InvalidLocalDevId; - unused.devId = UnusedLocalDevId; - // Now that a key with sufficient space has been found, populate the tables - for (uint32_t i = 0; i < receivers->numElems; i++) { - PReceiverGroup g = receivers->elems[i]; - uint32_t numReceivers = g.receivers->numElems; - if (numReceivers > 0) { - // Lookup thread id of receiver - uint32_t t = g.threadId; - // Lookup table size for this thread - uint32_t tableSize = inTable[t]->numElems; - // Make sure inTable is big enough for new entries - for (uint32_t j = tableSize; j < (key+numReceivers+1); j++) - inTable[t]->append(unused); - // Add receivers to thread's inTable - for (uint32_t j = 0; j < numReceivers; j++) { - inTable[t]->elems[key+j] = g.receivers->elems[j]; - } - inTable[t]->elems[key+numReceivers] = null; - } - } - return key; - } - - // Compute thread edge input and output tables - // (Only valid after mapper is called) - void computeInOutTables() { - // Routing table stats - uint64_t totalOutEdges = 0; - - // Sequence of local device ids, for each multicast destiation - SmallSeq> receivers[64]; - - // Sequence of receiver groups - // (A more compact representation of the receivers array) - SmallSeq> groups; - - // For each device - for (uint32_t d = 0; d < numDevices; d++) { - // For each pin - for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) { - Seq dests = *(graph.outgoing->elems[d]); - Seq edges = *(edgeLabels.elems[d]); - // While destinations are remaining - while (dests.numElems > 0) { - // Clear receivers - for (uint32_t i = 0; i < 64; i++) receivers[i].clear(); - uint32_t threadMaskLow = 0; - uint32_t threadMaskHigh = 0; - // Current mailbox being considered - PDeviceAddr mbox = getThreadId(toDeviceAddr[dests.elems[0]]) >> - TinselLogThreadsPerMailbox; - // For each destination - uint32_t destsRemaining = 0; - for (uint32_t i = 0; i < dests.numElems; i++) { - // Determine destination mailbox address and mailbox-local thread - PDeviceId destId = dests.elems[i]; - PDeviceAddr destAddr = toDeviceAddr[destId]; - uint32_t destMailbox = getThreadId(destAddr) >> - TinselLogThreadsPerMailbox; - uint32_t destThread = getThreadId(destAddr) & - ((1< edge; - edge.devId = getLocalDeviceId(destAddr); - if (! std::is_same::value) edge.edge = edges.elems[i]; - receivers[destThread].append(edge); - if (destThread < 32) threadMaskLow |= 1 << destThread; - if (destThread >= 32) threadMaskHigh |= 1 << (destThread-32); - } - else { - // Add destination back into sequence - dests.elems[destsRemaining] = dests.elems[i]; - edges.elems[destsRemaining] = edges.elems[i]; - destsRemaining++; - } - } - // Create receiver groups - createReceiverGroups(mbox, receivers, &groups); - // Add input table entries - uint32_t key = addInTableEntries(&groups); - // Add output table entry - PRoutingDest edge; - edge.kind = PRDestKindMRM; - edge.mbox = mbox; - edge.mrm.key = key; - edge.mrm.threadMaskLow = threadMaskLow; - edge.mrm.threadMaskHigh = threadMaskHigh; - outTable[d][p]->append(edge); - // Prepare for new output table entry - dests.numElems = destsRemaining; - edges.numElems = destsRemaining; - totalOutEdges++; - } - } - } - //printf("Average edges per pin: %lu\n", - // totalOutEdges / (numDevices * POLITE_NUM_PINS); - } - - // Release all structures - void releaseAll() { - if (devices != NULL) { - free(devices); - free(toDeviceAddr); - free(numDevicesOnThread); - for (uint32_t t = 0; t < TinselMaxThreads; t++) - if (fromDeviceAddr[t] != NULL) free(fromDeviceAddr[t]); - free(fromDeviceAddr); - for (uint32_t t = 0; t < TinselMaxThreads; t++) - if (vertexMem[t] != NULL) free(vertexMem[t]); - free(vertexMem); - free(vertexMemSize); - free(vertexMemBase); - for (uint32_t t = 0; t < TinselMaxThreads; t++) - if (threadMem[t] != NULL) free(threadMem[t]); - free(threadMem); - free(threadMemSize); - free(threadMemBase); - for (uint32_t t = 0; t < TinselMaxThreads; t++) - if (inEdgeMem[t] != NULL) free(inEdgeMem[t]); - free(inEdgeMem); - free(inEdgeMemSize); - free(inEdgeMemBase); - } - if (inTable != NULL) { - for (uint32_t t = 0; t < TinselMaxThreads; t++) - if (inTable[t] != NULL) delete inTable[t]; - free(inTable); - inTable = NULL; - } - if (outTable != NULL) { - for (uint32_t d = 0; d < numDevices; d++) { - if (outTable[d] == NULL) continue; - for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) - delete outTable[d][p]; - free(outTable[d]); - } - free(outTable); - outTable = NULL; - } - if (keyTable != NULL) { - for (uint32_t d = 0; d < numDevices; d++) delete [] keyTable[d]; - delete [] keyTable; - keyTable = NULL; - } - if (routingTables != NULL) delete routingTables; - } - - // Implement mapping to tinsel threads - void map() { - // Let's measure some times - struct timeval placementStart, placementFinish; - struct timeval routingStart, routingFinish; - struct timeval initStart, initFinish; - - // Release all mapping and heap structures - releaseAll(); - - // Reallocate mapping structures - allocateMapping(); - - // Start placement timer - gettimeofday(&placementStart, NULL); - - // Partition into subgraphs, one per board - Placer boards(&graph, numBoardsX, numBoardsY); - - // Place subgraphs onto 2D mesh - const uint32_t placerEffort = 8; - boards.place(placerEffort); - - // For each board - for (uint32_t boardY = 0; boardY < numBoardsY; boardY++) { - for (uint32_t boardX = 0; boardX < numBoardsX; boardX++) { - // Partition into subgraphs, one per mailbox - PartitionId b = boards.mapping[boardY][boardX]; - Placer boxes(&boards.subgraphs[b], - TinselMailboxMeshXLen, TinselMailboxMeshYLen); - boxes.place(placerEffort); - - // For each mailbox - for (uint32_t boxX = 0; boxX < TinselMailboxMeshXLen; boxX++) { - for (uint32_t boxY = 0; boxY < TinselMailboxMeshYLen; boxY++) { - // Partition into subgraphs, one per thread - uint32_t numThreads = 1<incoming->numElems; - numDevicesOnThread[threadId] = numDevs; - fromDeviceAddr[threadId] = (PDeviceId*) - malloc(sizeof(PDeviceId) * numDevs); - for (uint32_t devNum = 0; devNum < numDevs; devNum++) - fromDeviceAddr[threadId][devNum] = g->labels->elems[devNum]; - - // Populate toDeviceAddr mapping - assert(numDevs < maxLocalDeviceId()); - for (uint32_t devNum = 0; devNum < numDevs; devNum++) { - PDeviceAddr devAddr = - makeDeviceAddr(threadId, devNum); - toDeviceAddr[g->labels->elems[devNum]] = devAddr; - } - } - } - } - } - } - - // Stop placement timer and start In/Out table timer - gettimeofday(&placementFinish, NULL); - gettimeofday(&routingStart, NULL); - - // Compute send and receive side routing tables - allocateInOutTables(); - computeInOutTables(); - - // Compute per-board programmable routing tables - routingTables = new ProgRouterMesh(numBoardsX, numBoardsY); - for (uint32_t d = 0; d < numDevices; d++) { - uint32_t src = getThreadId(toDeviceAddr[d]) >> - TinselLogThreadsPerMailbox; - for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) - keyTable[d][p] = routingTables->addDestsFromBoard(src, outTable[d][p]); - } - - // Stop routing timer and start init timer - gettimeofday(&routingFinish, NULL); - gettimeofday(&initStart, NULL); - - // Reallocate and initialise heap structures - allocatePartitions(); - initialisePartitions(); - - // Display times, if chatty - gettimeofday(&initFinish, NULL); - if (chatty > 0) { - struct timeval diff; - - timersub(&placementFinish, &placementStart, &diff); - double duration = (double) diff.tv_sec + - (double) diff.tv_usec / 1000000.0; - printf("POLite mapper profile:\n"); - printf(" Partitioning and placement: %lfs\n", duration); - - timersub(&routingFinish, &routingStart, &diff); - duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; - printf(" In/Out table construction: %lfs\n", duration); - - timersub(&initFinish, &initStart, &diff); - duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; - printf(" Thread state initialisation: %lfs\n", duration); - } - } - - // Constructor - PGraph() { - char* str = getenv("HOSTLINK_BOXES_X"); - int x = str ? atoi(str) : 1; - x = x * TinselMeshXLenWithinBox; - str = getenv("HOSTLINK_BOXES_Y"); - int y = str ? atoi(str) : 1; - y = y * TinselMeshYLenWithinBox; - constructor(x, y); - } - PGraph(uint32_t numBoxesX, uint32_t numBoxesY) { - int x = numBoxesX * TinselMeshXLenWithinBox; - int y = numBoxesY * TinselMeshYLenWithinBox; - constructor(x, y); - } - - // Deconstructor - ~PGraph() { - releaseAll(); - for (uint32_t i = 0; i < edgeLabels.numElems; i++) - delete edgeLabels.elems[i]; - } - - // Write partition to tinsel machine - void writeRAM(HostLink* hostLink, - uint8_t** heap, uint32_t* heapSize, uint32_t* heapBase) { - // Number of bytes written by each thread - uint32_t* writeCount = (uint32_t*) - calloc(TinselMaxThreads, sizeof(uint32_t)); - - // Number of threads completed by each core - uint32_t*** threadCount = (uint32_t***) - calloc(meshLenX, sizeof(uint32_t**)); - for (uint32_t x = 0; x < meshLenX; x++) { - threadCount[x] = (uint32_t**) - calloc(meshLenY, sizeof(uint32_t*)); - for (uint32_t y = 0; y < meshLenY; y++) - threadCount[x][y] = (uint32_t*) - calloc(TinselCoresPerBoard, sizeof(uint32_t)); - } - - // Initialise write addresses - for (int x = 0; x < meshLenX; x++) - for (int y = 0; y < meshLenY; y++) - for (int c = 0; c < TinselCoresPerBoard; c++) - hostLink->setAddr(x, y, c, heapBase[hostLink->toAddr(x, y, c, 0)]); - - // Write heaps - uint32_t done = false; - while (! done) { - done = true; - for (int x = 0; x < meshLenX; x++) { - for (int y = 0; y < meshLenY; y++) { - for (int c = 0; c < TinselCoresPerBoard; c++) { - uint32_t t = threadCount[x][y][c]; - if (t < TinselThreadsPerCore) { - done = false; - uint32_t threadId = hostLink->toAddr(x, y, c, t); - uint32_t written = writeCount[threadId]; - if (written == heapSize[threadId]) { - threadCount[x][y][c] = t+1; - if ((t+1) < TinselThreadsPerCore) - hostLink->setAddr(x, y, c, - heapBase[hostLink->toAddr(x, y, c, t+1)]); - } else { - uint32_t send = min((heapSize[threadId] - written)>>2, 15); - hostLink->store(x, y, c, send, - (uint32_t*) &heap[threadId][written]); - writeCount[threadId] = written + send * sizeof(uint32_t); - } - } - } - } - } - } - - // Release memory - free(writeCount); - for (uint32_t x = 0; x < meshLenX; x++) { - for (uint32_t y = 0; y < meshLenY; y++) - free(threadCount[x][y]); - free(threadCount[x]); - } - free(threadCount); - } - - // Write graph to tinsel machine - void write(HostLink* hostLink) { - // Start timer - struct timeval start, finish; - gettimeofday(&start, NULL); - - bool useSendBufferOld = hostLink->useSendBuffer; - hostLink->useSendBuffer = true; - writeRAM(hostLink, vertexMem, vertexMemSize, vertexMemBase); - writeRAM(hostLink, threadMem, threadMemSize, threadMemBase); - writeRAM(hostLink, inEdgeMem, inEdgeMemSize, inEdgeMemBase); - routingTables->write(hostLink); - hostLink->flush(); - hostLink->useSendBuffer = useSendBufferOld; - - // Display time if chatty - gettimeofday(&finish, NULL); - if (chatty > 0) { - struct timeval diff; - timersub(&finish, &start, &diff); - double duration = (double) diff.tv_sec + - (double) diff.tv_usec / 1000000.0; - printf("POLite graph upload time: %lfs\n", duration); - } - } - - // Determine fan-in of given device - uint32_t fanIn(PDeviceId id) { - return graph.fanIn(id); - } - - // Determine fan-out of given device - uint32_t fanOut(PDeviceId id) { - return graph.fanOut(id); - } - -}; - -// Read performance stats and store in file -inline void politeSaveStats(HostLink* hostLink, const char* filename) { - #ifdef POLITE_DUMP_STATS - // Open file for performance counters - FILE* statsFile = fopen(filename, "wt"); - if (statsFile == NULL) { - printf("Error creating stats file\n"); - exit(EXIT_FAILURE); - } - uint32_t meshLenX = hostLink->meshXLen; - uint32_t meshLenY = hostLink->meshYLen; - // Number of caches - uint32_t numLines = meshLenX * meshLenY * - TinselDCachesPerDRAM * TinselDRAMsPerBoard; - // Add on number of cores - numLines += meshLenX * meshLenY * TinselCoresPerBoard; - // Add on number of threads - #ifdef POLITE_COUNT_MSGS - numLines += meshLenX * meshLenY * TinselThreadsPerBoard; - #endif - hostLink->dumpStdOut(statsFile, numLines); - fclose(statsFile); - #endif -} - -#endif diff --git a/include/POLite/Local/PDevice.h b/include/POLite/PDevice.h similarity index 80% rename from include/POLite/Local/PDevice.h rename to include/POLite/PDevice.h index 9408cfae..6ba3be83 100644 --- a/include/POLite/Local/PDevice.h +++ b/include/POLite/PDevice.h @@ -22,14 +22,22 @@ #define POLITE_NUM_PINS 1 #endif -// Macros for performance stats +// The local-multicast key points to a list of incoming edges. Some +// of those edges are stored in a header, the rest in an array at a +// different location. The number stored in the header is controlled +// by the following parameter. If it's too low, we risk wasting +// memory bandwidth. If it's too high, we risk wasting memory. +// The minimum value is 0. For large edge state sizes, use 0. +#ifndef POLITE_EDGES_PER_HEADER +#define POLITE_EDGES_PER_HEADER 6 +#endif + +// Macros for performance stats: // POLITE_DUMP_STATS - dump performance stats on termination -// POLITE_COUNT_MSGS - include message counts of performance stats +// POLITE_COUNT_MSGS - include message counts in performance stats // Thread-local device id typedef uint16_t PLocalDeviceId; -#define InvalidLocalDevId 0xffff -#define UnusedLocalDevId 0xfffe // Thread id typedef uint32_t PThreadId; @@ -54,7 +62,7 @@ inline PLocalDeviceId getLocalDeviceId(PDeviceAddr addr) { return addr >> 19; } // What's the max allowed local device address? inline uint32_t maxLocalDeviceId() { return 8192; } -// Routing key +// Local multicast key typedef uint16_t Key; #define InvalidKey 0xffff @@ -102,8 +110,8 @@ template struct ALIGNED PState { // Message structure template struct PMessage { - // Source-based routing key - Key key; + // Destination key + uint16_t destKey; // Application message M payload; }; @@ -119,34 +127,26 @@ struct POutEdge { uint32_t threadMaskHigh; }; -// An incoming edge to a device (labelleled) +// An incoming edge to a device template struct PInEdge { // Destination device PLocalDeviceId devId; - // Edge info + // Edge data E edge; }; -// An incoming edge to a device (unlabelleled) -template <> struct PInEdge { - union { - // Destination device - PLocalDeviceId devId; - // Unused - None edge; - }; +// Header for a list of incoming edges (fixed size structure to +// support fast construction/packing of local-multicast tables) +template struct PInHeader { + // Number of receivers + uint16_t numReceivers; + // Pointer to remaining edges in inTableRest, + // if they don't all fit in the header + uint16_t restIndex; + // Edges stored in the header, to make good use of cached data + PInEdge edges[POLITE_EDGES_PER_HEADER]; }; -// Helper function: Count board hops between two threads -inline uint32_t hopsBetween(uint32_t t0, uint32_t t1) { - uint32_t xmask = ((1<> (TinselLogThreadsPerBoard + TinselMeshXBits); - int32_t x0 = (t0 >> TinselLogThreadsPerBoard) & xmask; - int32_t y1 = t1 >> (TinselLogThreadsPerBoard + TinselMeshXBits); - int32_t x1 = (t1 >> TinselLogThreadsPerBoard) & xmask; - return (abs(x0-x1) + abs(y0-y1)); -} - // Generic thread structure template struct PThread { @@ -161,7 +161,8 @@ template ) devices; // Pointer to base of routing tables PTR(POutEdge) outTableBase; - PTR(PInEdge) inTableBase; + PTR(PInHeader) inTableHeaderBase; + PTR(PInEdge) inTableRestBase; // Array of local device ids are ready to send PTR(PLocalDeviceId) senders; // This array is accessed in a LIFO manner @@ -169,14 +170,12 @@ template * m = (PMessage*) tinselSendSlot(); // Send message - m->key = outEdge->key; + m->destKey = outEdge->key; tinselMulticast(outEdge->mbox, outEdge->threadMaskHigh, outEdge->threadMaskLow, m); #ifdef POLITE_COUNT_MSGS - interThreadSendCount++; - interBoardSendCount += - hopsBetween(outEdge->mbox << TinselLogThreadsPerMailbox, - tinselId()); + msgsSent++; #endif // Move to next neighbour outEdge++; @@ -329,8 +330,14 @@ template * inMsg = (PMessage*) tinselRecv(); - PInEdge* inEdge = &inTableBase[inMsg->key]; - while (inEdge->devId != InvalidLocalDevId) { + PInHeader* inHeader = &inTableHeaderBase[inMsg->destKey]; + // Determine number and location of edges/receivers + uint32_t numReceivers = inHeader->numReceivers; + PInEdge* inEdge = inHeader->edges; + // For each receiver + for (uint32_t i = 0; i < numReceivers; i++) { + if (i == POLITE_EDGES_PER_HEADER) + inEdge = &inTableRestBase[inHeader->restIndex]; // Lookup destination device PLocalDeviceId id = inEdge->devId; DeviceType dev = getDevice(id); diff --git a/include/POLite/Local/PGraph.h b/include/POLite/PGraph.h similarity index 62% rename from include/POLite/Local/PGraph.h rename to include/POLite/PGraph.h index 5ded656a..c5e5b41f 100644 --- a/include/POLite/Local/PGraph.h +++ b/include/POLite/PGraph.h @@ -12,7 +12,10 @@ #include #include #include +#include +#include #include +#include // Nodes of a POETS graph are devices typedef NodeId PDeviceId; @@ -23,9 +26,27 @@ template struct PReceiverGroup { // Thread id where all the receivers reside uint32_t threadId; // A sequence of receiving devices on that thread - Seq>* receivers; + SmallSeq> receivers; }; +// This structure holds info about an edge destination +struct PEdgeDest { + // Index of edge in outgoing edge list + uint32_t index; + // Destination device + PDeviceId dest; + // Address where destination is located + PDeviceAddr addr; +}; + +// Comparison function for PEdgeDest +// (Useful to sort destinations by thread id of destination) +inline int cmpEdgeDest(const void* e0, const void* e1) { + PEdgeDest* d0 = (PEdgeDest*) e0; + PEdgeDest* d1 = (PEdgeDest*) e1; + return getThreadId(d0->addr) < getThreadId(d1->addr); +} + // POETS graph template class PGraph { @@ -58,8 +79,19 @@ template *** outTable; - // Sequence of incoming edges for every thread - Seq>** inTable; + // Sequence of in-edge headers, for each thread + Seq>** inTableHeaders; + // Remaining in-edges that don't fit in the header table, for each thread + Seq>** inTableRest; + // Bitmap denoting used space in header table, for each thread + Bitmap** inTableBitmaps; + + // Programmable routing tables + ProgRouterMesh* progRouterTables; + + // Receiver groups (used internally by some methods, but declared once + // to avoid repeated allocation) + PReceiverGroup groups[TinselThreadsPerMailbox]; // Generic constructor void constructor(uint32_t lenX, uint32_t lenY) { @@ -78,17 +110,24 @@ template ); } - // Add space for incoming edge table - if (inTable[threadId]) { - sizeEIMem = inTable[threadId]->numElems * sizeof(PInEdge); - sizeEIMem = wordAlign(sizeEIMem); + // Add space for incoming edge tables + if (inTableHeaders[threadId]) { + sizeEIHeaderMem = inTableHeaders[threadId]->numElems * + sizeof(PInHeader); + sizeEIHeaderMem = wordAlign(sizeEIHeaderMem); + } + if (inTableRest[threadId]) { + sizeEIRestMem = inTableRest[threadId]->numElems * sizeof(PInEdge); + sizeEIRestMem = wordAlign(sizeEIRestMem); } // Add space for outgoing edge table for (uint32_t devNum = 0; devNum < numDevs; devNum++) { @@ -234,8 +288,10 @@ template maxDRAMSize) { @@ -249,14 +305,17 @@ template devices = vertexMemBase[threadId]; // Set tinsel address of base of edge tables thread->outTableBase = outEdgeMemBase[threadId]; - thread->inTableBase = inEdgeMemBase[threadId]; + thread->inTableHeaderBase = inEdgeHeaderMemBase[threadId]; + thread->inTableRestBase = inEdgeRestMemBase[threadId]; // Add space for each device on thread uint32_t numDevs = numDevicesOnThread[threadId]; for (uint32_t devNum = 0; devNum < numDevs; devNum++) { @@ -340,11 +408,18 @@ template * inEdgeArray = (PInEdge*) inEdgeMem[threadId]; - Seq>* edges = inTable[threadId]; + PInHeader* inEdgeHeaderArray = + (PInHeader*) inEdgeHeaderMem[threadId]; + Seq>* headers = inTableHeaders[threadId]; + if (headers) + for (uint32_t i = 0; i < headers->numElems; i++) { + inEdgeHeaderArray[i] = headers->elems[i]; + } + PInEdge* inEdgeRestArray = (PInEdge*) inEdgeRestMem[threadId]; + Seq>* edges = inTableRest[threadId]; if (edges) for (uint32_t i = 0; i < edges->numElems; i++) { - inEdgeArray[i] = edges->elems[i]; + inEdgeRestArray[i] = edges->elems[i]; } // At this point, check that next pointers line up with heap sizes if (nextVMem != vertexMemSize[threadId]) { @@ -371,12 +446,27 @@ template >**) + // Receiver-side tables (headers) + inTableHeaders = (Seq>**) + calloc(TinselMaxThreads,sizeof(Seq>*)); + for (uint32_t t = 0; t < TinselMaxThreads; t++) { + if (numDevicesOnThread[t] != 0) + inTableHeaders[t] = new SmallSeq>; + } + + // Receiver-side tables (rest) + inTableRest = (Seq>**) calloc(TinselMaxThreads,sizeof(Seq>*)); for (uint32_t t = 0; t < TinselMaxThreads; t++) { if (numDevicesOnThread[t] != 0) - inTable[t] = new SmallSeq>; + inTableRest[t] = new SmallSeq>; + } + + // Receiver-side tables (bitmaps) + inTableBitmaps = (Bitmap**) calloc(TinselMaxThreads,sizeof(Bitmap*)); + for (uint32_t t = 0; t < TinselMaxThreads; t++) { + if (numDevicesOnThread[t] != 0) + inTableBitmaps[t] = new Bitmap; } // Sender-side tables @@ -389,174 +479,233 @@ template >* receivers, - Seq>* groups) { - groups->clear(); - for (uint32_t i = 0; i < 64; i++) { - if (receivers[i].numElems > 0) { - // Add receiver group - PReceiverGroup g; - g.threadId = (mbox << TinselLogThreadsPerMailbox) | i; - g.receivers = &receivers[i]; - groups->append(g); - } + // Determine local-multicast routing key for given set of receivers + // (The key must be the same for all receivers) + uint32_t findKey(uint32_t numGroups) { + // Fast path (single receiver) + if (numGroups == 1) { + Bitmap* bm = inTableBitmaps[groups[0].threadId]; + return bm->grabNextBit(); } - } - // Determine routing key for given set of receivers - // (The key must be the same for all receivers) - uint32_t findKey(Seq>* receivers) { - uint32_t key = 0; - - bool found = false; - while (!found) { - found = true; - for (uint32_t i = 0; i < receivers->numElems; i++) { - PReceiverGroup g = receivers->elems[i]; - uint32_t numReceivers = g.receivers->numElems; - if (numReceivers > 0) { - // Lookup thread id of receiver - uint32_t t = g.threadId; - // Lookup table size for this thread - uint32_t tableSize = inTable[t]->numElems; - // Move to next receiver when we find a space - if (key >= tableSize) continue; - // Is there space at the current key? - // (Need space for numReceivers plus null terminator) - bool space = true; - for (int j = 0; j < numReceivers+1; j++) { - if ((key+j) >= tableSize) break; - if (inTable[t]->elems[key+j].devId != UnusedLocalDevId) { - found = false; - key = key+j+1; - break; - } - } - } + // Determine starting index for key search + uint32_t index = 0; + for (uint32_t i = 0; i < numGroups; i++) { + PReceiverGroup* g = &groups[i]; + Bitmap* bm = inTableBitmaps[g->threadId]; + if (bm->firstFree > index) index = bm->firstFree; + } + + // Find key that is available for all receivers + uint64_t mask; + retry: + mask = 0ul; + for (uint32_t i = 0; i < numGroups; i++) { + PReceiverGroup* g = &groups[i]; + Bitmap* bm = inTableBitmaps[g->threadId]; + mask |= bm->getWord(index); + if (~mask == 0ul) { index++; goto retry; } } + + // Mark key as taken in each bitmap + uint32_t bit = __builtin_ctzll(~mask); + for (uint32_t i = 0; i < numGroups; i++) { + PReceiverGroup* g = &groups[i]; + Bitmap* bm = inTableBitmaps[g->threadId]; + bm->setBit(index, bit); } - return key; + return 64*index + bit; } // Add entries to the input tables for the given receivers // (Only valid after mapper is called) - uint32_t addInTableEntries(Seq>* receivers) { - uint32_t key = findKey(receivers); - if (key >= 0xfffe) { + uint32_t addInTableEntries(uint32_t numGroups) { + uint32_t key = findKey(numGroups); + if (key >= 0xffff) { printf("Routing key exceeds 16 bits\n"); exit(EXIT_FAILURE); } - PInEdge null, unused; - null.devId = InvalidLocalDevId; - unused.devId = UnusedLocalDevId; - // Now that a key with sufficient space has been found, populate the tables - for (uint32_t i = 0; i < receivers->numElems; i++) { - PReceiverGroup g = receivers->elems[i]; - uint32_t numReceivers = g.receivers->numElems; - if (numReceivers > 0) { - // Lookup thread id of receiver - uint32_t t = g.threadId; - // Lookup table size for this thread - uint32_t tableSize = inTable[t]->numElems; - // Make sure inTable is big enough for new entries - for (uint32_t j = tableSize; j < (key+numReceivers+1); j++) - inTable[t]->append(unused); - // Add receivers to thread's inTable - for (uint32_t j = 0; j < numReceivers; j++) { - inTable[t]->elems[key+j] = g.receivers->elems[j]; + // Populate inTableHeaders and inTableRest using the key + for (uint32_t i = 0; i < numGroups; i++) { + PReceiverGroup* g = &groups[i]; + uint32_t numEdges = g->receivers.numElems; + PInEdge* edgePtr = g->receivers.elems; + if (numEdges > 0) { + // Determine thread id of receiver + uint32_t t = g->threadId; + // Extend table + Seq>* headers = inTableHeaders[t]; + if (key >= headers->numElems) + headers->extendBy(key + 1 - headers->numElems); + // Fill in header + PInHeader* header = &inTableHeaders[t]->elems[key]; + header->numReceivers = numEdges; + if (inTableRest[t]->numElems > 0xffff) { + printf("In-table index exceeds 16 bits\n"); + exit(EXIT_FAILURE); + } + header->restIndex = inTableRest[t]->numElems; + uint32_t numHeaderEdges = numEdges < POLITE_EDGES_PER_HEADER ? + numEdges : POLITE_EDGES_PER_HEADER; + for (uint32_t j = 0; j < numHeaderEdges; j++) { + header->edges[j] = *edgePtr; + edgePtr++; + } + numEdges -= numHeaderEdges; + // Overflow into rest memory if header not big enough + for (uint32_t j = 0; j < numEdges; j++) { + inTableRest[t]->append(*edgePtr); + edgePtr++; } - inTable[t]->elems[key+numReceivers] = null; } } return key; } + // Split edge list into board-local and non-board-local destinations + // And sort each list by destination thread id + // (Only valid after mapper is called) + void splitDests(PDeviceId devId, PinId pinId, + Seq* local, Seq* nonLocal) { + local->clear(); + nonLocal->clear(); + PDeviceAddr devAddr = toDeviceAddr[devId]; + uint32_t devBoard = getThreadId(devAddr) >> TinselLogThreadsPerBoard; + // Split destinations into local/non-local + Seq* dests = graph.outgoing->elems[devId]; + Seq* pinIds = graph.pins->elems[devId]; + uint32_t index = 0; + for (uint32_t d = 0; d < dests->numElems; d++) { + if (pinIds->elems[d] == pinId) { + PEdgeDest e; + e.index = index++; + e.dest = dests->elems[d]; + e.addr = toDeviceAddr[e.dest]; + uint32_t destBoard = getThreadId(e.addr) >> TinselLogThreadsPerBoard; + if (devBoard == destBoard) + local->append(e); + else + nonLocal->append(e); + } + } + // Sort local list + qsort(local->elems, local->numElems, sizeof(PEdgeDest), cmpEdgeDest); + // Sort non-local list + qsort(nonLocal->elems, nonLocal->numElems, sizeof(PEdgeDest), cmpEdgeDest); + } + + // Compute table updates for destinations for given device + // (Only valid after mapper is called) + void computeTables(Seq* local, uint32_t d, + Seq* out) { + out->clear(); + uint32_t index = 0; + while (index < local->numElems) { + // New set of receiver groups on same mailbox + uint32_t threadMaskLow = 0; + uint32_t threadMaskHigh = 0; + uint32_t nextGroup = 0; + // Current mailbox & thread being considered + PDeviceAddr mbox = getThreadId(local->elems[index].addr) >> + TinselLogThreadsPerMailbox; + uint32_t thread = getThreadId(local->elems[index].addr) & + ((1<numElems) { + PEdgeDest* edge = &local->elems[index]; + // Determine destination mailbox address and mailbox-local thread + uint32_t destMailbox = getThreadId(edge->addr) >> + TinselLogThreadsPerMailbox; + uint32_t destThread = getThreadId(edge->addr) & + ((1< in; + in.devId = getLocalDeviceId(edge->addr); + Seq* edges = edgeLabels.elems[d]; + if (! std::is_same::value) + in.edge = edges->elems[edge->index]; + // Update current receiver group + groups[nextGroup].receivers.append(in); + groups[nextGroup].threadId = thread; + if (thread < 32) threadMaskLow |= 1 << thread; + if (thread >= 32) threadMaskHigh |= 1 << (thread-32); + index++; + } + else { + // Start new receiver group + thread = destThread; + nextGroup++; + assert(nextGroup < TinselThreadsPerMailbox); + } + } + else break; + } + // Add input table entries + uint32_t key = addInTableEntries(nextGroup+1); + // Add output entry + PRoutingDest dest; + dest.kind = PRDestKindMRM; + dest.mbox = mbox; + dest.mrm.key = key; + dest.mrm.threadMaskLow = threadMaskLow; + dest.mrm.threadMaskHigh = threadMaskHigh; + out->append(dest); + // Clear receiver groups, for a new iteration + for (uint32_t i = 0; i <= nextGroup; i++) groups[i].receivers.clear(); + } + } + // Compute routing tables // (Only valid after mapper is called) void computeRoutingTables() { - // Routing table stats - uint64_t totalOutEdges = 0; + // Edge destinations (local to sender board, or not) + Seq local; + Seq nonLocal; - // Sequence of local device ids, for each multicast destiation - SmallSeq> receivers[64]; + // Routing destinations + Seq dests; - // Sequence of receiver groups - // (A more compact representation of the receivers array) - SmallSeq> groups; + // Allocate per-board programmable routing tables + progRouterTables = new ProgRouterMesh(numBoardsX, numBoardsY); // For each device for (uint32_t d = 0; d < numDevices; d++) { // For each pin for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) { - Seq dests = *(graph.outgoing->elems[d]); - Seq edges = *(edgeLabels.elems[d]); - // While destinations are remaining - while (dests.numElems > 0) { - // Clear receivers - for (uint32_t i = 0; i < 64; i++) receivers[i].clear(); - uint32_t threadMaskLow = 0; - uint32_t threadMaskHigh = 0; - // Current mailbox being considered - PDeviceAddr mbox = getThreadId(toDeviceAddr[dests.elems[0]]) >> - TinselLogThreadsPerMailbox; - // For each destination - uint32_t destsRemaining = 0; - for (uint32_t i = 0; i < dests.numElems; i++) { - // Determine destination mailbox address and mailbox-local thread - PDeviceId destId = dests.elems[i]; - PDeviceAddr destAddr = toDeviceAddr[destId]; - uint32_t destMailbox = getThreadId(destAddr) >> - TinselLogThreadsPerMailbox; - uint32_t destThread = getThreadId(destAddr) & - ((1< edge; - edge.devId = getLocalDeviceId(destAddr); - if (! std::is_same::value) edge.edge = edges.elems[i]; - receivers[destThread].append(edge); - if (destThread < 32) threadMaskLow |= 1 << destThread; - if (destThread >= 32) threadMaskHigh |= 1 << (destThread-32); - } - else { - // Add destination back into sequence - dests.elems[destsRemaining] = dests.elems[i]; - edges.elems[destsRemaining] = edges.elems[i]; - destsRemaining++; - } - } - // Create receiver groups - createReceiverGroups(mbox, receivers, &groups); - // Add input table entries - uint32_t key = addInTableEntries(&groups); - // Add output table entry + // Split edge lists into local/non-local and sort by target thread id + splitDests(d, p, &local, &nonLocal); + // Deal with board-local connections + computeTables(&local, d, &dests); + for (uint32_t i = 0; i < dests.numElems; i++) { + PRoutingDest dest = dests.elems[i]; POutEdge edge; - edge.mbox = mbox; - edge.key = key; - edge.threadMaskLow = threadMaskLow; - edge.threadMaskHigh = threadMaskHigh; + edge.mbox = dest.mbox; + edge.key = dest.mrm.key; + edge.threadMaskLow = dest.mrm.threadMaskLow; + edge.threadMaskHigh = dest.mrm.threadMaskHigh; outTable[d][p]->append(edge); - // Prepare for new output table entry - dests.numElems = destsRemaining; - edges.numElems = destsRemaining; - totalOutEdges++; } - // Add output edge terminator + // Deal with non-board-local connections + computeTables(&nonLocal, d, &dests); + uint32_t src = getThreadId(toDeviceAddr[d]) >> + TinselLogThreadsPerMailbox; + uint32_t key = progRouterTables->addDestsFromBoard(src, &dests); + POutEdge edge; + edge.mbox = tinselUseRoutingKey(); + edge.key = 0; + edge.threadMaskLow = key; + edge.threadMaskHigh = 0; + outTable[d][p]->append(edge); + // Add output list terminator POutEdge term; term.key = InvalidKey; outTable[d][p]->append(term); } } - //printf("Average edges per pin: %lu\n", - // totalOutEdges / (numDevices * POLITE_NUM_PINS); - } + } // Release all structures void releaseAll() { @@ -578,21 +727,38 @@ template useSendBuffer = true; writeRAM(hostLink, vertexMem, vertexMemSize, vertexMemBase); writeRAM(hostLink, threadMem, threadMemSize, threadMemBase); - writeRAM(hostLink, inEdgeMem, inEdgeMemSize, inEdgeMemBase); + writeRAM(hostLink, inEdgeHeaderMem, + inEdgeHeaderMemSize, inEdgeHeaderMemBase); + writeRAM(hostLink, inEdgeRestMem, inEdgeRestMemSize, inEdgeRestMemBase); writeRAM(hostLink, outEdgeMem, outEdgeMemSize, outEdgeMemBase); + progRouterTables->write(hostLink); hostLink->flush(); hostLink->useSendBuffer = useSendBufferOld; @@ -838,7 +1008,6 @@ template Date: Tue, 23 Jun 2020 09:04:34 +0000 Subject: [PATCH 73/78] Silly mistakes --- include/POLite/Bitmap.h | 4 ++-- include/POLite/PGraph.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/POLite/Bitmap.h b/include/POLite/Bitmap.h index 262f99af..0a165896 100644 --- a/include/POLite/Bitmap.h +++ b/include/POLite/Bitmap.h @@ -47,9 +47,9 @@ struct Bitmap { // Find index of next zero bit, and flip that bit inline uint32_t grabNextBit() { - uint64_t word = getWord(firstFree); + uint64_t word = ~getWord(firstFree); assert(word != 0ul); - uint32_t bit = __builtin_ctzll(~word); + uint32_t bit = __builtin_ctzll(word); setBit(firstFree, bit); return 64*firstFree + bit; } diff --git a/include/POLite/PGraph.h b/include/POLite/PGraph.h index c5e5b41f..5efaa15a 100644 --- a/include/POLite/PGraph.h +++ b/include/POLite/PGraph.h @@ -629,7 +629,7 @@ template elems[edge->index]; // Update current receiver group groups[nextGroup].receivers.append(in); - groups[nextGroup].threadId = thread; + groups[nextGroup].threadId = getThreadId(edge->addr); if (thread < 32) threadMaskLow |= 1 << thread; if (thread >= 32) threadMaskHigh |= 1 << (thread-32); index++; From 8d3807f8fc5ea63c08a6c07f28c6c4655f9f15c5 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Tue, 23 Jun 2020 10:13:14 +0000 Subject: [PATCH 74/78] Another mistak --- include/POLite/PGraph.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/POLite/PGraph.h b/include/POLite/PGraph.h index 5efaa15a..909db3ec 100644 --- a/include/POLite/PGraph.h +++ b/include/POLite/PGraph.h @@ -574,11 +574,10 @@ template * dests = graph.outgoing->elems[devId]; Seq* pinIds = graph.pins->elems[devId]; - uint32_t index = 0; for (uint32_t d = 0; d < dests->numElems; d++) { if (pinIds->elems[d] == pinId) { PEdgeDest e; - e.index = index++; + e.index = d; e.dest = dests->elems[d]; e.addr = toDeviceAddr[e.dest]; uint32_t destBoard = getThreadId(e.addr) >> TinselLogThreadsPerBoard; From d309559f37895d5ea7b0f228910b2faec5373060 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Tue, 23 Jun 2020 15:53:55 +0000 Subject: [PATCH 75/78] More fixes and tweaks --- include/POLite/Bitmap.h | 9 +++++---- include/POLite/PDevice.h | 10 ++++++++++ include/POLite/PGraph.h | 12 ++++++------ 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/include/POLite/Bitmap.h b/include/POLite/Bitmap.h index 0a165896..9271bc07 100644 --- a/include/POLite/Bitmap.h +++ b/include/POLite/Bitmap.h @@ -47,11 +47,12 @@ struct Bitmap { // Find index of next zero bit, and flip that bit inline uint32_t grabNextBit() { - uint64_t word = ~getWord(firstFree); - assert(word != 0ul); - uint32_t bit = __builtin_ctzll(word); + uint64_t word = getWord(firstFree); + assert(~word != 0ul); + uint32_t bit = __builtin_ctzll(~word); + uint32_t result = 64*firstFree + bit; setBit(firstFree, bit); - return 64*firstFree + bit; + return result; } }; diff --git a/include/POLite/PDevice.h b/include/POLite/PDevice.h index 6ba3be83..508207bd 100644 --- a/include/POLite/PDevice.h +++ b/include/POLite/PDevice.h @@ -135,6 +135,16 @@ template struct PInEdge { E edge; }; +// An incoming edge to a device (unlabelled) +template <> struct PInEdge { + union { + // Destination device + PLocalDeviceId devId; + // Unused + None edge; + }; +}; + // Header for a list of incoming edges (fixed size structure to // support fast construction/packing of local-multicast tables) template struct PInHeader { diff --git a/include/POLite/PGraph.h b/include/POLite/PGraph.h index 909db3ec..a1ecc739 100644 --- a/include/POLite/PGraph.h +++ b/include/POLite/PGraph.h @@ -595,23 +595,23 @@ template * local, uint32_t d, + void computeTables(Seq* dests, uint32_t d, Seq* out) { out->clear(); uint32_t index = 0; - while (index < local->numElems) { + while (index < dests->numElems) { // New set of receiver groups on same mailbox uint32_t threadMaskLow = 0; uint32_t threadMaskHigh = 0; uint32_t nextGroup = 0; // Current mailbox & thread being considered - PDeviceAddr mbox = getThreadId(local->elems[index].addr) >> + PDeviceAddr mbox = getThreadId(dests->elems[index].addr) >> TinselLogThreadsPerMailbox; - uint32_t thread = getThreadId(local->elems[index].addr) & + uint32_t thread = getThreadId(dests->elems[index].addr) & ((1<numElems) { - PEdgeDest* edge = &local->elems[index]; + while (index < dests->numElems) { + PEdgeDest* edge = &dests->elems[index]; // Determine destination mailbox address and mailbox-local thread uint32_t destMailbox = getThreadId(edge->addr) >> TinselLogThreadsPerMailbox; From 0bb89acac23dabbfcef8fa9e4934eabf7da8c5cb Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Tue, 23 Jun 2020 20:24:35 +0100 Subject: [PATCH 76/78] Update README --- README.md | 57 +++++++++++++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index e4fdf72c..77883461 100644 --- a/README.md +++ b/README.md @@ -1299,13 +1299,21 @@ by each thread. After mapping, POLite writes the graph into cluster memory and triggers execution. By default, vertex states are written into the off-chip QDRII+ SRAMs, and edge lists are written in the DDR3 DRAMs. -This default behaviour can be modified by setting the boolean flags -`graph.mapVerticesToDRAM`, `graph.mapInEdgesToDRAM`, -`graph.mapOutEdgesToDRAM` accordingly (true means "map to DRAM" and -false means "map to SRAM"). Once the application is up and running, -the host and the graph vertices can continue to communicate: any -vertex can send messages to the host via the `HostPin` or the `finish` -handler, and the host can send messages to any vertex. +This default behaviour can be modified by adjusting the following +flags of the `PGraph` class. + + Flag | Default + ------------------------ | ------- + `mapVerticesToDRAM` | `false` + `mapInEdgeHeadersToDRAM` | `true` + `mapInEdgeRestToDRAM` | `true` + `mapOutEdgesToDRAM` | `true` + +A value of `true` means "map to DRAM", while `false` means "map to +(off-chip) SRAM". Once the application is up and running, the host +and the graph vertices can continue to communicate: any vertex can +send messages to the host via the `HostPin` or the `finish` handler, +and the host can send messages to any vertex. **Softswitch**. Central to POLite is an event loop running on each Tinsel thread, which we call the softswitch as it effectively @@ -1321,20 +1329,12 @@ required, to meet the semantics of the POLite library. before the first instance of `#include `, to control some aspects of POLite behaviour. - Macro | Meaning - --------- | ------- - `POLITE_NUM_PINS` | Max number of pins per vertex (default 1) - `POLITE_DUMP_STATS` | Dump stats upon completion - `POLITE_COUNT_MSGS` | Include message counts in stats dump - -POLite supports three mapping modes, also controlled via macros: - - - Macro | Use when graphs have... - --------- | ----------------------- - `POLITE_MAP_LOCAL` | ...lots of local connections and few distributed connections - `POLITE_MAP_DIST` | ...lots of distributed connections and few local connections (this mapper is fast) - `POLITE_MAP_HYBRID` | ...a mix of local and distributed connections (default) + Macro | Meaning + --------- | ------- + `POLITE_NUM_PINS` | Max number of pins per vertex (default 1) + `POLITE_DUMP_STATS` | Dump stats upon completion + `POLITE_COUNT_MSGS` | Include message counts in stats dump + `POLITE_EDGES_PER_HEADER` | Lower this for large edge states (default 6) **POLite dynamic parameters**. The following environment variables can be set, to control some aspects of POLite behaviour. @@ -1346,14 +1346,13 @@ be set, to control some aspects of POLite behaviour. `POLITE_BOARDS_X` | Size of board mesh to use in X dimension `POLITE_BOARDS_Y` | Size of board mesh to use in Y dimension `POLITE_CHATTY` | Set to `1` to enable emission of mapper stats - `POLITE_PLACER` | Use `metis`, `random`, or `direct` placement - -**Limitations**. POLite provides several important features of the -vertex-centric paradigm, but there are lots of limitations and quirks; -it is only intended as a prototype library for hardware evaluation -purposes. One of the features of the Pregel framework is the ability -for vertices to add and remove vertices and edges at runtime -- but -currently, POLite only supports static graphs. + `POLITE_PLACER` | Use `metis`, `random`, `bfs`, or `direct` placement + +**Limitations**. POLite is primarily intended as a prototype library +for hardware evaluation purposes. It occupies a single, simple point +in a wider, richer design space. In particular, it doesn't support +dynamic creation of vertices and edges, and it hasn't been optimised +to deal with highly non-uniform fanouts. ## A. DE5-Net Synthesis Report From f2fda2c8598c3a7a9e02d475d3a1fd85a3f5d1ab Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Wed, 24 Jun 2020 08:25:55 +0000 Subject: [PATCH 77/78] Use OpenMP during hierarchical partitioning --- apps/POLite/util/polite.mk | 2 +- include/POLite/PGraph.h | 1 + include/POLite/Placer.h | 13 ++++++++++--- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/apps/POLite/util/polite.mk b/apps/POLite/util/polite.mk index a1d96f83..4abe32ee 100644 --- a/apps/POLite/util/polite.mk +++ b/apps/POLite/util/polite.mk @@ -51,7 +51,7 @@ $(HL)/%.o: $(BUILD)/run: $(RUN_CPP) $(RUN_H) $(HL)/*.o g++ -std=c++11 -O2 -I $(INC) -I $(HL) -o $(BUILD)/run $(RUN_CPP) $(HL)/*.o \ - -lmetis -fno-exceptions + -lmetis -fno-exceptions -fopenmp $(BUILD)/sim: $(RUN_CPP) $(RUN_H) $(HL)/sim/*.o g++ -O2 -I $(INC) -I $(HL) -o $(BUILD)/sim $(RUN_CPP) $(HL)/sim/*.o \ diff --git a/include/POLite/PGraph.h b/include/POLite/PGraph.h index a1ecc739..1a67e2ef 100644 --- a/include/POLite/PGraph.h +++ b/include/POLite/PGraph.h @@ -796,6 +796,7 @@ template #include #include +#include typedef uint32_t PartitionId; @@ -52,6 +53,11 @@ struct Placer { uint32_t* yCoordSaved; uint64_t savedCost; + // Random numbers + unsigned int seed; + void setRand(unsigned int s) { seed = s; }; + int getRand() { return rand_r(&seed); } + // Controls which strategy is used Method method = Default; @@ -160,9 +166,8 @@ struct Placer { uint32_t numParts = width * height; // Populate result array - srand(0); for (uint32_t i = 0; i < numVertices; i++) { - partitions[i] = rand() % numParts; + partitions[i] = getRand() % numParts; } } @@ -308,7 +313,7 @@ struct Placer { // Random mapping for (uint32_t y = 0; y < height; y++) { for (uint32_t x = 0; x < width; x++) { - int index = rand() % numPartitions; + int index = getRand() % numPartitions; PartitionId p = pids[index]; mapping[y][x] = p; xCoord[p] = x; @@ -424,6 +429,8 @@ struct Placer { graph = g; width = w; height = h; + // Random seed + setRand(1 + omp_get_thread_num()); // Allocate the partitions array partitions = new PartitionId [g->incoming->numElems]; // Allocate subgraphs From 473d9061d9772e04d9978cf3d8aba6277cb65de9 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Wed, 24 Jun 2020 12:27:13 +0000 Subject: [PATCH 78/78] Set release date --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 77883461..a66aed56 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ Released on 2 Dec 2019 and maintained in the [tinsel-0.7.1 branch](https://github.com/POETSII/tinsel/tree/tinsel-0.7.1). (Local hardware multicast.) * [v0.8](https://github.com/POETSII/tinsel/releases/tag/v0.8): -Released on 18 May 2020 and maintained in the +Released on 24 Jun 2020 and maintained in the [master branch](https://github.com/POETSII/tinsel/). (Global hardware multicast.) @@ -74,9 +74,9 @@ main features are: * **Multithreading**. A critical aspect of the design is to tolerate latency as cleanly as possible. This includes the - latencies arising from: floating-point on Stratix V FPGAs - (tens of cycles); off-chip memories; deep pipelines - (keeping Fmax high); and sharing of resources between cores + latencies arising from floating-point on Stratix V FPGAs + (tens of cycles), off-chip memories, deep pipelines + (keeping Fmax high), and sharing of resources between cores (such as caches, mailboxes, and FPUs). * **Message-passing**. Although there is a requirement to support a