diff --git a/include/POLite.h b/include/POLite.h index d12a0e73..858b865e 100644 --- a/include/POLite.h +++ b/include/POLite.h @@ -6,13 +6,22 @@ #ifdef TINSEL #include - #include + #ifdef POLITE_FAST_MAP + #include + #else + #include + #endif #else - #include + #ifdef POLITE_FAST_MAP + #include + #include + #else + #include + #include + #endif #include #include #include - #include #endif #endif diff --git a/include/POLite/FastMap/PDevice.h b/include/POLite/FastMap/PDevice.h new file mode 100644 index 00000000..f095eba6 --- /dev/null +++ b/include/POLite/FastMap/PDevice.h @@ -0,0 +1,302 @@ +// SPDX-License-Identifier: BSD-2-Clause +#ifndef _PDEVICE_H_ +#define _PDEVICE_H_ + +#include +#include +#include + +#ifdef TINSEL + #include + #define PTR(t) t* +#else + #include + #define PTR(t) uint32_t +#endif + +// Use this to align on half-cache-line boundary +#define ALIGNED __attribute__((aligned(1<<(TinselLogBytesPerLine-1)))) + +// This is a static limit on the number of pins per device +#ifndef POLITE_NUM_PINS +#define POLITE_NUM_PINS 1 +#endif + +// Macros for performance stats +// POLITE_DUMP_STATS - dump performance stats on termination +// POLITE_COUNT_MSGS - include message counts of performance stats + +// Thread-local device id +typedef uint16_t PLocalDeviceId; + +// Thread id +typedef uint32_t PThreadId; + +// Device address +// Bits 17->0: thread id +// Bit 18: invalid address +// Bits 31->19: thread-local device id +typedef uint32_t PDeviceAddr; + +// Device address constructors +inline PDeviceAddr invalidDeviceAddr() { return 0x40000; } +inline PDeviceAddr makeDeviceAddr(PThreadId t, PLocalDeviceId d) { + return (d << 19) | t; +} + +// Device address deconstructors +inline bool isValidDeviceAddr(PDeviceAddr addr) { return !(addr & 0x40000); } +inline PThreadId getThreadId(PDeviceAddr addr) { return addr & 0x3ffff; } +inline PLocalDeviceId getLocalDeviceId(PDeviceAddr addr) { return addr >> 19; } + +// What's the max allowed local device address? +inline uint32_t maxLocalDeviceId() { return 8192; } + +// Pins +// No - means 'not ready to send' +// HostPin - means 'send to host' +// Pin(n) - means 'send to application pin number n' +typedef uint8_t PPin; +#define No 0 +#define HostPin 1 +#define Pin(n) ((n)+2) + +// For template arguments that are not used +struct None {}; + +// Generic device structure +// Type parameters: +// S - State +// E - Edge label +// M - Message structure +template struct PDevice { + // State + S* s; + PPin* readyToSend; + uint32_t numVertices; + uint16_t time; + + // Handlers + void init(); + void send(volatile M* msg); + void recv(M* msg, E* edge); + bool step(); + bool finish(volatile M* msg); +}; + +// Generic device state structure +template struct ALIGNED PState { + // Board-level routing key for each outgoing pin + uint32_t pin[POLITE_NUM_PINS]; + // Ready-to-send status + PPin readyToSend; + // Custom state + S state; +}; + +// Message structure +template struct PMessage { + // Destination thread-local device id + uint16_t devId; + // Id of incoming edge + uint16_t edgeId; + // Application message + M payload; +}; + +// An incoming edge to a device +template struct PInEdge { + E edge; +}; + +// Generic thread structure +template struct PThread { + + // Number of devices handled by thread + PLocalDeviceId numDevices; + // Number of times step handler has been called + uint16_t time; + // Number of devices in graph + uint32_t numVertices; + // Pointer to array of device states + PTR(PState) devices; + // Pointer to base of edge table + PTR(PInEdge) inTableBase; + // Array of local device ids are ready to send + PTR(PLocalDeviceId) senders; + // This array is accessed in a LIFO manner + PTR(PLocalDeviceId) sendersTop; + + // Count number of messages sent + #ifdef POLITE_COUNT_MSGS + // Total messages sent + uint32_t msgsSent; + // Total messages received + uint32_t msgsReceived; + // Number of times we wanted to send but couldn't + uint32_t blockedSends; + #endif + + #ifdef TINSEL + + // Helper function to construct a device + INLINE DeviceType getDevice(uint32_t id) { + DeviceType dev; + dev.s = &devices[id].state; + dev.readyToSend = &devices[id].readyToSend; + dev.numVertices = numVertices; + dev.time = time; + return dev; + } + + // Dump performance counter stats over UART + void dumpStats() { + tinselPerfCountStop(); + uint32_t me = tinselId(); + // Per-cache performance counters + uint32_t cacheMask = (1 << + (TinselLogThreadsPerCore + TinselLogCoresPerDCache)) - 1; + if ((me & cacheMask) == 0) { + printf("H:%x,M:%x,W:%x\n", + tinselHitCount(), + tinselMissCount(), + tinselWritebackCount()); + } + // Per-core performance counters + uint32_t coreMask = (1 << (TinselLogThreadsPerCore)) - 1; + if ((me & coreMask) == 0) { + printf("C:%x %x,I:%x %x\n", + tinselCycleCountU(), tinselCycleCount(), + tinselCPUIdleCountU(), tinselCPUIdleCount()); + } + // Per-thread performance counters + #ifdef POLITE_COUNT_MSGS + uint32_t intraBoardId = me & ((1<)-1) >> TinselLogBytesPerFlit); + + // Event loop + while (1) { + // Try to send + if (sendersTop != senders) { + if (tinselCanSend()) { + // Get next sender + PLocalDeviceId src = *(--sendersTop); + // Lookup device + DeviceType dev = getDevice(src); + PPin pin = *dev.readyToSend; + // Invoke send handler + PMessage* m = (PMessage*) tinselSendSlot(); + dev.send(&m->payload); + // Reinsert sender, if it still wants to send + if (*dev.readyToSend != No) sendersTop++; + // Is it a send to the host pin or a user pin? + if (pin == HostPin) + tinselSend(tinselHostId(), m); + else + tinselKeySend(devices[src].pin[pin-2], m); + #ifdef POLITE_COUNT_MSGS + msgsSent++; + #endif + } + else { + #ifdef POLITE_COUNT_MSGS + blockedSends++; + #endif + tinselWaitUntil(TINSEL_CAN_SEND|TINSEL_CAN_RECV); + } + } + else { + // Idle detection + int idle = tinselIdle(!active); + if (idle > 1) + break; + else if (idle) { + active = false; + for (uint32_t i = 0; i < numDevices; i++) { + DeviceType dev = getDevice(i); + // Invoke the step handler for each device + active = dev.step() || active; + // Device ready to send? + if (*dev.readyToSend != No) { + *(sendersTop++) = i; + } + } + time++; + } + } + + // Step 2: try to receive + while (tinselCanRecv()) { + PMessage* inMsg = (PMessage*) tinselRecv(); + PInEdge* inEdge = &inTableBase[inMsg->edgeId]; + // Lookup destination device + PLocalDeviceId id = inMsg->devId; + DeviceType dev = getDevice(id); + // Was it ready to send? + PPin oldReadyToSend = *dev.readyToSend; + // Invoke receive handler + dev.recv(&inMsg->payload, &inEdge->edge); + // Insert device into a senders array, if not already there + if (*dev.readyToSend != No && oldReadyToSend == No) + *(sendersTop++) = id; + #ifdef POLITE_COUNT_MSGS + msgsReceived++; + #endif + tinselFree(inMsg); + } + } + + // Termination + #ifdef POLITE_DUMP_STATS + dumpStats(); + #endif + + // Invoke finish handler for each device + for (uint32_t i = 0; i < numDevices; i++) { + DeviceType dev = getDevice(i); + tinselWaitUntil(TINSEL_CAN_SEND); + PMessage* m = (PMessage*) tinselSendSlot(); + if (dev.finish(&m->payload)) tinselSend(tinselHostId(), m); + } + + // Sleep + tinselWaitUntil(TINSEL_CAN_RECV); while (1); + } + + #endif + +}; + +#endif diff --git a/include/POLite/FastMap/PGraph.h b/include/POLite/FastMap/PGraph.h new file mode 100644 index 00000000..8ac0c84d --- /dev/null +++ b/include/POLite/FastMap/PGraph.h @@ -0,0 +1,710 @@ +// SPDX-License-Identifier: BSD-2-Clause +#ifndef _PGRAPH_H_ +#define _PGRAPH_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Nodes of a POETS graph are devices +typedef NodeId PDeviceId; + +// POETS graph +template class PGraph { + private: + // Align address to 2^n byte boundary + inline uint32_t align(uint32_t n, uint32_t addr) { + if ((addr & (1<> n) + 1) << n; + } + + // Align address to 32-bit word boundary + uint32_t wordAlign(uint32_t addr) { return align(2, addr); } + + // Align address to cache-line boundary + uint32_t cacheAlign(uint32_t addr) { + return align(TinselLogBytesPerLine, addr); + } + + // Helper function + inline uint32_t min(uint32_t x, uint32_t y) { return x < y ? x : y; } + + // Number of FPGA boards available + uint32_t meshLenX; + uint32_t meshLenY; + + // Number of FPGA boards to use + uint32_t numBoardsX; + uint32_t numBoardsY; + + // Out table (sender-side edge tables) + // Sequence of destinations for every (device, pin) pair + Seq*** outTable; + + // Key table (sender-side key tables) + // Global routing key for every (device, pin) pair + uint32_t** keyTable; + + // In table (receiver-side edge tables) + // Sequence of incoming edges for every thread + Seq>** inTable; + + // Mesh of per-board programmable routers + ProgRouterMesh* routingTables; + + // Generic constructor + void constructor(uint32_t lenX, uint32_t lenY) { + meshLenX = lenX; + meshLenY = lenY; + char* str = getenv("POLITE_BOARDS_X"); + int nx = str ? atoi(str) : meshLenX; + str = getenv("POLITE_BOARDS_Y"); + int ny = str ? atoi(str) : meshLenY; + setNumBoards(nx, ny); + numDevices = 0; + devices = NULL; + toDeviceAddr = NULL; + numDevicesOnThread = NULL; + fromDeviceAddr = NULL; + vertexMem = NULL; + vertexMemSize = NULL; + vertexMemBase = NULL; + inEdgeMem = NULL; + inEdgeMemSize = NULL; + inEdgeMemBase = NULL; + mapVerticesToDRAM = false; + mapInEdgesToDRAM = true; + outTable = NULL; + keyTable = NULL; + inTable = NULL; + routingTables = NULL; + chatty = 0; + str = getenv("POLITE_CHATTY"); + if (str != NULL) { + chatty = !strcmp(str, "0") ? 0 : 1; + } + } + + public: + // Number of devices + uint32_t numDevices; + + // Graph containing device ids and connections + Graph graph; + + // Edge labels: has same structure as graph.outgoing + Seq*> edgeLabels; + + // Mapping from device id to device state + // (Not valid until the mapper is called) + PState** devices; + + // Mapping from thread id to number of devices on that thread + // (Not valid until the mapper is called) + uint32_t* numDevicesOnThread; + + // Mapping from device id to device address and back + // (Not valid until the mapper is called) + PDeviceAddr* toDeviceAddr; // Device id -> device address + PDeviceId** fromDeviceAddr; // Device address -> device id + + // Each thread's vertex mem and thread mem regions + // (Not valid until the mapper is called) + uint8_t** vertexMem; uint8_t** threadMem; + uint32_t* vertexMemSize; uint32_t* threadMemSize; + uint32_t* vertexMemBase; uint32_t* threadMemBase; + + // Each thread's in-edge tables + // (Not valid until the mapper is called) + uint8_t** inEdgeMem; + uint32_t* inEdgeMemSize; + uint32_t* inEdgeMemBase; + + // Where to map the various regions + // (If false, map to SRAM instead) + bool mapVerticesToDRAM; + bool mapInEdgesToDRAM; + + // Allow mapper to print useful information to stdout + uint32_t chatty; + + // Setter for number of boards to use + void setNumBoards(uint32_t x, uint32_t y) { + if (x > meshLenX || y > meshLenY) { + printf("Mapper: %d x %d boards requested, %d x %d available\n", + numBoardsX, numBoardsY, meshLenX, meshLenY); + exit(EXIT_FAILURE); + } + numBoardsX = x; + numBoardsY = y; + } + + // Create new device + inline PDeviceId newDevice() { + edgeLabels.append(new SmallSeq); + numDevices++; + return graph.newNode(); + } + + // Add a connection between devices + inline void addEdge(PDeviceId from, PinId pin, PDeviceId to) { + if (pin >= POLITE_NUM_PINS) { + printf("addEdge: pin exceeds POLITE_NUM_PINS\n"); + exit(EXIT_FAILURE); + } + graph.addEdge(from, pin, to); + E edge; + edgeLabels.elems[from]->append(edge); + } + + // Add labelled edge using given output pin + void addLabelledEdge(E edge, PDeviceId x, PinId pin, PDeviceId y) { + graph.addEdge(x, pin, y); + edgeLabels.elems[x]->append(edge); + } + + // Allocate SRAM and DRAM partitions + void allocatePartitions() { + // Decide a maximum partition size that is reasonable + // SRAM: Partition size minus 2048 bytes for the stack + uint32_t maxSRAMSize = (1<)); + // Add space for devices + uint32_t numDevs = numDevicesOnThread[threadId]; + for (uint32_t devNum = 0; devNum < numDevs; devNum++) { + // Add space for device + sizeVMem = sizeVMem + sizeof(PState); + } + // Add space for incoming edge table + if (inTable[threadId]) { + sizeEIMem = inTable[threadId]->numElems * sizeof(PInEdge); + sizeEIMem = wordAlign(sizeEIMem); + } + // The total partition size including uninitialised portions + uint32_t totalSizeVMem = + sizeVMem + wordAlign(sizeof(PLocalDeviceId) * numDevs); + // Check that total size is reasonable + uint32_t totalSizeSRAM = sizeTMem; + uint32_t totalSizeDRAM = 0; + if (mapVerticesToDRAM) totalSizeDRAM += totalSizeVMem; + else totalSizeSRAM += totalSizeVMem; + if (mapInEdgesToDRAM) totalSizeDRAM += sizeEIMem; + else totalSizeSRAM += sizeEIMem; + if (totalSizeDRAM > maxDRAMSize) { + printf("Error: max DRAM partition size exceeded\n"); + exit(EXIT_FAILURE); + } + if (totalSizeSRAM > maxSRAMSize) { + printf("Error: max SRAM partition size exceeded\n"); + exit(EXIT_FAILURE); + } + // Allocate space for the initialised portion of the partition + assert((sizeVMem%4) == 0); + assert((sizeTMem%4) == 0); + assert((sizeEIMem%4) == 0); + vertexMem[threadId] = (uint8_t*) calloc(sizeVMem, 1); + vertexMemSize[threadId] = sizeVMem; + threadMem[threadId] = (uint8_t*) calloc(sizeTMem, 1); + threadMemSize[threadId] = sizeTMem; + inEdgeMem[threadId] = (uint8_t*) calloc(sizeEIMem, 1); + inEdgeMemSize[threadId] = sizeEIMem; + // Tinsel address of base of partition + uint32_t partId = threadId & (TinselThreadsPerDRAM-1); + uint32_t sramBase = (1 << TinselLogBytesPerSRAM) + + (partId << TinselLogBytesPerSRAMPartition); + uint32_t dramBase = TinselBytesPerDRAM - + ((partId+1) << TinselLogBytesPerDRAMPartition); + // Use partition-interleaved region for DRAM + dramBase |= 0x80000000; + threadMemBase[threadId] = sramBase; + sramBase += threadMemSize[threadId]; + // Determine base addresses of each region + if (mapVerticesToDRAM) { + vertexMemBase[threadId] = dramBase; + dramBase += totalSizeVMem; + } + else { + vertexMemBase[threadId] = sramBase; + sramBase += totalSizeVMem; + } + if (mapInEdgesToDRAM) { + inEdgeMemBase[threadId] = dramBase; + dramBase += sizeEIMem; + } + else { + inEdgeMemBase[threadId] = sramBase; + sramBase += sizeEIMem; + } + } + } + + // Initialise partitions + void initialisePartitions() { + for (uint32_t threadId = 0; threadId < TinselMaxThreads; threadId++) { + // Next pointers for each partition + uint32_t nextVMem = 0; + // Pointer to thread structure + PThread* thread = + (PThread*) &threadMem[threadId][0]; + // Set number of devices on thread + thread->numDevices = numDevicesOnThread[threadId]; + // Set number of devices in graph + thread->numVertices = numDevices; + // Set tinsel address of array of device states + thread->devices = vertexMemBase[threadId]; + // Set tinsel address of base of in-edge table + thread->inTableBase = inEdgeMemBase[threadId]; + // Add space for each device on thread + uint32_t numDevs = numDevicesOnThread[threadId]; + for (uint32_t devNum = 0; devNum < numDevs; devNum++) { + PState* dev = (PState*) &vertexMem[threadId][nextVMem]; + PDeviceId id = fromDeviceAddr[threadId][devNum]; + devices[id] = dev; + // Add space for device + nextVMem = nextVMem + sizeof(PState); + } + // Initialise each device and the thread's out edges + for (uint32_t devNum = 0; devNum < numDevs; devNum++) { + PDeviceId id = fromDeviceAddr[threadId][devNum]; + PState* dev = devices[id]; + // Initialise + for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) { + dev->pin[p] = keyTable[id][p]; + } + } + // Intialise thread's in edges + PInEdge* inEdgeArray = (PInEdge*) inEdgeMem[threadId]; + Seq>* edges = inTable[threadId]; + if (edges) + for (uint32_t i = 0; i < edges->numElems; i++) { + inEdgeArray[i] = edges->elems[i]; + } + // At this point, check that next pointers line up with heap sizes + if (nextVMem != vertexMemSize[threadId]) { + printf("Error: vertex mem size does not match pre-computed size\n"); + exit(EXIT_FAILURE); + } + // Set tinsel address of senders array + thread->senders = vertexMemBase[threadId] + nextVMem; + } + } + + // Allocate mapping structures + void allocateMapping() { + devices = (PState**) calloc(numDevices, sizeof(PState*)); + toDeviceAddr = (PDeviceAddr*) calloc(numDevices, sizeof(PDeviceAddr)); + fromDeviceAddr = (PDeviceId**) calloc(TinselMaxThreads, sizeof(PDeviceId*)); + numDevicesOnThread = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t)); + } + + // Allocate thread edge input and output tables + // (Only valid after mapper is called) + void allocateInOutTables() { + // Receiver-side tables + inTable = (Seq>**) + calloc(TinselMaxThreads,sizeof(Seq>*)); + for (uint32_t t = 0; t < TinselMaxThreads; t++) { + if (numDevicesOnThread[t] != 0) + inTable[t] = new SmallSeq>; + } + + // Sender-side tables + outTable = (Seq***) + calloc(numDevices, sizeof(Seq**)); + for (uint32_t d = 0; d < numDevices; d++) { + outTable[d] = (Seq**) + calloc(POLITE_NUM_PINS, sizeof(Seq*)); + for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) + outTable[d][p] = new SmallSeq; + } + + keyTable = new uint32_t* [numDevices]; + for (uint32_t d = 0; d < numDevices; d++) + keyTable[d] = new uint32_t [POLITE_NUM_PINS]; + } + + // Compute thread edge input and output tables + // (Only valid after mapper is called) + void computeInOutTables() { + // For each device + for (uint32_t d = 0; d < numDevices; d++) { + // For each pin + for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) { + Seq* dests = graph.outgoing->elems[d]; + Seq* edges = edgeLabels.elems[d]; + for (uint32_t i = 0; i < dests->numElems; i++) { + PDeviceId destId = dests->elems[i]; + // Destination thread id + uint32_t threadId = getThreadId(toDeviceAddr[destId]); + // Thread-local device id + uint32_t devId = getLocalDeviceId(toDeviceAddr[destId]); + // Add edge to thread's input table + uint32_t edgeId = inTable[threadId]->numElems; + if (i < inTable[threadId]->numElems) { + PInEdge edge; + edge.edge = edges->elems[i]; + inTable[threadId]->append(edge); + } + // Add output table entry + PRoutingDest rdest; + rdest.kind = PRDestKindURM1; + rdest.mbox = threadId >> TinselLogThreadsPerMailbox; + rdest.urm1.key = devId | (edgeId << 16); + rdest.urm1.threadId = threadId & + ((1<append(rdest); + } + } + } + } + + // Release all structures + void releaseAll() { + if (devices != NULL) { + free(devices); + free(toDeviceAddr); + free(numDevicesOnThread); + for (uint32_t t = 0; t < TinselMaxThreads; t++) + if (fromDeviceAddr[t] != NULL) free(fromDeviceAddr[t]); + free(fromDeviceAddr); + for (uint32_t t = 0; t < TinselMaxThreads; t++) + if (vertexMem[t] != NULL) free(vertexMem[t]); + free(vertexMem); + free(vertexMemSize); + free(vertexMemBase); + for (uint32_t t = 0; t < TinselMaxThreads; t++) + if (threadMem[t] != NULL) free(threadMem[t]); + free(threadMem); + free(threadMemSize); + free(threadMemBase); + for (uint32_t t = 0; t < TinselMaxThreads; t++) + if (inEdgeMem[t] != NULL) free(inEdgeMem[t]); + free(inEdgeMem); + free(inEdgeMemSize); + free(inEdgeMemBase); + } + if (inTable != NULL) { + for (uint32_t t = 0; t < TinselMaxThreads; t++) + if (inTable[t] != NULL) delete inTable[t]; + free(inTable); + inTable = NULL; + } + if (outTable != NULL) { + for (uint32_t d = 0; d < numDevices; d++) { + if (outTable[d] == NULL) continue; + for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) + delete outTable[d][p]; + free(outTable[d]); + } + free(outTable); + outTable = NULL; + } + if (keyTable != NULL) { + for (uint32_t d = 0; d < numDevices; d++) delete [] keyTable[d]; + delete [] keyTable; + keyTable = NULL; + } + if (routingTables != NULL) delete routingTables; + } + + // Implement mapping to tinsel threads + void map() { + // Let's measure some times + struct timeval placementStart, placementFinish; + struct timeval routingStart, routingFinish; + struct timeval initStart, initFinish; + + // Release all mapping and heap structures + releaseAll(); + + // Reallocate mapping structures + allocateMapping(); + + // Start placement timer + gettimeofday(&placementStart, NULL); + + // Partition into subgraphs, one per board + Placer boards(&graph, numBoardsX, numBoardsY); + + // Place subgraphs onto 2D mesh + const uint32_t placerEffort = 8; + boards.place(placerEffort); + + // For each board + for (uint32_t boardY = 0; boardY < numBoardsY; boardY++) { + for (uint32_t boardX = 0; boardX < numBoardsX; boardX++) { + // Partition into subgraphs, one per mailbox + PartitionId b = boards.mapping[boardY][boardX]; + Placer boxes(&boards.subgraphs[b], + TinselMailboxMeshXLen, TinselMailboxMeshYLen); + boxes.place(placerEffort); + + // For each mailbox + for (uint32_t boxX = 0; boxX < TinselMailboxMeshXLen; boxX++) { + for (uint32_t boxY = 0; boxY < TinselMailboxMeshYLen; boxY++) { + // Partition into subgraphs, one per thread + uint32_t numThreads = 1<incoming->numElems; + numDevicesOnThread[threadId] = numDevs; + fromDeviceAddr[threadId] = (PDeviceId*) + malloc(sizeof(PDeviceId) * numDevs); + for (uint32_t devNum = 0; devNum < numDevs; devNum++) + fromDeviceAddr[threadId][devNum] = g->labels->elems[devNum]; + + // Populate toDeviceAddr mapping + assert(numDevs < maxLocalDeviceId()); + for (uint32_t devNum = 0; devNum < numDevs; devNum++) { + PDeviceAddr devAddr = + makeDeviceAddr(threadId, devNum); + toDeviceAddr[g->labels->elems[devNum]] = devAddr; + } + } + } + } + } + } + + // Stop placement timer and start In/Out table timer + gettimeofday(&placementFinish, NULL); + gettimeofday(&routingStart, NULL); + + // Compute send and receive side routing tables + allocateInOutTables(); + computeInOutTables(); + + // Compute per-board programmable routing tables + routingTables = new ProgRouterMesh(numBoardsX, numBoardsY); + for (uint32_t d = 0; d < numDevices; d++) { + uint32_t src = getThreadId(toDeviceAddr[d]) >> + TinselLogThreadsPerMailbox; + for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) + keyTable[d][p] = routingTables->addDestsFromBoard(src, outTable[d][p]); + } + + // Stop routing timer and start init timer + gettimeofday(&routingFinish, NULL); + gettimeofday(&initStart, NULL); + + // Reallocate and initialise heap structures + allocatePartitions(); + initialisePartitions(); + + // Display times, if chatty + gettimeofday(&initFinish, NULL); + if (chatty > 0) { + struct timeval diff; + + timersub(&placementFinish, &placementStart, &diff); + double duration = (double) diff.tv_sec + + (double) diff.tv_usec / 1000000.0; + printf("POLite mapper profile:\n"); + printf(" Partitioning and placement: %lfs\n", duration); + + timersub(&routingFinish, &routingStart, &diff); + duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; + printf(" In/Out table construction: %lfs\n", duration); + + timersub(&initFinish, &initStart, &diff); + duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; + printf(" Thread state initialisation: %lfs\n", duration); + } + } + + // Constructor + PGraph() { + char* str = getenv("HOSTLINK_BOXES_X"); + int x = str ? atoi(str) : 1; + x = x * TinselMeshXLenWithinBox; + str = getenv("HOSTLINK_BOXES_Y"); + int y = str ? atoi(str) : 1; + y = y * TinselMeshYLenWithinBox; + constructor(x, y); + } + PGraph(uint32_t numBoxesX, uint32_t numBoxesY) { + int x = numBoxesX * TinselMeshXLenWithinBox; + int y = numBoxesY * TinselMeshYLenWithinBox; + constructor(x, y); + } + + // Deconstructor + ~PGraph() { + releaseAll(); + for (uint32_t i = 0; i < edgeLabels.numElems; i++) + delete edgeLabels.elems[i]; + } + + // Write partition to tinsel machine + void writeRAM(HostLink* hostLink, + uint8_t** heap, uint32_t* heapSize, uint32_t* heapBase) { + // Number of bytes written by each thread + uint32_t* writeCount = (uint32_t*) + calloc(TinselMaxThreads, sizeof(uint32_t)); + + // Number of threads completed by each core + uint32_t*** threadCount = (uint32_t***) + calloc(meshLenX, sizeof(uint32_t**)); + for (uint32_t x = 0; x < meshLenX; x++) { + threadCount[x] = (uint32_t**) + calloc(meshLenY, sizeof(uint32_t*)); + for (uint32_t y = 0; y < meshLenY; y++) + threadCount[x][y] = (uint32_t*) + calloc(TinselCoresPerBoard, sizeof(uint32_t)); + } + + // Initialise write addresses + for (int x = 0; x < meshLenX; x++) + for (int y = 0; y < meshLenY; y++) + for (int c = 0; c < TinselCoresPerBoard; c++) + hostLink->setAddr(x, y, c, heapBase[hostLink->toAddr(x, y, c, 0)]); + + // Write heaps + uint32_t done = false; + while (! done) { + done = true; + for (int x = 0; x < meshLenX; x++) { + for (int y = 0; y < meshLenY; y++) { + for (int c = 0; c < TinselCoresPerBoard; c++) { + uint32_t t = threadCount[x][y][c]; + if (t < TinselThreadsPerCore) { + done = false; + uint32_t threadId = hostLink->toAddr(x, y, c, t); + uint32_t written = writeCount[threadId]; + if (written == heapSize[threadId]) { + threadCount[x][y][c] = t+1; + if ((t+1) < TinselThreadsPerCore) + hostLink->setAddr(x, y, c, + heapBase[hostLink->toAddr(x, y, c, t+1)]); + } else { + uint32_t send = min((heapSize[threadId] - written)>>2, 15); + hostLink->store(x, y, c, send, + (uint32_t*) &heap[threadId][written]); + writeCount[threadId] = written + send * sizeof(uint32_t); + } + } + } + } + } + } + + // Release memory + free(writeCount); + for (uint32_t x = 0; x < meshLenX; x++) { + for (uint32_t y = 0; y < meshLenY; y++) + free(threadCount[x][y]); + free(threadCount[x]); + } + free(threadCount); + } + + // Write graph to tinsel machine + void write(HostLink* hostLink) { + // Start timer + struct timeval start, finish; + gettimeofday(&start, NULL); + + bool useSendBufferOld = hostLink->useSendBuffer; + hostLink->useSendBuffer = true; + writeRAM(hostLink, vertexMem, vertexMemSize, vertexMemBase); + writeRAM(hostLink, threadMem, threadMemSize, threadMemBase); + writeRAM(hostLink, inEdgeMem, inEdgeMemSize, inEdgeMemBase); + routingTables->write(hostLink); + hostLink->flush(); + hostLink->useSendBuffer = useSendBufferOld; + + // Display time if chatty + gettimeofday(&finish, NULL); + if (chatty > 0) { + struct timeval diff; + timersub(&finish, &start, &diff); + double duration = (double) diff.tv_sec + + (double) diff.tv_usec / 1000000.0; + printf("POLite graph upload time: %lfs\n", duration); + } + } + + // Determine fan-in of given device + uint32_t fanIn(PDeviceId id) { + return graph.fanIn(id); + } + + // Determine fan-out of given device + uint32_t fanOut(PDeviceId id) { + return graph.fanOut(id); + } + +}; + +// Read performance stats and store in file +inline void politeSaveStats(HostLink* hostLink, const char* filename) { + #ifdef POLITE_DUMP_STATS + // Open file for performance counters + FILE* statsFile = fopen(filename, "wt"); + if (statsFile == NULL) { + printf("Error creating stats file\n"); + exit(EXIT_FAILURE); + } + uint32_t meshLenX = hostLink->meshXLen; + uint32_t meshLenY = hostLink->meshYLen; + // Number of caches + uint32_t numLines = meshLenX * meshLenY * + TinselDCachesPerDRAM * TinselDRAMsPerBoard; + // Add on number of cores + numLines += meshLenX * meshLenY * TinselCoresPerBoard; + // Add on number of threads + #ifdef POLITE_COUNT_MSGS + numLines += meshLenX * meshLenY * TinselThreadsPerBoard; + #endif + hostLink->dumpStdOut(statsFile, numLines); + fclose(statsFile); + #endif +} + +#endif diff --git a/include/POLite/PGraph.h b/include/POLite/PGraph.h index 20cc4b5b..57b3172e 100644 --- a/include/POLite/PGraph.h +++ b/include/POLite/PGraph.h @@ -512,10 +512,11 @@ template append(edge); // Prepare for new output table entry dests.numElems = destsRemaining; diff --git a/include/POLite/Placer.h b/include/POLite/Placer.h index 32aec831..1468f0c7 100644 --- a/include/POLite/Placer.h +++ b/include/POLite/Placer.h @@ -10,6 +10,14 @@ typedef uint32_t PartitionId; // Partition and place a graph on a 2D mesh struct Placer { + // Select between different methods + enum Method { + Default, + Metis, + Random + }; + const Method defaultMethod=Metis; + // The graph being placed Graph* graph; @@ -41,8 +49,31 @@ struct Placer { uint32_t* yCoordSaved; uint64_t savedCost; + // Controls which strategy is used + Method method = Default; + + // Select placer method + void chooseMethod() + { + auto e = getenv("POLITE_PLACER"); + if (e) { + if (!strcmp(e, "metis")) + method=Metis; + else if (!strcmp(e, "random")) + method=Random; + else if (!strcmp(e, "default") || *e == '\0') + method=Default; + else { + fprintf(stderr, "Don't understand placer method : %s\n", e); + exit(EXIT_FAILURE); + } + } + if (method == Default) + method = defaultMethod; + } + // Partition the graph using Metis - void partition() { + void partitionMetis() { // Compute total number of edges uint32_t numEdges = 0; for (uint32_t i = 0; i < graph->incoming->numElems; i++) { @@ -116,6 +147,31 @@ struct Placer { free(parts); } + // Partition the graph randomly + void partitionRandom() { + uint32_t numVertices = graph->incoming->numElems; + uint32_t numParts = width * height; + + // Populate result array + srand(0); + for (uint32_t i = 0; i < numVertices; i++) { + partitions[i] = rand() % numParts; + } + } + + void partition() + { + switch(method){ + case Default: + case Metis: + partitionMetis(); + break; + case Random: + partitionRandom(); + break; + } + } + // Create subgraph for each partition void computeSubgraphs() { uint32_t numPartitions = width*height; @@ -316,6 +372,8 @@ struct Placer { yCoord = new uint32_t [width*height]; xCoordSaved = new uint32_t [width*height]; yCoordSaved = new uint32_t [width*height]; + // Pick a placement method, or select default + chooseMethod(); // Partition the graph using Metis partition(); // Compute subgraphs, one per partition diff --git a/include/POLite/ProgRouters.h b/include/POLite/ProgRouters.h index 90083802..a1c5942d 100644 --- a/include/POLite/ProgRouters.h +++ b/include/POLite/ProgRouters.h @@ -182,22 +182,58 @@ class ProgRouter { numChunks++; numRecords++; } + + // Add a URM1 record to the table + void addURM1(uint32_t mboxX, uint32_t mboxY, + uint32_t threadId, uint32_t key) { + if (numChunks == 5) nextBeat(); + uint8_t* ptr = currentRecord48(); + ptr[0] = key; + ptr[1] = key >> 8; + ptr[2] = key >> 16; + ptr[3] = key >> 24; + ptr[4] = (threadId << 3); + ptr[5] = (mboxY << 3) | (mboxX << 1) | (threadId >> 5); + numChunks++; + numRecords++; + } }; // ================================== // Data type for routing destinations // ================================== -struct PRoutingDest { - // Destination mailbox - uint32_t mbox; - // Thread-level routing key +enum PRoutingDestKind { PRDestKindURM1, PRDestKindMRM }; + +// URM1 routing destination +struct PRoutingDestURM1 { + // Mailbox-local thread + uint16_t threadId; + // Thread-local routing key + uint32_t key; +}; + +// MRM routing destination +struct PRoutingDestMRM { + // Thread-local routing key uint16_t key; // Destination threads uint32_t threadMaskLow; uint32_t threadMaskHigh; }; +// Routing destination +struct PRoutingDest { + PRoutingDestKind kind; + // Destination mailbox + uint32_t mbox; + // URM1 or MRM destination + union { + PRoutingDestURM1 urm1; + PRoutingDestMRM mrm; + }; +}; + // Extract board X coord from routing dest inline uint32_t destX(uint32_t mbox) { uint32_t x = mbox >> (TinselMailboxMeshXBits + TinselMailboxMeshYBits); @@ -288,9 +324,19 @@ class ProgRouterMesh { // Add local records for (int i = 0; i < local.numElems; i++) { PRoutingDest dest = local.elems[i]; - table[senderY][senderX].addMRM(destMboxX(dest.mbox), - destMboxY(dest.mbox), dest.threadMaskHigh, - dest.threadMaskLow, dest.key); + if (dest.kind == PRDestKindMRM) { + table[senderY][senderX].addMRM(destMboxX(dest.mbox), + destMboxY(dest.mbox), dest.mrm.threadMaskHigh, + dest.mrm.threadMaskLow, dest.mrm.key); + } + else if (dest.kind == PRDestKindURM1) { + table[senderY][senderX].addURM1(destMboxX(dest.mbox), + destMboxY(dest.mbox), dest.urm1.threadId, dest.urm1.key); + } + else { + fprintf(stderr, "ProgRouters.h: unknown routing record kind\n"); + exit(EXIT_FAILURE); + } } return table[senderY][senderX].genKey();