diff --git a/include/POLite.h b/include/POLite.h
index d12a0e73..858b865e 100644
--- a/include/POLite.h
+++ b/include/POLite.h
@@ -6,13 +6,22 @@
 
 #ifdef TINSEL
   #include <tinsel.h>
-  #include <POLite/PDevice.h>
+  #ifdef POLITE_FAST_MAP
+    #include <POLite/FastMap/PDevice.h>
+  #else
+    #include <POLite/PDevice.h>
+  #endif
 #else
-  #include <POLite/PDevice.h>
+  #ifdef POLITE_FAST_MAP
+    #include <POLite/FastMap/PDevice.h>
+    #include <POLite/FastMap/PGraph.h>
+  #else
+    #include <POLite/PDevice.h>
+    #include <POLite/PGraph.h>
+  #endif
   #include <POLite/Seq.h>
   #include <POLite/Graph.h>
   #include <POLite/Placer.h>
-  #include <POLite/PGraph.h>
 #endif
 
 #endif
diff --git a/include/POLite/FastMap/PDevice.h b/include/POLite/FastMap/PDevice.h
new file mode 100644
index 00000000..f095eba6
--- /dev/null
+++ b/include/POLite/FastMap/PDevice.h
@@ -0,0 +1,302 @@
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef _PDEVICE_H_
+#define _PDEVICE_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <type_traits>
+
+#ifdef TINSEL
+  #include <tinsel.h>
+  #define PTR(t) t*
+#else
+  #include <tinsel-interface.h>
+  #define PTR(t) uint32_t
+#endif
+
+// Use this to align on half-cache-line boundary
+#define ALIGNED __attribute__((aligned(1<<(TinselLogBytesPerLine-1))))
+
+// This is a static limit on the number of pins per device
+#ifndef POLITE_NUM_PINS
+#define POLITE_NUM_PINS 1
+#endif
+
+// Macros for performance stats
+//   POLITE_DUMP_STATS - dump performance stats on termination
+//   POLITE_COUNT_MSGS - include message counts of performance stats
+
+// Thread-local device id
+typedef uint16_t PLocalDeviceId;
+
+// Thread id
+typedef uint32_t PThreadId;
+
+// Device address
+// Bits 17->0: thread id
+// Bit 18: invalid address
+// Bits 31->19: thread-local device id
+typedef uint32_t PDeviceAddr;
+
+// Device address constructors
+inline PDeviceAddr invalidDeviceAddr() { return 0x40000; }
+inline PDeviceAddr makeDeviceAddr(PThreadId t, PLocalDeviceId d) {
+  return (d << 19) | t;
+}
+
+// Device address deconstructors
+inline bool isValidDeviceAddr(PDeviceAddr addr) { return !(addr & 0x40000); }
+inline PThreadId getThreadId(PDeviceAddr addr) { return addr & 0x3ffff; }
+inline PLocalDeviceId getLocalDeviceId(PDeviceAddr addr) { return addr >> 19; }
+
+// What's the max allowed local device address?
+inline uint32_t maxLocalDeviceId() { return 8192; }
+
+// Pins
+//   No      - means 'not ready to send'
+//   HostPin - means 'send to host'
+//   Pin(n)  - means 'send to application pin number n'
+typedef uint8_t PPin;
+#define No 0
+#define HostPin 1
+#define Pin(n) ((n)+2)
+
+// For template arguments that are not used
+struct None {};
+
+// Generic device structure
+// Type parameters:
+//   S - State
+//   E - Edge label
+//   M - Message structure
+template <typename S, typename E, typename M> struct PDevice {
+  // State
+  S* s;
+  PPin* readyToSend;
+  uint32_t numVertices;
+  uint16_t time;
+
+  // Handlers
+  void init();
+  void send(volatile M* msg);
+  void recv(M* msg, E* edge);
+  bool step();
+  bool finish(volatile M* msg);
+};
+
+// Generic device state structure
+template <typename S> struct ALIGNED PState {
+  // Board-level routing key for each outgoing pin
+  uint32_t pin[POLITE_NUM_PINS];
+  // Ready-to-send status
+  PPin readyToSend;
+  // Custom state
+  S state;
+};
+
+// Message structure
+template <typename M> struct PMessage {
+  // Destination thread-local device id
+  uint16_t devId;
+  // Id of incoming edge
+  uint16_t edgeId;
+  // Application message
+  M payload;
+};
+
+// An incoming edge to a device
+template <typename E> struct PInEdge {
+  E edge;
+};
+
+// Generic thread structure
+template <typename DeviceType,
+          typename S, typename E, typename M> struct PThread {
+
+  // Number of devices handled by thread
+  PLocalDeviceId numDevices;
+  // Number of times step handler has been called
+  uint16_t time;
+  // Number of devices in graph
+  uint32_t numVertices;
+  // Pointer to array of device states
+  PTR(PState<S>) devices;
+  // Pointer to base of edge table
+  PTR(PInEdge<E>) inTableBase;
+  // Array of local device ids are ready to send
+  PTR(PLocalDeviceId) senders;
+  // This array is accessed in a LIFO manner
+  PTR(PLocalDeviceId) sendersTop;
+
+  // Count number of messages sent
+  #ifdef POLITE_COUNT_MSGS
+  // Total messages sent
+  uint32_t msgsSent;
+  // Total messages received
+  uint32_t msgsReceived;
+  // Number of times we wanted to send but couldn't
+  uint32_t blockedSends;
+  #endif
+
+  #ifdef TINSEL
+
+  // Helper function to construct a device
+  INLINE DeviceType getDevice(uint32_t id) {
+    DeviceType dev;
+    dev.s           = &devices[id].state;
+    dev.readyToSend = &devices[id].readyToSend;
+    dev.numVertices = numVertices;
+    dev.time        = time;
+    return dev;
+  }
+
+  // Dump performance counter stats over UART
+  void dumpStats() {
+    tinselPerfCountStop();
+    uint32_t me = tinselId();
+    // Per-cache performance counters
+    uint32_t cacheMask = (1 <<
+      (TinselLogThreadsPerCore + TinselLogCoresPerDCache)) - 1;
+    if ((me & cacheMask) == 0) {
+      printf("H:%x,M:%x,W:%x\n",
+        tinselHitCount(),
+        tinselMissCount(),
+        tinselWritebackCount());
+    }
+    // Per-core performance counters
+    uint32_t coreMask = (1 << (TinselLogThreadsPerCore)) - 1;
+    if ((me & coreMask) == 0) {
+      printf("C:%x %x,I:%x %x\n",
+        tinselCycleCountU(), tinselCycleCount(),
+        tinselCPUIdleCountU(), tinselCPUIdleCount());
+    }
+    // Per-thread performance counters
+    #ifdef POLITE_COUNT_MSGS
+    uint32_t intraBoardId = me & ((1<<TinselLogThreadsPerBoard) - 1);
+    uint32_t progRouterSent =
+      intraBoardId == 0 ? tinselProgRouterSent() : 0;
+    uint32_t progRouterSentInter =
+      intraBoardId == 0 ? tinselProgRouterSentInterBoard() : 0;
+    printf("MS:%x,MR:%x,PR:%x,PRI:%x,BL:%x\n",
+      msgsSent, msgsReceived, progRouterSent,
+        progRouterSentInter, blockedSends);
+    #endif
+  }
+
+  // Invoke device handlers
+  void run() {
+    // Did last call to step handler request a new time step?
+    bool active = true;
+
+    // Reset performance counters
+    tinselPerfCountReset();
+
+    // Initialisation
+    sendersTop = senders;
+    for (uint32_t i = 0; i < numDevices; i++) {
+      DeviceType dev = getDevice(i);
+      // Invoke the initialiser for each device
+      dev.init();
+      // Device ready to send?
+      if (*dev.readyToSend != No) {
+        *(sendersTop++) = i;
+      }
+    }
+
+    // Set number of flits per message
+    tinselSetLen((sizeof(PMessage<M>)-1) >> TinselLogBytesPerFlit);
+
+    // Event loop
+    while (1) {
+      // Try to send
+      if (sendersTop != senders) {
+        if (tinselCanSend()) {
+          // Get next sender
+          PLocalDeviceId src = *(--sendersTop);
+          // Lookup device
+          DeviceType dev = getDevice(src);
+          PPin pin = *dev.readyToSend;
+          // Invoke send handler
+          PMessage<M>* m = (PMessage<M>*) tinselSendSlot();
+          dev.send(&m->payload);
+          // Reinsert sender, if it still wants to send
+          if (*dev.readyToSend != No) sendersTop++;
+          // Is it a send to the host pin or a user pin?
+          if (pin == HostPin)
+            tinselSend(tinselHostId(), m);
+          else
+            tinselKeySend(devices[src].pin[pin-2], m);
+          #ifdef POLITE_COUNT_MSGS
+            msgsSent++;
+          #endif
+        }
+        else {
+          #ifdef POLITE_COUNT_MSGS
+            blockedSends++;
+          #endif
+          tinselWaitUntil(TINSEL_CAN_SEND|TINSEL_CAN_RECV);
+        }
+      }
+      else {
+        // Idle detection
+        int idle = tinselIdle(!active);
+        if (idle > 1)
+          break;
+        else if (idle) {
+          active = false;
+          for (uint32_t i = 0; i < numDevices; i++) {
+            DeviceType dev = getDevice(i);
+            // Invoke the step handler for each device
+            active = dev.step() || active;
+            // Device ready to send?
+            if (*dev.readyToSend != No) {
+              *(sendersTop++) = i;
+            }
+          }
+          time++;
+        }
+      }
+
+      // Step 2: try to receive
+      while (tinselCanRecv()) {
+        PMessage<M>* inMsg = (PMessage<M>*) tinselRecv();
+        PInEdge<E>* inEdge = &inTableBase[inMsg->edgeId];
+        // Lookup destination device
+        PLocalDeviceId id = inMsg->devId;
+        DeviceType dev = getDevice(id);
+        // Was it ready to send?
+        PPin oldReadyToSend = *dev.readyToSend;
+        // Invoke receive handler
+        dev.recv(&inMsg->payload, &inEdge->edge);
+        // Insert device into a senders array, if not already there
+        if (*dev.readyToSend != No && oldReadyToSend == No)
+          *(sendersTop++) = id;
+        #ifdef POLITE_COUNT_MSGS
+          msgsReceived++;
+        #endif
+        tinselFree(inMsg);
+      }
+    }
+
+    // Termination
+    #ifdef POLITE_DUMP_STATS
+      dumpStats();
+    #endif
+
+    // Invoke finish handler for each device
+    for (uint32_t i = 0; i < numDevices; i++) {
+      DeviceType dev = getDevice(i);
+      tinselWaitUntil(TINSEL_CAN_SEND);
+      PMessage<M>* m = (PMessage<M>*) tinselSendSlot();
+      if (dev.finish(&m->payload)) tinselSend(tinselHostId(), m);
+    }
+
+    // Sleep
+    tinselWaitUntil(TINSEL_CAN_RECV); while (1);
+  }
+
+  #endif
+
+};
+
+#endif
diff --git a/include/POLite/FastMap/PGraph.h b/include/POLite/FastMap/PGraph.h
new file mode 100644
index 00000000..8ac0c84d
--- /dev/null
+++ b/include/POLite/FastMap/PGraph.h
@@ -0,0 +1,710 @@
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef _PGRAPH_H_
+#define _PGRAPH_H_
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+#include <HostLink.h>
+#include <config.h>
+#include <POLite.h>
+#include <POLite/Seq.h>
+#include <POLite/Graph.h>
+#include <POLite/Placer.h>
+#include <POLite/ProgRouters.h>
+#include <type_traits>
+
+// Nodes of a POETS graph are devices
+typedef NodeId PDeviceId;
+
+// POETS graph
+template <typename DeviceType,
+          typename S, typename E, typename M> class PGraph {
+ private:
+  // Align address to 2^n byte boundary
+  inline uint32_t align(uint32_t n, uint32_t addr) {
+    if ((addr & (1<<n)-1) == 0) return addr;
+    return ((addr >> n) + 1) << n;
+  }
+
+  // Align address to 32-bit word boundary
+  uint32_t wordAlign(uint32_t addr) { return align(2, addr); }
+
+  // Align address to cache-line boundary
+  uint32_t cacheAlign(uint32_t addr) {
+    return align(TinselLogBytesPerLine, addr);
+  }
+
+  // Helper function
+  inline uint32_t min(uint32_t x, uint32_t y) { return x < y ? x : y; }
+
+  // Number of FPGA boards available
+  uint32_t meshLenX;
+  uint32_t meshLenY;
+
+  // Number of FPGA boards to use
+  uint32_t numBoardsX;
+  uint32_t numBoardsY;
+
+  // Out table (sender-side edge tables)
+  // Sequence of destinations for every (device, pin) pair
+  Seq<PRoutingDest>*** outTable;
+
+  // Key table (sender-side key tables)
+  // Global routing key for every (device, pin) pair
+  uint32_t** keyTable;
+
+  // In table (receiver-side edge tables)
+  // Sequence of incoming edges for every thread
+  Seq<PInEdge<E>>** inTable;
+
+  // Mesh of per-board programmable routers
+  ProgRouterMesh* routingTables;
+
+  // Generic constructor
+  void constructor(uint32_t lenX, uint32_t lenY) {
+    meshLenX = lenX;
+    meshLenY = lenY;
+    char* str = getenv("POLITE_BOARDS_X");
+    int nx = str ? atoi(str) : meshLenX;
+    str = getenv("POLITE_BOARDS_Y");
+    int ny = str ? atoi(str) : meshLenY;
+    setNumBoards(nx, ny);
+    numDevices = 0;
+    devices = NULL;
+    toDeviceAddr = NULL;
+    numDevicesOnThread = NULL;
+    fromDeviceAddr = NULL;
+    vertexMem = NULL;
+    vertexMemSize = NULL;
+    vertexMemBase = NULL;
+    inEdgeMem = NULL;
+    inEdgeMemSize = NULL;
+    inEdgeMemBase = NULL;
+    mapVerticesToDRAM = false;
+    mapInEdgesToDRAM = true;
+    outTable = NULL;
+    keyTable = NULL;
+    inTable = NULL;
+    routingTables = NULL;
+    chatty = 0;
+    str = getenv("POLITE_CHATTY");
+    if (str != NULL) {
+      chatty = !strcmp(str, "0") ? 0 : 1;
+    }
+  }
+
+ public:
+  // Number of devices
+  uint32_t numDevices;
+
+  // Graph containing device ids and connections
+  Graph graph;
+
+  // Edge labels: has same structure as graph.outgoing
+  Seq<Seq<E>*> edgeLabels;
+
+  // Mapping from device id to device state
+  // (Not valid until the mapper is called)
+  PState<S>** devices;
+
+  // Mapping from thread id to number of devices on that thread
+  // (Not valid until the mapper is called)
+  uint32_t* numDevicesOnThread;
+
+  // Mapping from device id to device address and back
+  // (Not valid until the mapper is called)
+  PDeviceAddr* toDeviceAddr;  // Device id -> device address
+  PDeviceId** fromDeviceAddr; // Device address -> device id
+
+  // Each thread's vertex mem and thread mem regions
+  // (Not valid until the mapper is called)
+  uint8_t** vertexMem;      uint8_t** threadMem;
+  uint32_t* vertexMemSize;  uint32_t* threadMemSize;
+  uint32_t* vertexMemBase;  uint32_t* threadMemBase;
+
+  // Each thread's in-edge tables
+  // (Not valid until the mapper is called)
+  uint8_t** inEdgeMem;
+  uint32_t* inEdgeMemSize;
+  uint32_t* inEdgeMemBase;
+
+  // Where to map the various regions
+  // (If false, map to SRAM instead)
+  bool mapVerticesToDRAM;
+  bool mapInEdgesToDRAM;
+
+  // Allow mapper to print useful information to stdout
+  uint32_t chatty;
+
+  // Setter for number of boards to use
+  void setNumBoards(uint32_t x, uint32_t y) {
+    if (x > meshLenX || y > meshLenY) {
+      printf("Mapper: %d x %d boards requested, %d x %d available\n",
+        numBoardsX, numBoardsY, meshLenX, meshLenY);
+      exit(EXIT_FAILURE);
+    }
+    numBoardsX = x;
+    numBoardsY = y;
+  }
+
+  // Create new device
+  inline PDeviceId newDevice() {
+    edgeLabels.append(new SmallSeq<E>);
+    numDevices++;
+    return graph.newNode();
+  }
+
+  // Add a connection between devices
+  inline void addEdge(PDeviceId from, PinId pin, PDeviceId to) {
+    if (pin >= POLITE_NUM_PINS) {
+      printf("addEdge: pin exceeds POLITE_NUM_PINS\n");
+      exit(EXIT_FAILURE);
+    }
+    graph.addEdge(from, pin, to);
+    E edge;
+    edgeLabels.elems[from]->append(edge);
+  }
+
+  // Add labelled edge using given output pin
+  void addLabelledEdge(E edge, PDeviceId x, PinId pin, PDeviceId y) {
+    graph.addEdge(x, pin, y);
+    edgeLabels.elems[x]->append(edge);
+  }
+
+  // Allocate SRAM and DRAM partitions
+  void allocatePartitions() {
+    // Decide a maximum partition size that is reasonable
+    // SRAM: Partition size minus 2048 bytes for the stack
+    uint32_t maxSRAMSize = (1<<TinselLogBytesPerSRAMPartition) - 2048;
+    // DRAM: Partition size minus 65536 bytes for the stack
+    uint32_t maxDRAMSize = (1<<TinselLogBytesPerDRAMPartition) - 65536;
+    // Allocate partition sizes and bases
+    vertexMem = (uint8_t**) calloc(TinselMaxThreads, sizeof(uint8_t*));
+    vertexMemSize = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+    vertexMemBase = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+    threadMem = (uint8_t**) calloc(TinselMaxThreads, sizeof(uint8_t*));
+    threadMemSize = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+    threadMemBase = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+    inEdgeMem = (uint8_t**) calloc(TinselMaxThreads, sizeof(uint8_t*));
+    inEdgeMemSize = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+    inEdgeMemBase = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+    // Compute partition sizes for each thread
+    for (uint32_t threadId = 0; threadId < TinselMaxThreads; threadId++) {
+      // This variable is used to count the size of the *initialised*
+      // partition.  The total partition size is larger as it includes
+      // uninitialised portions.
+      uint32_t sizeVMem = 0;
+      uint32_t sizeEIMem = 0;
+      uint32_t sizeTMem = 0;
+      // Add space for thread structure (always stored in SRAM)
+      sizeTMem = cacheAlign(sizeof(PThread<DeviceType, S, E, M>));
+      // Add space for devices
+      uint32_t numDevs = numDevicesOnThread[threadId];
+      for (uint32_t devNum = 0; devNum < numDevs; devNum++) {
+        // Add space for device
+        sizeVMem = sizeVMem + sizeof(PState<S>);
+      }
+      // Add space for incoming edge table
+      if (inTable[threadId]) {
+        sizeEIMem = inTable[threadId]->numElems * sizeof(PInEdge<E>);
+        sizeEIMem = wordAlign(sizeEIMem);
+      }
+      // The total partition size including uninitialised portions
+      uint32_t totalSizeVMem =
+        sizeVMem + wordAlign(sizeof(PLocalDeviceId) * numDevs);
+      // Check that total size is reasonable
+      uint32_t totalSizeSRAM = sizeTMem;
+      uint32_t totalSizeDRAM = 0;
+      if (mapVerticesToDRAM) totalSizeDRAM += totalSizeVMem;
+                        else totalSizeSRAM += totalSizeVMem;
+      if (mapInEdgesToDRAM)  totalSizeDRAM += sizeEIMem;
+                        else totalSizeSRAM += sizeEIMem;
+      if (totalSizeDRAM > maxDRAMSize) {
+        printf("Error: max DRAM partition size exceeded\n");
+        exit(EXIT_FAILURE);
+      }
+      if (totalSizeSRAM > maxSRAMSize) {
+        printf("Error: max SRAM partition size exceeded\n");
+        exit(EXIT_FAILURE);
+      }
+      // Allocate space for the initialised portion of the partition
+      assert((sizeVMem%4) == 0);
+      assert((sizeTMem%4) == 0);
+      assert((sizeEIMem%4) == 0);
+      vertexMem[threadId] = (uint8_t*) calloc(sizeVMem, 1);
+      vertexMemSize[threadId] = sizeVMem;
+      threadMem[threadId] = (uint8_t*) calloc(sizeTMem, 1);
+      threadMemSize[threadId] = sizeTMem;
+      inEdgeMem[threadId] = (uint8_t*) calloc(sizeEIMem, 1);
+      inEdgeMemSize[threadId] = sizeEIMem;
+      // Tinsel address of base of partition
+      uint32_t partId = threadId & (TinselThreadsPerDRAM-1);
+      uint32_t sramBase = (1 << TinselLogBytesPerSRAM) +
+          (partId << TinselLogBytesPerSRAMPartition);
+      uint32_t dramBase = TinselBytesPerDRAM -
+          ((partId+1) << TinselLogBytesPerDRAMPartition);
+      // Use partition-interleaved region for DRAM
+      dramBase |= 0x80000000;
+      threadMemBase[threadId] = sramBase;
+      sramBase += threadMemSize[threadId];
+      // Determine base addresses of each region
+      if (mapVerticesToDRAM) {
+        vertexMemBase[threadId] = dramBase;
+        dramBase += totalSizeVMem;
+      }
+      else {
+        vertexMemBase[threadId] = sramBase;
+        sramBase += totalSizeVMem;
+      }
+      if (mapInEdgesToDRAM) {
+        inEdgeMemBase[threadId] = dramBase;
+        dramBase += sizeEIMem;
+      }
+      else {
+        inEdgeMemBase[threadId] = sramBase;
+        sramBase += sizeEIMem;
+      }
+    }
+  }
+
+  // Initialise partitions
+  void initialisePartitions() {
+    for (uint32_t threadId = 0; threadId < TinselMaxThreads; threadId++) {
+      // Next pointers for each partition
+      uint32_t nextVMem = 0;
+      // Pointer to thread structure
+      PThread<DeviceType, S, E, M>* thread =
+        (PThread<DeviceType, S, E, M>*) &threadMem[threadId][0];
+      // Set number of devices on thread
+      thread->numDevices = numDevicesOnThread[threadId];
+      // Set number of devices in graph
+      thread->numVertices = numDevices;
+      // Set tinsel address of array of device states
+      thread->devices = vertexMemBase[threadId];
+      // Set tinsel address of base of in-edge table
+      thread->inTableBase = inEdgeMemBase[threadId];
+      // Add space for each device on thread
+      uint32_t numDevs = numDevicesOnThread[threadId];
+      for (uint32_t devNum = 0; devNum < numDevs; devNum++) {
+        PState<S>* dev = (PState<S>*) &vertexMem[threadId][nextVMem];
+        PDeviceId id = fromDeviceAddr[threadId][devNum];
+        devices[id] = dev;
+        // Add space for device
+        nextVMem = nextVMem + sizeof(PState<S>);
+      }
+      // Initialise each device and the thread's out edges
+      for (uint32_t devNum = 0; devNum < numDevs; devNum++) {
+        PDeviceId id = fromDeviceAddr[threadId][devNum];
+        PState<S>* dev = devices[id];
+        // Initialise
+        for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) {
+          dev->pin[p] = keyTable[id][p];
+        }
+      }
+      // Intialise thread's in edges
+      PInEdge<E>* inEdgeArray = (PInEdge<E>*) inEdgeMem[threadId];
+      Seq<PInEdge<E>>* edges = inTable[threadId];
+      if (edges)
+        for (uint32_t i = 0; i < edges->numElems; i++) {
+          inEdgeArray[i] = edges->elems[i];
+        }
+      // At this point, check that next pointers line up with heap sizes
+      if (nextVMem != vertexMemSize[threadId]) {
+        printf("Error: vertex mem size does not match pre-computed size\n");
+        exit(EXIT_FAILURE);
+      }
+      // Set tinsel address of senders array
+      thread->senders = vertexMemBase[threadId] + nextVMem;
+    }
+  }
+
+  // Allocate mapping structures
+  void allocateMapping() {
+    devices = (PState<S>**) calloc(numDevices, sizeof(PState<S>*));
+    toDeviceAddr = (PDeviceAddr*) calloc(numDevices, sizeof(PDeviceAddr));
+    fromDeviceAddr = (PDeviceId**) calloc(TinselMaxThreads, sizeof(PDeviceId*));
+    numDevicesOnThread = (uint32_t*) calloc(TinselMaxThreads, sizeof(uint32_t));
+  }
+
+  // Allocate thread edge input and output tables
+  // (Only valid after mapper is called)
+  void allocateInOutTables() {
+    // Receiver-side tables
+    inTable = (Seq<PInEdge<E>>**)
+      calloc(TinselMaxThreads,sizeof(Seq<PInEdge<E>>*));
+    for (uint32_t t = 0; t < TinselMaxThreads; t++) {
+      if (numDevicesOnThread[t] != 0)
+        inTable[t] = new SmallSeq<PInEdge<E>>;
+    }
+
+    // Sender-side tables
+    outTable = (Seq<PRoutingDest>***)
+      calloc(numDevices, sizeof(Seq<PRoutingDest>**));
+    for (uint32_t d = 0; d < numDevices; d++) {
+      outTable[d] = (Seq<PRoutingDest>**)
+        calloc(POLITE_NUM_PINS, sizeof(Seq<PRoutingDest>*));
+      for (uint32_t p = 0; p < POLITE_NUM_PINS; p++)
+        outTable[d][p] = new SmallSeq<PRoutingDest>;
+    }
+
+    keyTable = new uint32_t* [numDevices];
+    for (uint32_t d = 0; d < numDevices; d++)
+      keyTable[d] = new uint32_t [POLITE_NUM_PINS];
+  }
+
+  // Compute thread edge input and output tables
+  // (Only valid after mapper is called)
+  void computeInOutTables() {
+    // For each device
+    for (uint32_t d = 0; d < numDevices; d++) {
+      // For each pin
+      for (uint32_t p = 0; p < POLITE_NUM_PINS; p++) {
+        Seq<PDeviceId>* dests = graph.outgoing->elems[d];
+        Seq<E>* edges = edgeLabels.elems[d];
+        for (uint32_t i = 0; i < dests->numElems; i++) {
+          PDeviceId destId = dests->elems[i];
+          // Destination thread id
+          uint32_t threadId = getThreadId(toDeviceAddr[destId]);
+          // Thread-local device id
+          uint32_t devId = getLocalDeviceId(toDeviceAddr[destId]);
+          // Add edge to thread's input table
+          uint32_t edgeId = inTable[threadId]->numElems;
+          if (i < inTable[threadId]->numElems) {
+            PInEdge<E> edge;
+            edge.edge = edges->elems[i];
+            inTable[threadId]->append(edge);
+          }
+          // Add output table entry
+          PRoutingDest rdest;
+          rdest.kind = PRDestKindURM1;
+          rdest.mbox = threadId >> TinselLogThreadsPerMailbox;
+          rdest.urm1.key = devId | (edgeId << 16);
+          rdest.urm1.threadId = threadId &
+            ((1<<TinselLogThreadsPerMailbox) - 1);
+          outTable[d][p]->append(rdest);
+        }
+      }
+    }
+  }
+
+  // Release all structures
+  void releaseAll() {
+    if (devices != NULL) {
+      free(devices);
+      free(toDeviceAddr);
+      free(numDevicesOnThread);
+      for (uint32_t t = 0; t < TinselMaxThreads; t++)
+        if (fromDeviceAddr[t] != NULL) free(fromDeviceAddr[t]);
+      free(fromDeviceAddr);
+      for (uint32_t t = 0; t < TinselMaxThreads; t++)
+        if (vertexMem[t] != NULL) free(vertexMem[t]);
+      free(vertexMem);
+      free(vertexMemSize);
+      free(vertexMemBase);
+      for (uint32_t t = 0; t < TinselMaxThreads; t++)
+        if (threadMem[t] != NULL) free(threadMem[t]);
+      free(threadMem);
+      free(threadMemSize);
+      free(threadMemBase);
+      for (uint32_t t = 0; t < TinselMaxThreads; t++)
+        if (inEdgeMem[t] != NULL) free(inEdgeMem[t]);
+      free(inEdgeMem);
+      free(inEdgeMemSize);
+      free(inEdgeMemBase);
+    }
+    if (inTable != NULL) {
+      for (uint32_t t = 0; t < TinselMaxThreads; t++)
+        if (inTable[t] != NULL) delete inTable[t];
+      free(inTable);
+      inTable = NULL;
+    }
+    if (outTable != NULL) {
+      for (uint32_t d = 0; d < numDevices; d++) {
+        if (outTable[d] == NULL) continue;
+        for (uint32_t p = 0; p < POLITE_NUM_PINS; p++)
+          delete outTable[d][p];
+        free(outTable[d]);
+      }
+      free(outTable);
+      outTable = NULL;
+    }
+    if (keyTable != NULL) {
+      for (uint32_t d = 0; d < numDevices; d++) delete [] keyTable[d];
+      delete [] keyTable;
+      keyTable = NULL;
+    }
+    if (routingTables != NULL) delete routingTables;
+  }
+
+  // Implement mapping to tinsel threads
+  void map() {
+    // Let's measure some times
+    struct timeval placementStart, placementFinish;
+    struct timeval routingStart, routingFinish;
+    struct timeval initStart, initFinish;
+
+    // Release all mapping and heap structures
+    releaseAll();
+
+    // Reallocate mapping structures
+    allocateMapping();
+
+    // Start placement timer
+    gettimeofday(&placementStart, NULL);
+
+    // Partition into subgraphs, one per board
+    Placer boards(&graph, numBoardsX, numBoardsY);
+
+    // Place subgraphs onto 2D mesh
+    const uint32_t placerEffort = 8;
+    boards.place(placerEffort);
+
+    // For each board
+    for (uint32_t boardY = 0; boardY < numBoardsY; boardY++) {
+      for (uint32_t boardX = 0; boardX < numBoardsX; boardX++) {
+        // Partition into subgraphs, one per mailbox
+        PartitionId b = boards.mapping[boardY][boardX];
+        Placer boxes(&boards.subgraphs[b], 
+                 TinselMailboxMeshXLen, TinselMailboxMeshYLen);
+        boxes.place(placerEffort);
+
+        // For each mailbox
+        for (uint32_t boxX = 0; boxX < TinselMailboxMeshXLen; boxX++) {
+          for (uint32_t boxY = 0; boxY < TinselMailboxMeshYLen; boxY++) {
+            // Partition into subgraphs, one per thread
+            uint32_t numThreads = 1<<TinselLogThreadsPerMailbox;
+            PartitionId t = boxes.mapping[boxY][boxX];
+            Placer threads(&boxes.subgraphs[t], numThreads, 1);
+
+            // For each thread
+            for (uint32_t threadNum = 0; threadNum < numThreads; threadNum++) {
+              // Determine tinsel thread id
+              uint32_t threadId = boardY;
+              threadId = (threadId << TinselMeshXBits) | boardX;
+              threadId = (threadId << TinselMailboxMeshYBits) | boxY;
+              threadId = (threadId << TinselMailboxMeshXBits) | boxX;
+              threadId = (threadId << (TinselLogCoresPerMailbox +
+                            TinselLogThreadsPerCore)) | threadNum;
+
+              // Get subgraph
+              Graph* g = &threads.subgraphs[threadNum];
+
+              // Populate fromDeviceAddr mapping
+              uint32_t numDevs = g->incoming->numElems;
+              numDevicesOnThread[threadId] = numDevs;
+              fromDeviceAddr[threadId] = (PDeviceId*)
+                malloc(sizeof(PDeviceId) * numDevs);
+              for (uint32_t devNum = 0; devNum < numDevs; devNum++)
+                fromDeviceAddr[threadId][devNum] = g->labels->elems[devNum];
+  
+              // Populate toDeviceAddr mapping
+              assert(numDevs < maxLocalDeviceId());
+              for (uint32_t devNum = 0; devNum < numDevs; devNum++) {
+                PDeviceAddr devAddr =
+                  makeDeviceAddr(threadId, devNum);
+                toDeviceAddr[g->labels->elems[devNum]] = devAddr;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // Stop placement timer and start In/Out table timer
+    gettimeofday(&placementFinish, NULL);
+    gettimeofday(&routingStart, NULL);
+
+    // Compute send and receive side routing tables
+    allocateInOutTables();
+    computeInOutTables();
+
+    // Compute per-board programmable routing tables
+    routingTables = new ProgRouterMesh(numBoardsX, numBoardsY);
+    for (uint32_t d = 0; d < numDevices; d++) {
+      uint32_t src = getThreadId(toDeviceAddr[d]) >>
+        TinselLogThreadsPerMailbox;
+      for (uint32_t p = 0; p < POLITE_NUM_PINS; p++)
+        keyTable[d][p] = routingTables->addDestsFromBoard(src, outTable[d][p]);
+   }
+
+    // Stop routing timer and start init timer
+    gettimeofday(&routingFinish, NULL);
+    gettimeofday(&initStart, NULL);
+
+    // Reallocate and initialise heap structures
+    allocatePartitions();
+    initialisePartitions();
+
+    // Display times, if chatty
+    gettimeofday(&initFinish, NULL);
+    if (chatty > 0) {
+      struct timeval diff;
+
+      timersub(&placementFinish, &placementStart, &diff);
+      double duration = (double) diff.tv_sec +
+        (double) diff.tv_usec / 1000000.0;
+      printf("POLite mapper profile:\n");
+      printf("  Partitioning and placement: %lfs\n", duration);
+
+      timersub(&routingFinish, &routingStart, &diff);
+      duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
+      printf("  In/Out table construction: %lfs\n", duration);
+
+      timersub(&initFinish, &initStart, &diff);
+      duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0;
+      printf("  Thread state initialisation: %lfs\n", duration);
+    }
+  }
+
+  // Constructor
+  PGraph() {
+    char* str = getenv("HOSTLINK_BOXES_X");
+    int x = str ? atoi(str) : 1;
+    x = x * TinselMeshXLenWithinBox;
+    str = getenv("HOSTLINK_BOXES_Y");
+    int y = str ? atoi(str) : 1;
+    y = y * TinselMeshYLenWithinBox;
+    constructor(x, y);
+  }
+  PGraph(uint32_t numBoxesX, uint32_t numBoxesY) {
+    int x = numBoxesX * TinselMeshXLenWithinBox; 
+    int y = numBoxesY * TinselMeshYLenWithinBox;
+    constructor(x, y);
+  }
+
+  // Deconstructor
+  ~PGraph() {
+    releaseAll();
+    for (uint32_t i = 0; i < edgeLabels.numElems; i++)
+      delete edgeLabels.elems[i];
+  }
+
+  // Write partition to tinsel machine
+  void writeRAM(HostLink* hostLink,
+         uint8_t** heap, uint32_t* heapSize, uint32_t* heapBase) {
+    // Number of bytes written by each thread
+    uint32_t* writeCount = (uint32_t*)
+      calloc(TinselMaxThreads, sizeof(uint32_t));
+
+    // Number of threads completed by each core
+    uint32_t*** threadCount = (uint32_t***)
+      calloc(meshLenX, sizeof(uint32_t**));
+    for (uint32_t x = 0; x < meshLenX; x++) {
+      threadCount[x] = (uint32_t**)
+        calloc(meshLenY, sizeof(uint32_t*));
+      for (uint32_t y = 0; y < meshLenY; y++)
+        threadCount[x][y] = (uint32_t*)
+          calloc(TinselCoresPerBoard, sizeof(uint32_t));
+    }
+
+    // Initialise write addresses
+    for (int x = 0; x < meshLenX; x++)
+      for (int y = 0; y < meshLenY; y++)
+        for (int c = 0; c < TinselCoresPerBoard; c++)
+          hostLink->setAddr(x, y, c, heapBase[hostLink->toAddr(x, y, c, 0)]);
+
+    // Write heaps
+    uint32_t done = false;
+    while (! done) {
+      done = true;
+      for (int x = 0; x < meshLenX; x++) {
+        for (int y = 0; y < meshLenY; y++) {
+          for (int c = 0; c < TinselCoresPerBoard; c++) {
+            uint32_t t = threadCount[x][y][c];
+            if (t < TinselThreadsPerCore) {
+              done = false;
+              uint32_t threadId = hostLink->toAddr(x, y, c, t);
+              uint32_t written = writeCount[threadId];
+              if (written == heapSize[threadId]) {
+                threadCount[x][y][c] = t+1;
+                if ((t+1) < TinselThreadsPerCore)
+                  hostLink->setAddr(x, y, c,
+                    heapBase[hostLink->toAddr(x, y, c, t+1)]);
+              } else {
+                uint32_t send = min((heapSize[threadId] - written)>>2, 15);
+                hostLink->store(x, y, c, send,
+                  (uint32_t*) &heap[threadId][written]);
+                writeCount[threadId] = written + send * sizeof(uint32_t);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // Release memory
+    free(writeCount);
+    for (uint32_t x = 0; x < meshLenX; x++) {
+      for (uint32_t y = 0; y < meshLenY; y++)
+        free(threadCount[x][y]);
+      free(threadCount[x]);
+    }
+    free(threadCount);
+  }
+
+  // Write graph to tinsel machine
+  void write(HostLink* hostLink) { 
+    // Start timer
+    struct timeval start, finish;
+    gettimeofday(&start, NULL);
+
+    bool useSendBufferOld = hostLink->useSendBuffer;
+    hostLink->useSendBuffer = true;
+    writeRAM(hostLink, vertexMem, vertexMemSize, vertexMemBase);
+    writeRAM(hostLink, threadMem, threadMemSize, threadMemBase);
+    writeRAM(hostLink, inEdgeMem, inEdgeMemSize, inEdgeMemBase);
+    routingTables->write(hostLink);
+    hostLink->flush();
+    hostLink->useSendBuffer = useSendBufferOld;
+
+    // Display time if chatty
+    gettimeofday(&finish, NULL);
+    if (chatty > 0) {
+      struct timeval diff;
+      timersub(&finish, &start, &diff);
+      double duration = (double) diff.tv_sec +
+        (double) diff.tv_usec / 1000000.0;
+      printf("POLite graph upload time: %lfs\n", duration);
+    }
+  }
+
+  // Determine fan-in of given device
+  uint32_t fanIn(PDeviceId id) {
+    return graph.fanIn(id);
+  }
+
+  // Determine fan-out of given device
+  uint32_t fanOut(PDeviceId id) {
+    return graph.fanOut(id);
+  }
+
+};
+
+// Read performance stats and store in file
+inline void politeSaveStats(HostLink* hostLink, const char* filename) {
+  #ifdef POLITE_DUMP_STATS
+  // Open file for performance counters
+  FILE* statsFile = fopen(filename, "wt");
+  if (statsFile == NULL) {
+    printf("Error creating stats file\n");
+    exit(EXIT_FAILURE);
+  }
+  uint32_t meshLenX = hostLink->meshXLen;
+  uint32_t meshLenY = hostLink->meshYLen;
+  // Number of caches
+  uint32_t numLines = meshLenX * meshLenY *
+                        TinselDCachesPerDRAM * TinselDRAMsPerBoard;
+  // Add on number of cores
+  numLines += meshLenX * meshLenY * TinselCoresPerBoard;
+  // Add on number of threads
+  #ifdef POLITE_COUNT_MSGS
+  numLines += meshLenX * meshLenY * TinselThreadsPerBoard;
+  #endif
+  hostLink->dumpStdOut(statsFile, numLines);
+  fclose(statsFile);
+  #endif
+}
+
+#endif
diff --git a/include/POLite/PGraph.h b/include/POLite/PGraph.h
index 20cc4b5b..57b3172e 100644
--- a/include/POLite/PGraph.h
+++ b/include/POLite/PGraph.h
@@ -512,10 +512,11 @@ template <typename DeviceType,
           uint32_t key = addInTableEntries(&groups);
           // Add output table entry
           PRoutingDest edge;
+          edge.kind = PRDestKindMRM;
           edge.mbox = mbox;
-          edge.key = key;
-          edge.threadMaskLow = threadMaskLow;
-          edge.threadMaskHigh = threadMaskHigh;
+          edge.mrm.key = key;
+          edge.mrm.threadMaskLow = threadMaskLow;
+          edge.mrm.threadMaskHigh = threadMaskHigh;
           outTable[d][p]->append(edge);
           // Prepare for new output table entry
           dests.numElems = destsRemaining;
diff --git a/include/POLite/Placer.h b/include/POLite/Placer.h
index 32aec831..1468f0c7 100644
--- a/include/POLite/Placer.h
+++ b/include/POLite/Placer.h
@@ -10,6 +10,14 @@ typedef uint32_t PartitionId;
 
 // Partition and place a graph on a 2D mesh
 struct Placer {
+  // Select between different methods
+  enum Method {
+    Default,
+    Metis,
+    Random
+  };
+  const Method defaultMethod=Metis;
+
   // The graph being placed
   Graph* graph;
 
@@ -41,8 +49,31 @@ struct Placer {
   uint32_t* yCoordSaved;
   uint64_t savedCost;
 
+  // Controls which strategy is used
+  Method method = Default;
+
+  // Select placer method
+  void chooseMethod()
+  {
+    auto e = getenv("POLITE_PLACER");
+    if (e) {
+      if (!strcmp(e, "metis"))
+        method=Metis;
+      else if (!strcmp(e, "random"))
+        method=Random;
+      else if (!strcmp(e, "default") || *e == '\0')
+        method=Default;
+      else {
+        fprintf(stderr, "Don't understand placer method : %s\n", e);
+        exit(EXIT_FAILURE);
+      }
+    }
+    if (method == Default)
+      method = defaultMethod;
+  }
+
   // Partition the graph using Metis
-  void partition() {
+  void partitionMetis() {
     // Compute total number of edges
     uint32_t numEdges = 0;
     for (uint32_t i = 0; i < graph->incoming->numElems; i++) {
@@ -116,6 +147,31 @@ struct Placer {
     free(parts);
   }
 
+  // Partition the graph randomly
+  void partitionRandom() {
+    uint32_t numVertices = graph->incoming->numElems;
+    uint32_t numParts = width * height;
+
+    // Populate result array
+    srand(0);
+    for (uint32_t i = 0; i < numVertices; i++) {
+      partitions[i] = rand() % numParts;
+    }
+  }
+
+  void partition()
+  {
+    switch(method){
+    case Default:
+    case Metis:
+      partitionMetis();
+      break;
+    case Random:
+      partitionRandom();
+      break;
+    }
+  }
+
   // Create subgraph for each partition
   void computeSubgraphs() {
     uint32_t numPartitions = width*height;
@@ -316,6 +372,8 @@ struct Placer {
     yCoord = new uint32_t [width*height];
     xCoordSaved = new uint32_t [width*height];
     yCoordSaved = new uint32_t [width*height];
+    // Pick a placement method, or select default
+    chooseMethod();
     // Partition the graph using Metis
     partition();
     // Compute subgraphs, one per partition
diff --git a/include/POLite/ProgRouters.h b/include/POLite/ProgRouters.h
index 90083802..a1c5942d 100644
--- a/include/POLite/ProgRouters.h
+++ b/include/POLite/ProgRouters.h
@@ -182,22 +182,58 @@ class ProgRouter {
     numChunks++;
     numRecords++;
   }
+
+  // Add a URM1 record to the table
+  void addURM1(uint32_t mboxX, uint32_t mboxY,
+                 uint32_t threadId, uint32_t key) {
+    if (numChunks == 5) nextBeat();
+    uint8_t* ptr = currentRecord48();
+    ptr[0] = key;
+    ptr[1] = key >> 8;
+    ptr[2] = key >> 16;
+    ptr[3] = key >> 24;
+    ptr[4] = (threadId << 3);
+    ptr[5] = (mboxY << 3) | (mboxX << 1) | (threadId >> 5);
+    numChunks++;
+    numRecords++;
+  }
 };
 
 // ==================================
 // Data type for routing destinations
 // ==================================
 
-struct PRoutingDest {
-  // Destination mailbox
-  uint32_t mbox;
-  // Thread-level routing key
+enum PRoutingDestKind { PRDestKindURM1, PRDestKindMRM };
+
+// URM1 routing destination
+struct PRoutingDestURM1 {
+  // Mailbox-local thread
+  uint16_t threadId;
+  // Thread-local routing key
+  uint32_t key;
+};
+
+// MRM routing destination
+struct PRoutingDestMRM {
+  // Thread-local routing key
   uint16_t key;
   // Destination threads
   uint32_t threadMaskLow;
   uint32_t threadMaskHigh;
 };
 
+// Routing destination
+struct PRoutingDest {
+  PRoutingDestKind kind;
+  // Destination mailbox
+  uint32_t mbox;
+  // URM1 or MRM destination
+  union {
+    PRoutingDestURM1 urm1;
+    PRoutingDestMRM mrm;
+  };
+};
+
 // Extract board X coord from routing dest
 inline uint32_t destX(uint32_t mbox) {
   uint32_t x = mbox >> (TinselMailboxMeshXBits + TinselMailboxMeshYBits);
@@ -288,9 +324,19 @@ class ProgRouterMesh {
     // Add local records
     for (int i = 0; i < local.numElems; i++) {
       PRoutingDest dest = local.elems[i];
-      table[senderY][senderX].addMRM(destMboxX(dest.mbox),
-        destMboxY(dest.mbox), dest.threadMaskHigh,
-        dest.threadMaskLow, dest.key);
+      if (dest.kind == PRDestKindMRM) {
+        table[senderY][senderX].addMRM(destMboxX(dest.mbox),
+          destMboxY(dest.mbox), dest.mrm.threadMaskHigh,
+          dest.mrm.threadMaskLow, dest.mrm.key);
+      }
+      else if (dest.kind == PRDestKindURM1) {
+        table[senderY][senderX].addURM1(destMboxX(dest.mbox),
+          destMboxY(dest.mbox), dest.urm1.threadId, dest.urm1.key);
+      }
+      else {
+        fprintf(stderr, "ProgRouters.h: unknown routing record kind\n");
+        exit(EXIT_FAILURE);
+      }
     }
 
     return table[senderY][senderX].genKey();